void bli_unpackm_blk_var2( obj_t* p, obj_t* c, unpackm_t* cntl ) { num_t dt_cp = bli_obj_datatype( *c ); // Normally we take the parameters from the source argument. But here, // the packm/unpackm framework is not yet solidified enough for us to // assume that at this point struc(P) == struc(C), (ie: since // densification may have marked P's structure as dense when the root // is upper or lower). So, we take the struc field from C, not P. struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); // Again, normally the trans argument is on the source matrix. But we // know that the packed matrix is not transposed. If there is to be a // transposition, it is because C was originally transposed when packed. // Thus, we query C for the trans status, not P. Also, we only query // the trans status (not the conjugation status), since we probably // don't want to un-conjugate if the original matrix was conjugated // when packed. trans_t transc = bli_obj_onlytrans_status( *c ); dim_t m_c = bli_obj_length( *c ); dim_t n_c = bli_obj_width( *c ); dim_t m_panel = bli_obj_panel_length( *c ); dim_t n_panel = bli_obj_panel_width( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, m_c, n_c, m_panel, n_panel, buf_p, rs_p, cs_p, pd_p, ps_p, buf_c, rs_c, cs_c ); }
void bli_setid_unb_var1( obj_t* beta, obj_t* x ) { num_t dt_xr = bli_obj_datatype_proj_to_real( *x ); num_t dt_x = bli_obj_datatype( *x ); doff_t diagoffx = bli_obj_diag_offset( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); void* buf_beta = bli_obj_buffer_for_1x1( dt_xr, *beta ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x]; // Invoke the function. f( diagoffx, m, n, buf_beta, buf_x, rs_x, cs_x ); }
void bli_norm1m_unb_var1( obj_t* x, obj_t* norm ) { num_t dt_x = bli_obj_datatype( *x ); doff_t diagoffx = bli_obj_diag_offset( *x ); uplo_t diagx = bli_obj_diag( *x ); uplo_t uplox = bli_obj_uplo( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_norm = bli_obj_buffer_at_off( *norm ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x]; // Invoke the function. f( diagoffx, diagx, uplox, m, n, buf_x, rs_x, cs_x, buf_norm ); }
err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ) { err_t e_val = BLIS_SUCCESS; if ( offset != bli_obj_diag_offset( *a ) ) e_val = BLIS_UNEXPECTED_DIAG_OFFSET; return e_val; }
void bli_packm_unb_var1( obj_t* c, obj_t* p, packm_thrinfo_t* thread ) { num_t dt_cp = bli_obj_datatype( *c ); struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); trans_t transc = bli_obj_conjtrans_status( *c ); dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); void* buf_kappa; FUNCPTR_T f; // This variant assumes that the computational kernel will always apply // the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE // for kappa so that the underlying packm implementation does not scale // during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; if( thread_am_ochief( thread ) ) { // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p ); } }
void bli_scalm_unb_var1( obj_t* alpha, obj_t* x, cntx_t* cntx ) { num_t dt_x = bli_obj_datatype( *x ); doff_t diagoffx = bli_obj_diag_offset( *x ); uplo_t diagx = bli_obj_diag( *x ); uplo_t uplox = bli_obj_uplo( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_alpha; obj_t x_local; FUNCPTR_T f; // Alias x to x_local so we can apply alpha if it is non-unit. bli_obj_alias_to( *x, x_local ); // If alpha is non-unit, apply it to the scalar attached to x. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &x_local ); } // Grab the address of the internal scalar buffer for the scalar // attached to x. buf_alpha_x = bli_obj_internal_scalar_buffer( *x ); // Index into the type combination array to extract the correct // function pointer. // NOTE: We use dt_x for both alpha and x because alpha was obtained // from the attached scalar of x, which is guaranteed to be of the // same datatype as x. f = ftypes[dt_x][dt_x]; // Invoke the function. // NOTE: We unconditionally pass in BLIS_NO_CONJUGATE for alpha // because it would have already been conjugated by the front-end. f( BLIS_NO_CONJUGATE, diagoffx, diagx, uplox, m, n, buf_alpha, buf_x, rs_x, cs_x ); }
void bli_trsm_rl_ker_var2( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, trsm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t cs_b = bli_obj_col_stride( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); num_t dt_alpha; void* buf_alpha; FUNCPTR_T f; // If alpha is a scalar constant, use dt_exec to extract the address of the // corresponding constant value; otherwise, use the datatype encoded // within the alpha object and extract the buffer at the alpha offset. bli_set_scalar_dt_buffer( alpha, dt_exec, dt_alpha, buf_alpha ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, ps_a, buf_b, rs_b, cs_b, ps_b, buf_c, rs_c, cs_c ); }
void bli_obj_print( char* label, obj_t* obj ) { FILE* file = stdout; if ( bli_error_checking_is_enabled() ) bli_obj_print_check( label, obj ); fprintf( file, "\n" ); fprintf( file, "%s\n", label ); fprintf( file, "\n" ); fprintf( file, " m x n %lu x %lu\n", ( unsigned long int )bli_obj_length( *obj ), ( unsigned long int )bli_obj_width( *obj ) ); fprintf( file, "\n" ); fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_off( *obj ), ( unsigned long int )bli_obj_col_off( *obj ) ); fprintf( file, " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( *obj ) ); fprintf( file, "\n" ); fprintf( file, " buf %p\n", ( void* )bli_obj_buffer( *obj ) ); fprintf( file, " elem size %lu\n", ( unsigned long int )bli_obj_elem_size( *obj ) ); fprintf( file, " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ), ( signed long int )bli_obj_col_stride( *obj ) ); fprintf( file, " is %ld\n", ( signed long int )bli_obj_imag_stride( *obj ) ); fprintf( file, " m_padded %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) ); fprintf( file, " n_padded %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) ); fprintf( file, " ps %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) ); fprintf( file, "\n" ); fprintf( file, " info %lX\n", ( unsigned long int )(*obj).info ); fprintf( file, " - is complex %lu\n", ( unsigned long int )bli_obj_is_complex( *obj ) ); fprintf( file, " - is d. prec %lu\n", ( unsigned long int )bli_obj_is_double_precision( *obj ) ); fprintf( file, " - datatype %lu\n", ( unsigned long int )bli_obj_datatype( *obj ) ); fprintf( file, " - target dt %lu\n", ( unsigned long int )bli_obj_target_datatype( *obj ) ); fprintf( file, " - exec dt %lu\n", ( unsigned long int )bli_obj_execution_datatype( *obj ) ); fprintf( file, " - has trans %lu\n", ( unsigned long int )bli_obj_has_trans( *obj ) ); fprintf( file, " - has conj %lu\n", ( unsigned long int )bli_obj_has_conj( *obj ) ); fprintf( file, " - unit diag? %lu\n", ( unsigned long int )bli_obj_has_unit_diag( *obj ) ); fprintf( file, " - struc type %lu\n", ( unsigned long int )bli_obj_struc( *obj ) >> BLIS_STRUC_SHIFT ); fprintf( file, " - uplo type %lu\n", ( unsigned long int )bli_obj_uplo( *obj ) >> BLIS_UPLO_SHIFT ); fprintf( file, " - is upper %lu\n", ( unsigned long int )bli_obj_is_upper( *obj ) ); fprintf( file, " - is lower %lu\n", ( unsigned long int )bli_obj_is_lower( *obj ) ); fprintf( file, " - is dense %lu\n", ( unsigned long int )bli_obj_is_dense( *obj ) ); fprintf( file, " - pack schema %lu\n", ( unsigned long int )bli_obj_pack_schema( *obj ) >> BLIS_PACK_SCHEMA_SHIFT ); fprintf( file, " - packinv diag? %lu\n", ( unsigned long int )bli_obj_has_inverted_diag( *obj ) ); fprintf( file, " - pack ordifup %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_upper( *obj ) ); fprintf( file, " - pack ordiflo %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_lower( *obj ) ); fprintf( file, " - packbuf type %lu\n", ( unsigned long int )bli_obj_pack_buffer_type( *obj ) >> BLIS_PACK_BUFFER_SHIFT ); fprintf( file, "\n" ); }
siz_t bli_thread_get_range_weighted_b2t ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { siz_t area; // This function assigns area-weighted ranges in the m dimension // where the total range spans 0 to m-1 with 0 at the bottom end and // m-1 at the top end. if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) { bli_reflect_about_diag( diagoff, uplo, m, n ); } bli_reflect_about_diag( diagoff, uplo, m, n ); bli_rotate180_trapezoid( diagoff, uplo ); area = bli_thread_get_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end ); } else // if dense or zeros { area = bli_thread_get_range_b2t ( thr, a, bmult, start, end ); } return area; }
void bli_axpym_unb_var1( obj_t* alpha, obj_t* x, obj_t* y, cntx_t* cntx ) { num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); doff_t diagoffx = bli_obj_diag_offset( *x ); diag_t diagx = bli_obj_diag( *x ); uplo_t uplox = bli_obj_uplo( *x ); trans_t transx = bli_obj_conjtrans_status( *x ); dim_t m = bli_obj_length( *y ); dim_t n = bli_obj_width( *y ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_y = bli_obj_row_stride( *y ); inc_t cs_y = bli_obj_col_stride( *y ); void* buf_y = bli_obj_buffer_at_off( *y ); num_t dt_alpha; void* buf_alpha; FUNCPTR_T f; // If alpha is a scalar constant, use dt_x to extract the address of the // corresponding constant value; otherwise, use the datatype encoded // within the alpha object and extract the buffer at the alpha offset. bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_alpha][dt_x][dt_y]; // Invoke the function. f( diagoffx, diagx, uplox, transx, m, n, buf_alpha, buf_x, rs_x, cs_x, buf_y, rs_y, cs_y ); }
void bli_unpackm_unb_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_pc = bli_obj_datatype( *p ); doff_t diagoffp = bli_obj_diag_offset( *p ); uplo_t uplop = bli_obj_uplo( *p ); trans_t transc = bli_obj_onlytrans_status( *c ); dim_t m_c = bli_obj_length( *c ); dim_t n_c = bli_obj_width( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_pc]; // Invoke the function. f( diagoffp, uplop, transc, m_c, n_c, buf_p, rs_p, cs_p, buf_c, rs_c, cs_c, cntx ); }
void bli_addm_unb_var1( obj_t* x, obj_t* y, cntx_t* cntx ) { num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); doff_t diagoffx = bli_obj_diag_offset( *x ); diag_t diagx = bli_obj_diag( *x ); uplo_t uplox = bli_obj_uplo( *x ); trans_t transx = bli_obj_conjtrans_status( *x ); dim_t m = bli_obj_length( *y ); dim_t n = bli_obj_width( *y ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_y = bli_obj_row_stride( *y ); inc_t cs_y = bli_obj_col_stride( *y ); void* buf_y = bli_obj_buffer_at_off( *y ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x][dt_y]; // Invoke the function. f( diagoffx, diagx, uplox, transx, m, n, buf_x, rs_x, cs_x, buf_y, rs_y, cs_y ); }
void bli_scald_unb_var1( obj_t* beta, obj_t* x ) { num_t dt_x = bli_obj_datatype( *x ); conj_t conjbeta = bli_obj_conj_status( *beta ); doff_t diagoffx = bli_obj_diag_offset( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_beta; num_t dt_beta; FUNCPTR_T f; // If beta is a scalar constant, use dt_x to extract the address of the // corresponding constant value; otherwise, use the datatype encoded // within the beta object and extract the buffer at the beta offset. bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_beta][dt_x]; // Invoke the function. f( conjbeta, diagoffx, m, n, buf_beta, buf_x, rs_x, cs_x ); }
void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; func_t* gemmtrsm_ukrs; func_t* gemm_ukrs; void* gemmtrsm_ukr; void* gemm_ukr; // Grab the address of the internal scalar buffer for the scalar // attached to A. This will be the alpha scalar used in the gemmtrsm // subproblems (ie: the scalar that would be applied to the packed // copy of A prior to it being updated by the trsm subproblem). This // scalar may be unit, if for example it was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. gemmtrsm_ukrs = cntl_gemmtrsm_u_ukrs( cntl ); gemm_ukrs = cntl_gemm_ukrs( cntl ); gemmtrsm_ukr = bli_func_obj_query( dt_exec, gemmtrsm_ukrs ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr, thread ); }
void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; func_t* gemmtrsm_ukrs; func_t* gemm_ukrs; void* gemmtrsm_ukr; void* gemm_ukr; // Grab the address of the internal scalar buffer for the scalar // attached to A. This will be the alpha scalar used in the gemmtrsm // subproblems (ie: the scalar that would be applied to the packed // copy of A prior to it being updated by the trsm subproblem). This // scalar may be unit, if for example it was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This // is needed because cs_a and rs_b are used to index into the // micro-panels of A and B, respectively, and since the pointer // types in the macro-kernel (scomplex or dcomplex) will result // in pointer arithmetic that moves twice as far as it should, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_panel_packed_4m( *a ) || bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. gemmtrsm_ukrs = cntl_gemmtrsm_l_ukrs( cntl ); gemm_ukrs = cntl_gemm_ukrs( cntl ); gemmtrsm_ukr = bli_func_obj_query( dt_exec, gemmtrsm_ukrs ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr ); }
void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; func_t* gemm_ukrs; void* gemm_ukr; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. gemm_ukrs = cntl_gemm_ukrs( cntl ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, gemm_ukr, thread ); }
void bli_herk_u_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, thread ); }
void bli_trsm_rl_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, thread ); }
void bli_packm_blk_var1( obj_t* c, obj_t* p, packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); trans_t transc = bli_obj_conjtrans_status( *c ); pack_t schema = bli_obj_pack_schema( *p ); bool_t invdiag = bli_obj_has_inverted_diag( *p ); bool_t revifup = bli_obj_is_pack_rev_if_upper( *p ); bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p ); dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); inc_t is_p = bli_obj_imag_stride( *p ); dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); obj_t kappa; /*---initialize pointer to stop gcc complaining 2-9-16 GH --- */ obj_t* kappa_p = {0}; void* buf_kappa; func_t* packm_kers; void* packm_ker; FUNCPTR_T f; // Treatment of kappa (ie: packing during scaling) depends on // whether we are executing an induced method. if ( bli_is_ind_packed( schema ) ) { // The value for kappa we use will depend on whether the scalar // attached to A has a nonzero imaginary component. If it does, // then we will apply the scalar during packing to facilitate // implementing induced complex domain algorithms in terms of // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) if( thread_am_ochief( t ) ) { if ( bli_obj_scalar_has_nonzero_imag( p ) ) { // Detach the scalar. bli_obj_scalar_detach( p, &kappa ); // Reset the attached scalar (to 1.0). bli_obj_scalar_reset( p ); kappa_p = κ } else { // If the internal scalar of A has only a real component, then // we will apply it later (in the micro-kernel), and so we will // use BLIS_ONE to indicate no scaling during packing. kappa_p = &BLIS_ONE; } } kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); } else // if ( bli_is_nat_packed( schema ) ) { // This branch if for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform // any scaling during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); } // Choose the correct func_t object based on the pack_t schema. if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers; else if ( bli_is_3mi_packed( schema ) || bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers; else if ( bli_is_ro_packed( schema ) || bli_is_io_packed( schema ) || bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_obj_query( dt_cp, packm_kers ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, schema, invdiag, revifup, reviflo, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, is_p, pd_p, ps_p, packm_ker, t ); }
void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, trmm_t* cntl, trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; func_t* gemm_ukrs; void* gemm_ukr; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This // is needed because cs_a and rs_b are used to index into the // micro-panels of A and B, respectively, and since the pointer // types in the macro-kernel (scomplex or dcomplex) will result // in pointer arithmetic that moves twice as far as it should, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_panel_packed_4m( *a ) || bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. gemm_ukrs = cntl_gemm_ukrs( cntl ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, gemm_ukr, thread ); }
void libblis_test_randm_check( obj_t* x, double* resid ) { doff_t diagoffx = bli_obj_diag_offset( *x ); uplo_t uplox = bli_obj_uplo( *x ); dim_t m_x = bli_obj_length( *x ); dim_t n_x = bli_obj_width( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); *resid = 0.0; // // The two most likely ways that randm would fail is if all elements // were zero, or if all elements were greater than or equal to one. // We check both of these conditions by computing the sum of the // absolute values of the elements of x. // if ( bli_obj_is_float( *x ) ) { float sum_x; bli_sabsumm( diagoffx, uplox, m_x, n_x, buf_x, rs_x, cs_x, &sum_x ); if ( sum_x == *bli_s0 ) *resid = 1.0; else if ( sum_x >= 1.0 * m_x * n_x ) *resid = 2.0; } else if ( bli_obj_is_double( *x ) ) { double sum_x; bli_dabsumm( diagoffx, uplox, m_x, n_x, buf_x, rs_x, cs_x, &sum_x ); if ( sum_x == *bli_d0 ) *resid = 1.0; else if ( sum_x >= 1.0 * m_x * n_x ) *resid = 2.0; } else if ( bli_obj_is_scomplex( *x ) ) { float sum_x; bli_cabsumm( diagoffx, uplox, m_x, n_x, buf_x, rs_x, cs_x, &sum_x ); if ( sum_x == *bli_s0 ) *resid = 1.0; else if ( sum_x >= 2.0 * m_x * n_x ) *resid = 2.0; } else // if ( bli_obj_is_dcomplex( *x ) ) { double sum_x; bli_zabsumm( diagoffx, uplox, m_x, n_x, buf_x, rs_x, cs_x, &sum_x ); if ( sum_x == *bli_d0 ) *resid = 1.0; else if ( sum_x >= 2.0 * m_x * n_x ) *resid = 2.0; } }
void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, trmm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t cs_b = bli_obj_col_stride( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, ps_a, buf_b, rs_b, cs_b, ps_b, buf_beta, buf_c, rs_c, cs_c ); }