void bli_unpackm_blk_var2( obj_t* p, obj_t* c, unpackm_t* cntl ) { num_t dt_cp = bli_obj_datatype( *c ); // Normally we take the parameters from the source argument. But here, // the packm/unpackm framework is not yet solidified enough for us to // assume that at this point struc(P) == struc(C), (ie: since // densification may have marked P's structure as dense when the root // is upper or lower). So, we take the struc field from C, not P. struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); // Again, normally the trans argument is on the source matrix. But we // know that the packed matrix is not transposed. If there is to be a // transposition, it is because C was originally transposed when packed. // Thus, we query C for the trans status, not P. Also, we only query // the trans status (not the conjugation status), since we probably // don't want to un-conjugate if the original matrix was conjugated // when packed. trans_t transc = bli_obj_onlytrans_status( *c ); dim_t m_c = bli_obj_length( *c ); dim_t n_c = bli_obj_width( *c ); dim_t m_panel = bli_obj_panel_length( *c ); dim_t n_panel = bli_obj_panel_width( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, m_c, n_c, m_panel, n_panel, buf_p, rs_p, cs_p, pd_p, ps_p, buf_c, rs_c, cs_c ); }
void bli_packm_unb_var1( obj_t* c, obj_t* p, packm_thrinfo_t* thread ) { num_t dt_cp = bli_obj_datatype( *c ); struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); trans_t transc = bli_obj_conjtrans_status( *c ); dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); void* buf_kappa; FUNCPTR_T f; // This variant assumes that the computational kernel will always apply // the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE // for kappa so that the underlying packm implementation does not scale // during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; if( thread_am_ochief( thread ) ) { // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p ); } }
dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ) { dim_t panel_off; if ( bli_obj_pack_schema( *p ) == BLIS_PACKED_ROWS ) { // For the "packed rows" schema, a single row is effectively one // row panel, and so we use the row offset as the panel offset. // Then we multiply this offset by the effective panel stride // (ie: the row stride) to arrive at the desired offset. panel_off = offmn * bli_obj_row_stride( *p ); } else if ( bli_obj_pack_schema( *p ) == BLIS_PACKED_COLUMNS ) { // For the "packed columns" schema, a single column is effectively one // column panel, and so we use the column offset as the panel offset. // Then we multiply this offset by the effective panel stride // (ie: the column stride) to arrive at the desired offset. panel_off = offmn * bli_obj_col_stride( *p ); } else if ( bli_obj_pack_schema( *p ) == BLIS_PACKED_ROW_PANELS ) { // For the "packed row panels" schema, the column stride is equal to // the panel dimension (length). So we can divide it into offmn // (interpreted as a row offset) to arrive at a panel offset. Then // we multiply this offset by the panel stride to arrive at the total // offset to the panel (in units of elements). panel_off = offmn / bli_obj_col_stride( *p ); panel_off = panel_off * bli_obj_panel_stride( *p ); // Sanity check. if ( offmn % bli_obj_col_stride( *p ) > 0 ) bli_abort(); } else if ( bli_obj_pack_schema( *p ) == BLIS_PACKED_COL_PANELS ) { // For the "packed column panels" schema, the row stride is equal to // the panel dimension (width). So we can divide it into offmn // (interpreted as a column offset) to arrive at a panel offset. Then // we multiply this offset by the panel stride to arrive at the total // offset to the panel (in units of elements). panel_off = offmn / bli_obj_row_stride( *p ); panel_off = panel_off * bli_obj_panel_stride( *p ); // Sanity check. if ( offmn % bli_obj_row_stride( *p ) > 0 ) bli_abort(); } else { panel_off = 0; bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } return panel_off; }
void bli_trsm_rl_ker_var2( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, trsm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t cs_b = bli_obj_col_stride( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); num_t dt_alpha; void* buf_alpha; FUNCPTR_T f; // If alpha is a scalar constant, use dt_exec to extract the address of the // corresponding constant value; otherwise, use the datatype encoded // within the alpha object and extract the buffer at the alpha offset. bli_set_scalar_dt_buffer( alpha, dt_exec, dt_alpha, buf_alpha ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, ps_a, buf_b, rs_b, cs_b, ps_b, buf_c, rs_c, cs_c ); }
void bli_axpym_unb_var1( obj_t* alpha, obj_t* x, obj_t* y, cntx_t* cntx ) { num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); doff_t diagoffx = bli_obj_diag_offset( *x ); diag_t diagx = bli_obj_diag( *x ); uplo_t uplox = bli_obj_uplo( *x ); trans_t transx = bli_obj_conjtrans_status( *x ); dim_t m = bli_obj_length( *y ); dim_t n = bli_obj_width( *y ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_y = bli_obj_row_stride( *y ); inc_t cs_y = bli_obj_col_stride( *y ); void* buf_y = bli_obj_buffer_at_off( *y ); num_t dt_alpha; void* buf_alpha; FUNCPTR_T f; // If alpha is a scalar constant, use dt_x to extract the address of the // corresponding constant value; otherwise, use the datatype encoded // within the alpha object and extract the buffer at the alpha offset. bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_alpha][dt_x][dt_y]; // Invoke the function. f( diagoffx, diagx, uplox, transx, m, n, buf_alpha, buf_x, rs_x, cs_x, buf_y, rs_y, cs_y ); }
void bli_setid_unb_var1( obj_t* beta, obj_t* x ) { num_t dt_xr = bli_obj_datatype_proj_to_real( *x ); num_t dt_x = bli_obj_datatype( *x ); doff_t diagoffx = bli_obj_diag_offset( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); void* buf_beta = bli_obj_buffer_for_1x1( dt_xr, *beta ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x]; // Invoke the function. f( diagoffx, m, n, buf_beta, buf_x, rs_x, cs_x ); }
void bli_norm1m_unb_var1( obj_t* x, obj_t* norm ) { num_t dt_x = bli_obj_datatype( *x ); doff_t diagoffx = bli_obj_diag_offset( *x ); uplo_t diagx = bli_obj_diag( *x ); uplo_t uplox = bli_obj_uplo( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_norm = bli_obj_buffer_at_off( *norm ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x]; // Invoke the function. f( diagoffx, diagx, uplox, m, n, buf_x, rs_x, cs_x, buf_norm ); }
void bli_gemv_unb_var2( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, gemv_t* cntl ) { num_t dt_a = bli_obj_datatype( *a ); num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); conj_t transa = bli_obj_conjtrans_status( *a ); conj_t conjx = bli_obj_conj_status( *x ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t incx = bli_obj_vector_inc( *x ); void* buf_y = bli_obj_buffer_at_off( *y ); inc_t incy = bli_obj_vector_inc( *y ); num_t dt_alpha; void* buf_alpha; num_t dt_beta; void* buf_beta; FUNCPTR_T f; // The datatype of alpha MUST be the type union of a and x. This is to // prevent any unnecessary loss of information during computation. dt_alpha = bli_datatype_union( dt_a, dt_x ); buf_alpha = bli_obj_scalar_buffer( dt_alpha, *alpha ); // The datatype of beta MUST be the same as the datatype of y. dt_beta = dt_y; buf_beta = bli_obj_scalar_buffer( dt_beta, *beta ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_a][dt_x][dt_y]; // Invoke the function. f( transa, conjx, m, n, buf_alpha, buf_a, rs_a, cs_a, buf_x, incx, buf_beta, buf_y, incy ); }
void bli_gemmtrsm_ukr( obj_t* alpha, obj_t* a1x, obj_t* a11, obj_t* bx1, obj_t* b11, obj_t* c11 ) { dim_t k = bli_obj_width( *a1x ); num_t dt = bli_obj_datatype( *c11 ); void* buf_a1x = bli_obj_buffer_at_off( *a1x ); void* buf_a11 = bli_obj_buffer_at_off( *a11 ); void* buf_bx1 = bli_obj_buffer_at_off( *bx1 ); void* buf_b11 = bli_obj_buffer_at_off( *b11 ); void* buf_c11 = bli_obj_buffer_at_off( *c11 ); inc_t rs_c = bli_obj_row_stride( *c11 ); inc_t cs_c = bli_obj_col_stride( *c11 ); void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); inc_t ps_a = bli_obj_panel_stride( *a1x ); inc_t ps_b = bli_obj_panel_stride( *bx1 ); FUNCPTR_T f; auxinfo_t data; // Fill the auxinfo_t struct in case the micro-kernel uses it. if ( bli_obj_is_lower( *a11 ) ) { bli_auxinfo_set_next_a( buf_a1x, data ); } else { bli_auxinfo_set_next_a( buf_a11, data ); } bli_auxinfo_set_next_b( buf_bx1, data ); bli_auxinfo_set_ps_a( ps_a, data ); bli_auxinfo_set_ps_b( ps_b, data ); // Index into the type combination array to extract the correct // function pointer. if ( bli_obj_is_lower( *a11 ) ) f = ftypes_l[dt]; else f = ftypes_u[dt]; // Invoke the function. f( k, buf_alpha, buf_a1x, buf_a11, buf_bx1, buf_b11, buf_c11, rs_c, cs_c, &data ); }
void bli_dotxf_kernel( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { num_t dt_a = bli_obj_datatype( *a ); num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); conj_t conjat = bli_obj_conj_status( *a ); conj_t conjx = bli_obj_conj_status( *x ); dim_t m = bli_obj_vector_dim( *x ); dim_t b_n = bli_obj_vector_dim( *y ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t inc_x = bli_obj_vector_inc( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t inc_y = bli_obj_vector_inc( *y ); void* buf_y = bli_obj_buffer_at_off( *y ); num_t dt_alpha; void* buf_alpha; num_t dt_beta; void* buf_beta; FUNCPTR_T f; // The datatype of alpha MUST be the type union of a and x. This is to // prevent any unnecessary loss of information during computation. dt_alpha = bli_datatype_union( dt_a, dt_x ); buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); // The datatype of beta MUST be the same as the datatype of y. dt_beta = dt_y; buf_beta = bli_obj_buffer_for_1x1( dt_beta, *beta ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_a][dt_x][dt_y]; // Invoke the function. f( conjat, conjx, m, b_n, buf_alpha, buf_a, rs_a, cs_a, buf_x, inc_x, buf_beta, buf_y, inc_y ); }
void bli_unpackm_unb_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_pc = bli_obj_datatype( *p ); doff_t diagoffp = bli_obj_diag_offset( *p ); uplo_t uplop = bli_obj_uplo( *p ); trans_t transc = bli_obj_onlytrans_status( *c ); dim_t m_c = bli_obj_length( *c ); dim_t n_c = bli_obj_width( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_pc]; // Invoke the function. f( diagoffp, uplop, transc, m_c, n_c, buf_p, rs_p, cs_p, buf_c, rs_c, cs_c, cntx ); }
void bli_scalm_unb_var1( obj_t* alpha, obj_t* x, cntx_t* cntx ) { num_t dt_x = bli_obj_datatype( *x ); doff_t diagoffx = bli_obj_diag_offset( *x ); uplo_t diagx = bli_obj_diag( *x ); uplo_t uplox = bli_obj_uplo( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_alpha; obj_t x_local; FUNCPTR_T f; // Alias x to x_local so we can apply alpha if it is non-unit. bli_obj_alias_to( *x, x_local ); // If alpha is non-unit, apply it to the scalar attached to x. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &x_local ); } // Grab the address of the internal scalar buffer for the scalar // attached to x. buf_alpha_x = bli_obj_internal_scalar_buffer( *x ); // Index into the type combination array to extract the correct // function pointer. // NOTE: We use dt_x for both alpha and x because alpha was obtained // from the attached scalar of x, which is guaranteed to be of the // same datatype as x. f = ftypes[dt_x][dt_x]; // Invoke the function. // NOTE: We unconditionally pass in BLIS_NO_CONJUGATE for alpha // because it would have already been conjugated by the front-end. f( BLIS_NO_CONJUGATE, diagoffx, diagx, uplox, m, n, buf_alpha, buf_x, rs_x, cs_x ); }
void bli_ger_unb_var2( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ) { num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); num_t dt_a = bli_obj_datatype( *a ); conj_t conjx = bli_obj_conj_status( *x ); conj_t conjy = bli_obj_conj_status( *y ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t incx = bli_obj_vector_inc( *x ); void* buf_y = bli_obj_buffer_at_off( *y ); inc_t incy = bli_obj_vector_inc( *y ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); num_t dt_alpha; void* buf_alpha; FUNCPTR_T f; // The datatype of alpha MUST be the type union of x and y. This is to // prevent any unnecessary loss of information during computation. dt_alpha = bli_datatype_union( dt_x, dt_y ); buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_a]; // Invoke the function. f( conjx, conjy, m, n, buf_alpha, buf_x, incx, buf_y, incy, buf_a, rs_a, cs_a, cntx ); }
void bli_her2_unb_var1( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, her2_t* cntl ) { num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); num_t dt_c = bli_obj_datatype( *c ); uplo_t uplo = bli_obj_uplo( *c ); conj_t conjx = bli_obj_conj_status( *x ); conj_t conjy = bli_obj_conj_status( *y ); dim_t m = bli_obj_length( *c ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t incx = bli_obj_vector_inc( *x ); void* buf_y = bli_obj_buffer_at_off( *y ); inc_t incy = bli_obj_vector_inc( *y ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); num_t dt_alpha; void* buf_alpha; FUNCPTR_T f; // The datatype of alpha MUST be the type union of the datatypes of x and y. dt_alpha = bli_datatype_union( dt_x, dt_y ); buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x][dt_y][dt_c]; // Invoke the function. f( uplo, conjx, conjy, conjh, m, buf_alpha, buf_x, incx, buf_y, incy, buf_c, rs_c, cs_c ); }
void bli_addm_unb_var1( obj_t* x, obj_t* y, cntx_t* cntx ) { num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); doff_t diagoffx = bli_obj_diag_offset( *x ); diag_t diagx = bli_obj_diag( *x ); uplo_t uplox = bli_obj_uplo( *x ); trans_t transx = bli_obj_conjtrans_status( *x ); dim_t m = bli_obj_length( *y ); dim_t n = bli_obj_width( *y ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_y = bli_obj_row_stride( *y ); inc_t cs_y = bli_obj_col_stride( *y ); void* buf_y = bli_obj_buffer_at_off( *y ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x][dt_y]; // Invoke the function. f( diagoffx, diagx, uplox, transx, m, n, buf_x, rs_x, cs_x, buf_y, rs_y, cs_y ); }
void bli_obj_print( char* label, obj_t* obj ) { FILE* file = stdout; if ( bli_error_checking_is_enabled() ) bli_obj_print_check( label, obj ); fprintf( file, "\n" ); fprintf( file, "%s\n", label ); fprintf( file, "\n" ); fprintf( file, " m x n %lu x %lu\n", ( unsigned long int )bli_obj_length( *obj ), ( unsigned long int )bli_obj_width( *obj ) ); fprintf( file, "\n" ); fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_off( *obj ), ( unsigned long int )bli_obj_col_off( *obj ) ); fprintf( file, " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( *obj ) ); fprintf( file, "\n" ); fprintf( file, " buf %p\n", ( void* )bli_obj_buffer( *obj ) ); fprintf( file, " elem size %lu\n", ( unsigned long int )bli_obj_elem_size( *obj ) ); fprintf( file, " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ), ( signed long int )bli_obj_col_stride( *obj ) ); fprintf( file, " is %ld\n", ( signed long int )bli_obj_imag_stride( *obj ) ); fprintf( file, " m_padded %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) ); fprintf( file, " n_padded %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) ); fprintf( file, " ps %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) ); fprintf( file, "\n" ); fprintf( file, " info %lX\n", ( unsigned long int )(*obj).info ); fprintf( file, " - is complex %lu\n", ( unsigned long int )bli_obj_is_complex( *obj ) ); fprintf( file, " - is d. prec %lu\n", ( unsigned long int )bli_obj_is_double_precision( *obj ) ); fprintf( file, " - datatype %lu\n", ( unsigned long int )bli_obj_datatype( *obj ) ); fprintf( file, " - target dt %lu\n", ( unsigned long int )bli_obj_target_datatype( *obj ) ); fprintf( file, " - exec dt %lu\n", ( unsigned long int )bli_obj_execution_datatype( *obj ) ); fprintf( file, " - has trans %lu\n", ( unsigned long int )bli_obj_has_trans( *obj ) ); fprintf( file, " - has conj %lu\n", ( unsigned long int )bli_obj_has_conj( *obj ) ); fprintf( file, " - unit diag? %lu\n", ( unsigned long int )bli_obj_has_unit_diag( *obj ) ); fprintf( file, " - struc type %lu\n", ( unsigned long int )bli_obj_struc( *obj ) >> BLIS_STRUC_SHIFT ); fprintf( file, " - uplo type %lu\n", ( unsigned long int )bli_obj_uplo( *obj ) >> BLIS_UPLO_SHIFT ); fprintf( file, " - is upper %lu\n", ( unsigned long int )bli_obj_is_upper( *obj ) ); fprintf( file, " - is lower %lu\n", ( unsigned long int )bli_obj_is_lower( *obj ) ); fprintf( file, " - is dense %lu\n", ( unsigned long int )bli_obj_is_dense( *obj ) ); fprintf( file, " - pack schema %lu\n", ( unsigned long int )bli_obj_pack_schema( *obj ) >> BLIS_PACK_SCHEMA_SHIFT ); fprintf( file, " - packinv diag? %lu\n", ( unsigned long int )bli_obj_has_inverted_diag( *obj ) ); fprintf( file, " - pack ordifup %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_upper( *obj ) ); fprintf( file, " - pack ordiflo %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_lower( *obj ) ); fprintf( file, " - packbuf type %lu\n", ( unsigned long int )bli_obj_pack_buffer_type( *obj ) >> BLIS_PACK_BUFFER_SHIFT ); fprintf( file, "\n" ); }
void bli_fprintm( FILE* file, char* s1, obj_t* x, char* format, char* s2 ) { num_t dt_x = bli_obj_datatype( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); FUNCPTR_T f; if ( bli_error_checking_is_enabled() ) bli_fprintm_check( file, s1, x, format, s2 ); // Handle constants up front. if ( dt_x == BLIS_CONSTANT ) { float* sp = bli_obj_buffer_for_const( BLIS_FLOAT, *x ); double* dp = bli_obj_buffer_for_const( BLIS_DOUBLE, *x ); scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, *x ); dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, *x ); gint_t* ip = bli_obj_buffer_for_const( BLIS_INT, *x ); fprintf( file, "%s\n", s1 ); fprintf( file, " float: %9.2e\n", bli_sreal( *sp ) ); fprintf( file, " double: %9.2e\n", bli_dreal( *dp ) ); fprintf( file, " scomplex: %9.2e + %9.2e\n", bli_creal( *cp ), bli_cimag( *cp ) ); fprintf( file, " dcomplex: %9.2e + %9.2e\n", bli_zreal( *zp ), bli_zimag( *zp ) ); fprintf( file, " int: %ld\n", *ip ); fprintf( file, "\n" ); return; } // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x]; // Invoke the function. f( file, s1, m, n, buf_x, rs_x, cs_x, format, s2 ); }
void bli_trmv_unf_var2( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trmv_t* cntl ) { num_t dt_a = bli_obj_datatype( *a ); num_t dt_x = bli_obj_datatype( *x ); uplo_t uplo = bli_obj_uplo( *a ); trans_t trans = bli_obj_conjtrans_status( *a ); diag_t diag = bli_obj_diag( *a ); dim_t m = bli_obj_length( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t incx = bli_obj_vector_inc( *x ); num_t dt_alpha; void* buf_alpha; FUNCPTR_T f; // The datatype of alpha MUST be the type union of a and x. This is to // prevent any unnecessary loss of information during computation. dt_alpha = bli_datatype_union( dt_a, dt_x ); buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_a][dt_x]; // Invoke the function. f( uplo, trans, diag, m, buf_alpha, buf_a, rs_a, cs_a, buf_x, incx ); }
void bli_scald_unb_var1( obj_t* beta, obj_t* x ) { num_t dt_x = bli_obj_datatype( *x ); conj_t conjbeta = bli_obj_conj_status( *beta ); doff_t diagoffx = bli_obj_diag_offset( *x ); dim_t m = bli_obj_length( *x ); dim_t n = bli_obj_width( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_beta; num_t dt_beta; FUNCPTR_T f; // If beta is a scalar constant, use dt_x to extract the address of the // corresponding constant value; otherwise, use the datatype encoded // within the beta object and extract the buffer at the beta offset. bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_beta][dt_x]; // Invoke the function. f( conjbeta, diagoffx, m, n, buf_beta, buf_x, rs_x, cs_x ); }
void bli_mksymm_unb_var1( obj_t* a ) { num_t dt_a = bli_obj_datatype( *a ); uplo_t uploa = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_a]; // Invoke the function. f( uploa, m, buf_a, rs_a, cs_a ); }
void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; func_t* gemm_ukrs; void* gemm_ukr; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. gemm_ukrs = cntl_gemm_ukrs( cntl ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, gemm_ukr, thread ); }
void bli_herk_u_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, thread ); }
void bli_trsm_rl_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, thread ); }
void bli_packm_blk_var1( obj_t* c, obj_t* p, packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); trans_t transc = bli_obj_conjtrans_status( *c ); pack_t schema = bli_obj_pack_schema( *p ); bool_t invdiag = bli_obj_has_inverted_diag( *p ); bool_t revifup = bli_obj_is_pack_rev_if_upper( *p ); bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p ); dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); inc_t is_p = bli_obj_imag_stride( *p ); dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); obj_t kappa; /*---initialize pointer to stop gcc complaining 2-9-16 GH --- */ obj_t* kappa_p = {0}; void* buf_kappa; func_t* packm_kers; void* packm_ker; FUNCPTR_T f; // Treatment of kappa (ie: packing during scaling) depends on // whether we are executing an induced method. if ( bli_is_ind_packed( schema ) ) { // The value for kappa we use will depend on whether the scalar // attached to A has a nonzero imaginary component. If it does, // then we will apply the scalar during packing to facilitate // implementing induced complex domain algorithms in terms of // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) if( thread_am_ochief( t ) ) { if ( bli_obj_scalar_has_nonzero_imag( p ) ) { // Detach the scalar. bli_obj_scalar_detach( p, &kappa ); // Reset the attached scalar (to 1.0). bli_obj_scalar_reset( p ); kappa_p = κ } else { // If the internal scalar of A has only a real component, then // we will apply it later (in the micro-kernel), and so we will // use BLIS_ONE to indicate no scaling during packing. kappa_p = &BLIS_ONE; } } kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); } else // if ( bli_is_nat_packed( schema ) ) { // This branch if for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform // any scaling during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); } // Choose the correct func_t object based on the pack_t schema. if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers; else if ( bli_is_3mi_packed( schema ) || bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers; else if ( bli_is_ro_packed( schema ) || bli_is_io_packed( schema ) || bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_obj_query( dt_cp, packm_kers ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, schema, invdiag, revifup, reviflo, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, is_p, pd_p, ps_p, packm_ker, t ); }
void bli_hemv_unf_var3( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, hemv_t* cntl ) { num_t dt_a = bli_obj_datatype( *a ); num_t dt_x = bli_obj_datatype( *x ); num_t dt_y = bli_obj_datatype( *y ); uplo_t uplo = bli_obj_uplo( *a ); conj_t conja = bli_obj_conj_status( *a ); conj_t conjx = bli_obj_conj_status( *x ); dim_t m = bli_obj_length( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t incx = bli_obj_vector_inc( *x ); void* buf_y = bli_obj_buffer_at_off( *y ); inc_t incy = bli_obj_vector_inc( *y ); num_t dt_alpha; void* buf_alpha; num_t dt_beta; void* buf_beta; FUNCPTR_T f; // The datatype of alpha MUST be the type union of a and x. This is to // prevent any unnecessary loss of information during computation. dt_alpha = bli_datatype_union( dt_a, dt_x ); buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); // The datatype of beta MUST be the same as the datatype of y. dt_beta = dt_y; buf_beta = bli_obj_buffer_for_1x1( dt_beta, *beta ); #if 0 obj_t x_copy, y_copy; bli_obj_create( dt_x, m, 1, 0, 0, &x_copy ); bli_obj_create( dt_y, m, 1, 0, 0, &y_copy ); bli_copyv( x, &x_copy ); bli_copyv( y, &y_copy ); buf_x = bli_obj_buffer_at_off( x_copy ); buf_y = bli_obj_buffer_at_off( y_copy ); incx = 1; incy = 1; #endif // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_a][dt_x][dt_y]; // Invoke the function. f( uplo, conja, conjx, conjh, m, buf_alpha, buf_a, rs_a, cs_a, buf_x, incx, buf_beta, buf_y, incy ); #if 0 bli_copyv( &y_copy, y ); bli_obj_free( &x_copy ); bli_obj_free( &y_copy ); #endif }
void bli_packm_blk_var1_md ( obj_t* c, obj_t* p, cntx_t* cntx, cntl_t* cntl, thrinfo_t* t ) { num_t dt_c = bli_obj_dt( c ); num_t dt_p = bli_obj_dt( p ); trans_t transc = bli_obj_conjtrans_status( c ); pack_t schema = bli_obj_pack_schema( p ); dim_t m_p = bli_obj_length( p ); dim_t n_p = bli_obj_width( p ); dim_t m_max_p = bli_obj_padded_length( p ); dim_t n_max_p = bli_obj_padded_width( p ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_p = bli_obj_buffer_at_off( p ); inc_t rs_p = bli_obj_row_stride( p ); inc_t cs_p = bli_obj_col_stride( p ); inc_t is_p = bli_obj_imag_stride( p ); dim_t pd_p = bli_obj_panel_dim( p ); inc_t ps_p = bli_obj_panel_stride( p ); obj_t kappa; void* buf_kappa; FUNCPTR_T f; // Treatment of kappa (ie: packing during scaling) depends on // whether we are executing an induced method. if ( bli_is_nat_packed( schema ) ) { // This branch is for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform // any scaling during packing. buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); } else // if ( bli_is_ind_packed( schema ) ) { obj_t* kappa_p; // The value for kappa we use will depend on whether the scalar // attached to A has a nonzero imaginary component. If it does, // then we will apply the scalar during packing to facilitate // implementing induced complex domain algorithms in terms of // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) if ( bli_obj_scalar_has_nonzero_imag( p ) ) { // Detach the scalar. bli_obj_scalar_detach( p, &kappa ); // Reset the attached scalar (to 1.0). bli_obj_scalar_reset( p ); kappa_p = κ } else { // If the internal scalar of A has only a real component, then // we will apply it later (in the micro-kernel), and so we will // use BLIS_ONE to indicate no scaling during packing. kappa_p = &BLIS_ONE; } // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p ); } // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_c][dt_p]; // Invoke the function. f( transc, schema, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, is_p, pd_p, ps_p, cntx, t ); }
void libblis_test_randm_check( obj_t* x, double* resid ) { doff_t diagoffx = bli_obj_diag_offset( *x ); uplo_t uplox = bli_obj_uplo( *x ); dim_t m_x = bli_obj_length( *x ); dim_t n_x = bli_obj_width( *x ); inc_t rs_x = bli_obj_row_stride( *x ); inc_t cs_x = bli_obj_col_stride( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); *resid = 0.0; // // The two most likely ways that randm would fail is if all elements // were zero, or if all elements were greater than or equal to one. // We check both of these conditions by computing the sum of the // absolute values of the elements of x. // if ( bli_obj_is_float( *x ) ) { float sum_x; bli_sabsumm( diagoffx, uplox, m_x, n_x, buf_x, rs_x, cs_x, &sum_x ); if ( sum_x == *bli_s0 ) *resid = 1.0; else if ( sum_x >= 1.0 * m_x * n_x ) *resid = 2.0; } else if ( bli_obj_is_double( *x ) ) { double sum_x; bli_dabsumm( diagoffx, uplox, m_x, n_x, buf_x, rs_x, cs_x, &sum_x ); if ( sum_x == *bli_d0 ) *resid = 1.0; else if ( sum_x >= 1.0 * m_x * n_x ) *resid = 2.0; } else if ( bli_obj_is_scomplex( *x ) ) { float sum_x; bli_cabsumm( diagoffx, uplox, m_x, n_x, buf_x, rs_x, cs_x, &sum_x ); if ( sum_x == *bli_s0 ) *resid = 1.0; else if ( sum_x >= 2.0 * m_x * n_x ) *resid = 2.0; } else // if ( bli_obj_is_dcomplex( *x ) ) { double sum_x; bli_zabsumm( diagoffx, uplox, m_x, n_x, buf_x, rs_x, cs_x, &sum_x ); if ( sum_x == *bli_d0 ) *resid = 1.0; else if ( sum_x >= 2.0 * m_x * n_x ) *resid = 2.0; } }
void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; func_t* gemmtrsm_ukrs; func_t* gemm_ukrs; void* gemmtrsm_ukr; void* gemm_ukr; // Grab the address of the internal scalar buffer for the scalar // attached to A. This will be the alpha scalar used in the gemmtrsm // subproblems (ie: the scalar that would be applied to the packed // copy of A prior to it being updated by the trsm subproblem). This // scalar may be unit, if for example it was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This // is needed because cs_a and rs_b are used to index into the // micro-panels of A and B, respectively, and since the pointer // types in the macro-kernel (scomplex or dcomplex) will result // in pointer arithmetic that moves twice as far as it should, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_panel_packed_4m( *a ) || bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. gemmtrsm_ukrs = cntl_gemmtrsm_l_ukrs( cntl ); gemm_ukrs = cntl_gemm_ukrs( cntl ); gemmtrsm_ukr = bli_func_obj_query( dt_exec, gemmtrsm_ukrs ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr ); }
void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; func_t* gemmtrsm_ukrs; func_t* gemm_ukrs; void* gemmtrsm_ukr; void* gemm_ukr; // Grab the address of the internal scalar buffer for the scalar // attached to A. This will be the alpha scalar used in the gemmtrsm // subproblems (ie: the scalar that would be applied to the packed // copy of A prior to it being updated by the trsm subproblem). This // scalar may be unit, if for example it was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. gemmtrsm_ukrs = cntl_gemmtrsm_u_ukrs( cntl ); gemm_ukrs = cntl_gemm_ukrs( cntl ); gemmtrsm_ukr = bli_func_obj_query( dt_exec, gemmtrsm_ukrs ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr, thread ); }
void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, trmm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t cs_b = bli_obj_col_stride( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, ps_a, buf_b, rs_b, cs_b, ps_b, buf_beta, buf_c, rs_c, cs_c ); }