// // Define object-based interface. // void bli_scalv( obj_t* alpha, obj_t* x ) { num_t dt = bli_obj_datatype( *x ); dim_t n = bli_obj_vector_dim( *x ); inc_t inc_x = bli_obj_vector_inc( *x ); void* buf_x = bli_obj_buffer_at_off( *x ); obj_t alpha_local; void* buf_alpha; FUNCPTR_T f = ftypes[dt]; if ( bli_error_checking_is_enabled() ) bli_scalv_check( alpha, x ); // Create a local copy-cast of alpha (and apply internal conjugation // if needed). bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // Extract the scalar buffer. buf_alpha = bli_obj_buffer_for_1x1( dt, alpha_local ); // Invoke the void pointer-based function. f( BLIS_NO_CONJUGATE, // conjugation applied during copy-cast. n, buf_alpha, buf_x, inc_x ); }
void libblis_test_addv_check( obj_t* alpha, obj_t* beta, obj_t* x, obj_t* y, double* resid ) { num_t dt = bli_obj_datatype( *x ); num_t dt_real = bli_obj_datatype_proj_to_real( *x ); dim_t m = bli_obj_vector_dim( *x ); conj_t conjx = bli_obj_conj_status( *x ); obj_t aplusb; obj_t alpha_conj; obj_t norm_r, m_r, temp_r; double junk; // // Pre-conditions: // - x is set to alpha. // - y_orig is set to beta. // Note: // - alpha and beta should have non-zero imaginary components in the // complex cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // // y := y_orig + conjx(x) // // is functioning correctly if // // fnormv(y) - sqrt( absqsc( beta + conjx(alpha) ) * m ) // // is negligible. // bli_obj_scalar_init_detached( dt, &aplusb ); bli_obj_scalar_init_detached( dt_real, &temp_r ); bli_obj_scalar_init_detached( dt_real, &norm_r ); bli_obj_scalar_init_detached( dt_real, &m_r ); bli_obj_scalar_init_detached_copy_of( dt, conjx, alpha, &alpha_conj ); bli_fnormv( y, &norm_r ); bli_copysc( beta, &aplusb ); bli_addsc( &alpha_conj, &aplusb ); bli_setsc( ( double )m, 0.0, &m_r ); bli_absqsc( &aplusb, &temp_r ); bli_mulsc( &m_r, &temp_r ); bli_sqrtsc( &temp_r, &temp_r ); bli_subsc( &temp_r, &norm_r ); bli_getsc( &norm_r, resid, &junk ); }
void bli_obj_scalar_attach( conj_t conj, obj_t* alpha, obj_t* a ) { obj_t alpha_cast; // Make a copy-cast of alpha of the same datatype as A. This step // gives us the opportunity to conjugate and/or typecast alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), conj, alpha, &alpha_cast ); // Copy the internal scalar in alpha_cast to A. bli_obj_copy_internal_scalar( alpha_cast, *a ); }
void bli_obj_scalar_apply_scalar( obj_t* alpha, obj_t* a ) { obj_t alpha_cast; obj_t scalar_a; // Make a copy-cast of alpha of the same datatype as A. This step // gives us the opportunity to typecast alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), BLIS_NO_CONJUGATE, alpha, &alpha_cast ); // Detach the scalar from A. bli_obj_scalar_detach( a, &scalar_a ); // Scale the detached scalar by alpha. bli_mulsc( &alpha_cast, &scalar_a ); // Copy the internal scalar in scalar_a to A. bli_obj_copy_internal_scalar( scalar_a, *a ); }
// // Define object-based interface. // void bli_setm( obj_t* beta, obj_t* x ) { num_t dt_x; obj_t beta_local; if ( bli_error_checking_is_enabled() ) bli_setm_check( beta, x ); // Use the datatype of x as the target type for beta (since we do // not assume mixed domain/type support is enabled). dt_x = bli_obj_datatype( *x ); // Create an object to hold a copy-cast of beta. bli_obj_scalar_init_detached_copy_of( dt_x, BLIS_NO_CONJUGATE, beta, &beta_local ); bli_setm_unb_var1( &beta_local, x ); }
void bli_syr2_front ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx ) { her2_t* her2_cntl; num_t dt_targ_x; num_t dt_targ_y; //num_t dt_targ_c; bool_t x_has_unit_inc; bool_t y_has_unit_inc; bool_t c_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syr2_check( alpha, x, y, c ); // Query the target datatypes of each object. dt_targ_x = bli_obj_target_dt( x ); dt_targ_y = bli_obj_target_dt( y ); //dt_targ_c = bli_obj_target_dt( c ); // Determine whether each operand with unit stride. x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 ); c_has_unit_inc = ( bli_obj_is_row_stored( c ) || bli_obj_is_col_stored( c ) ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the datatypes of x and y. dt_alpha = bli_dt_union( dt_targ_x, dt_targ_y ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( x_has_unit_inc && y_has_unit_inc && c_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol; else her2_cntl = her2_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow; else her2_cntl = her2_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y ); if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lrow_ucol; else her2_cntl = her2_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lcol_urow; else her2_cntl = her2_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-cast scalar and the // chosen control tree. Set conjh to BLIS_NO_CONJUGATE to invoke the // symmetric (and not Hermitian) algorithms. bli_her2_int( BLIS_NO_CONJUGATE, &alpha_local, &alpha_local, x, y, c, cntx, her2_cntl ); }
void bli_her2k_front( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, herk_t* cntl ) { obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( *b, bh_local ); bli_obj_induce_trans( bh_local ); bli_obj_toggle_conj( bh_local ); bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is row-stored, transpose the entire operation // so as to allow the macro-kernel more favorable access patterns // through C. (The effect of the transposition of A and A' is negligible // because those operands are always packed to contiguous memory.) if ( bli_obj_is_row_stored( c_local ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( bh_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( ah_local ); bli_obj_induce_trans( c_local ); } #if 0 // Invoke the internal back-end. bli_her2k_int( alpha, &a_local, &bh_local, &alpha_conj, &b_local, &ah_local, beta, &c_local, cntl ); #else // Invoke herk twice, using beta only the first time. bli_herk_int( alpha, &a_local, &bh_local, beta, &c_local, cntl ); bli_herk_int( &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, cntl ); #endif }
void bli_her2k_front( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( *b, bh_local ); bli_obj_induce_trans( bh_local ); bli_obj_toggle_conj( bh_local ); bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( bh_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( ah_local ); bli_obj_induce_trans( c_local ); } #if 0 // Invoke the internal back-end. bli_her2k_int( alpha, &a_local, &bh_local, &alpha_conj, &b_local, &ah_local, beta, &c_local, cntl ); #else // Invoke herk twice, using beta only the first time. herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_herk_int, alpha, &a_local, &bh_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_level3_thread_decorator( n_threads, (level3_int_t) bli_herk_int, &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, (void*) cntl, (void**) infos ); bli_herk_thrinfo_free_paths( infos, n_threads ); #endif // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
void libblis_test_gemv_check( obj_t* kappa, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, obj_t* y_orig, double* resid ) { num_t dt = bli_obj_datatype( *y ); num_t dt_real = bli_obj_datatype_proj_to_real( *y ); conj_t conja = bli_obj_conj_status( *a ); dim_t n_x = bli_obj_vector_dim( *x ); dim_t m_y = bli_obj_vector_dim( *y ); dim_t min_m_n = bli_min( m_y, n_x ); obj_t x_temp, y_temp; obj_t kappac, norm; obj_t xT_temp, yT_temp, yT; double junk; // // Pre-conditions: // - a is initialized to kappa along the diagonal. // - x is randomized. // - y_orig is randomized. // Note: // - alpha, beta, and kappa should have non-zero imaginary components in // the complex cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // // y := beta * y_orig + alpha * transa(A) * conjx(x) // // is functioning correctly if // // normf( y - z ) // // is negligible, where // // z = beta * y_orig + alpha * conja(kappa) * x // bli_obj_scalar_init_detached_copy_of( dt, conja, kappa, &kappac ); bli_obj_scalar_init_detached( dt_real, &norm ); bli_obj_create( dt, n_x, 1, 0, 0, &x_temp ); bli_obj_create( dt, m_y, 1, 0, 0, &y_temp ); bli_copyv( x, &x_temp ); bli_copyv( y_orig, &y_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, &x_temp, &xT_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, &y_temp, &yT_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, y, &yT ); bli_scalv( &kappac, &xT_temp ); bli_scalv( beta, &yT_temp ); bli_axpyv( alpha, &xT_temp, &yT_temp ); bli_subv( &yT_temp, &yT ); bli_normfv( &yT, &norm ); bli_getsc( &norm, resid, &junk ); bli_obj_free( &x_temp ); bli_obj_free( &y_temp ); }
void bli_hemv( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { hemv_t* hemv_cntl; num_t dt_targ_a; num_t dt_targ_x; num_t dt_targ_y; bool_t a_has_unit_inc; bool_t x_has_unit_inc; bool_t y_has_unit_inc; obj_t alpha_local; obj_t beta_local; num_t dt_alpha; num_t dt_beta; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_hemv_check( alpha, a, x, beta, y ); // Query the target datatypes of each object. dt_targ_a = bli_obj_target_datatype( *a ); dt_targ_x = bli_obj_target_datatype( *x ); dt_targ_y = bli_obj_target_datatype( *y ); // Determine whether each operand with unit stride. a_has_unit_inc = ( bli_obj_is_row_stored( *a ) || bli_obj_is_col_stored( *a ) ); x_has_unit_inc = ( bli_obj_vector_inc( *x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( *y ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // Create an object to hold a copy-cast of beta. Notice that we use // the datatype of y. Here's why: If y is real and beta is complex, // there is no reason to keep beta_local in the complex domain since // the complex part of beta*y will not be stored. If y is complex and // beta is real then beta is harmlessly promoted to complex. dt_beta = dt_targ_y; bli_obj_scalar_init_detached_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_has_unit_inc && x_has_unit_inc && y_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( *a ) ) { if ( bli_obj_is_row_stored( *a ) ) hemv_cntl = hemv_cntl_bs_ke_lrow_ucol; else hemv_cntl = hemv_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( *a ) ) { if ( bli_obj_is_row_stored( *a ) ) hemv_cntl = hemv_cntl_bs_ke_lcol_urow; else hemv_cntl = hemv_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, *a ); if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, *x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, *y ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( *a ) ) { if ( bli_obj_is_row_tilted( *a ) ) hemv_cntl = hemv_cntl_ge_lrow_ucol; else hemv_cntl = hemv_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( *a ) ) { if ( bli_obj_is_row_tilted( *a ) ) hemv_cntl = hemv_cntl_ge_lcol_urow; else hemv_cntl = hemv_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-casts of scalars and the // chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the // Hermitian (and not symmetric) algorithms. bli_hemv_int( BLIS_CONJUGATE, &alpha_local, a, x, &beta_local, y, hemv_cntl ); }
void libblis_test_her2k_check ( test_params_t* params, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, obj_t* c_orig, double* resid ) { num_t dt = bli_obj_datatype( *c ); num_t dt_real = bli_obj_datatype_proj_to_real( *c ); dim_t m = bli_obj_length( *c ); dim_t k = bli_obj_width_after_trans( *a ); obj_t alphac, ah, bh; obj_t norm; obj_t t, v, w1, w2, z; double junk; // // Pre-conditions: // - a is randomized. // - b is randomized. // - c_orig is randomized and Hermitian. // Note: // - alpha should have a non-zero imaginary component in the // complex cases in order to more fully exercise the implementation. // - beta must be real-valued. // // Under these conditions, we assume that the implementation for // // C := beta * C_orig + alpha * transa(A) * transb(B)^H + conj(alpha) * transb(B) * transa(A)^H // // is functioning correctly if // // normf( v - z ) // // is negligible, where // // v = C * t // z = ( beta * C_orig + alpha * transa(A) * transb(B)^H + conj(alpha) * transb(B) * transa(A)^H ) * t // = beta * C_orig * t + alpha * transa(A) * transb(B)^H * t + conj(alpha) * transb(B) * transa(A)^H * t // = beta * C_orig * t + alpha * transa(A) * transb(B)^H * t + conj(alpha) * transb(B) * w2 // = beta * C_orig * t + alpha * transa(A) * w1 + conj(alpha) * transb(B) * w2 // = beta * C_orig * t + alpha * transa(A) * w1 + z // = beta * C_orig * t + z // bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *a, ah ); bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, *b, bh ); bli_obj_scalar_init_detached( dt_real, &norm ); bli_obj_scalar_init_detached_copy_of( dt, BLIS_CONJUGATE, alpha, &alphac ); bli_obj_create( dt, m, 1, 0, 0, &t ); bli_obj_create( dt, m, 1, 0, 0, &v ); bli_obj_create( dt, k, 1, 0, 0, &w1 ); bli_obj_create( dt, k, 1, 0, 0, &w2 ); bli_obj_create( dt, m, 1, 0, 0, &z ); libblis_test_vobj_randomize( params, TRUE, &t ); bli_hemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v ); bli_gemv( &BLIS_ONE, &ah, &t, &BLIS_ZERO, &w2 ); bli_gemv( &BLIS_ONE, &bh, &t, &BLIS_ZERO, &w1 ); bli_gemv( alpha, a, &w1, &BLIS_ZERO, &z ); bli_gemv( &alphac, b, &w2, &BLIS_ONE, &z ); bli_hemv( beta, c_orig, &t, &BLIS_ONE, &z ); bli_subv( &z, &v ); bli_normfv( &v, &norm ); bli_getsc( &norm, resid, &junk ); bli_obj_free( &t ); bli_obj_free( &v ); bli_obj_free( &w1 ); bli_obj_free( &w2 ); bli_obj_free( &z ); }
void bli_trmv_front ( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx ) { trmv_t* trmv_cntl; num_t dt_targ_a; num_t dt_targ_x; bool_t a_has_unit_inc; bool_t x_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmv_check( alpha, a, x ); // Query the target datatypes of each object. dt_targ_a = bli_obj_target_datatype( *a ); dt_targ_x = bli_obj_target_datatype( *x ); // Determine whether each operand with unit stride. a_has_unit_inc = ( bli_obj_is_row_stored( *a ) || bli_obj_is_col_stored( *a ) ); x_has_unit_inc = ( bli_obj_vector_inc( *x ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_has_unit_inc && x_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of transposition and row/column-storage. // The row-stored without transpose and column-stored with transpose // trees are identical. Same for the remaining two trees. if ( bli_obj_has_notrans( *a ) ) { if ( bli_obj_is_row_stored( *a ) ) trmv_cntl = trmv_cntl_bs_ke_nrow_tcol; else trmv_cntl = trmv_cntl_bs_ke_ncol_trow; } else // if ( bli_obj_has_trans( *a ) ) { if ( bli_obj_is_row_stored( *a ) ) trmv_cntl = trmv_cntl_bs_ke_ncol_trow; else trmv_cntl = trmv_cntl_bs_ke_nrow_tcol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, *a ); if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, *x ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_has_notrans( *a ) ) { if ( bli_obj_is_row_tilted( *a ) ) trmv_cntl = trmv_cntl_ge_nrow_tcol; else trmv_cntl = trmv_cntl_ge_ncol_trow; } else // if ( bli_obj_has_trans( *a ) ) { if ( bli_obj_is_row_tilted( *a ) ) trmv_cntl = trmv_cntl_ge_ncol_trow; else trmv_cntl = trmv_cntl_ge_nrow_tcol; } } // Invoke the internal back-end with the copy-cast of alpha and the // chosen control tree. bli_trmv_int( &alpha_local, a, x, cntx, trmv_cntl ); }
void bli_her2_int( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx, her2_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t alpha_local; obj_t alpha_conj_local; obj_t x_local; obj_t y_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) { if ( bli_is_conj( conjh ) ) bli_her2_check( alpha, x, y, c ); else bli_syr2_check( alpha, x, y, c ); } // If C, x, or y has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) return; if ( bli_obj_has_zero_dim( x ) ) return; if ( bli_obj_has_zero_dim( y ) ) return; // Alias the operands in case we need to apply conjugations. bli_obj_alias_to( x, &x_local ); bli_obj_alias_to( y, &y_local ); bli_obj_alias_to( c, &c_local ); // If matrix C is marked for conjugation, we interpret this as a request // to apply a conjugation to the other operands. if ( bli_obj_has_conj( &c_local ) ) { bli_obj_toggle_conj( &c_local ); bli_obj_toggle_conj( &x_local ); bli_obj_toggle_conj( &y_local ); bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ), BLIS_CONJUGATE, alpha, &alpha_local ); bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha_conj ), BLIS_CONJUGATE, alpha_conj, &alpha_conj_local ); } else { bli_obj_alias_to( *alpha, alpha_local ); bli_obj_alias_to( *alpha_conj, alpha_conj_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( conjh, &alpha_local, &alpha_conj_local, &x_local, &y_local, &c_local, cntx, cntl ); }
void bli_her2k_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { bli_init_once(); obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c, cntx ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); bli_obj_set_as_root( &c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( b, &bh_local ); bli_obj_induce_trans( &bh_local ); bli_obj_toggle_conj( &bh_local ); bli_obj_alias_to( a, &ah_local ); bli_obj_induce_trans( &ah_local ); bli_obj_toggle_conj( &ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &bh_local ); bli_obj_swap( &b_local, &ah_local ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &bh_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &ah_local ); bli_obj_induce_trans( &c_local ); } // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ) ); // Invoke herk twice, using beta only the first time. // Invoke the internal back-end. bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id alpha, &a_local, &bh_local, beta, &c_local, cntx, cntl ); bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, cntx, cntl ); // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
// // Define object-based interface. // void bli_dotxf( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { num_t dt = bli_obj_datatype( *x ); conj_t conja = bli_obj_conj_status( *a ); conj_t conjx = bli_obj_conj_status( *x ); dim_t m = bli_obj_vector_dim( *y ); dim_t b_n = bli_obj_vector_dim( *x ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t inc_x = bli_obj_vector_inc( *x ); void* buf_y = bli_obj_buffer_at_off( *y ); inc_t inc_y = bli_obj_vector_inc( *y ); obj_t alpha_local; void* buf_alpha; obj_t beta_local; void* buf_beta; FUNCPTR_T f = ftypes[dt]; if ( bli_error_checking_is_enabled() ) bli_dotxf_check( alpha, a, x, beta, y ); // Create local copy-casts of the scalars (and apply internal conjugation // if needed). bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, alpha, &alpha_local ); bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, beta, &beta_local ); // Extract the scalar buffers. buf_alpha = bli_obj_buffer_for_1x1( dt, alpha_local ); buf_beta = bli_obj_buffer_for_1x1( dt, beta_local ); // Support cases where matrix A requires a transposition. if ( bli_obj_has_trans( *a ) ) { bli_swap_incs( rs_a, cs_a ); } // Invoke the void pointer-based function. f( conja, conjx, m, b_n, buf_alpha, buf_a, rs_a, cs_a, buf_x, inc_x, buf_beta, buf_y, inc_y ); }
void bli_ger_int( conj_t conjx, conj_t conjy, obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t alpha_local; obj_t x_local; obj_t y_local; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_ger_check( alpha, x, y, a ); // If A has a zero dimension, return early. if ( bli_obj_has_zero_dim( a ) ) return; // If x or y has a zero dimension, return early. if ( bli_obj_has_zero_dim( x ) || bli_obj_has_zero_dim( y ) ) return; // Alias the objects, applying conjx and conjy to x and y, respectively. bli_obj_alias_with_conj( conjx, x, &x_local ); bli_obj_alias_with_conj( conjy, y, &y_local ); bli_obj_alias_to( a, &a_local ); // If matrix A is marked for conjugation, we interpret this as a request // to apply a conjugation to the other operands. if ( bli_obj_has_conj( &a_local ) ) { bli_obj_toggle_conj( &a_local ); bli_obj_toggle_conj( &x_local ); bli_obj_toggle_conj( &y_local ); bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ), BLIS_CONJUGATE, alpha, &alpha_local ); } else { bli_obj_alias_to( *alpha, alpha_local ); } // If we are about the call a leaf-level implementation, and matrix A // still needs a transposition, then we must induce one by swapping the // strides and dimensions. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( &a_local ) ) { bli_obj_induce_trans( &a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( &alpha_local, &x_local, &y_local, &a_local, cntx, cntl ); }