void bli_her2k_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { bli_init_once(); obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c, cntx ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); bli_obj_set_as_root( &c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( b, &bh_local ); bli_obj_induce_trans( &bh_local ); bli_obj_toggle_conj( &bh_local ); bli_obj_alias_to( a, &ah_local ); bli_obj_induce_trans( &ah_local ); bli_obj_toggle_conj( &ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &bh_local ); bli_obj_swap( &b_local, &ah_local ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &bh_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &ah_local ); bli_obj_induce_trans( &c_local ); } // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ) ); // Invoke herk twice, using beta only the first time. // Invoke the internal back-end. bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id alpha, &a_local, &bh_local, beta, &c_local, cntx, cntl ); bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, cntx, cntl ); // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
void libblis_test_trsm_check ( test_params_t* params, side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* b_orig, double* resid ) { num_t dt = bli_obj_dt( b ); num_t dt_real = bli_obj_dt_proj_to_real( b ); dim_t m = bli_obj_length( b ); dim_t n = bli_obj_width( b ); obj_t norm; obj_t t, v, w, z; double junk; // // Pre-conditions: // - a is randomized and triangular. // - b_orig is randomized. // Note: // - alpha should have a non-zero imaginary component in the // complex cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // // B := alpha * inv(transa(A)) * B_orig (side = left) // B := alpha * B_orig * inv(transa(A)) (side = right) // // is functioning correctly if // // normf( v - z ) // // is negligible, where // // v = B * t // // z = ( alpha * inv(transa(A)) * B ) * t (side = left) // = alpha * inv(transa(A)) * B * t // = alpha * inv(transa(A)) * w // // z = ( alpha * B * inv(transa(A)) ) * t (side = right) // = alpha * B * tinv(ransa(A)) * t // = alpha * B * w bli_obj_scalar_init_detached( dt_real, &norm ); if ( bli_is_left( side ) ) { bli_obj_create( dt, n, 1, 0, 0, &t ); bli_obj_create( dt, m, 1, 0, 0, &v ); bli_obj_create( dt, m, 1, 0, 0, &w ); bli_obj_create( dt, m, 1, 0, 0, &z ); } else // else if ( bli_is_left( side ) ) { bli_obj_create( dt, n, 1, 0, 0, &t ); bli_obj_create( dt, m, 1, 0, 0, &v ); bli_obj_create( dt, n, 1, 0, 0, &w ); bli_obj_create( dt, m, 1, 0, 0, &z ); } libblis_test_vobj_randomize( params, TRUE, &t ); bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &v ); if ( bli_is_left( side ) ) { bli_gemv( alpha, b_orig, &t, &BLIS_ZERO, &w ); bli_trsv( &BLIS_ONE, a, &w ); bli_copyv( &w, &z ); } else { bli_copyv( &t, &w ); bli_trsv( &BLIS_ONE, a, &w ); bli_gemv( alpha, b_orig, &w, &BLIS_ZERO, &z ); } bli_subv( &z, &v ); bli_normfv( &v, &norm ); bli_getsc( &norm, resid, &junk ); bli_obj_free( &t ); bli_obj_free( &v ); bli_obj_free( &w ); bli_obj_free( &z ); }
void bli_ger_int( conj_t conjx, conj_t conjy, obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t alpha_local; obj_t x_local; obj_t y_local; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_ger_check( alpha, x, y, a ); // If A has a zero dimension, return early. if ( bli_obj_has_zero_dim( a ) ) return; // If x or y has a zero dimension, return early. if ( bli_obj_has_zero_dim( x ) || bli_obj_has_zero_dim( y ) ) return; // Alias the objects, applying conjx and conjy to x and y, respectively. bli_obj_alias_with_conj( conjx, x, &x_local ); bli_obj_alias_with_conj( conjy, y, &y_local ); bli_obj_alias_to( a, &a_local ); // If matrix A is marked for conjugation, we interpret this as a request // to apply a conjugation to the other operands. if ( bli_obj_has_conj( &a_local ) ) { bli_obj_toggle_conj( &a_local ); bli_obj_toggle_conj( &x_local ); bli_obj_toggle_conj( &y_local ); bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ), BLIS_CONJUGATE, alpha, &alpha_local ); } else { bli_obj_alias_to( *alpha, alpha_local ); } // If we are about the call a leaf-level implementation, and matrix A // still needs a transposition, then we must induce one by swapping the // strides and dimensions. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( &a_local ) ) { bli_obj_induce_trans( &a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( &alpha_local, &x_local, &y_local, &a_local, cntx, cntl ); }
void libblis_test_setv_check ( test_params_t* params, obj_t* beta, obj_t* x, double* resid ) { num_t dt_x = bli_obj_dt( x ); dim_t m_x = bli_obj_vector_dim( x ); inc_t inc_x = bli_obj_vector_inc( x ); void* buf_x = bli_obj_buffer_at_off( x ); void* buf_beta = bli_obj_buffer_for_1x1( dt_x, beta ); dim_t i; *resid = 0.0; // // The easiest way to check that setv was successful is to confirm // that each element of x is equal to beta. // if ( bli_obj_is_float( x ) ) { float* chi1 = buf_x; float* beta_cast = buf_beta; for ( i = 0; i < m_x; ++i ) { if ( !bli_seq( *chi1, *beta_cast ) ) { *resid = 1.0; return; } chi1 += inc_x; } } else if ( bli_obj_is_double( x ) ) { double* chi1 = buf_x; double* beta_cast = buf_beta; for ( i = 0; i < m_x; ++i ) { if ( !bli_deq( *chi1, *beta_cast ) ) { *resid = 1.0; return; } chi1 += inc_x; } } else if ( bli_obj_is_scomplex( x ) ) { scomplex* chi1 = buf_x; scomplex* beta_cast = buf_beta; for ( i = 0; i < m_x; ++i ) { if ( !bli_ceq( *chi1, *beta_cast ) ) { *resid = 1.0; return; } chi1 += inc_x; } } else // if ( bli_obj_is_dcomplex( x ) ) { dcomplex* chi1 = buf_x; dcomplex* beta_cast = buf_beta; for ( i = 0; i < m_x; ++i ) { if ( !bli_zeq( *chi1, *beta_cast ) ) { *resid = 1.0; return; } chi1 += inc_x; } } }
void libblis_test_syr_check ( test_params_t* params, obj_t* alpha, obj_t* x, obj_t* a, obj_t* a_orig, double* resid ) { num_t dt = bli_obj_dt( a ); num_t dt_real = bli_obj_dt_proj_to_real( a ); dim_t m_a = bli_obj_length( a ); obj_t xt, t, v, w; obj_t rho, norm; double junk; // // Pre-conditions: // - x is randomized. // - a is randomized and symmetric. // Note: // - alpha should have a non-zero imaginary component in the // complex cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // // A := A_orig + alpha * conjx(x) * conjx(x)^T // // is functioning correctly if // // normf( v - w ) // // is negligible, where // // v = A * t // w = ( A_orig + alpha * conjx(x) * conjx(x)^T ) * t // = A_orig * t + alpha * conjx(x) * conjx(x)^T * t // = A_orig * t + alpha * conjx(x) * rho // = A_orig * t + w // bli_mksymm( a ); bli_mksymm( a_orig ); bli_obj_set_struc( BLIS_GENERAL, a ); bli_obj_set_struc( BLIS_GENERAL, a_orig ); bli_obj_set_uplo( BLIS_DENSE, a ); bli_obj_set_uplo( BLIS_DENSE, a_orig ); bli_obj_scalar_init_detached( dt, &rho ); bli_obj_scalar_init_detached( dt_real, &norm ); bli_obj_create( dt, m_a, 1, 0, 0, &t ); bli_obj_create( dt, m_a, 1, 0, 0, &v ); bli_obj_create( dt, m_a, 1, 0, 0, &w ); bli_obj_alias_to( x, &xt ); libblis_test_vobj_randomize( params, TRUE, &t ); bli_gemv( &BLIS_ONE, a, &t, &BLIS_ZERO, &v ); bli_dotv( &xt, &t, &rho ); bli_mulsc( alpha, &rho ); bli_scal2v( &rho, x, &w ); bli_gemv( &BLIS_ONE, a_orig, &t, &BLIS_ONE, &w ); bli_subv( &w, &v ); bli_normfv( &v, &norm ); bli_getsc( &norm, resid, &junk ); bli_obj_free( &t ); bli_obj_free( &v ); bli_obj_free( &w ); }
void libblis_test_gemv_check ( test_params_t* params, obj_t* kappa, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, obj_t* y_orig, double* resid ) { num_t dt = bli_obj_dt( y ); num_t dt_real = bli_obj_dt_proj_to_real( y ); conj_t conja = bli_obj_conj_status( a ); dim_t n_x = bli_obj_vector_dim( x ); dim_t m_y = bli_obj_vector_dim( y ); dim_t min_m_n = bli_min( m_y, n_x ); obj_t x_temp, y_temp; obj_t kappac, norm; obj_t xT_temp, yT_temp, yT; double junk; // // Pre-conditions: // - a is initialized to kappa along the diagonal. // - x is randomized. // - y_orig is randomized. // Note: // - alpha, beta, and kappa should have non-zero imaginary components in // the complex cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // // y := beta * y_orig + alpha * transa(A) * conjx(x) // // is functioning correctly if // // normf( y - z ) // // is negligible, where // // z = beta * y_orig + alpha * conja(kappa) * x // bli_obj_scalar_init_detached_copy_of( dt, conja, kappa, &kappac ); bli_obj_scalar_init_detached( dt_real, &norm ); bli_obj_create( dt, n_x, 1, 0, 0, &x_temp ); bli_obj_create( dt, m_y, 1, 0, 0, &y_temp ); bli_copyv( x, &x_temp ); bli_copyv( y_orig, &y_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, &x_temp, &xT_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, &y_temp, &yT_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, y, &yT ); bli_scalv( &kappac, &xT_temp ); bli_scalv( beta, &yT_temp ); bli_axpyv( alpha, &xT_temp, &yT_temp ); bli_subv( &yT_temp, &yT ); bli_normfv( &yT, &norm ); bli_getsc( &norm, resid, &junk ); bli_obj_free( &x_temp ); bli_obj_free( &y_temp ); }
void bli_packm_unb_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_cp = bli_obj_dt( c ); struc_t strucc = bli_obj_struc( c ); doff_t diagoffc = bli_obj_diag_offset( c ); diag_t diagc = bli_obj_diag( c ); uplo_t uploc = bli_obj_uplo( c ); trans_t transc = bli_obj_conjtrans_status( c ); dim_t m_p = bli_obj_length( p ); dim_t n_p = bli_obj_width( p ); dim_t m_max_p = bli_obj_padded_length( p ); dim_t n_max_p = bli_obj_padded_width( p ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_p = bli_obj_buffer_at_off( p ); inc_t rs_p = bli_obj_row_stride( p ); inc_t cs_p = bli_obj_col_stride( p ); void* buf_kappa; FUNCPTR_T f; // This variant assumes that the computational kernel will always apply // the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE // for kappa so that the underlying packm implementation does not scale // during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; if( bli_thread_am_ochief( thread ) ) { // Invoke the function. f ( strucc, diagoffc, diagc, uploc, transc, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, cntx ); } }