FLA_Error FLA_Tevd_eigval_v_opt_var1( FLA_Obj G, FLA_Obj d, FLA_Obj e, FLA_Obj k ) { FLA_Datatype datatype; int m_A, n_G; int rs_G, cs_G; int inc_d; int inc_e; datatype = FLA_Obj_datatype( d ); m_A = FLA_Obj_vector_dim( d ); n_G = FLA_Obj_width( G ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); inc_d = FLA_Obj_vector_inc( d ); inc_e = FLA_Obj_vector_inc( e ); switch ( datatype ) { case FLA_FLOAT: { scomplex* buff_G = FLA_COMPLEX_PTR( G ); float* buff_d = FLA_FLOAT_PTR( d ); float* buff_e = FLA_FLOAT_PTR( e ); int* buff_k = FLA_INT_PTR( k ); FLA_Tevd_eigval_v_ops_var1( m_A, n_G, buff_G, rs_G, cs_G, buff_d, inc_d, buff_e, inc_e, buff_k ); break; } case FLA_DOUBLE: { dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_d = FLA_DOUBLE_PTR( d ); double* buff_e = FLA_DOUBLE_PTR( e ); int* buff_k = FLA_INT_PTR( k ); FLA_Tevd_eigval_v_opd_var1( m_A, n_G, buff_G, rs_G, cs_G, buff_d, inc_d, buff_e, inc_e, buff_k ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Trinv_lu_ops_var2( int mn_A, float* buff_A, int rs_A, int cs_A ) { float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); int i; for ( i = 0; i < mn_A; ++i ) { float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; int mn_ahead = mn_A - i - 1; /*------------------------------------------------------------*/ // FLA_Trsv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, A22, a21 ); bl1_strsv( BLIS1_LOWER_TRIANGULAR, BLIS1_NO_TRANSPOSE, BLIS1_UNIT_DIAG, mn_ahead, A22, rs_A, cs_A, a21, rs_A ); // FLA_Scal_external( FLA_MINUS_ONE, a21 ); bl1_sscalv( BLIS1_NO_CONJUGATE, mn_ahead, buff_m1, a21, rs_A ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Obj_create_complex_constant( double const_real, double const_imag, FLA_Obj *obj ) { int* temp_i; float* temp_s; double* temp_d; scomplex* temp_c; dcomplex* temp_z; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_create_complex_constant_check( const_real, const_imag, obj ); FLA_Obj_create( FLA_CONSTANT, 1, 1, 0, 0, obj ); #ifdef FLA_ENABLE_SCC if ( !FLA_is_owner() ) return FLA_SUCCESS; #endif temp_i = FLA_INT_PTR( *obj ); temp_s = FLA_FLOAT_PTR( *obj ); temp_d = FLA_DOUBLE_PTR( *obj ); temp_c = FLA_COMPLEX_PTR( *obj ); temp_z = FLA_DOUBLE_COMPLEX_PTR( *obj ); *temp_i = ( int ) const_real; *temp_s = ( float ) const_real; *temp_d = const_real; temp_c->real = ( float ) const_real; temp_c->imag = ( float ) const_imag; temp_z->real = const_real; temp_z->imag = const_imag; return FLA_SUCCESS; }
FLA_Error FLA_Chol_u_opt_var3( FLA_Obj A ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype datatype; int mn_A; int rs_A, cs_A; datatype = FLA_Obj_datatype( A ); mn_A = FLA_Obj_length( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); r_val = FLA_Chol_u_ops_var3( mn_A, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); r_val = FLA_Chol_u_opd_var3( mn_A, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); r_val = FLA_Chol_u_opc_var3( mn_A, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); r_val = FLA_Chol_u_opz_var3( mn_A, buff_A, rs_A, cs_A ); break; } } return r_val; }
FLA_Error FLA_Absolute_value( FLA_Obj alpha ) { FLA_Datatype datatype; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Absolute_value_check( alpha ); datatype = FLA_Obj_datatype( alpha ); switch ( datatype ){ case FLA_FLOAT: { float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); *buff_alpha = ( float ) fabs( ( double ) *buff_alpha ); break; } case FLA_DOUBLE: { double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); *buff_alpha = fabs( *buff_alpha ); break; } case FLA_COMPLEX: { scomplex *buff_alpha = ( scomplex * ) FLA_COMPLEX_PTR( alpha ); buff_alpha->real = ( float ) sqrt( ( double ) buff_alpha->real * buff_alpha->real + buff_alpha->imag * buff_alpha->imag ); buff_alpha->imag = 0.0F; break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_alpha = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); buff_alpha->real = sqrt( buff_alpha->real * buff_alpha->real + buff_alpha->imag * buff_alpha->imag ); buff_alpha->imag = 0.0; break; } } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Ahx_Ax_ops_var1( int m_A, int n_A, float* buff_A, int rs_A, int cs_A, float* buff_x, int inc_x, float* buff_v, int inc_v, float* buff_w, int inc_w ) { float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); int i; bl1_ssetv( m_A, buff_0, buff_w, inc_w ); for ( i = 0; i < n_A; ++i ) { float* a1 = buff_A + (i )*cs_A + (0 )*rs_A; float* nu1 = buff_v + (i )*inc_v; float* x = buff_x; float* chi1 = buff_x + (i )*inc_x; float* w = buff_w; /*------------------------------------------------------------*/ bl1_sdot( BLIS1_CONJUGATE, m_A, a1, rs_A, x, inc_x, nu1 ); /* *nu1 = F77_sdot( &m_A, a1, &rs_A, x, &inc_x ); */ bl1_saxpyv( BLIS1_NO_CONJUGATE, m_A, chi1, a1, rs_A, w, inc_w ); /* F77_saxpy( &m_A, chi1, a1, &rs_A, w, &inc_w ); */ /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Trinv_lu_ops_var3( int mn_A, float* buff_A, int rs_A, int cs_A ) { float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); int i; for ( i = 0; i < mn_A; ++i ) { float* a10t = buff_A + (0 )*cs_A + (i )*rs_A; float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; int mn_ahead = mn_A - i - 1; int mn_behind = i; /*------------------------------------------------------------*/ // FLA_Scal_external( FLA_MINUS_ONE, a21 ); bl1_sscalv( BLIS1_NO_CONJUGATE, mn_ahead, buff_m1, a21, rs_A ); // FLA_Ger_external( FLA_ONE, a21, a10t, A20 ); bl1_sger( BLIS1_NO_CONJUGATE, BLIS1_NO_CONJUGATE, mn_ahead, mn_behind, buff_1, a21, rs_A, a10t, cs_A, A20, rs_A, cs_A ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Chol_u_opc_var3( int mn_A, scomplex* buff_A, int rs_A, int cs_A ) { float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); int i; FLA_Error e_val; for ( i = 0; i < mn_A; ++i ) { scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; int mn_ahead = mn_A - i - 1; int mn_behind = i; /*------------------------------------------------------------*/ // r_val = FLA_Sqrt( alpha11 ); // if ( r_val != FLA_SUCCESS ) // return ( FLA_Obj_length( A00 ) + 1 ); bl1_csqrte( alpha11, &e_val ); if ( e_val != FLA_SUCCESS ) return mn_behind; // FLA_Inv_scal_external( alpha11, a12t ); bl1_cinvscalv( BLIS1_NO_CONJUGATE, mn_ahead, alpha11, a12t, cs_A ); // FLA_Herc_external( FLA_UPPER_TRIANGULAR, FLA_MINUS_ONE, a12t, A22 ); bl1_cher( BLIS1_UPPER_TRIANGULAR, BLIS1_CONJUGATE, mn_ahead, buff_m1, a12t, cs_A, A22, rs_A, cs_A ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Ttmm_l_ops_var3( int mn_A, float* buff_A, int rs_A, int cs_A ) { float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); int i; for ( i = 0; i < mn_A; ++i ) { float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; int mn_ahead = mn_A - i - 1; /*------------------------------------------------------------*/ // FLA_Absolute_square( alpha11 ); bl1_sabsqr( alpha11 ); // FLA_Dotcs_external( FLA_CONJUGATE, FLA_ONE, a21, a21, FLA_ONE, alpha11 ); bl1_sdots( BLIS1_CONJUGATE, mn_ahead, buff_1, a21, rs_A, a21, rs_A, buff_1, alpha11 ); // FLA_Trmv_external( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, A22, a21 ); bl1_strmv( BLIS1_LOWER_TRIANGULAR, BLIS1_CONJ_TRANSPOSE, BLIS1_NONUNIT_DIAG, mn_ahead, A22, rs_A, cs_A, a21, rs_A ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Copy_external( FLA_Obj A, FLA_Obj B ) { FLA_Datatype dt_A; FLA_Datatype dt_B; int m_B, n_B; int rs_A, cs_A; int rs_B, cs_B; trans_t blis_trans; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Copy_check( A, B ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; dt_A = FLA_Obj_datatype( A ); dt_B = FLA_Obj_datatype( B ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); m_B = FLA_Obj_length( B ); n_B = FLA_Obj_width( B ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); if ( FLA_Obj_is_conformal_to( FLA_NO_TRANSPOSE, A, B ) ) FLA_Param_map_flame_to_blis_trans( FLA_NO_TRANSPOSE, &blis_trans ); else // if ( FLA_Obj_is_conformal_to( FLA_TRANSPOSE, A, B ) ) FLA_Param_map_flame_to_blis_trans( FLA_TRANSPOSE, &blis_trans ); // If A is of type FLA_CONSTANT, then we have to proceed based on the // datatype of B. if ( dt_A == FLA_CONSTANT ) { if ( dt_B == FLA_FLOAT ) { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_scopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_dcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_ccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_zcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } else if ( dt_A == FLA_INT ) { int* buff_A = ( int * ) FLA_INT_PTR( A ); int* buff_B = ( int * ) FLA_INT_PTR( B ); bli_icopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_A == FLA_FLOAT ) { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); if ( dt_B == FLA_FLOAT ) { float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_scopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_sdcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_sccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_szcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } else if ( dt_A == FLA_DOUBLE ) { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); if ( dt_B == FLA_FLOAT ) { float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_dscopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_dcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_dccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_dzcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } else if ( dt_A == FLA_COMPLEX ) { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); if ( dt_B == FLA_FLOAT ) { float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_cscopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_cdcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_ccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_czcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } else if ( dt_A == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); if ( dt_B == FLA_FLOAT ) { float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_zscopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_zdcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_zccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_zcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } return FLA_SUCCESS; }
FLA_Error FLA_LU_piv_ops_var5( int m_A, int n_A, float* buff_A, int rs_A, int cs_A, int* buff_p, int inc_p ) { FLA_Error r_val = FLA_SUCCESS; float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); int min_m_n = min( m_A, n_A ); int i; for ( i = 0; i < min_m_n; ++i ) { float pivot_val = fzero; float* a10t = buff_A + (0 )*cs_A + (i )*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; int* pi1 = buff_p + i*inc_p; int m_ahead = m_A - i - 1; int n_ahead = n_A - i - 1; /*------------------------------------------------------------*/ // FLA_Merge_2x1( alpha11, // a21, &aB1 ); // FLA_Amax_external( aB1, pi1 ); bl1_samax( m_ahead + 1, alpha11, rs_A, pi1 ); // If a null pivot is encountered, return the index. pivot_val = *(alpha11 + *pi1); if ( pivot_val == fzero ) r_val = ( r_val == FLA_SUCCESS ? i : r_val ); else { // FLA_Merge_1x2( ABL, ABR, &AB ); // FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pi1, AB ); FLA_Apply_pivots_ln_ops_var1( n_A, a10t, rs_A, cs_A, 0, 0, pi1, inc_p ); // FLA_Inv_scal_external( alpha11, a21 ); bl1_sinvscalv( BLIS1_NO_CONJUGATE, m_ahead, alpha11, a21, rs_A ); } // FLA_Ger_external( FLA_MINUS_ONE, a21, a12t, A22 ); bl1_sger( BLIS1_NO_CONJUGATE, BLIS1_NO_CONJUGATE, m_ahead, n_ahead, buff_m1, a21, rs_A, a12t, cs_A, A22, rs_A, cs_A ); /*------------------------------------------------------------*/ } return r_val; }
FLA_Error FLA_LU_piv_opt_var5( FLA_Obj A, FLA_Obj p ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype datatype; int m_A, n_A; int rs_A, cs_A; int inc_p; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); inc_p = FLA_Obj_vector_inc( p ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); int* buff_p = FLA_INT_PTR( p ); r_val = FLA_LU_piv_ops_var5( m_A, n_A, buff_A, rs_A, cs_A, buff_p, inc_p ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); int* buff_p = FLA_INT_PTR( p ); r_val = FLA_LU_piv_opd_var5( m_A, n_A, buff_A, rs_A, cs_A, buff_p, inc_p ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); int* buff_p = FLA_INT_PTR( p ); r_val = FLA_LU_piv_opc_var5( m_A, n_A, buff_A, rs_A, cs_A, buff_p, inc_p ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); int* buff_p = FLA_INT_PTR( p ); r_val = FLA_LU_piv_opz_var5( m_A, n_A, buff_A, rs_A, cs_A, buff_p, inc_p ); break; } } return r_val; }
FLA_Error FLA_Sort( FLA_Direct direct, FLA_Obj x ) { FLA_Datatype datatype; FLA_Obj x_use; dim_t m_x; dim_t inc_x; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Sort_check( direct, x ); datatype = FLA_Obj_datatype( x ); m_x = FLA_Obj_vector_dim( x ); inc_x = FLA_Obj_vector_inc( x ); // If the vector does not have unit stride, copy it to a temporary vector // that does have unit stride. if ( inc_x != 1 ) { FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, x, &x_use ); inc_x = FLA_Obj_vector_inc( x_use ); } else { x_use = x; } switch ( datatype ) { case FLA_FLOAT: { float* x_p = ( float* ) FLA_FLOAT_PTR( x_use ); if ( direct == FLA_FORWARD ) FLA_Sort_f_ops( m_x, x_p, inc_x ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_b_ops( m_x, x_p, inc_x ); break; } case FLA_DOUBLE: { double* x_p = ( double* ) FLA_DOUBLE_PTR( x_use ); if ( direct == FLA_FORWARD ) FLA_Sort_f_opd( m_x, x_p, inc_x ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_b_opd( m_x, x_p, inc_x ); break; } } if ( inc_x != 1 ) { FLA_Copy( x_use, x ); FLA_Obj_free( &x_use ); } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ops_var4( int m_A, int m_T, float* buff_A, int rs_A, int cs_A, float* buff_Y, int rs_Y, int cs_Y, float* buff_Z, int rs_Z, int cs_Z, float* buff_T, int rs_T, int cs_T ) { float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem, last_elem; float dot_product; float beta, conj_beta; float inv_tau11; float minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); float* buff_d = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_e = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_f = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_d = 1; int inc_e = 1; int inc_f = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bl1_ssetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bl1_ssetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { float* a10t = buff_A + (0 )*cs_A + (i )*rs_A; float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; float* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; float* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; float* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; float* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; float* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* d0 = buff_d + (0 )*inc_d; float* e0 = buff_e + (0 )*inc_e; float* f0 = buff_f + (0 )*inc_f; float* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; float* ABL = a10t; float* ZBL = z10t; float* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bl1_sgemv( BLIS1_NO_TRANSPOSE, BLIS1_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bl1_sgemv( BLIS1_NO_TRANSPOSE, BLIS1_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bl1_sgemv( BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bl1_sgemv( BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bl1_sdiv3( buff_1, tau11, &inv_tau11 ); bl1_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); bl1_sgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); bl1_sgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); bl1_sgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, d0, inc_d ); bl1_sgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, Y20, rs_Y, cs_Y, a21, rs_A, buff_0, e0, inc_e ); bl1_sgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, Z20, rs_Z, cs_Z, a21, rs_A, buff_0, f0, inc_f ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); bl1_sgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, d0, inc_d, buff_1, y21, rs_Y ); bl1_sgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, f0, inc_f, buff_1, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); bl1_sgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, e0, inc_e, buff_1, z21, rs_Z ); bl1_sgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, d0, inc_d, buff_1, z21, rs_Z ); // FLA_Copy( d0, t01 ); bl1_scopyv( BLIS1_NO_CONJUGATE, n_behind, d0, inc_d, t01, rs_T ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bl1_sdot( BLIS1_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bl1_sinvscals( buff_2, &beta ); bl1_scopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bl1_sscals( &minus_inv_tau11, &conj_beta ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bl1_sscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bl1_sscals( &minus_inv_tau11, &beta ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bl1_sscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bl1_sdot( BLIS1_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bl1_sscals( &minus_inv_tau11, &dot_product ); bl1_saxpyv( BLIS1_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bl1_sgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bl1_sger( BLIS1_NO_CONJUGATE, BLIS1_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &d ); // FLA_Obj_free( &e ); // FLA_Obj_free( &f ); FLA_free( buff_d ); FLA_free( buff_e ); FLA_free( buff_f ); return FLA_SUCCESS; }
FLA_Error FLA_Herc_external( FLA_Uplo uplo, FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj A ) { FLA_Datatype datatype; int m_A; int rs_A, cs_A; int inc_x; uplo_t blis_uplo; conj_t blis_conj; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Herc_check( uplo, conj, alpha, x, A ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); inc_x = FLA_Obj_vector_inc( x ); FLA_Param_map_flame_to_blis_uplo( uplo, &blis_uplo ); FLA_Param_map_flame_to_blis_conj( conj, &blis_conj ); switch( datatype ){ case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_x = ( float * ) FLA_FLOAT_PTR( x ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); bli_ssyr( blis_uplo, m_A, buff_alpha, buff_x, inc_x, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_x = ( double * ) FLA_DOUBLE_PTR( x ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); bli_dsyr( blis_uplo, m_A, buff_alpha, buff_x, inc_x, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_x = ( scomplex * ) FLA_COMPLEX_PTR( x ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); bli_cher( blis_uplo, blis_conj, m_A, buff_alpha, buff_x, inc_x, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_x = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( x ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); bli_zher( blis_uplo, blis_conj, m_A, buff_alpha, buff_x, inc_x, buff_A, rs_A, cs_A ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Symm_external( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Datatype datatype; int m_C, n_C; int rs_A, cs_A; int rs_B, cs_B; int rs_C, cs_C; side_t blis_side; uplo_t blis_uplo; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Symm_check( side, uplo, alpha, A, B, beta, C ); if ( FLA_Obj_has_zero_dim( C ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); m_C = FLA_Obj_length( C ); n_C = FLA_Obj_width( C ); rs_C = FLA_Obj_row_stride( C ); cs_C = FLA_Obj_col_stride( C ); FLA_Param_map_flame_to_blis_side( side, &blis_side ); FLA_Param_map_flame_to_blis_uplo( uplo, &blis_uplo ); switch( datatype ){ case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); float *buff_C = ( float * ) FLA_FLOAT_PTR( C ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_beta = ( float * ) FLA_FLOAT_PTR( beta ); bli_ssymm( blis_side, blis_uplo, m_C, n_C, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); double *buff_C = ( double * ) FLA_DOUBLE_PTR( C ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_beta = ( double * ) FLA_DOUBLE_PTR( beta ); bli_dsymm( blis_side, blis_uplo, m_C, n_C, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); scomplex *buff_C = ( scomplex * ) FLA_COMPLEX_PTR( C ); scomplex *buff_alpha = ( scomplex * ) FLA_COMPLEX_PTR( alpha ); scomplex *buff_beta = ( scomplex * ) FLA_COMPLEX_PTR( beta ); bli_csymm( blis_side, blis_uplo, m_C, n_C, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); dcomplex *buff_C = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( C ); dcomplex *buff_alpha = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); dcomplex *buff_beta = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( beta ); bli_zsymm( blis_side, blis_uplo, m_C, n_C, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } } return FLA_SUCCESS; }
FLA_Bool FLA_Obj_equals( FLA_Obj A, FLA_Obj B ) { FLA_Datatype datatype_A; FLA_Datatype datatype_B; FLA_Datatype datatype; dim_t m, n; dim_t rs_A, cs_A; dim_t rs_B, cs_B; dim_t i, j; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_equals_check( A, B ); m = FLA_Obj_length( A ); n = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); datatype_A = FLA_Obj_datatype( A ); datatype_B = FLA_Obj_datatype( B ); // If A is a non-FLA_CONSTANT object, then we should proceed based on the // value of datatype_A. In such a situation, either datatype_B is an exact // match and we're fine, or datatype_B is FLA_CONSTANT, in which case we're // also covered since FLA_CONSTANT encompassas all numerical types. // If A is an FLA_CONSTANT object, then we should proceed based on the value // of datatype_B. In this case, datatype_B is either a non-FLA_CONSTANT type, // which mirrors the second sub-case above, or datatype_B is FLA_CONSTANT, // in which case both types are FLA_CONSTANT and therefore we have to handle // that case. Only if both are FLA_CONSTANTs does the FLA_CONSTANT case // statement below execute. if ( datatype_A != FLA_CONSTANT ) datatype = datatype_A; else datatype = datatype_B; switch ( datatype ) { case FLA_CONSTANT: { // We require ALL floating-point fields to be the same. float* buffs_A = ( float * ) FLA_FLOAT_PTR( A ); float* buffs_B = ( float * ) FLA_FLOAT_PTR( B ); double* buffd_A = ( double * ) FLA_DOUBLE_PTR( A ); double* buffd_B = ( double * ) FLA_DOUBLE_PTR( B ); scomplex* buffc_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex* buffc_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); dcomplex* buffz_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buffz_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); if ( *buffs_A != *buffs_B || *buffd_A != *buffd_B || buffc_A->real != buffc_B->real || buffc_A->imag != buffc_B->imag || buffz_A->real != buffz_B->real || buffz_A->imag != buffz_B->imag ) { return FALSE; } break; } case FLA_INT: { int *buff_A = ( int * ) FLA_INT_PTR( A ); int *buff_B = ( int * ) FLA_INT_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ] != buff_B[ j * cs_B + i * rs_B ] ) { return FALSE; } break; } case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ] != buff_B[ j * cs_B + i * rs_B ] ) { return FALSE; } break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ] != buff_B[ j * cs_B + i * rs_B ] ) { return FALSE; } break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ].real != buff_B[ j * cs_B + i * rs_B ].real || buff_A[ j * cs_A + i * rs_A ].imag != buff_B[ j * cs_B + i * rs_B ].imag ) { return FALSE; } break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ].real != buff_B[ j * cs_B + i * rs_B ].real || buff_A[ j * cs_A + i * rs_A ].imag != buff_B[ j * cs_B + i * rs_B ].imag ) { return FALSE; } break; } } return TRUE; }
FLA_Error FLA_Tridiag_UT_l_step_ops_var2( int m_A, int m_T, float* buff_A, int rs_A, int cs_A, float* buff_T, int rs_T, int cs_T ) { float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem; float beta; float inv_tau11; float minus_inv_tau11; float minus_upsilon11, minus_conj_upsilon11; float minus_zeta11, minus_conj_zeta11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); float* buff_u = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_w = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_z = 1; int inc_w = 1; // Initialize some variables (only to prevent compiler warnings). first_elem = *buff_0; minus_inv_tau11 = *buff_0; for ( i = 0; i < b_alg; ++i ) { float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* upsilon11= buff_u + (i )*inc_u; float* u21 = buff_u + (i+1)*inc_u; float* zeta11 = buff_z + (i )*inc_z; float* z21 = buff_z + (i+1)*inc_z; float* w21 = buff_w + (i+1)*inc_w; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon11, minus_upsilon11 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon11 ); // FLA_Copy( minus_upsilon11, minus_conj_upsilon11 ); bl1_smult3( buff_m1, upsilon11, &minus_upsilon11 ); bl1_scopyconj( &minus_upsilon11, &minus_conj_upsilon11 ); // FLA_Copy( zeta11, minus_zeta11 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta11 ); // FLA_Copy( minus_zeta11, minus_conj_zeta11 ); bl1_smult3( buff_m1, zeta11, &minus_zeta11 ); bl1_scopyconj( &minus_zeta11, &minus_conj_zeta11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon11, zeta11, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta11, upsilon11, alpha11 ); bl1_saxpyv( BLIS1_CONJUGATE, 1, &minus_upsilon11, zeta11, 1, alpha11, 1 ); bl1_saxpyv( BLIS1_CONJUGATE, 1, &minus_zeta11, upsilon11, 1, alpha11, 1 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_zeta11, u21, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon11, z21, a21 ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &minus_conj_zeta11, u21, inc_u, a21, rs_A ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &minus_conj_upsilon11, z21, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bl1_sdiv3( buff_1, tau11, &inv_tau11 ); bl1_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 ) { // FLA_Her2( FLA_LOWER_TRIANGULAR, FLA_MINUS_ONE, u21, z21, A22 ); bl1_ssyr2( BLIS1_LOWER_TRIANGULAR, m_ahead, buff_m1, u21, inc_u, z21, inc_z, A22, rs_A, cs_A ); } if ( m_ahead > 0 ) { // FLA_Hemv( FLA_LOWER_TRIANGULAR, FLA_ONE, A22, a21, FLA_ZERO, w21 ); bl1_ssymv( BLIS1_LOWER_TRIANGULAR, m_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, w21, inc_w ); // FLA_Copy( a21, u21 ); // FLA_Copy( w21, z21 ); bl1_scopyv( BLIS1_NO_CONJUGATE, m_ahead, a21, rs_A, u21, inc_u ); bl1_scopyv( BLIS1_NO_CONJUGATE, m_ahead, w21, inc_w, z21, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); bl1_sdot( BLIS1_CONJUGATE, m_ahead, a21, rs_A, z21, inc_z, &beta ); bl1_sinvscals( buff_2, &beta ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bl1_sscals( &minus_inv_tau11, &beta ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, inc_z ); bl1_sscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, z21, inc_z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bl1_sgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Her2( FLA_LOWER_TRIANGULAR, FLA_MINUS_ONE, u21, z21, A22 ); bl1_ssyr2( BLIS1_LOWER_TRIANGULAR, m_ahead, buff_m1, u21, inc_u, z21, inc_z, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &z ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_z ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Househ3UD_UT( FLA_Obj chi_0, FLA_Obj x1, FLA_Obj y2, FLA_Obj tau ) /* Compute an up-and-downdating UT Householder transformation / / 1 0 0 \ / 1 0 0 \ / 1 \ ( 1 u1' v2' ) \ H = | | 0 I 0 | - inv(tau) | 0 I 0 | | u1 | | \ \ 0 0 I / \ 0 0 -I / \ v2 / / by computing tau, u1, and v2 such that the following is satisfied: / chi_0 \ / alpha \ H | x1 | = | 0 | \ y2 / \ 0 / where alpha = - lambda * chi_0 / | chi_0 | lambda = sqrt( conj(chi0) chi0 + x1' x1 - y2' y2 ) / chi_0 \ x = | x1 | \ y2 / tau = ( 1 + u1' u1 - v2' v2 ) / 2 u1 = x1 / ( chi_0 - alpha ) v2 = -y2 / ( chi_0 - alpha ) Upon completion, alpha, u1, and v2 have overwritten objects chi_0, x1, and y2, respectively. -FGVZ */ { FLA_Datatype datatype; int m_x1; int m_y2; int inc_x1; int inc_y2; datatype = FLA_Obj_datatype( x1 ); m_x1 = FLA_Obj_vector_dim( x1 ); m_y2 = FLA_Obj_vector_dim( y2 ); inc_x1 = FLA_Obj_vector_inc( x1 ); inc_y2 = FLA_Obj_vector_inc( y2 ); if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Househ3UD_UT_check( chi_0, x1, y2, tau ); switch ( datatype ) { case FLA_FLOAT: { float* chi_0_p = ( float* ) FLA_FLOAT_PTR( chi_0 ); float* x1_p = ( float* ) FLA_FLOAT_PTR( x1 ); float* y2_p = ( float* ) FLA_FLOAT_PTR( y2 ); float* tau_p = ( float* ) FLA_FLOAT_PTR( tau ); FLA_Househ3UD_UT_ops( m_x1, m_y2, chi_0_p, x1_p, inc_x1, y2_p, inc_y2, tau_p ); break; } case FLA_DOUBLE: { double* chi_0_p = ( double* ) FLA_DOUBLE_PTR( chi_0 ); double* x1_p = ( double* ) FLA_DOUBLE_PTR( x1 ); double* y2_p = ( double* ) FLA_DOUBLE_PTR( y2 ); double* tau_p = ( double* ) FLA_DOUBLE_PTR( tau ); FLA_Househ3UD_UT_opd( m_x1, m_y2, chi_0_p, x1_p, inc_x1, y2_p, inc_y2, tau_p ); break; } case FLA_COMPLEX: { scomplex* chi_0_p = ( scomplex* ) FLA_COMPLEX_PTR( chi_0 ); scomplex* x1_p = ( scomplex* ) FLA_COMPLEX_PTR( x1 ); scomplex* y2_p = ( scomplex* ) FLA_COMPLEX_PTR( y2 ); scomplex* tau_p = ( scomplex* ) FLA_COMPLEX_PTR( tau ); FLA_Househ3UD_UT_opc( m_x1, m_y2, chi_0_p, x1_p, inc_x1, y2_p, inc_y2, tau_p ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* chi_0_p = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( chi_0 ); dcomplex* x1_p = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( x1 ); dcomplex* y2_p = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( y2 ); dcomplex* tau_p = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( tau ); FLA_Househ3UD_UT_opz( m_x1, m_y2, chi_0_p, x1_p, inc_x1, y2_p, inc_y2, tau_p ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Househ3UD_UT_ops( int m_x1, int m_y2, float* chi_0, float* x1, int inc_x1, float* y2, int inc_y2, float* tau ) { float one_half = *FLA_FLOAT_PTR( FLA_ONE_HALF ); float alpha; float chi_0_minus_alpha; float neg_chi_0_minus_alpha; float abs_chi_0; float norm_x_1; float norm_y_2; float lambda; float abs_sq_chi_0_minus_alpha; int i_one = 1; // // Compute the 2-norms of x_1 and y_2: // // norm_x_1 := || x_1 ||_2 // norm_y_2 := || y_2 ||_2 // bl1_snrm2( m_x1, x1, inc_x1, &norm_x_1 ); bl1_snrm2( m_y2, y2, inc_y2, &norm_y_2 ); // // If 2-norms of x_1, y_2 are zero, then return with trivial tau, chi_0 values. // if ( norm_x_1 == 0.0F && norm_y_2 == 0.0F ) { *chi_0 = -(*chi_0); *tau = one_half; return FLA_SUCCESS; } // // Compute the absolute value (magnitude) of chi_0, which equals the 2-norm // of chi_0: // // abs_chi_0 := | chi_0 | = || chi_0 ||_2 // bl1_snrm2( i_one, chi_0, i_one, &abs_chi_0 ); // // Compute lambda: // // lambda := sqrt( conj(chi0) chi0 + x1' x1 - y2' y2 ) // lambda = ( float ) sqrt( abs_chi_0 * abs_chi_0 + norm_x_1 * norm_x_1 - norm_y_2 * norm_y_2 ); // Compute alpha: // // alpha := - lambda * chi_0 / | chi_0 | // = -sign( chi_0 ) * lambda // alpha = -ssign( *chi_0 ) * lambda; // // Overwrite x_1 and y_2 with u_1 and v_2, respectively: // // x_1 := x_1 / ( chi_0 - alpha ) // y_2 := y_2 / -( chi_0 - alpha ) // chi_0_minus_alpha = (*chi_0) - alpha; bl1_sinvscalv( BLIS1_NO_CONJUGATE, m_x1, &chi_0_minus_alpha, x1, inc_x1 ); neg_chi_0_minus_alpha = -chi_0_minus_alpha; bl1_sinvscalv( BLIS1_NO_CONJUGATE, m_y2, &neg_chi_0_minus_alpha, y2, inc_y2 ); // // Compute tau: // // tau := ( 1 + u_1' * u_1 - v_2' * v_2 ) / 2 // = ( ( chi_1 - alpha ) * conj( chi_1 - alpha ) + x_1' * x_1 - y_2' * y_2 ) / // ( 2 * ( chi_1 - alpha ) * conj( chi_1 - alpha ) ) // = ( | chi_1 - alpha |^2 + || x_2 ||_2^2 - || y_2 ||_2^2 ) / // ( 2 * | chi_1 - alpha |^2 ) // abs_sq_chi_0_minus_alpha = chi_0_minus_alpha * chi_0_minus_alpha; *tau = ( abs_sq_chi_0_minus_alpha + norm_x_1 * norm_x_1 - norm_y_2 * norm_y_2 ) / ( 2.0F * abs_sq_chi_0_minus_alpha ); // // Overwrite chi_0 with alpha: // // chi_0 := alpha // *chi_0 = alpha; return FLA_SUCCESS; }
FLA_Bool FLA_Obj_has_nan( FLA_Obj A ) { FLA_Datatype datatype; dim_t i, j, m, n, cs, rs; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_has_nan_check( A ); datatype = FLA_Obj_datatype( A ); m = FLA_Obj_length( A ); n = FLA_Obj_width( A ); cs = FLA_Obj_col_stride( A ); rs = FLA_Obj_row_stride( A ); switch ( datatype ) { case FLA_FLOAT: { float *buff = ( float * ) FLA_FLOAT_PTR( A ); for ( j=0; j<n; ++j ) for ( i=0; i<m; ++i ) { float val = buff[i*cs + j*rs]; if ( val != val ) return TRUE; } break; } case FLA_DOUBLE: { double *buff = ( double * ) FLA_DOUBLE_PTR( A ); for ( j=0; j<n; ++j ) for ( i=0; i<m; ++i ) { double val = buff[i*cs + j*rs]; if ( val != val ) return TRUE; } break; } case FLA_COMPLEX: { scomplex *buff = ( scomplex * ) FLA_COMPLEX_PTR( A ); for ( j=0; j<n; ++j ) for ( i=0; i<m; ++i ) { scomplex val = buff[i*cs + j*rs]; if ( val.real != val.real || val.imag != val.imag ) return TRUE; } break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); for ( j=0; j<n; ++j ) for ( i=0; i<m; ++i ) { dcomplex val = buff[i*cs + j*rs]; if ( val.real != val.real || val.imag != val.imag ) return TRUE; } break; } } return FALSE; }
FLA_Error FLA_Bidiag_blk_external( FLA_Obj A, FLA_Obj tu, FLA_Obj tv ) { int info = 0; #ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES FLA_Datatype datatype; int m_A, n_A, cs_A; int min_m_n, max_m_n; int lwork; FLA_Obj d, e, work_obj; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Bidiag_check( A, tu, tv ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); max_m_n = FLA_Obj_max_dim( A ); cs_A = FLA_Obj_col_stride( A ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n, 1, 0, 0, &d ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n - 1, 1, 0, 0, &e ); lwork = (m_A + n_A) * FLA_Query_blocksize( datatype, FLA_DIMENSION_MIN ); FLA_Obj_create( datatype, lwork, 1, 0, 0, &work_obj ); switch( datatype ){ case FLA_FLOAT: { float* buff_A = ( float * ) FLA_FLOAT_PTR( A ); float* buff_d = ( float * ) FLA_FLOAT_PTR( d ); float* buff_e = ( float * ) FLA_FLOAT_PTR( e ); float* buff_tu = ( float * ) FLA_FLOAT_PTR( tu ); float* buff_tv = ( float * ) FLA_FLOAT_PTR( tv ); float* buff_work = ( float * ) FLA_FLOAT_PTR( work_obj ); F77_sgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_DOUBLE: { double* buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double* buff_d = ( double * ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double * ) FLA_DOUBLE_PTR( e ); double* buff_tu = ( double * ) FLA_DOUBLE_PTR( tu ); double* buff_tv = ( double * ) FLA_DOUBLE_PTR( tv ); double* buff_work = ( double * ) FLA_DOUBLE_PTR( work_obj ); F77_dgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_COMPLEX: { scomplex* buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); float* buff_d = ( float * ) FLA_FLOAT_PTR( d ); float* buff_e = ( float * ) FLA_FLOAT_PTR( e ); scomplex* buff_tu = ( scomplex * ) FLA_COMPLEX_PTR( tu ); scomplex* buff_tv = ( scomplex * ) FLA_COMPLEX_PTR( tv ); scomplex* buff_work = ( scomplex * ) FLA_COMPLEX_PTR( work_obj ); F77_cgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); double* buff_d = ( double * ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double * ) FLA_DOUBLE_PTR( e ); dcomplex* buff_tu = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tu ); dcomplex* buff_tv = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tv ); dcomplex* buff_work = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( work_obj ); F77_zgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } } FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &work_obj ); #else FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED ); #endif return info; }
FLA_Bool FLA_Obj_le( FLA_Obj A, FLA_Obj B ) { FLA_Datatype datatype_A; FLA_Datatype datatype_B; FLA_Datatype datatype; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_le_check( A, B ); datatype_A = FLA_Obj_datatype( A ); datatype_B = FLA_Obj_datatype( B ); if ( datatype_A != FLA_CONSTANT ) datatype = datatype_A; else datatype = datatype_B; switch ( datatype ) { case FLA_CONSTANT: { // We require ALL floating-point fields to be the same. float* buffs_A = ( float * ) FLA_FLOAT_PTR( A ); float* buffs_B = ( float * ) FLA_FLOAT_PTR( B ); double* buffd_A = ( double * ) FLA_DOUBLE_PTR( A ); double* buffd_B = ( double * ) FLA_DOUBLE_PTR( B ); scomplex* buffc_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex* buffc_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); dcomplex* buffz_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buffz_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); if ( !( *buffs_A <= *buffs_B && *buffd_A <= *buffd_B && buffc_A->real <= buffc_B->real && buffc_A->imag <= buffc_B->imag && buffz_A->real <= buffz_B->real && buffz_A->imag <= buffz_B->imag ) ) { return FALSE; } break; } case FLA_INT: { int *buff_A = ( int * ) FLA_INT_PTR( A ); int *buff_B = ( int * ) FLA_INT_PTR( B ); if ( !( *buff_A <= *buff_B ) ) return FALSE; break; } case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); if ( !( *buff_A <= *buff_B ) ) return FALSE; break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); if ( !( *buff_A <= *buff_B ) ) return FALSE; break; } } return TRUE; }
FLA_Error FLA_Apply_G_lf_blk_var3( FLA_Obj G, FLA_Obj A, dim_t b_alg ) { FLA_Datatype datatype; int k_G, m_A, n_A; int rs_G, cs_G; int rs_A, cs_A; datatype = FLA_Obj_datatype( A ); k_G = FLA_Obj_width( G ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); n_A = FLA_Obj_length( A ); m_A = FLA_Obj_width( A ); cs_A = FLA_Obj_row_stride( A ); rs_A = FLA_Obj_col_stride( A ); switch ( datatype ) { case FLA_FLOAT: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); float* buff_A = ( float* ) FLA_FLOAT_PTR( A ); FLA_Apply_G_rf_bls_var3( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_DOUBLE: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_A = ( double* ) FLA_DOUBLE_PTR( A ); FLA_Apply_G_rf_bld_var3( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_COMPLEX: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A ); FLA_Apply_G_rf_blc_var3( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A ); FLA_Apply_G_rf_blz_var3( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Apply_G_rf_asm_var8( FLA_Obj G, FLA_Obj A ) /* Apply k sets of Givens rotations to a matrix A from the right, where each set takes the form: A := A ( G(n-1,k) ... G(1,k) G(0,k) )' = A G(0,k)' G(1,k)' ... G(n-1,k)' where Gik is the ith Givens rotation formed from the kth set, stored in the (i,k) entries of of G: Gik = / gamma_ik -sigma_ik \ \ sigma_ik gamma_ik / This variant iterates in pipelined, overlapping fashion and applies rotations to four columns at a time. -FGVZ */ { FLA_Datatype datatype; int k_G, m_A, n_A; int rs_G, cs_G; int rs_A, cs_A; datatype = FLA_Obj_datatype( A ); k_G = FLA_Obj_width( G ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); switch ( datatype ) { case FLA_FLOAT: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); float* buff_A = ( float* ) FLA_FLOAT_PTR( A ); FLA_Apply_G_rf_ass_var8( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_A = ( double* ) FLA_DOUBLE_PTR( A ); FLA_Apply_G_rf_asd_var8( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A ); FLA_Apply_G_rf_asc_var8( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A ); FLA_Apply_G_rf_asz_var8( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Tevd_v_opt_var2( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj U, dim_t b_alg ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype datatype; int m_A, m_U, n_G; int inc_d; int inc_e; int rs_G, cs_G; int rs_R, cs_R; int rs_U, cs_U; int rs_W, cs_W; datatype = FLA_Obj_datatype( U ); m_A = FLA_Obj_vector_dim( d ); m_U = FLA_Obj_length( U ); n_G = FLA_Obj_width( G ); inc_d = FLA_Obj_vector_inc( d ); inc_e = FLA_Obj_vector_inc( e ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); rs_R = FLA_Obj_row_stride( R ); cs_R = FLA_Obj_col_stride( R ); rs_W = FLA_Obj_row_stride( W ); cs_W = FLA_Obj_col_stride( W ); rs_U = FLA_Obj_row_stride( U ); cs_U = FLA_Obj_col_stride( U ); switch ( datatype ) { case FLA_FLOAT: { float* buff_d = FLA_FLOAT_PTR( d ); float* buff_e = FLA_FLOAT_PTR( e ); scomplex* buff_G = FLA_COMPLEX_PTR( G ); float* buff_R = FLA_FLOAT_PTR( R ); float* buff_W = FLA_FLOAT_PTR( W ); float* buff_U = FLA_FLOAT_PTR( U ); r_val = FLA_Tevd_v_ops_var2( m_A, m_U, n_G, n_iter_max, buff_d, inc_d, buff_e, inc_e, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, buff_W, rs_W, cs_W, buff_U, rs_U, cs_U, b_alg ); break; } case FLA_DOUBLE: { double* buff_d = FLA_DOUBLE_PTR( d ); double* buff_e = FLA_DOUBLE_PTR( e ); dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_R = FLA_DOUBLE_PTR( R ); double* buff_W = FLA_DOUBLE_PTR( W ); double* buff_U = FLA_DOUBLE_PTR( U ); r_val = FLA_Tevd_v_opd_var2( m_A, m_U, n_G, n_iter_max, buff_d, inc_d, buff_e, inc_e, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, buff_W, rs_W, cs_W, buff_U, rs_U, cs_U, b_alg ); break; } case FLA_COMPLEX: { float* buff_d = FLA_FLOAT_PTR( d ); float* buff_e = FLA_FLOAT_PTR( e ); scomplex* buff_G = FLA_COMPLEX_PTR( G ); float* buff_R = FLA_FLOAT_PTR( R ); scomplex* buff_W = FLA_COMPLEX_PTR( W ); scomplex* buff_U = FLA_COMPLEX_PTR( U ); r_val = FLA_Tevd_v_opc_var2( m_A, m_U, n_G, n_iter_max, buff_d, inc_d, buff_e, inc_e, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, buff_W, rs_W, cs_W, buff_U, rs_U, cs_U, b_alg ); break; } case FLA_DOUBLE_COMPLEX: { double* buff_d = FLA_DOUBLE_PTR( d ); double* buff_e = FLA_DOUBLE_PTR( e ); dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_R = FLA_DOUBLE_PTR( R ); dcomplex* buff_W = FLA_DOUBLE_COMPLEX_PTR( W ); dcomplex* buff_U = FLA_DOUBLE_COMPLEX_PTR( U ); r_val = FLA_Tevd_v_opz_var2( m_A, m_U, n_G, n_iter_max, buff_d, inc_d, buff_e, inc_e, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, buff_W, rs_W, cs_W, buff_U, rs_U, cs_U, b_alg ); break; } } return r_val; }
FLA_Error FLA_Add_to_diag( void* diag_value, FLA_Obj A ) { FLA_Datatype datatype; dim_t j, min_m_n; dim_t rs, cs; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Add_to_diag_check( diag_value, A ); datatype = FLA_Obj_datatype( A ); min_m_n = FLA_Obj_min_dim( A ); rs = FLA_Obj_row_stride( A ); cs = FLA_Obj_col_stride( A ); switch ( datatype ){ case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *value_ptr = ( float * ) diag_value; for ( j = 0; j < min_m_n; j++ ) buff_A[ j*cs + j*rs ] += *value_ptr; break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *value_ptr = ( double * ) diag_value; for ( j = 0; j < min_m_n; j++ ) buff_A[ j*cs + j*rs ] += *value_ptr; break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *value_ptr = ( scomplex * ) diag_value; for ( j = 0; j < min_m_n; j++ ) { buff_A[ j*cs + j*rs ].real += value_ptr->real; buff_A[ j*cs + j*rs ].imag += value_ptr->imag; } break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *value_ptr = ( dcomplex * ) diag_value; for ( j = 0; j < min_m_n; j++ ) { buff_A[ j*cs + j*rs ].real += value_ptr->real; buff_A[ j*cs + j*rs ].imag += value_ptr->imag; } break; } } return FLA_SUCCESS; }
// According to the sorted order of a given vector s, // U and V are reordered in columns while C is reordered // in rows when they need to be applied. FLA_Error FLA_Sort_bsvd_ext( FLA_Direct direct, FLA_Obj s, FLA_Bool apply_U, FLA_Obj U, FLA_Bool apply_V, FLA_Obj V, FLA_Bool apply_C, FLA_Obj C ) { FLA_Datatype datatype; dim_t m_U, rs_U, cs_U; dim_t m_V, rs_V, cs_V; dim_t n_C, rs_C, cs_C; dim_t m_s, inc_s; //if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) // FLA_Sort_bsvd_check( direct, s, // apply_U, U, // apply_V, V, // apply_C, C ); // Sort singular values only; quick sort if ( apply_U == FALSE && apply_V == FALSE ) return FLA_Sort( direct, s ); // s dimensions must be provided. m_s = FLA_Obj_vector_dim( s ); inc_s = FLA_Obj_vector_inc( s ); // Datatype of U, V and C must be consistent and must be defined from one of them. FLA_SORT_BSVD_EXT_DEFINE_OBJ_VARIABLES( U, apply_U, datatype, m_U, FLA_Obj_length, rs_U, cs_U ); FLA_SORT_BSVD_EXT_DEFINE_OBJ_VARIABLES( V, apply_V, datatype, m_V, FLA_Obj_length, rs_V, cs_V ); FLA_SORT_BSVD_EXT_DEFINE_OBJ_VARIABLES( C, apply_C, datatype, n_C, FLA_Obj_width, rs_C, cs_C ); switch ( datatype ) { case FLA_FLOAT: { float* s_p = ( float* ) FLA_FLOAT_PTR( s ); float* U_p = ( apply_U == TRUE ? ( float* ) FLA_FLOAT_PTR( U ) : NULL ); float* V_p = ( apply_V == TRUE ? ( float* ) FLA_FLOAT_PTR( V ) : NULL ); float* C_p = ( apply_C == TRUE ? ( float* ) FLA_FLOAT_PTR( C ) : NULL ); if ( direct == FLA_FORWARD ) FLA_Sort_bsvd_ext_f_ops( m_s, s_p, inc_s, m_U, U_p, rs_U, cs_U, m_V, V_p, rs_V, cs_V, n_C, C_p, rs_C, cs_C ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_bsvd_ext_b_ops( m_s, s_p, inc_s, m_U, U_p, rs_U, cs_U, m_V, V_p, rs_V, cs_V, n_C, C_p, rs_C, cs_C ); break; } case FLA_DOUBLE: { double* s_p = ( double* ) FLA_DOUBLE_PTR( s ); double* U_p = ( apply_U == TRUE ? ( double* ) FLA_DOUBLE_PTR( U ) : NULL ); double* V_p = ( apply_V == TRUE ? ( double* ) FLA_DOUBLE_PTR( V ) : NULL ); double* C_p = ( apply_C == TRUE ? ( double* ) FLA_DOUBLE_PTR( C ) : NULL ); if ( direct == FLA_FORWARD ) FLA_Sort_bsvd_ext_f_opd( m_s, s_p, inc_s, m_U, U_p, rs_U, cs_U, m_V, V_p, rs_V, cs_V, n_C, C_p, rs_C, cs_C ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_bsvd_ext_b_opd( m_s, s_p, inc_s, m_U, U_p, rs_U, cs_U, m_V, V_p, rs_V, cs_V, n_C, C_p, rs_C, cs_C ); break; } case FLA_COMPLEX: { float* s_p = ( float* ) FLA_FLOAT_PTR( s ); scomplex* U_p = ( apply_U == TRUE ? ( scomplex* ) FLA_COMPLEX_PTR( U ) : NULL ); scomplex* V_p = ( apply_V == TRUE ? ( scomplex* ) FLA_COMPLEX_PTR( V ) : NULL ); scomplex* C_p = ( apply_C == TRUE ? ( scomplex* ) FLA_COMPLEX_PTR( C ) : NULL ); if ( direct == FLA_FORWARD ) FLA_Sort_bsvd_ext_f_opc( m_s, s_p, inc_s, m_U, U_p, rs_U, cs_U, m_V, V_p, rs_V, cs_V, n_C, C_p, rs_C, cs_C ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_bsvd_ext_b_opc( m_s, s_p, inc_s, m_U, U_p, rs_U, cs_U, m_V, V_p, rs_V, cs_V, n_C, C_p, rs_C, cs_C ); break; } case FLA_DOUBLE_COMPLEX: { double* s_p = ( double* ) FLA_DOUBLE_PTR( s ); dcomplex* U_p = ( apply_U == TRUE ? ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( U ) : NULL ); dcomplex* V_p = ( apply_V == TRUE ? ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( V ) : NULL ); dcomplex* C_p = ( apply_C == TRUE ? ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( C ) : NULL ); if ( direct == FLA_FORWARD ) FLA_Sort_bsvd_ext_f_opz( m_s, s_p, inc_s, m_U, U_p, rs_U, cs_U, m_V, V_p, rs_V, cs_V, n_C, C_p, rs_C, cs_C ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_bsvd_ext_b_opz( m_s, s_p, inc_s, m_U, U_p, rs_U, cs_U, m_V, V_p, rs_V, cs_V, n_C, C_p, rs_C, cs_C ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opt_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T ) { FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_Y, cs_Y; int rs_Z, cs_Z; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_Y = FLA_Obj_row_stride( Y ); cs_Y = FLA_Obj_col_stride( Y ); rs_Z = FLA_Obj_row_stride( Z ); cs_Z = FLA_Obj_col_stride( Z ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_Y = FLA_FLOAT_PTR( Y ); float* buff_Z = FLA_FLOAT_PTR( Z ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ops_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_Y = FLA_DOUBLE_PTR( Y ); double* buff_Z = FLA_DOUBLE_PTR( Z ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_opd_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_Y = FLA_COMPLEX_PTR( Y ); scomplex* buff_Z = FLA_COMPLEX_PTR( Z ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_opc_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_Y = FLA_DOUBLE_COMPLEX_PTR( Y ); dcomplex* buff_Z = FLA_DOUBLE_COMPLEX_PTR( Z ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_opz_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Trmm_external_gpu( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu ) { FLA_Datatype datatype; int m_B, n_B; int ldim_A; int ldim_B; char blas_side; char blas_uplo; char blas_trans; char blas_diag; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Trmm_check( side, uplo, trans, diag, alpha, A, B ); if ( FLA_Obj_has_zero_dim( B ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); ldim_A = FLA_Obj_length( A ); m_B = FLA_Obj_length( B ); n_B = FLA_Obj_width( B ); ldim_B = FLA_Obj_length( B ); FLA_Param_map_flame_to_netlib_side( side, &blas_side ); FLA_Param_map_flame_to_netlib_uplo( uplo, &blas_uplo ); FLA_Param_map_flame_to_netlib_trans( trans, &blas_trans ); FLA_Param_map_flame_to_netlib_diag( diag, &blas_diag ); switch( datatype ){ case FLA_FLOAT: { float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); cublasStrmm( blas_side, blas_uplo, blas_trans, blas_diag, m_B, n_B, *buff_alpha, ( float * ) A_gpu, ldim_A, ( float * ) B_gpu, ldim_B ); break; } case FLA_DOUBLE: { double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); cublasDtrmm( blas_side, blas_uplo, blas_trans, blas_diag, m_B, n_B, *buff_alpha, ( double * ) A_gpu, ldim_A, ( double * ) B_gpu, ldim_B ); break; } case FLA_COMPLEX: { cuComplex *buff_alpha = ( cuComplex * ) FLA_COMPLEX_PTR( alpha ); cublasCtrmm( blas_side, blas_uplo, blas_trans, blas_diag, m_B, n_B, *buff_alpha, ( cuComplex * ) A_gpu, ldim_A, ( cuComplex * ) B_gpu, ldim_B ); break; } case FLA_DOUBLE_COMPLEX: { cuDoubleComplex *buff_alpha = ( cuDoubleComplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); cublasZtrmm( blas_side, blas_uplo, blas_trans, blas_diag, m_B, n_B, *buff_alpha, ( cuDoubleComplex * ) A_gpu, ldim_A, ( cuDoubleComplex * ) B_gpu, ldim_B ); break; } } return FLA_SUCCESS; }