FLA_Error FLA_LU_nopiv_opt_var2( FLA_Obj A ) { FLA_Datatype datatype; int m_A, n_A; int rs_A, cs_A; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); FLA_LU_nopiv_ops_var2( m_A, n_A, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); FLA_LU_nopiv_opd_var2( m_A, n_A, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); FLA_LU_nopiv_opc_var2( m_A, n_A, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); FLA_LU_nopiv_opz_var2( m_A, n_A, buff_A, rs_A, cs_A ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Tridiag_UT_l_step_opt_var2( FLA_Obj A, FLA_Obj T ) { FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Tridiag_UT_l_step_ops_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Tridiag_UT_l_step_opd_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Tridiag_UT_l_step_opc_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Tridiag_UT_l_step_opz_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
FLA_Bool FLA_Obj_equals( FLA_Obj A, FLA_Obj B ) { FLA_Datatype datatype_A; FLA_Datatype datatype_B; FLA_Datatype datatype; dim_t m, n; dim_t rs_A, cs_A; dim_t rs_B, cs_B; dim_t i, j; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_equals_check( A, B ); m = FLA_Obj_length( A ); n = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); datatype_A = FLA_Obj_datatype( A ); datatype_B = FLA_Obj_datatype( B ); // If A is a non-FLA_CONSTANT object, then we should proceed based on the // value of datatype_A. In such a situation, either datatype_B is an exact // match and we're fine, or datatype_B is FLA_CONSTANT, in which case we're // also covered since FLA_CONSTANT encompassas all numerical types. // If A is an FLA_CONSTANT object, then we should proceed based on the value // of datatype_B. In this case, datatype_B is either a non-FLA_CONSTANT type, // which mirrors the second sub-case above, or datatype_B is FLA_CONSTANT, // in which case both types are FLA_CONSTANT and therefore we have to handle // that case. Only if both are FLA_CONSTANTs does the FLA_CONSTANT case // statement below execute. if ( datatype_A != FLA_CONSTANT ) datatype = datatype_A; else datatype = datatype_B; switch ( datatype ) { case FLA_CONSTANT: { // We require ALL floating-point fields to be the same. float* buffs_A = ( float * ) FLA_FLOAT_PTR( A ); float* buffs_B = ( float * ) FLA_FLOAT_PTR( B ); double* buffd_A = ( double * ) FLA_DOUBLE_PTR( A ); double* buffd_B = ( double * ) FLA_DOUBLE_PTR( B ); scomplex* buffc_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex* buffc_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); dcomplex* buffz_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buffz_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); if ( *buffs_A != *buffs_B || *buffd_A != *buffd_B || buffc_A->real != buffc_B->real || buffc_A->imag != buffc_B->imag || buffz_A->real != buffz_B->real || buffz_A->imag != buffz_B->imag ) { return FALSE; } break; } case FLA_INT: { int *buff_A = ( int * ) FLA_INT_PTR( A ); int *buff_B = ( int * ) FLA_INT_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ] != buff_B[ j * cs_B + i * rs_B ] ) { return FALSE; } break; } case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ] != buff_B[ j * cs_B + i * rs_B ] ) { return FALSE; } break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ] != buff_B[ j * cs_B + i * rs_B ] ) { return FALSE; } break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ].real != buff_B[ j * cs_B + i * rs_B ].real || buff_A[ j * cs_A + i * rs_A ].imag != buff_B[ j * cs_B + i * rs_B ].imag ) { return FALSE; } break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); for ( j = 0; j < n; j++ ) for ( i = 0; i < m; i++ ) if ( buff_A[ j * cs_A + i * rs_A ].real != buff_B[ j * cs_B + i * rs_B ].real || buff_A[ j * cs_A + i * rs_A ].imag != buff_B[ j * cs_B + i * rs_B ].imag ) { return FALSE; } break; } } return TRUE; }
FLA_Error FLA_Syr2k_ln_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj BT, B0, BB, B1, B2; FLA_Obj CTL, CTR, C00, C01, C02, CBL, CBR, C10, C11, C12, C20, C21, C22; dim_t b; FLA_Scalr_internal( FLA_LOWER_TRIANGULAR, beta, C, FLA_Cntl_sub_scalr( cntl ) ); FLA_Part_2x1( A, &AT, &AB, 0, FLA_TOP ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_TL ); while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, /* ** */ /* ** */ &A1, AB, &A2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b, FLA_BOTTOM ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, /**/ &C01, &C02, /* ************* */ /* ******************** */ &C10, /**/ &C11, &C12, CBL, /**/ CBR, &C20, /**/ &C21, &C22, b, b, FLA_BR ); /*------------------------------------------------------------*/ /* C21 = C21 + A2 * B1' */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, alpha, A2, B1, FLA_ONE, C21, FLA_Cntl_sub_gemm1( cntl ) ); /* C21 = C21 + B2 * A1' */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, alpha, B2, A1, FLA_ONE, C21, FLA_Cntl_sub_gemm2( cntl ) ); /* C11 = C11 + A1 * B1' + B1 * A1' */ FLA_Syr2k_internal( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, alpha, A1, B1, FLA_ONE, C11, FLA_Cntl_sub_syr2k( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, A1, /* ** */ /* ** */ &AB, A2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, C01, /**/ C02, C10, C11, /**/ C12, /* ************** */ /* ****************** */ &CBL, /**/ &CBR, C20, C21, /**/ C22, FLA_TL ); } return FLA_SUCCESS; }
FLA_Error FLA_Symm_ll_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj BT, B0, BB, b1t, B2; FLA_Obj CT, C0, CB, c1t, C2; FLA_Scal_external( beta, C ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &b1t, BB, &B2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( CT, &C0, /* ** */ /* ** */ &c1t, CB, &C2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* c1t = c1t + a10t * B0 */ /* c1t' = c1t' + B0' * a10t' */ FLA_Gemv_external( FLA_TRANSPOSE, alpha, B0, a10t, FLA_ONE, c1t ); /* c1t = c1t + a21' * B2 */ /* c1t' = c1t' + B2' * a21 */ FLA_Gemv_external( FLA_TRANSPOSE, alpha, B2, a21, FLA_ONE, c1t ); /* c1t = c1t + alpha11 * b1t */ FLA_Axpys_external( alpha, alpha11, b1t, FLA_ONE, c1t ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &BT, B0, b1t, /* ** */ /* ** */ &BB, B2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &CT, C0, c1t, /* ** */ /* ** */ &CB, C2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Apply_G_rf_asm_var1( FLA_Obj G, FLA_Obj A ) /* Apply k sets of Givens rotations to a matrix A from the right, where each set takes the form: A := A ( G(n-1,k) ... G(1,k) G(0,k) )' = A G(0,k)' G(1,k)' ... G(n-1,k)' where Gik is the ith Givens rotation formed from the kth set, stored in the (i,k) entries of of G: Gik = / gamma_ik -sigma_ik \ \ sigma_ik gamma_ik / This variant iterates naively and applies rotations to two columns at a time. -FGVZ */ { FLA_Datatype datatype; int k_G, m_A, n_A; int rs_G, cs_G; int rs_A, cs_A; datatype = FLA_Obj_datatype( A ); k_G = FLA_Obj_width( G ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); switch ( datatype ) { case FLA_FLOAT: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); float* buff_A = ( float* ) FLA_FLOAT_PTR( A ); FLA_Apply_G_rf_ass_var1( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_A = ( double* ) FLA_DOUBLE_PTR( A ); FLA_Apply_G_rf_asd_var1( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A ); FLA_Apply_G_rf_asc_var1( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A ); FLA_Apply_G_rf_asz_var1( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg, double* dtime_tred, double* dtime_tevd, double* dtime_appq ) { FLA_Error r_val = FLA_SUCCESS; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, r, d, e, G, R, W; FLA_Obj d0, e0, ls, pu; dim_t mn_A; dim_t n_G = k_accum; double dtime_temp; mn_A = FLA_Obj_length( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); *dtime_tred = 1; *dtime_tevd = 1; *dtime_appq = 1; // If the matrix is a scalar, then the EVD is easy. if ( mn_A == 1 ) { FLA_Copy( A, l ); FLA_Set( FLA_ONE, A ); return FLA_SUCCESS; } // Create a matrix to hold block Householder transformations. FLA_Tridiag_UT_create_T( A, &T ); // Create a vector to hold the realifying scalars. FLA_Obj_create( dt, mn_A, 1, 0, 0, &r ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d0 ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e0 ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &pu ); FLA_Obj_create( FLA_INT, mn_A, 1, 0, 0, &ls ); FLA_Obj_create( dt_comp, mn_A-1, n_G, 0, 0, &G ); FLA_Obj_create( dt_real, mn_A, mn_A, 0, 0, &R ); FLA_Obj_create( dt, mn_A, mn_A, 0, 0, &W ); dtime_temp = FLA_Clock(); { // Reduce the matrix to tridiagonal form. FLA_Tridiag_UT( uplo, A, T ); } *dtime_tred = FLA_Clock() - dtime_temp; // Apply scalars to rotate elements on the sub-diagonal to the real domain. FLA_Tridiag_UT_realify( uplo, A, r ); // Extract the diagonal and sub-diagonal from A. FLA_Tridiag_UT_extract_diagonals( uplo, A, d, e ); dtime_temp = FLA_Clock(); { // Form Q, overwriting A. FLA_Tridiag_UT_form_Q( uplo, A, T ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the scalars in r to Q. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A ); // Find the eigenvalues only. FLA_Copy( d, d0 ); FLA_Copy( e, e0 ); //r_val = FLA_Tevd_n_opt_var1( n_iter_max, d0, e0, G, A ); { int info; double* buff_d = FLA_DOUBLE_PTR( d0 ); double* buff_e = FLA_DOUBLE_PTR( e0 ); dsterf_( &mn_A, buff_d, buff_e, &info ); } FLA_Sort( FLA_FORWARD, d0 ); FLA_Set( FLA_ZERO, ls ); FLA_Set( FLA_ZERO, pu ); dtime_temp = FLA_Clock(); { // Perform an eigenvalue decomposition on the tridiagonal matrix. r_val = FLA_Tevd_v_opt_var4( n_iter_max, d, e, d0, ls, pu, G, R, W, A, b_alg ); } *dtime_tevd = FLA_Clock() - dtime_temp; // Copy the converged eigenvalues to the output vector. FLA_Copy( d, l ); // Sort the eigenvalues and eigenvectors in ascending order. FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_free( &T ); FLA_Obj_free( &r ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &d0 ); FLA_Obj_free( &pu ); FLA_Obj_free( &e0 ); FLA_Obj_free( &ls ); FLA_Obj_free( &G ); FLA_Obj_free( &R ); FLA_Obj_free( &W ); return r_val; }
FLA_Error FLA_Bidiag_blk_external( FLA_Obj A, FLA_Obj tu, FLA_Obj tv ) { int info = 0; #ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES FLA_Datatype datatype; int m_A, n_A, cs_A; int min_m_n, max_m_n; int lwork; FLA_Obj d, e, work_obj; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Bidiag_check( A, tu, tv ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); max_m_n = FLA_Obj_max_dim( A ); cs_A = FLA_Obj_col_stride( A ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n, 1, 0, 0, &d ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), min_m_n - 1, 1, 0, 0, &e ); lwork = (m_A + n_A) * FLA_Query_blocksize( datatype, FLA_DIMENSION_MIN ); FLA_Obj_create( datatype, lwork, 1, 0, 0, &work_obj ); switch( datatype ){ case FLA_FLOAT: { float* buff_A = ( float * ) FLA_FLOAT_PTR( A ); float* buff_d = ( float * ) FLA_FLOAT_PTR( d ); float* buff_e = ( float * ) FLA_FLOAT_PTR( e ); float* buff_tu = ( float * ) FLA_FLOAT_PTR( tu ); float* buff_tv = ( float * ) FLA_FLOAT_PTR( tv ); float* buff_work = ( float * ) FLA_FLOAT_PTR( work_obj ); F77_sgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_DOUBLE: { double* buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double* buff_d = ( double * ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double * ) FLA_DOUBLE_PTR( e ); double* buff_tu = ( double * ) FLA_DOUBLE_PTR( tu ); double* buff_tv = ( double * ) FLA_DOUBLE_PTR( tv ); double* buff_work = ( double * ) FLA_DOUBLE_PTR( work_obj ); F77_dgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_COMPLEX: { scomplex* buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); float* buff_d = ( float * ) FLA_FLOAT_PTR( d ); float* buff_e = ( float * ) FLA_FLOAT_PTR( e ); scomplex* buff_tu = ( scomplex * ) FLA_COMPLEX_PTR( tu ); scomplex* buff_tv = ( scomplex * ) FLA_COMPLEX_PTR( tv ); scomplex* buff_work = ( scomplex * ) FLA_COMPLEX_PTR( work_obj ); F77_cgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); double* buff_d = ( double * ) FLA_DOUBLE_PTR( d ); double* buff_e = ( double * ) FLA_DOUBLE_PTR( e ); dcomplex* buff_tu = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tu ); dcomplex* buff_tv = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( tv ); dcomplex* buff_work = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( work_obj ); F77_zgebrd( &m_A, &n_A, buff_A, &cs_A, buff_d, buff_e, buff_tu, buff_tv, buff_work, &lwork, &info ); break; } } FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &work_obj ); #else FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED ); #endif return info; }
FLA_Error FLA_Svd_uv_var2_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, S, rL, rR, d, e, G, H, RG, RH, W; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; double dtime_temp; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // If the matrix is a scalar, then the SVD is easy. if ( min_m_n == 1 ) { FLA_Copy( A, s ); FLA_Set_to_identity( U ); FLA_Set_to_identity( V ); return FLA_SUCCESS; } // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create matrices to hold the left and right Givens matrices. FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RG ); FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RH ); FLA_Obj_create( dt, m_A, n_A, 0, 0, &W ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the sub-diagonal to the real domain. // Extract the diagonal and sub-diagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_diagonals( A, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, U, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; //FLA_QR_UT_create_T( A, &TQ ); FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ ); dtime_temp = FLA_Clock(); { // Perform a QR factorization on A and form Q in U. FLA_QR_UT( A, TQ ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { FLA_QR_UT_form_Q( A, TQ, U ); } *dtime_appq = FLA_Clock() - dtime_temp; FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_diagonals( R, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); } *dtime_appq += FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, R, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying back // to U. FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; FLA_Obj_free( &R ); } } else // if ( m_A < n_A ) { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); FLA_Obj_free( &RG ); FLA_Obj_free( &RH ); FLA_Obj_free( &W ); return r_val; }
FLA_Error FLA_Her_external( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj A ) { FLA_Datatype datatype; int m_A; int rs_A, cs_A; int inc_x; uplo_t blis_uplo; conj_t blis_conj; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Her_check( uplo, alpha, x, A ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); inc_x = FLA_Obj_vector_inc( x ); FLA_Param_map_flame_to_blis_uplo( uplo, &blis_uplo ); FLA_Param_map_flame_to_blis_conj( FLA_NO_CONJUGATE, &blis_conj ); switch( datatype ) { case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_x = ( float * ) FLA_FLOAT_PTR( x ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); bli_ssyr( blis_uplo, m_A, buff_alpha, buff_x, inc_x, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_x = ( double * ) FLA_DOUBLE_PTR( x ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); bli_dsyr( blis_uplo, m_A, buff_alpha, buff_x, inc_x, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_x = ( scomplex * ) FLA_COMPLEX_PTR( x ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); bli_cher( blis_uplo, blis_conj, m_A, buff_alpha, buff_x, inc_x, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_x = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( x ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); bli_zher( blis_uplo, blis_conj, m_A, buff_alpha, buff_x, inc_x, buff_A, rs_A, cs_A ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Copy_external( FLA_Obj A, FLA_Obj B ) { FLA_Datatype dt_A; FLA_Datatype dt_B; int m_B, n_B; int rs_A, cs_A; int rs_B, cs_B; trans_t blis_trans; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Copy_check( A, B ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; dt_A = FLA_Obj_datatype( A ); dt_B = FLA_Obj_datatype( B ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); m_B = FLA_Obj_length( B ); n_B = FLA_Obj_width( B ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); if ( FLA_Obj_is_conformal_to( FLA_NO_TRANSPOSE, A, B ) ) FLA_Param_map_flame_to_blis_trans( FLA_NO_TRANSPOSE, &blis_trans ); else // if ( FLA_Obj_is_conformal_to( FLA_TRANSPOSE, A, B ) ) FLA_Param_map_flame_to_blis_trans( FLA_TRANSPOSE, &blis_trans ); // If A is of type FLA_CONSTANT, then we have to proceed based on the // datatype of B. if ( dt_A == FLA_CONSTANT ) { if ( dt_B == FLA_FLOAT ) { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_scopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_dcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_ccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_zcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } else if ( dt_A == FLA_INT ) { int* buff_A = ( int * ) FLA_INT_PTR( A ); int* buff_B = ( int * ) FLA_INT_PTR( B ); bli_icopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_A == FLA_FLOAT ) { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); if ( dt_B == FLA_FLOAT ) { float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_scopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_sdcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_sccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_szcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } else if ( dt_A == FLA_DOUBLE ) { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); if ( dt_B == FLA_FLOAT ) { float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_dscopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_dcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_dccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_dzcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } else if ( dt_A == FLA_COMPLEX ) { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); if ( dt_B == FLA_FLOAT ) { float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_cscopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_cdcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_ccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_czcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } else if ( dt_A == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); if ( dt_B == FLA_FLOAT ) { float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bli_zscopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE ) { double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bli_zdcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_COMPLEX ) { scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bli_zccopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } else if ( dt_B == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bli_zcopymt( blis_trans, m_B, n_B, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); } } return FLA_SUCCESS; }
FLA_Error FLA_Symm_ru_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj BL, BR, B0, b1t, B2; FLA_Obj CL, CR, C0, c1t, C2; FLA_Scal_external( beta, C ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_BR ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_RIGHT ); FLA_Part_1x2( C, &CL, &CR, 0, FLA_RIGHT ); while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &a01, /**/ &A02, &a10t, &alpha11, /**/ &a12t, /* ************* */ /* ************************** */ ABL, /**/ ABR, &A20, &a21, /**/ &A22, 1, 1, FLA_TL ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, &b1t, /**/ &B2, 1, FLA_LEFT ); FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, &c1t, /**/ &C2, 1, FLA_LEFT ); /*------------------------------------------------------------*/ /* c1t = c1t + b1t * alpha11 */ FLA_Axpys_external( alpha, alpha11, b1t, FLA_ONE, c1t ); /* c1t = c1t + B2 * a12t' */ FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B2, a12t, FLA_ONE, c1t ); /* C2 = C2 + b1t * a12t */ FLA_Ger_external( alpha, b1t, a12t, C2 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ a01, A02, /* ************** */ /* ************************ */ a10t, /**/ alpha11, a12t, &ABL, /**/ &ABR, A20, /**/ a21, A22, FLA_BR ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, /**/ b1t, B2, FLA_RIGHT ); FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, /**/ c1t, C2, FLA_RIGHT ); } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Gerc2_opt_var1( FLA_Obj alpha, FLA_Obj u, FLA_Obj y, FLA_Obj z, FLA_Obj v, FLA_Obj A ) { /* Effective computation: A = A + alpha * ( u * y' + z * v' ); */ FLA_Datatype datatype; int m_A, n_A; int rs_A, cs_A; int inc_u, inc_y, inc_z, inc_v; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); inc_u = FLA_Obj_vector_inc( u ); inc_y = FLA_Obj_vector_inc( y ); inc_z = FLA_Obj_vector_inc( z ); inc_v = FLA_Obj_vector_inc( v ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_u = FLA_FLOAT_PTR( u ); float* buff_y = FLA_FLOAT_PTR( y ); float* buff_z = FLA_FLOAT_PTR( z ); float* buff_v = FLA_FLOAT_PTR( v ); float* buff_alpha = FLA_FLOAT_PTR( alpha ); FLA_Fused_Gerc2_ops_var1( m_A, n_A, buff_alpha, buff_u, inc_u, buff_y, inc_y, buff_z, inc_z, buff_v, inc_v, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_u = FLA_DOUBLE_PTR( u ); double* buff_y = FLA_DOUBLE_PTR( y ); double* buff_z = FLA_DOUBLE_PTR( z ); double* buff_v = FLA_DOUBLE_PTR( v ); double* buff_alpha = FLA_DOUBLE_PTR( alpha ); FLA_Fused_Gerc2_opd_var1( m_A, n_A, buff_alpha, buff_u, inc_u, buff_y, inc_y, buff_z, inc_z, buff_v, inc_v, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_u = FLA_COMPLEX_PTR( u ); scomplex* buff_y = FLA_COMPLEX_PTR( y ); scomplex* buff_z = FLA_COMPLEX_PTR( z ); scomplex* buff_v = FLA_COMPLEX_PTR( v ); scomplex* buff_alpha = FLA_COMPLEX_PTR( alpha ); FLA_Fused_Gerc2_opc_var1( m_A, n_A, buff_alpha, buff_u, inc_u, buff_y, inc_y, buff_z, inc_z, buff_v, inc_v, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_u = FLA_DOUBLE_COMPLEX_PTR( u ); dcomplex* buff_y = FLA_DOUBLE_COMPLEX_PTR( y ); dcomplex* buff_z = FLA_DOUBLE_COMPLEX_PTR( z ); dcomplex* buff_v = FLA_DOUBLE_COMPLEX_PTR( v ); dcomplex* buff_alpha = FLA_DOUBLE_COMPLEX_PTR( alpha ); FLA_Fused_Gerc2_opz_var1( m_A, n_A, buff_alpha, buff_u, inc_u, buff_y, inc_y, buff_z, inc_z, buff_v, inc_v, buff_A, rs_A, cs_A ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Hemm_lu_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj BT, B0, BB, B1, B2; FLA_Obj CT, C0, CB, C1, C2; dim_t b; FLA_Scal_internal( beta, C, FLA_Cntl_sub_scal( cntl ) ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_BR ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_BOTTOM ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_BOTTOM ); while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( ATL, FLA_TL, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &A01, /**/ &A02, &A10, &A11, /**/ &A12, /* ************* */ /* ******************** */ ABL, /**/ ABR, &A20, &A21, /**/ &A22, b, b, FLA_TL ); FLA_Repart_2x1_to_3x1( BT, &B0, &B1, /* ** */ /* ** */ BB, &B2, b, FLA_TOP ); FLA_Repart_2x1_to_3x1( CT, &C0, &C1, /* ** */ /* ** */ CB, &C2, b, FLA_TOP ); /*------------------------------------------------------------*/ /* C1 = C1 + A01' * B0 */ FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A01, B0, FLA_ONE, C1, FLA_Cntl_sub_gemm1( cntl ) ); /* C1 = C1 + A11 * B1 */ FLA_Hemm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR, alpha, A11, B1, FLA_ONE, C1, FLA_Cntl_sub_hemm( cntl ) ); /* C1 = C1 + A12 * B2 */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A12, B2, FLA_ONE, C1, FLA_Cntl_sub_gemm2( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ A01, A02, /* ************** */ /* ****************** */ A10, /**/ A11, A12, &ABL, /**/ &ABR, A20, /**/ A21, A22, FLA_BR ); FLA_Cont_with_3x1_to_2x1( &BT, B0, /* ** */ /* ** */ B1, &BB, B2, FLA_BOTTOM ); FLA_Cont_with_3x1_to_2x1( &CT, C0, /* ** */ /* ** */ C1, &CB, C2, FLA_BOTTOM ); } return FLA_SUCCESS; }
FLA_Error FLA_Symm_ru_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj BL, BR, B0, B1, B2; FLA_Obj CL, CR, C0, C1, C2; dim_t b; FLA_Scal_internal( beta, C, FLA_Cntl_sub_scal( cntl ) ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); FLA_Part_1x2( C, &CL, &CR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, /**/ &C1, &C2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ /* C0 = C0 + B1 * A01' */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, alpha, B1, A01, FLA_ONE, C0, FLA_Cntl_sub_gemm1( cntl ) ); /* C1 = C1 + B1 * A11 */ FLA_Symm_internal( FLA_RIGHT, FLA_UPPER_TRIANGULAR, alpha, A11, B1, FLA_ONE, C1, FLA_Cntl_sub_symm( cntl ) ); /* C2 = C2 + B1 * A12 */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, B1, A12, FLA_ONE, C2, FLA_Cntl_sub_gemm2( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, C1, /**/ C2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Apply_G_lf_blk_var3( FLA_Obj G, FLA_Obj A, dim_t b_alg ) { FLA_Datatype datatype; int k_G, m_A, n_A; int rs_G, cs_G; int rs_A, cs_A; datatype = FLA_Obj_datatype( A ); k_G = FLA_Obj_width( G ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); n_A = FLA_Obj_length( A ); m_A = FLA_Obj_width( A ); cs_A = FLA_Obj_row_stride( A ); rs_A = FLA_Obj_col_stride( A ); switch ( datatype ) { case FLA_FLOAT: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); float* buff_A = ( float* ) FLA_FLOAT_PTR( A ); FLA_Apply_G_rf_bls_var3( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_DOUBLE: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_A = ( double* ) FLA_DOUBLE_PTR( A ); FLA_Apply_G_rf_bld_var3( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_COMPLEX: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A ); FLA_Apply_G_rf_blc_var3( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A ); FLA_Apply_G_rf_blz_var3( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Herk_un_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj CTL, CTR, C00, C01, C02, CBL, CBR, C10, C11, C12, C20, C21, C22; dim_t b; FLA_Part_2x1( A, &AT, &AB, 0, FLA_BOTTOM ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_BR ); while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, &A1, /* ** */ /* ** */ AB, &A2, b, FLA_TOP ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &C01, /**/ &C02, &C10, &C11, /**/ &C12, /* ************* */ /* ******************** */ CBL, /**/ CBR, &C20, &C21, /**/ &C22, b, b, FLA_TL ); /*------------------------------------------------------------*/ /* C12 = C12 + A1 * A2' */ FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, alpha, A1, A2, beta, C12, FLA_Cntl_sub_gemm( cntl ) ); /* C11 = C11 + A1 * A1' */ FLA_Herk_internal( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, alpha, A1, beta, C11, FLA_Cntl_sub_herk( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, /* ** */ /* ** */ A1, &AB, A2, FLA_BOTTOM ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ C01, C02, /* ************** */ /* ****************** */ C10, /**/ C11, C12, &CBL, /**/ &CBR, C20, /**/ C21, C22, FLA_BR ); } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_unb_var2( FLA_Obj A, FLA_Obj T ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj TTL, TTR, T00, t01, T02, TBL, TBR, t10t, tau11, t12t, T20, t21, T22; FLA_Obj yT, y0, yB, psi1, y2; FLA_Obj zT, z0, zB, zeta1, z2; FLA_Obj y, z; FLA_Obj inv_tau11; FLA_Obj minus_inv_tau11; FLA_Obj first_elem; FLA_Obj beta; FLA_Obj conj_beta; FLA_Obj dot_product; FLA_Obj a21_t, a21_b; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &first_elem ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &conj_beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &dot_product ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( T, &TTL, &TTR, &TBL, &TBR, 0, 0, FLA_TL ); FLA_Part_2x1( y, &yT, &yB, 0, FLA_TOP ); FLA_Part_2x1( z, &zT, &zB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < b_alg ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02, /* ************* */ /* ************************** */ &t10t, /**/ &tau11, &t12t, TBL, /**/ TBR, &T20, /**/ &t21, &T22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( yT, &y0, /* ** */ /* **** */ &psi1, yB, &y2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( zT, &z0, /* ** */ /* ***** */ &zeta1, zB, &z2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ if ( FLA_Obj_length( A22 ) > 0 ) { FLA_Part_2x1( a21, &a21_t, &a21_b, 1, FLA_TOP ); // [ u21, tau11, a21 ] = House( a21 ); FLA_Househ2_UT( FLA_LEFT, a21_t, a21_b, tau11 ); // inv_tau11 = 1 / tau11; // minus_inv_tau11 = -1 / tau11; FLA_Set( FLA_ONE, inv_tau11 ); FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); FLA_Copy( inv_tau11, minus_inv_tau11 ); FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); // Save first element of a21_t and set it to one so we can use a21 as // u21 in subsequent computations. We will restore a21_t later on. FLA_Copy( a21_t, first_elem ); FLA_Set( FLA_ONE, a21_t ); // y21 = A22' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); // z21 = A22 * u21; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); // beta = u21' * z21 / 2; // conj_beta = conj(beta); FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); FLA_Inv_scal( FLA_TWO, beta ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); // y21' = ( y21' - beta / tau * u21' ) / tau; // y21 = ( y21 - conj(beta) / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, conj_beta ); FLA_Axpy( conj_beta, a21, y2 ); FLA_Scal( inv_tau11, y2 ); // z21 = ( z21 - beta / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, beta ); FLA_Axpy( beta, a21, z2 ); FLA_Scal( inv_tau11, z2 ); // a12t = a12t * ( I - u21 * u21' / tau ); // = a12t - ( a12t * u21 ) * u21' / tau; FLA_Dot( a12t, a21, dot_product ); FLA_Scal( minus_inv_tau11, dot_product ); FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); // A02 = A02 * ( I - u21 * u21' / tau ); // = A02 - ( A02 * u21 ) * u21' / tau; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); // A22 = A22 - u21 * y21' - z21 * u21'; FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); // t01 = U20' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); // Restore first element of a21. FLA_Copy( first_elem, a21_t ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02, t10t, tau11, /**/ t12t, /* ************** */ /* ************************ */ &TBL, /**/ &TBR, T20, t21, /**/ T22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &yT, y0, psi1, /* ** */ /* **** */ &yB, y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &zT, z0, zeta1, /* ** */ /* ***** */ &zB, z2, FLA_TOP ); } FLA_Obj_free( &inv_tau11 ); FLA_Obj_free( &minus_inv_tau11 ); FLA_Obj_free( &first_elem ); FLA_Obj_free( &beta ); FLA_Obj_free( &conj_beta ); FLA_Obj_free( &dot_product ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); return FLA_SUCCESS; }
FLA_Error FLA_LU_piv_opt_var5( FLA_Obj A, FLA_Obj p ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype datatype; int m_A, n_A; int rs_A, cs_A; int inc_p; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); inc_p = FLA_Obj_vector_inc( p ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); int* buff_p = FLA_INT_PTR( p ); r_val = FLA_LU_piv_ops_var5( m_A, n_A, buff_A, rs_A, cs_A, buff_p, inc_p ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); int* buff_p = FLA_INT_PTR( p ); r_val = FLA_LU_piv_opd_var5( m_A, n_A, buff_A, rs_A, cs_A, buff_p, inc_p ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); int* buff_p = FLA_INT_PTR( p ); r_val = FLA_LU_piv_opc_var5( m_A, n_A, buff_A, rs_A, cs_A, buff_p, inc_p ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); int* buff_p = FLA_INT_PTR( p ); r_val = FLA_LU_piv_opz_var5( m_A, n_A, buff_A, rs_A, cs_A, buff_p, inc_p ); break; } } return r_val; }
void time_Trsm_lun( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_gemm_t* cntl_gemm_blas; fla_trsm_t* cntl_trsm_blas; fla_trsm_t* cntl_trsm_var; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trsm_var = FLA_Cntl_trsm_obj_create( FLA_FLAT, variant, bp, cntl_trsm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trsm_lun_unb_var1( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trsm_lun_blk_var1( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trsm_lun_unb_var2( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trsm_lun_blk_var2( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trsm_lun_unb_var3( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trsm_lun_blk_var3( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trsm_lun_unb_var4( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trsm_lun_blk_var4( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_trsm_var ); FLA_Cntl_obj_free( cntl_trsm_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1.0e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
FLA_Error FLA_Sylv_nn_blk_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj BTL, BTR, B00, B01, B02, BBL, BBR, B10, B11, B12, B20, B21, B22; FLA_Obj CTL, CTR, C00, C01, C02, CBL, CBR, C10, C11, C12, C20, C21, C22; dim_t b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_BR ); FLA_Part_2x2( B, &BTL, &BTR, &BBL, &BBR, 0, 0, FLA_TL ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_BL ); while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( CTR, FLA_TR, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &A01, /**/ &A02, &A10, &A11, /**/ &A12, /* ************* */ /* ******************** */ ABL, /**/ ABR, &A20, &A21, /**/ &A22, b, b, FLA_TL ); FLA_Repart_2x2_to_3x3( BTL, /**/ BTR, &B00, /**/ &B01, &B02, /* ************* */ /* ******************** */ &B10, /**/ &B11, &B12, BBL, /**/ BBR, &B20, /**/ &B21, &B22, b, b, FLA_BR ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, /**/ &C01, &C02, &C10, /**/ &C11, &C12, /* ************* */ /* ******************** */ CBL, /**/ CBR, &C20, /**/ &C21, &C22, b, b, FLA_TR ); // Loop Invariant: // CTL = CTL - ATR * sylv( ABR, BTL, CBL ) // CTR = CTR // CBL = sylv( ABR, BTL, CBL ) // CBR = sylv( ABR, BBR, CBR - sylv( ABR, BTL, CBL ) * BTR ) /*------------------------------------------------------------*/ // C10 = sylv( A11, B00, C10 ); FLA_Sylv_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A11, B00, C10, scale, FLA_Cntl_sub_sylv1( cntl ) ); // C00 = C00 - A01 * C10; FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A01, C10, FLA_ONE, C00, FLA_Cntl_sub_gemm1( cntl ) ); // C11 = sylv( A11, B11, C11 - A12 * C21 -/+ C10 * B01 ); FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_NEGATE( isgn ), C10, B01, FLA_ONE, C11, FLA_Cntl_sub_gemm2( cntl ) ); FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A12, C21, FLA_ONE, C11, FLA_Cntl_sub_gemm3( cntl ) ); FLA_Sylv_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A11, B11, C11, scale, FLA_Cntl_sub_sylv2( cntl ) ); // C01 = C01 - A01 * C11 - A02 * C21; FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A02, C21, FLA_ONE, C01, FLA_Cntl_sub_gemm4( cntl ) ); FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A01, C11, FLA_ONE, C01, FLA_Cntl_sub_gemm5( cntl ) ); // C12 = sylv( A11, B22, C12 - A12 * C22 -/+ C10 * B02 -/+ C11 * B12 ); FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_NEGATE( isgn ), C11, B12, FLA_ONE, C12, FLA_Cntl_sub_gemm6( cntl ) ); FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_NEGATE( isgn ), C10, B02, FLA_ONE, C12, FLA_Cntl_sub_gemm7( cntl ) ); FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A12, C22, FLA_ONE, C12, FLA_Cntl_sub_gemm8( cntl ) ); FLA_Sylv_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A11, B22, C12, scale, FLA_Cntl_sub_sylv3( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ A01, A02, /* ************** */ /* ****************** */ A10, /**/ A11, A12, &ABL, /**/ &ABR, A20, /**/ A21, A22, FLA_BR ); FLA_Cont_with_3x3_to_2x2( &BTL, /**/ &BTR, B00, B01, /**/ B02, B10, B11, /**/ B12, /* ************** */ /* ****************** */ &BBL, /**/ &BBR, B20, B21, /**/ B22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, C01, /**/ C02, /* ************** */ /* ****************** */ C10, C11, /**/ C12, &CBL, /**/ &CBR, C20, C21, /**/ C22, FLA_BL ); } return FLA_SUCCESS; }
FLA_Error FLA_Svd_uv_unb_var1( dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj scale, T, S, rL, rR, d, e, G, H; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create a real scaling factor. FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale ); // Compute a scaling factor; If none is needed, sigma will be set to one. FLA_Svd_compute_scaling( A, scale ); // Scale the matrix if scale is non-unit. if ( !FLA_Obj_equals( scale, FLA_ONE ) ) FLA_Scal( scale, A ); if ( m_A < crossover_ratio * n_A ) { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( A, d, e ); // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, U, V, b_alg ); } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; // Perform a QR factorization on A and form Q in U. FLA_QR_UT_create_T( A, &TQ ); FLA_QR_UT( A, TQ ); FLA_QR_UT_form_Q( A, TQ, U ); FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( R, d, e ); // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, R, V, b_alg ); // Multiply R into U, storing the result in A and then copying back // to U. FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); FLA_Obj_free( &R ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); // If the matrix was scaled, rescale the singular values. if ( !FLA_Obj_equals( scale, FLA_ONE ) ) FLA_Inv_scal( scale, s ); FLA_Obj_free( &scale ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); return r_val; }
FLA_Error FLA_Syr2k_ln_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Obj AT, A0, AB, a1t, A2; FLA_Obj BT, B0, BB, b1t, B2; FLA_Obj CTL, CTR, C00, c01, C02, CBL, CBR, c10t, gamma11, c12t, C20, c21, C22; FLA_Scalr_external( FLA_LOWER_TRIANGULAR, beta, C ); FLA_Part_2x1( A, &AT, &AB, 0, FLA_BOTTOM ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_BOTTOM ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_BR ); while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){ FLA_Repart_2x1_to_3x1( AT, &A0, &a1t, /* ** */ /* ** */ AB, &A2, 1, FLA_TOP ); FLA_Repart_2x1_to_3x1( BT, &B0, &b1t, /* ** */ /* ** */ BB, &B2, 1, FLA_TOP ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &c01, /**/ &C02, &c10t, &gamma11, /**/ &c12t, /* ************* */ /* ************************** */ CBL, /**/ CBR, &C20, &c21, /**/ &C22, 1, 1, FLA_TL ); /*------------------------------------------------------------*/ /* c21 = c21 + A2 * b1t' */ FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, A2, b1t, FLA_ONE, c21 ); /* c21 = c21 + B2 * a1t' */ FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B2, a1t, FLA_ONE, c21 ); /* gamma11 = gamma11 + a1t * b1t' + b1t * a1t' */ FLA_Dot2s_external( alpha, a1t, b1t, FLA_ONE, gamma11 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, /* ** */ /* ** */ a1t, &AB, A2, FLA_BOTTOM ); FLA_Cont_with_3x1_to_2x1( &BT, B0, /* ** */ /* ** */ b1t, &BB, B2, FLA_BOTTOM ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ c01, C02, /* ************** */ /* ************************ */ c10t, /**/ gamma11, c12t, &CBL, /**/ &CBR, C20, /**/ c21, C22, FLA_BR ); } return FLA_SUCCESS; }
FLA_Error FLA_Tevd_v_opt_var2( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj U, dim_t b_alg ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype datatype; int m_A, m_U, n_G; int inc_d; int inc_e; int rs_G, cs_G; int rs_R, cs_R; int rs_U, cs_U; int rs_W, cs_W; datatype = FLA_Obj_datatype( U ); m_A = FLA_Obj_vector_dim( d ); m_U = FLA_Obj_length( U ); n_G = FLA_Obj_width( G ); inc_d = FLA_Obj_vector_inc( d ); inc_e = FLA_Obj_vector_inc( e ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); rs_R = FLA_Obj_row_stride( R ); cs_R = FLA_Obj_col_stride( R ); rs_W = FLA_Obj_row_stride( W ); cs_W = FLA_Obj_col_stride( W ); rs_U = FLA_Obj_row_stride( U ); cs_U = FLA_Obj_col_stride( U ); switch ( datatype ) { case FLA_FLOAT: { float* buff_d = FLA_FLOAT_PTR( d ); float* buff_e = FLA_FLOAT_PTR( e ); scomplex* buff_G = FLA_COMPLEX_PTR( G ); float* buff_R = FLA_FLOAT_PTR( R ); float* buff_W = FLA_FLOAT_PTR( W ); float* buff_U = FLA_FLOAT_PTR( U ); r_val = FLA_Tevd_v_ops_var2( m_A, m_U, n_G, n_iter_max, buff_d, inc_d, buff_e, inc_e, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, buff_W, rs_W, cs_W, buff_U, rs_U, cs_U, b_alg ); break; } case FLA_DOUBLE: { double* buff_d = FLA_DOUBLE_PTR( d ); double* buff_e = FLA_DOUBLE_PTR( e ); dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_R = FLA_DOUBLE_PTR( R ); double* buff_W = FLA_DOUBLE_PTR( W ); double* buff_U = FLA_DOUBLE_PTR( U ); r_val = FLA_Tevd_v_opd_var2( m_A, m_U, n_G, n_iter_max, buff_d, inc_d, buff_e, inc_e, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, buff_W, rs_W, cs_W, buff_U, rs_U, cs_U, b_alg ); break; } case FLA_COMPLEX: { float* buff_d = FLA_FLOAT_PTR( d ); float* buff_e = FLA_FLOAT_PTR( e ); scomplex* buff_G = FLA_COMPLEX_PTR( G ); float* buff_R = FLA_FLOAT_PTR( R ); scomplex* buff_W = FLA_COMPLEX_PTR( W ); scomplex* buff_U = FLA_COMPLEX_PTR( U ); r_val = FLA_Tevd_v_opc_var2( m_A, m_U, n_G, n_iter_max, buff_d, inc_d, buff_e, inc_e, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, buff_W, rs_W, cs_W, buff_U, rs_U, cs_U, b_alg ); break; } case FLA_DOUBLE_COMPLEX: { double* buff_d = FLA_DOUBLE_PTR( d ); double* buff_e = FLA_DOUBLE_PTR( e ); dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_R = FLA_DOUBLE_PTR( R ); dcomplex* buff_W = FLA_DOUBLE_COMPLEX_PTR( W ); dcomplex* buff_U = FLA_DOUBLE_COMPLEX_PTR( U ); r_val = FLA_Tevd_v_opz_var2( m_A, m_U, n_G, n_iter_max, buff_d, inc_d, buff_e, inc_e, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, buff_W, rs_W, cs_W, buff_U, rs_U, cs_U, b_alg ); break; } } return r_val; }
FLA_Error FLA_Trmm_lut_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj BT, B0, BB, B1, B2; dim_t b; FLA_Scal_internal( alpha, B, FLA_Cntl_sub_scal( cntl ) ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_BR ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_BOTTOM ); while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){ b = FLA_Determine_blocksize( ATL, FLA_TL, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &A01, /**/ &A02, &A10, &A11, /**/ &A12, /* ************* */ /* ******************** */ ABL, /**/ ABR, &A20, &A21, /**/ &A22, b, b, FLA_TL ); FLA_Repart_2x1_to_3x1( BT, &B0, &B1, /* ** */ /* ** */ BB, &B2, b, FLA_TOP ); /*------------------------------------------------------------*/ /* B2 = B2 + A12' * B1; */ FLA_Gemm_internal( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A12, B1, FLA_ONE, B2, FLA_Cntl_sub_gemm( cntl ) ); /* B1 = triu( A11' ) * B1; */ FLA_Trmm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, diagA, FLA_ONE, A11, B1, FLA_Cntl_sub_trmm( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ A01, A02, /* ************** */ /* ****************** */ A10, /**/ A11, A12, &ABL, /**/ &ABR, A20, /**/ A21, A22, FLA_BR ); FLA_Cont_with_3x1_to_2x1( &BT, B0, /* ** */ /* ** */ B1, &BB, B2, FLA_BOTTOM ); } return FLA_SUCCESS; }
int Symm_ll1_unb_var2( FLA_Obj A, FLA_Obj B, FLA_Obj C ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj BT, B0, BB, b1t, B2; FLA_Obj CT, C0, CB, c1t, C2; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* *** */ &b1t, BB, &B2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( CT, &C0, /* ** */ /* *** */ &c1t, CB, &C2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ C0 = C0 + a10*b1t; c1t = c1t + a10t*B0 + alpha11*b1t; /* update line 1 */ /* : */ /* update line n */ /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &BT, B0, b1t, /* ** */ /* *** */ &BB, B2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &CT, C0, c1t, /* ** */ /* *** */ &CB, C2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Bool FLA_Obj_has_nan( FLA_Obj A ) { FLA_Datatype datatype; dim_t i, j, m, n, cs, rs; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_has_nan_check( A ); datatype = FLA_Obj_datatype( A ); m = FLA_Obj_length( A ); n = FLA_Obj_width( A ); cs = FLA_Obj_col_stride( A ); rs = FLA_Obj_row_stride( A ); switch ( datatype ) { case FLA_FLOAT: { float *buff = ( float * ) FLA_FLOAT_PTR( A ); for ( j=0; j<n; ++j ) for ( i=0; i<m; ++i ) { float val = buff[i*cs + j*rs]; if ( val != val ) return TRUE; } break; } case FLA_DOUBLE: { double *buff = ( double * ) FLA_DOUBLE_PTR( A ); for ( j=0; j<n; ++j ) for ( i=0; i<m; ++i ) { double val = buff[i*cs + j*rs]; if ( val != val ) return TRUE; } break; } case FLA_COMPLEX: { scomplex *buff = ( scomplex * ) FLA_COMPLEX_PTR( A ); for ( j=0; j<n; ++j ) for ( i=0; i<m; ++i ) { scomplex val = buff[i*cs + j*rs]; if ( val.real != val.real || val.imag != val.imag ) return TRUE; } break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); for ( j=0; j<n; ++j ) for ( i=0; i<m; ++i ) { dcomplex val = buff[i*cs + j*rs]; if ( val.real != val.real || val.imag != val.imag ) return TRUE; } break; } } return FALSE; }
FLA_Error FLA_Trmm_external_gpu( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu ) { FLA_Datatype datatype; int m_B, n_B; int ldim_A; int ldim_B; char blas_side; char blas_uplo; char blas_trans; char blas_diag; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Trmm_check( side, uplo, trans, diag, alpha, A, B ); if ( FLA_Obj_has_zero_dim( B ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); ldim_A = FLA_Obj_length( A ); m_B = FLA_Obj_length( B ); n_B = FLA_Obj_width( B ); ldim_B = FLA_Obj_length( B ); FLA_Param_map_flame_to_netlib_side( side, &blas_side ); FLA_Param_map_flame_to_netlib_uplo( uplo, &blas_uplo ); FLA_Param_map_flame_to_netlib_trans( trans, &blas_trans ); FLA_Param_map_flame_to_netlib_diag( diag, &blas_diag ); switch( datatype ){ case FLA_FLOAT: { float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); cublasStrmm( blas_side, blas_uplo, blas_trans, blas_diag, m_B, n_B, *buff_alpha, ( float * ) A_gpu, ldim_A, ( float * ) B_gpu, ldim_B ); break; } case FLA_DOUBLE: { double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); cublasDtrmm( blas_side, blas_uplo, blas_trans, blas_diag, m_B, n_B, *buff_alpha, ( double * ) A_gpu, ldim_A, ( double * ) B_gpu, ldim_B ); break; } case FLA_COMPLEX: { cuComplex *buff_alpha = ( cuComplex * ) FLA_COMPLEX_PTR( alpha ); cublasCtrmm( blas_side, blas_uplo, blas_trans, blas_diag, m_B, n_B, *buff_alpha, ( cuComplex * ) A_gpu, ldim_A, ( cuComplex * ) B_gpu, ldim_B ); break; } case FLA_DOUBLE_COMPLEX: { cuDoubleComplex *buff_alpha = ( cuDoubleComplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); cublasZtrmm( blas_side, blas_uplo, blas_trans, blas_diag, m_B, n_B, *buff_alpha, ( cuDoubleComplex * ) A_gpu, ldim_A, ( cuDoubleComplex * ) B_gpu, ldim_B ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Symm_external( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Datatype datatype; int m_C, n_C; int rs_A, cs_A; int rs_B, cs_B; int rs_C, cs_C; side_t blis_side; uplo_t blis_uplo; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Symm_check( side, uplo, alpha, A, B, beta, C ); if ( FLA_Obj_has_zero_dim( C ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); m_C = FLA_Obj_length( C ); n_C = FLA_Obj_width( C ); rs_C = FLA_Obj_row_stride( C ); cs_C = FLA_Obj_col_stride( C ); FLA_Param_map_flame_to_blis_side( side, &blis_side ); FLA_Param_map_flame_to_blis_uplo( uplo, &blis_uplo ); switch( datatype ){ case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); float *buff_C = ( float * ) FLA_FLOAT_PTR( C ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_beta = ( float * ) FLA_FLOAT_PTR( beta ); bli_ssymm( blis_side, blis_uplo, m_C, n_C, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); double *buff_C = ( double * ) FLA_DOUBLE_PTR( C ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_beta = ( double * ) FLA_DOUBLE_PTR( beta ); bli_dsymm( blis_side, blis_uplo, m_C, n_C, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); scomplex *buff_C = ( scomplex * ) FLA_COMPLEX_PTR( C ); scomplex *buff_alpha = ( scomplex * ) FLA_COMPLEX_PTR( alpha ); scomplex *buff_beta = ( scomplex * ) FLA_COMPLEX_PTR( beta ); bli_csymm( blis_side, blis_uplo, m_C, n_C, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); dcomplex *buff_C = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( C ); dcomplex *buff_alpha = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); dcomplex *buff_beta = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( beta ); bli_zsymm( blis_side, blis_uplo, m_C, n_C, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Apply_Q_UT_lnfc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl ) /* Apply a unitary matrix Q to a matrix B from the left, B := Q B where Q is the forward product of Householder transformations: Q = H(0) H(1) ... H(k-1) where H(i) corresponds to the Householder vector stored below the diagonal in the ith column of A. Thus, the operation becomes: B := Q B = H(0) H(1) ... H(k-1) B From this, we can see that we must move through A from bottom-right to top- left, since the Householder vector for H(k-1) was stored in the last column of A. We intend to apply blocks of reflectors at a time, where a block reflector H of b consecutive Householder transforms may be expressed as: H = ( H(i) H(i+1) ... H(i+b-1) ) = ( I - U inv(T) U' ) where: - U is the strictly lower trapezoidal (with implicit unit diagonal) matrix of Householder vectors, stored below the diagonal of A in columns i through i+b-1, corresponding to H(i) through H(i+b-1). - T is the upper triangular block Householder matrix corresponding to Householder vectors i through i+b-1. Consider applying H to B as an intermediate step towards applying all of Q: B := H B = ( I - U inv(T) U' ) B = B - U inv(T) U' B We must move from bottom-right to top-left. So, we partition: U -> / U11 \ B -> / B1 \ T -> ( T2 T1 ) \ U21 / \ B2 / where: - U11 is stored in strictly lower triangle of A11 with implicit unit diagonal. - U21 is stored in A21. - T1 is an upper triangular block of row-panel matrix T. Substituting repartitioned U, B, and T, we have: / B1 \ := / B1 \ - / U11 \ inv(T1) / U11 \' / B1 \ \ B2 / \ B2 / \ U21 / \ U21 / \ B2 / = / B1 \ - / U11 \ inv(T1) ( U11' U21' ) / B1 \ \ B2 / \ U21 / \ B2 / = / B1 \ - / U11 \ inv(T1) ( U11' B1 + U21' B2 ) \ B2 / \ U21 / Thus, B1 is updated as: B1 := B1 - U11 inv(T1) ( U11' B1 + U21' B2 ) And B2 is updated as: B2 := B2 - U21 inv(T1) ( U11' B1 + U21' B2 ) Note that: inv(T1) ( U11' B1 + U21' B2 ) is common to both updates, and thus may be computed and stored in workspace, and then re-used. -FGVZ */ { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj T1T, T2B; FLA_Obj WTL, WTR, WBL, WBR; FLA_Obj BT, B0, BB, B1, B2; dim_t b_alg, b; dim_t m_BR, n_BR; // Query the algorithmic blocksize by inspecting the length of T. b_alg = FLA_Obj_length( T ); // If m > n, then we have to initialize our partitionings carefully so // that we begin in the proper location in A and B (since we traverse // matrix A from BR to TL). if ( FLA_Obj_length( A ) > FLA_Obj_width( A ) ) { m_BR = FLA_Obj_length( A ) - FLA_Obj_width( A ); n_BR = 0; } else if ( FLA_Obj_length( A ) < FLA_Obj_width( A ) ) { m_BR = 0; n_BR = FLA_Obj_width( A ) - FLA_Obj_length( A ); } else { m_BR = 0; n_BR = 0; } FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, m_BR, n_BR, FLA_BR ); // A and T are dependent; we determine T matrix w.r.t. A FLA_Part_1x2( T, &TL, &TR, FLA_Obj_min_dim( A ), FLA_LEFT ); FLA_Part_2x1( B, &BT, &BB, m_BR, FLA_BOTTOM ); while ( FLA_Obj_min_dim( ATL ) > 0 ){ b = min( b_alg, FLA_Obj_min_dim( ATL ) ); // Since T was filled from left to right, and since we need to access them // in reverse order, we need to handle the case where the last block is // smaller than the other b x b blocks. if ( FLA_Obj_width( TR ) == 0 && FLA_Obj_width( T ) % b_alg > 0 ) b = FLA_Obj_width( T ) % b_alg; FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &A01, /**/ &A02, &A10, &A11, /**/ &A12, /* ************* */ /* ******************** */ ABL, /**/ ABR, &A20, &A21, /**/ &A22, b, b, FLA_TL ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, &T1, /**/ &T2, b, FLA_LEFT ); FLA_Repart_2x1_to_3x1( BT, &B0, &B1, /* ** */ /* ** */ BB, &B2, b, FLA_TOP ); /*------------------------------------------------------------*/ FLA_Part_2x1( T1, &T1T, &T2B, b, FLA_TOP ); FLA_Part_2x2( W, &WTL, &WTR, &WBL, &WBR, b, FLA_Obj_width( B1 ), FLA_TL ); // WTL = B1; FLA_Copyt_internal( FLA_NO_TRANSPOSE, B1, WTL, FLA_Cntl_sub_copyt( cntl ) ); // U11 = trilu( A11 ); // U21 = A21; // // WTL = inv( triu(T1T) ) * ( U11' * B1 + U21' * B2 ); FLA_Trmm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, FLA_ONE, A11, WTL, FLA_Cntl_sub_trmm1( cntl ) ); FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A21, B2, FLA_ONE, WTL, FLA_Cntl_sub_gemm1( cntl ) ); FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1T, WTL, FLA_Cntl_sub_trsm( cntl ) ); // B2 = B2 - U21 * WTL; // B1 = B1 - U11 * WTL; FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A21, WTL, FLA_ONE, B2, FLA_Cntl_sub_gemm2( cntl ) ); FLA_Trmm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, FLA_MINUS_ONE, A11, WTL, FLA_Cntl_sub_trmm2( cntl ) ); FLA_Axpyt_internal( FLA_NO_TRANSPOSE, FLA_ONE, WTL, B1, FLA_Cntl_sub_axpyt( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ A01, A02, /* ************** */ /* ****************** */ A10, /**/ A11, A12, &ABL, /**/ &ABR, A20, /**/ A21, A22, FLA_BR ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, /**/ T1, T2, FLA_RIGHT ); FLA_Cont_with_3x1_to_2x1( &BT, B0, /* ** */ /* ** */ B1, &BB, B2, FLA_BOTTOM ); } return FLA_SUCCESS; }