FLA_Error FLASH_QR_UT_inc_opt1( FLA_Obj A, FLA_Obj TW ) { FLA_Error r_val; FLA_Obj U; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_QR_UT_inc_check( A, TW ); // Create a temporary matrix to hold copies of all of the blocks along the // diagonal of A. FLASH_Obj_create_diag_panel( A, &U ); // Begin a parallel region. FLASH_Queue_begin(); // Invoke FLA_QR_UT_inc_blk_var2() with the standard control tree. r_val = FLA_QR_UT_inc_blk_var2( A, TW, U, flash_qrutinc_cntl ); // End the parallel region. FLASH_Queue_end(); // Free the temporary matrix. FLASH_Obj_free( &U ); return r_val; }
FLA_Error FLASH_CAQR_UT_inc_noopt( dim_t p, FLA_Obj A, FLA_Obj ATW, FLA_Obj R, FLA_Obj RTW ) { FLA_Error r_val = FLA_SUCCESS; dim_t nb_part; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_CAQR_UT_inc_check( p, A, ATW, R, RTW ); // Compute the partition length from the number of partitions. nb_part = FLA_CAQR_UT_inc_compute_blocks_per_part( p, A ); // Begin a parallel region. FLASH_Queue_begin(); // Perform incremental QR's on each of the p partitions. FLA_CAQR_UT_inc_factorize_panels( nb_part, A, ATW ); // Copy the triangles of A into R. FLA_CAQR_UT_inc_copy_triangles( nb_part, A, R ); // Perform an incremental CAQR on the resulting upper triangular R's in A. FLA_CAQR_UT_inc_blk_var1( R, RTW, flash_caqrutinc_cntl ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_LU_piv( FLA_Obj A, FLA_Obj p ) { FLA_Error r_val = FLA_SUCCESS; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_LU_piv_check( A, p ); // *** The current LU_piv algorithm implemented assumes that // the matrix has a hierarchical depth of 1. We check for that here, because // we anticipate that we'll use a more general algorithm in the future, and // we don't want to forget to remove the constraint. *** if ( FLASH_Obj_depth( A ) != 1 ) { FLA_Print_message( "FLASH_LU_piv() currently only supports matrices of depth 1", __FILE__, __LINE__ ); FLA_Abort(); } // Begin a parallel region. FLASH_Queue_begin(); // Invoke FLA_LU_piv_internal() with large control tree. FLA_LU_piv_internal( A, p, flash_lu_piv_cntl ); // End the parallel region. FLASH_Queue_end(); // Check for singularity. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) r_val = FLASH_LU_find_zero_on_diagonal( A ); return r_val; }
FLA_Error FLASH_Apply_CAQ_UT_inc( dim_t p, FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj ATW, FLA_Obj R, FLA_Obj RTW, FLA_Obj W, FLA_Obj B ) { FLA_Error r_val; dim_t nb_part; FLA_Obj WT, WB; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Apply_CAQ_UT_inc_check( side, trans, direct, storev, A, ATW, R, RTW, W, B ); // Compute the partition length from the number of partitions. nb_part = FLA_CAQR_UT_inc_compute_blocks_per_part( p, R ); // Begin a parallel region. FLASH_Queue_begin(); // Apply the individual Q's from the incremental QR factorizations. FLA_Apply_CAQ_UT_inc_apply_panels( nb_part, A, ATW, W, B ); FLA_Part_2x1( W, &WT, &WB, 1, FLA_TOP ); // Apply the Q from the factorization of the upper triangular R's. r_val = FLA_Apply_CAQ_UT_inc_internal( side, trans, direct, storev, R, RTW, WT, B, flash_apcaqutinc_cntl ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_Her2k( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Error r_val; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Her2k_check( uplo, trans, alpha, A, B, beta, C ); // Begin a parallel region. FLASH_Queue_begin(); // Enqueue tasks via a SuperMatrix-aware control tree. r_val = FLA_Her2k_internal( uplo, trans, alpha, A, B, beta, C, flash_her2k_cntl_mm ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_Lyap( FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale ) { FLA_Error r_val; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Lyap_check( trans, isgn, A, C, scale ); // Begin a parallel region. FLASH_Queue_begin(); // Enqueue tasks via a SuperMatrix-aware control tree. r_val = FLA_Lyap_internal( trans, isgn, A, C, scale, flash_lyap_cntl ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_Copy( FLA_Obj A, FLA_Obj B ) { FLA_Error r_val; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Copy_check( A, B ); // Begin a parallel region. FLASH_Queue_begin(); // Execute tasks. r_val = FLA_Copy_internal( A, B, flash_copy_cntl ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_Trsm( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B ) { FLA_Error r_val; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Trsm_check( side, uplo, trans, diag, alpha, A, B ); // Begin a parallel region. FLASH_Queue_begin(); // Enqueue tasks via a SuperMatrix-aware control tree. r_val = FLA_Trsm_internal( side, uplo, trans, diag, alpha, A, B, flash_trsm_cntl_mm ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_Chol( FLA_Uplo uplo, FLA_Obj A ) { FLA_Error r_val; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Chol_check( uplo, A ); // Begin a parallel region. FLASH_Queue_begin(); // Enqueue tasks via a SuperMatrix-aware control tree. r_val = FLA_Chol_internal( uplo, A, flash_chol_cntl ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_Apply_Q2_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E ) { FLA_Error r_val; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Apply_Q2_UT_check( side, trans, direct, storev, D, T, W, C, E ); // Begin a parallel region. FLASH_Queue_begin(); // Invoke FLA_Apply_Q2_UT_internal() with the standard control tree. r_val = FLA_Apply_Q2_UT_internal( side, trans, direct, storev, D, T, W, C, E, flash_apq2ut_cntl ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_Apply_Q_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B ) { FLA_Error r_val; dim_t b_alg; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Apply_Q_UT_check( side, trans, direct, storev, A, T, W, B ); // Inspect the length of TTL to get the blocksize used by the QR/LQ // factorization, which will be our inner blocksize for Apply_Q_UT. b_alg = FLASH_Obj_scalar_length_tl( T ); // The traditional (non-incremental) Apply_Q_UT algorithm-by-blocks // requires that the algorithmic blocksize be equal to the storage // blocksize. if ( b_alg != FLASH_Obj_scalar_width_tl( T ) ) { FLA_Print_message( "FLASH_Apply_Q_UT() requires that b_alg == b_store", __FILE__, __LINE__ ); FLA_Abort(); } // Adjust the blocksize of the control tree node for the flat subproblem. if ( FLA_Cntl_blocksize( fla_apqut_cntl_leaf ) != NULL ) FLA_Blocksize_set( FLA_Cntl_blocksize( fla_apqut_cntl_leaf ), b_alg, b_alg, b_alg, b_alg ); // Begin a parallel region. FLASH_Queue_begin(); // Invoke FLA_Apply_Q_UT_internal() with the standard control tree. r_val = FLA_Apply_Q_UT_internal( side, trans, direct, storev, A, T, W, B, flash_apqut_cntl_blas ); // End the parallel region. FLASH_Queue_end(); return r_val; }
FLA_Error FLASH_LU_nopiv( FLA_Obj A ) { FLA_Error r_val; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_LU_nopiv_check( A ); // Begin a parallel region. FLASH_Queue_begin(); // Enqueue tasks via a SuperMatrix-aware control tree. r_val = FLA_LU_nopiv_internal( A, flash_lu_nopiv_cntl ); // End the parallel region. FLASH_Queue_end(); // Check for singularity. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) r_val = FLASH_LU_find_zero_on_diagonal( A ); return r_val; }