FLA_Error LU_blk_var4( FLA_Obj A, int nb_alg ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; dim_t b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ b = min( FLA_Obj_length( ABR ), nb_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); /*------------------------------------------------------------*/ /* A11 = A11 - A10 * A01 ); */ FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A10, A01, FLA_ONE, A11 ); /* A11 = LU( A11 ); */ LU_unb_var4( A11 ); /* A12 = A12 - A10 * A02; */ FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A10, A02, FLA_ONE, A12 ); /* A12 = inv( trilu( A11 ) ) * A12; */ FLA_Trsm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, FLA_ONE, A11, A12 ); /* A21 = A21 - A20 * A01; */ FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, A01, FLA_ONE, A21 ); /* A21 = A21 * inv( triu( A11 ) ); */ FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A11, A21 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); } return FLA_SUCCESS; }
int LU_blk_var3( FLA_Obj A, int nb_alg ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; int b; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ b = min( FLA_Obj_length( ABR ), nb_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); /*------------------------------------------------------------*/ // A01 := inv(L00) * A01 FLA_Trsm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, FLA_ONE, A00, A01 ); // A11 := LU(A11 - A10 * A01) FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A10, A01, FLA_ONE, A11); LU_unb_var3(A11); // A21 := (A21 - A20 * A01) * inv(U11) FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, A01, FLA_ONE, A21); FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A11, A21); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); } return FLA_SUCCESS; }
void libfla_test_eig_gest_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Uplo inv; FLA_Uplo uplo; FLA_Obj A, B, Y, norm; FLA_Obj A_save, B_save; FLA_Obj A_test, B_test, Y_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_inv( &pc_str[pci][0], &inv ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); if ( inv == FLA_NO_INVERSE && ( ( impl == FLA_TEST_FLAT_UNB_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_OPT_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 3 ) ) ) { *perf = 0.0; *residual = 0.0; return; } // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &Y ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); // Initialize the test matrices. FLA_Random_spd_matrix( uplo, A ); FLA_Scalr( uplo, FLA_TWO, A ); FLA_Hermitianize( uplo, A ); FLA_Random_spd_matrix( uplo, B ); FLA_Scalr( uplo, FLA_TWO, B ); FLA_Chol( uplo, B ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( Y, 1, &b_flash, &Y_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); } else { A_test = A; Y_test = Y; B_test = B; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_hierarchify( A_save, A_test ); FLASH_Obj_hierarchify( B_save, B_test ); } else { FLA_Copy_external( A_save, A_test ); FLA_Copy_external( B_save, B_test ); } time = FLA_Clock(); libfla_test_eig_gest_impl( impl, inv, uplo, A_test, Y_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Check our solution. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLA_Trans trans_left, trans_right; FLASH_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLASH_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLASH_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } FLASH_Obj_flatten( A_test, A ); } else { FLA_Trans trans_left, trans_right; FLA_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLA_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLA_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &Y_test ); FLASH_Obj_free( &B_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_free(); // Compute the performance of the best experiment repeat. *perf = 1.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Axpy_external( FLA_MINUS_ONE, A_save, A ); FLA_Norm1( A, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &Y ); FLA_Obj_free( &B ); }