void time_Tevd_v( int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max, FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l, double *dtime, double *diff1, double* diff2, double *gflops ) { int irep; double k, dtime_old = 1.0e9; FLA_Obj A_save, G_save, d_save, e_save; if ( //( variant == 0 ) || //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || FALSE ) { *dtime = 0.0; *gflops = 0.0; *diff1 = 0.0; *diff2 = 0.0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); FLA_Copy_external( d, d_save ); FLA_Copy_external( e, e_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Tevd_v( d, e, A ); break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj V, A_rev_evd, norm, eye; FLA_Copy( d, l ); //FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" ); FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); /* FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, D, FLA_ZERO, A_rev_evd ); FLA_Copy( A_rev_evd, D ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, D, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); */ //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); /* FLA_Obj_free( &EL ); FLA_Obj_free( &EU ); FLA_Obj_free( &D ); FLA_Obj_free( &dc ); FLA_Obj_free( &ec ); */ FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; if ( FLA_Obj_is_complex( A ) ) { *gflops = ( ( 4.5 * k * m * m ) + 2.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } else { *gflops = ( ( 4.5 * k * m * m ) + 1.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &d_save ); FLA_Obj_free( &e_save ); }
FLA_Error FLA_Svd_ext_u_unb_var1( FLA_Svd_type jobu, FLA_Svd_type jobv, dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj scale, T, S, rL, rR, d, e, G, H, C; // C is dummy. dim_t m_A, n_A, min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; FLA_Bool u_is_formed = FALSE, v_is_formed = FALSE; int apply_scale; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = min( m_A, n_A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. if ( FLA_Obj_is_complex( A ) ) { FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); } // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create a real scaling factor. FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale ); // Scale matrix A if necessary. FLA_Max_abs_value( A, scale ); apply_scale = ( FLA_Obj_gt( scale, FLA_OVERFLOW_SQUARE_THRES ) == TRUE ) - ( FLA_Obj_lt( scale, FLA_UNDERFLOW_SQUARE_THRES ) == TRUE ); if ( apply_scale ) FLA_Scal( apply_scale > 0 ? FLA_SAFE_MIN : FLA_SAFE_INV_MIN, A ); if ( m_A < crossover_ratio * n_A ) { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( A, T, S ); if ( FLA_Obj_is_complex( A ) ) FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( A, d, e ); // Form U and V. if ( u_is_formed == FALSE ) { switch ( jobu ) { case FLA_SVD_VECTORS_MIN_OVERWRITE: if ( jobv != FLA_SVD_VECTORS_NONE ) FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_NO_TRANSPOSE, V ); v_is_formed = TRUE; // For this case, V should be formed here. U = A; case FLA_SVD_VECTORS_ALL: case FLA_SVD_VECTORS_MIN_COPY: FLA_Bidiag_UT_form_U_ext( FLA_UPPER_TRIANGULAR, A, T, FLA_NO_TRANSPOSE, U ); u_is_formed = TRUE; break; case FLA_SVD_VECTORS_NONE: // Do nothing break; } } if ( v_is_formed == FALSE ) { if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE ) { FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_CONJ_TRANSPOSE, A ); v_is_formed = TRUE; /* and */ V = A; // This V is actually V^H. // V^H -> V FLA_Obj_flip_base( &V ); FLA_Obj_flip_view( &V ); if ( FLA_Obj_is_complex( A ) ) FLA_Conjugate( V ); } else if ( jobv != FLA_SVD_VECTORS_NONE ) { FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_NO_TRANSPOSE, V ); v_is_formed = TRUE; } } // For complex matrices, apply realification transformation. if ( FLA_Obj_is_complex( A ) && jobu != FLA_SVD_VECTORS_NONE ) { FLA_Obj UL, UR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); } if ( FLA_Obj_is_complex( A ) && jobv != FLA_SVD_VECTORS_NONE ) { FLA_Obj VL, VR; FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } // Perform a singular value decomposition on the upper bidiagonal matrix. r_val = FLA_Bsvd_ext_opt_var1( n_iter_max, d, e, G, H, jobu, U, jobv, V, FALSE, C, // C is not referenced b_alg ); } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; // Perform a QR factorization on A. FLA_QR_UT_create_T( A, &TQ ); FLA_QR_UT( A, TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); // Form U; if necessary overwrite on A. if ( u_is_formed == FALSE ) { switch ( jobu ) { case FLA_SVD_VECTORS_MIN_OVERWRITE: U = A; case FLA_SVD_VECTORS_ALL: case FLA_SVD_VECTORS_MIN_COPY: FLA_QR_UT_form_Q( A, TQ, U ); u_is_formed = TRUE; break; case FLA_SVD_VECTORS_NONE: // Do nothing break; } } FLA_Obj_free( &TQ ); // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); if ( FLA_Obj_is_complex( R ) ) FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( R, d, e ); if ( v_is_formed == FALSE ) { if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE ) { FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, R, S, FLA_CONJ_TRANSPOSE, AT ); v_is_formed = TRUE; /* and */ V = AT; // This V is actually V^H. // V^H -> V FLA_Obj_flip_base( &V ); FLA_Obj_flip_view( &V ); if ( FLA_Obj_is_complex( A ) ) FLA_Conjugate( V ); } else if ( jobv != FLA_SVD_VECTORS_NONE ) { FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, R, S, FLA_NO_TRANSPOSE, V ); v_is_formed = TRUE; } } // Apply householder vectors U in R. FLA_Bidiag_UT_form_U_ext( FLA_UPPER_TRIANGULAR, R, T, FLA_NO_TRANSPOSE, R ); // Apply the realifying scalars in rL and rR to U and V, respectively. if ( FLA_Obj_is_complex( A ) && jobu != FLA_SVD_VECTORS_NONE ) { FLA_Obj RL, RR; FLA_Part_1x2( R, &RL, &RR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, RL ); } if ( FLA_Obj_is_complex( A ) && jobv != FLA_SVD_VECTORS_NONE ) { FLA_Obj VL, VR; FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_ext_opt_var1( n_iter_max, d, e, G, H, jobu, R, jobv, V, FALSE, C, b_alg ); // Multiply R into U, storing the result in A and then copying back // to U. if ( jobu != FLA_SVD_VECTORS_NONE ) { FLA_Obj UL, UR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); if ( jobu == FLA_SVD_VECTORS_MIN_OVERWRITE || jobv == FLA_SVD_VECTORS_MIN_OVERWRITE ) { FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, UL, &C ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, C ); FLA_Copy( C, UL ); FLA_Obj_free( &C ); } else { FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } } FLA_Obj_free( &R ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // No sort is required as it is applied on FLA_Bsvd. if ( apply_scale ) FLA_Scal( apply_scale < 0 ? FLA_SAFE_MIN : FLA_SAFE_INV_MIN, s ); // When V is overwritten, flip it again. if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE ) { // Always apply conjugation first wrt dimensions used; then, flip base. if ( FLA_Obj_is_complex( V ) ) FLA_Conjugate( V ); FLA_Obj_flip_base( &V ); } FLA_Obj_free( &scale ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); if ( FLA_Obj_is_complex( A ) ) { FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); } FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); return r_val; }
int main(int argc, char *argv[]) { int m, n, k, nfirst, nlast, ninc, i, irep, nrepeats, nb_alg, check;; double dtime, dtime_best, gflops, max_gflops, diff, d_n; FLA_Obj A, B, C, Cref, Cold; /* Initialize FLAME */ FLA_Init( ); /* Every time trial is repeated "repeat" times */ printf( "%% number of repeats:" ); scanf( "%d", &nrepeats ); printf( "%% %d\n", nrepeats ); /* Enter the max GFLOPS attainable */ printf( "%% enter max GFLOPS:" ); scanf( "%lf", &max_gflops ); printf( "%% %lf\n", max_gflops ); /* Enter the algorithmic block size */ printf( "%% enter nb_alg:" ); scanf( "%d", &nb_alg ); printf( "%% %d\n", nb_alg ); /* Timing trials for matrix sizes n=nfirst to nlast in increments of ninc will be performed */ printf( "%% enter nfirst, nlast, ninc:" ); scanf( "%d%d%d", &nfirst, &nlast, &ninc ); printf( "%% %d %d %d\n", nfirst, nlast, ninc ); i = 1; for ( n=nfirst; n<= nlast; n+=ninc ){ /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &B ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &C ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Cref ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Cold ); /* Generate random matrices L and B */ FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( Cold ); gflops = 2.0 * n * n * n * 1.0e-09; /* Time FLA_Symm */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, Cref ); dtime = FLA_Clock(); FLA_Symm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ONE, Cref ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); /* Time the your implementations */ #if TEST_UNB_VAR1==TRUE /* Variant 1 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var1( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR1==TRUE /* Variant 1 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var1( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR2==TRUE /* Variant 2 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var2( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR2==TRUE /* Variant 2 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var2( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR3==TRUE /* Variant 3 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var3( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR3==TRUE /* Variant 3 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var3( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR4==TRUE /* Variant 4 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var4( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var4( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR4==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var4( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var4( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR5==TRUE /* Variant 5 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var5( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var5( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR5==TRUE /* Variant 5 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var5( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var5( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR6==TRUE /* Variant 6 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var6( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var6( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR6==TRUE /* Variant 6 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var6( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var6( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR7==TRUE /* Variant 7 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var7( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var7( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR7==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var7( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var7( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_UNB_VAR8==TRUE /* Variant 8 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_unb_var8( A, B, C ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_unb_var8( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif #if TEST_BLK_VAR8==TRUE /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Cold, C ); dtime = FLA_Clock(); Symm_blk_var8( A, B, C, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( C, Cref ); printf( "data_blk_var8( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &Cref ); FLA_Obj_free( &Cold ); printf( "\n" ); i++; } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ printf( "close all\n" ); /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ // printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" ); /* Plot the performance of your implementations */ #if TEST_UNB_VAR1==TRUE printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.' ); \n" ); #endif #if TEST_UNB_VAR2==TRUE printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.' ); \n" ); #endif #if TEST_UNB_VAR3==TRUE printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.' ); \n" ); #endif #if TEST_UNB_VAR4==TRUE printf( "plot( data_unb_var4( :,1 ), data_unb_var4( :, 2 ), 'm-.' ); \n" ); #endif #if TEST_UNB_VAR5==TRUE printf( "plot( data_unb_var5( :,1 ), data_unb_var5( :, 2 ), 'c-.' ); \n" ); #endif #if TEST_UNB_VAR6==TRUE printf( "plot( data_unb_var6( :,1 ), data_unb_var6( :, 2 ), 'y-.' ); \n" ); #endif #if TEST_UNB_VAR7==TRUE printf( "plot( data_unb_var7( :,1 ), data_unb_var7( :, 2 ), 'k-.' ); \n" ); #endif #if TEST_UNB_VAR8==TRUE printf( "plot( data_unb_var8( :,1 ), data_unb_var8( :, 2 ), 'm:' ); \n" ); #endif #if TEST_BLK_VAR1==TRUE printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r--' ); \n" ); #endif #if TEST_BLK_VAR2==TRUE printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g--' ); \n" ); #endif #if TEST_BLK_VAR3==TRUE printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b--' ); \n" ); #endif #if TEST_BLK_VAR4==TRUE printf( "plot( data_blk_var4( :,1 ), data_blk_var4( :, 2 ), 'm--' ); \n" ); #endif #if TEST_BLK_VAR5==TRUE printf( "plot( data_blk_var5( :,1 ), data_blk_var5( :, 2 ), 'c--' ); \n" ); #endif #if TEST_BLK_VAR6==TRUE printf( "plot( data_blk_var6( :,1 ), data_blk_var6( :, 2 ), 'y--' ); \n" ); #endif #if TEST_BLK_VAR7==TRUE printf( "plot( data_blk_var7( :,1 ), data_blk_var7( :, 2 ), 'k--' ); \n" ); #endif #if TEST_BLK_VAR8==TRUE printf( "plot( data_blk_var8( :,1 ), data_blk_var8( :, 2 ), 'm-' ); \n" ); #endif printf( "hold on \n"); printf( "xlabel( 'matrix dimension m=n' );\n"); printf( "ylabel( 'GFLOPS/sec.' );\n"); // printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops ); printf( "legend( 'FLA Trsm', ...\n"); #if TEST_UNB_VAR1==TRUE printf( " 'unb var1', ...\n"); #endif #if TEST_UNB_VAR2==TRUE printf( " 'unb var2', ...\n"); #endif #if TEST_UNB_VAR3==TRUE printf( " 'unb var3', ...\n"); #endif #if TEST_UNB_VAR4==TRUE printf( " 'unb var4', ...\n"); #endif #if TEST_UNB_VAR5==TRUE printf( " 'unb var5', ...\n"); #endif #if TEST_UNB_VAR6==TRUE printf( " 'unb var6', ...\n"); #endif #if TEST_UNB_VAR7==TRUE printf( " 'unb var7', ...\n"); #endif #if TEST_UNB_VAR8==TRUE printf( " 'unb var8', ...\n"); #endif #if TEST_BLK_VAR1==TRUE printf( " 'blk var1', ...\n"); #endif #if TEST_BLK_VAR2==TRUE printf( " 'blk var2', ...\n"); #endif #if TEST_BLK_VAR3==TRUE printf( " 'blk var3', ...\n"); #endif #if TEST_BLK_VAR4==TRUE printf( " 'blk var4', ...\n"); #endif #if TEST_BLK_VAR5==TRUE printf( " 'blk var5', ...\n"); #endif #if TEST_BLK_VAR6==TRUE printf( " 'blk var6', ...\n"); #endif #if TEST_BLK_VAR7==TRUE printf( " 'blk var7', ...\n"); #endif #if TEST_BLK_VAR8==TRUE printf( " 'blk var8', ...\n"); #endif printf( " 2 );\n"); FLA_Finalize( ); }
FLA_Error FLA_Svd_uv_unb_var1( dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj scale, T, S, rL, rR, d, e, G, H; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create a real scaling factor. FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale ); // Compute a scaling factor; If none is needed, sigma will be set to one. FLA_Svd_compute_scaling( A, scale ); // Scale the matrix if scale is non-unit. if ( !FLA_Obj_equals( scale, FLA_ONE ) ) FLA_Scal( scale, A ); if ( m_A < crossover_ratio * n_A ) { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( A, d, e ); // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, U, V, b_alg ); } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; // Perform a QR factorization on A and form Q in U. FLA_QR_UT_create_T( A, &TQ ); FLA_QR_UT( A, TQ ); FLA_QR_UT_form_Q( A, TQ, U ); FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( R, d, e ); // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, R, V, b_alg ); // Multiply R into U, storing the result in A and then copying back // to U. FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); FLA_Obj_free( &R ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); // If the matrix was scaled, rescale the singular values. if ( !FLA_Obj_equals( scale, FLA_ONE ) ) FLA_Inv_scal( scale, s ); FLA_Obj_free( &scale ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); return r_val; }
FLA_Error FLA_Sort( FLA_Direct direct, FLA_Obj x ) { FLA_Datatype datatype; FLA_Obj x_use; dim_t m_x; dim_t inc_x; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Sort_check( direct, x ); datatype = FLA_Obj_datatype( x ); m_x = FLA_Obj_vector_dim( x ); inc_x = FLA_Obj_vector_inc( x ); // If the vector does not have unit stride, copy it to a temporary vector // that does have unit stride. if ( inc_x != 1 ) { FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, x, &x_use ); inc_x = FLA_Obj_vector_inc( x_use ); } else { x_use = x; } switch ( datatype ) { case FLA_FLOAT: { float* x_p = ( float* ) FLA_FLOAT_PTR( x_use ); if ( direct == FLA_FORWARD ) FLA_Sort_f_ops( m_x, x_p, inc_x ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_b_ops( m_x, x_p, inc_x ); break; } case FLA_DOUBLE: { double* x_p = ( double* ) FLA_DOUBLE_PTR( x_use ); if ( direct == FLA_FORWARD ) FLA_Sort_f_opd( m_x, x_p, inc_x ); else // if ( direct == FLA_BACKWARD ) FLA_Sort_b_opd( m_x, x_p, inc_x ); break; } } if ( inc_x != 1 ) { FLA_Copy( x_use, x ); FLA_Obj_free( &x_use ); } return FLA_SUCCESS; }
void time_Apply_G_rf( int variant, int type, int n_repeats, int m, int k, int n, int b_alg, FLA_Obj A, FLA_Obj A_ref, FLA_Obj G, FLA_Obj P, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, G_save, norm; if ( FLA_Obj_is_real( A ) ) { if ( //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 1 && type == FLA_ALG_UNB_ASM ) || //( variant == 1 && type == FLA_ALG_BLOCKED ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_ASM ) || //( variant == 2 && type == FLA_ALG_BLOCKED ) || //( variant == 3 && type == FLA_ALG_UNB_OPT ) || //( variant == 3 && type == FLA_ALG_UNB_ASM ) || //( variant == 3 && type == FLA_ALG_BLOCKED ) || //( variant == 6 && type == FLA_ALG_UNB_OPT ) || //( variant == 6 && type == FLA_ALG_UNB_ASM ) || //( variant == 6 && type == FLA_ALG_BLOCKED ) || //( variant == 9 && type == FLA_ALG_UNB_OPT ) || //( variant == 9 && type == FLA_ALG_UNB_ASM ) || //( variant == 9 && type == FLA_ALG_BLOCKED ) || ( variant == 4 ) || ( variant == 5 ) || ( variant == 7 ) || ( variant == 8 ) || FALSE ) { *gflops = 0.0; *diff = 0.0; return; } } else if ( FLA_Obj_is_complex( A ) ) { if ( //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 1 && type == FLA_ALG_UNB_ASM ) || //( variant == 1 && type == FLA_ALG_BLOCKED ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_ASM ) || //( variant == 2 && type == FLA_ALG_BLOCKED ) || //( variant == 3 && type == FLA_ALG_UNB_OPT ) || //( variant == 3 && type == FLA_ALG_UNB_ASM ) || //( variant == 3 && type == FLA_ALG_BLOCKED ) || //( variant == 6 && type == FLA_ALG_UNB_OPT ) || //( variant == 6 && type == FLA_ALG_UNB_ASM ) || //( variant == 6 && type == FLA_ALG_BLOCKED ) || //( variant == 9 && type == FLA_ALG_UNB_OPT ) || //( variant == 9 && type == FLA_ALG_UNB_ASM ) || //( variant == 9 && type == FLA_ALG_BLOCKED ) || ( variant == 4 ) || ( variant == 5 ) || ( variant == 7 ) || ( variant == 8 ) || FALSE ) { *gflops = 0.0; *diff = 0.0; return; } } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); //dim_t b_flash_m = b_alg; //dim_t b_flash_n = n; //FLASH_Obj_create_hier_copy_of_flat_ext( A, 1, &b_flash_m, &b_flash_n, &AH ); //printf ( "flash dims: %d x %d\n", FLA_Obj_length( AH ), FLA_Obj_width( AH ) ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); //FLASH_Obj_hierarchify( A_save, AH ); *dtime = FLA_Clock(); switch( variant ){ case 0: break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var1( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var1( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var1( G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var2( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var2( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var2( G, A, b_alg ); break; } break; } // Time variant 3 case 3: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var3( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var3( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var3( G, A, b_alg ); break; } break; } // Time variant 6 case 6: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var6( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var6( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var6( G, A, b_alg ); break; } break; } // Time variant 9 case 9: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var9( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var9( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var9( G, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( variant == 1 && type == FLA_ALG_UNB_OPT ) { //FLA_Obj_show( "A_ref", A, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A", A, "%9.2e ", "" ); FLA_Copy( A, A_ref ); *diff = 0.0; } else { //FLA_Obj_show( "A", A, "%9.2e + %9.2e ", "" ); //if ( variant == 7 && type == FLA_ALG_UNB_ASM ) //FLA_Obj_show( "A", A, "%9.2e", "" ); //if ( variant == 9 ) FLASH_Obj_flatten( AH, A ); FLA_Axpy( FLA_MINUS_ONE, A_ref, A ); FLA_Norm_frob( A, norm ); FLA_Obj_extract_real_scalar( norm, diff ); //*diff = FLA_Max_elemwise_diff( A_ref, A ); } *gflops = 6.0 * k * m * ( n - 1 ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 2.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); //FLASH_Obj_free( &AH ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Svd_uv_var2_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, S, rL, rR, d, e, G, H, RG, RH, W; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; double dtime_temp; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // If the matrix is a scalar, then the SVD is easy. if ( min_m_n == 1 ) { FLA_Copy( A, s ); FLA_Set_to_identity( U ); FLA_Set_to_identity( V ); return FLA_SUCCESS; } // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create matrices to hold the left and right Givens matrices. FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RG ); FLA_Obj_create( dt_real, min_m_n, min_m_n, 0, 0, &RH ); FLA_Obj_create( dt, m_A, n_A, 0, 0, &W ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the sub-diagonal to the real domain. // Extract the diagonal and sub-diagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_diagonals( A, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, U, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; //FLA_QR_UT_create_T( A, &TQ ); FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ ); dtime_temp = FLA_Clock(); { // Perform a QR factorization on A and form Q in U. FLA_QR_UT( A, TQ ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { FLA_QR_UT_form_Q( A, TQ, U ); } *dtime_appq = FLA_Clock() - dtime_temp; FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_diagonals( R, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); } *dtime_appq += FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var2( n_iter_max, d, e, G, H, RG, RH, W, R, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying back // to U. FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; FLA_Obj_free( &R ); } } else // if ( m_A < n_A ) { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); FLA_Obj_free( &RG ); FLA_Obj_free( &RH ); FLA_Obj_free( &W ); return r_val; }
void time_Hess_UT( int variant, int type, int nrepeats, int m, FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Hess_UT( A, t ); break; case FLA_ALG_FRONT: FLA_Hess_UT( A, T ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } //if ( type == FLA_ALG_REFERENCE ) //{ // ; //} //else { FLA_Obj AT, AB; FLA_Obj Q, QT, QB; FLA_Obj E, ET, EB; FLA_Obj F; dim_t m_A, m_T; m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &Q ); FLA_Set_to_identity( Q ); FLA_Part_2x1( Q, &QT, &QB, 1, FLA_TOP ); FLA_Part_2x1( A, &AT, &AB, 1, FLA_TOP ); if ( type == FLA_ALG_REFERENCE ) { if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB ); } else FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QB ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, Q, FLA_ZERO, E ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, Q, E, FLA_ZERO, F ); FLA_Copy( A, E ); FLA_Part_2x1( E, &ET, &EB, 1, FLA_TOP ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, EB ); *diff = FLA_Max_elemwise_diff( E, F ); FLA_Obj_free( &Q ); FLA_Obj_free( &E ); FLA_Obj_free( &F ); } *gflops = ( 10.0 / 3.0 * m * m * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Fill_with_logarithmic_dist( FLA_Obj alpha, FLA_Obj x ) { FLA_Obj lT, l0, lB, lambda1, l2; FLA_Obj l, k, alpha2; FLA_Datatype dt_real; dim_t n_x; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Fill_with_logarithmic_dist_check( alpha, x ); dt_real = FLA_Obj_datatype_proj_to_real( x ); n_x = FLA_Obj_vector_dim( x ); // Create a local counter to increment as we create the distribution. FLA_Obj_create( dt_real, 1, 1, 0, 0, &k ); // Create a local vector l. We will work with this vector, which is // the same length as x, so that we can use vertical partitioning. FLA_Obj_create( dt_real, n_x, 1, 0, 0, &l ); // Create a local real scalar alpha2 of the same precision as // alpha. Then copy alpha to alpha2, which will convert the // complex value to real, if necessary (ie: if alpha is complex). FLA_Obj_create( dt_real, 1, 1, 0, 0, &alpha2 ); FLA_Copy( alpha, alpha2 ); // Initialize k to 0. FLA_Set( FLA_ZERO, k ); FLA_Part_2x1( l, &lT, &lB, 0, FLA_TOP ); while ( FLA_Obj_length( lB ) > 0 ) { FLA_Repart_2x1_to_3x1( lT, &l0, /* ** */ /* ******* */ &lambda1, lB, &l2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ // lambda1 = alpha^k; FLA_Pow( alpha2, k, lambda1 ); // k = k + 1; FLA_Mult_add( FLA_ONE, FLA_ONE, k ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &lT, l0, lambda1, /* ** */ /* ******* */ &lB, l2, FLA_TOP ); } // Normalize by last element. FLA_Part_2x1( l, &lT, &lB, 1, FLA_BOTTOM ); FLA_Inv_scal( lB, l ); // Overwrite x with the distribution we created in l. FLA_Copy( l, x ); FLA_Obj_free( &l ); FLA_Obj_free( &k ); FLA_Obj_free( &alpha2 ); return FLA_SUCCESS; }
void time_Bidiag_UT( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj tu, FLA_Obj tv, FLA_Obj TU, FLA_Obj TV, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( param_combo ){ case 0: { switch( type ) { case FLA_ALG_REFERENCE: REF_Bidiag_UT( A, tu, tv ); break; case FLA_ALG_FRONT: FLA_Bidiag_UT( A, TU, TV ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj AL, AR; FLA_Obj ATL, ATR, ABL, ABR; FLA_Obj QU; FLA_Obj QV, QVL, QVR; FLA_Obj E, EL, ER; FLA_Obj F; FLA_Obj WU, WV, eye; FLA_Obj tvT, tvB; dim_t m_A, n_A, m_TU; //FLA_Obj_show( "A_save", A_save, "%10.3e", "" ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); m_TU = FLA_Obj_length( TU ); FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &QU ); FLA_Obj_create( FLA_Obj_datatype( A ), n_A, n_A, 0, 0, &QV ); FLA_Obj_create( FLA_Obj_datatype( A ), m_TU, m_A, 0, 0, &WU ); FLA_Obj_create( FLA_Obj_datatype( A ), m_TU, n_A, 0, 0, &WV ); FLA_Set_to_identity( QU ); FLA_Set_to_identity( QV ); FLA_Part_1x2( QV, &QVL, &QVR, 1, FLA_LEFT ); FLA_Part_1x2( A, &AL, &AR, 1, FLA_LEFT ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 1, 1, FLA_BL ); FLA_Part_2x1( tv, &tvT, &tvB, 1, FLA_BOTTOM ); if ( type == FLA_ALG_REFERENCE ) { if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, AR, tv, QVR ); // // Need to apply backwards transformation, since vectors are stored columnwise. // QL? RQ? // //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR ); //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, AR, tvT, QVR ); } else { FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, TU, WU, QU ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, AR, TV, WV, QVR ); } /* FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Set_to_identity( eye ); //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, // FLA_ONE, QV, QV, FLA_MINUS_ONE, eye ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, QU, QU, FLA_MINUS_ONE, eye ); FLA_Obj_show( "eye", eye, "%10.3e", "" ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &eye ); */ FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A_save, QV, FLA_ZERO, E ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, QU, E, FLA_ZERO, F ); //FLA_Obj_show( "A_save", A_save, "%10.3e", "" ); FLA_Copy( A, E ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, E ); FLA_Part_1x2( E, &EL, &ER, 1, FLA_LEFT ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, ER ); //FLA_Obj_show( "B", E, "%10.3e", "" ); //FLA_Obj_show( "Q'AV", F, "%10.3e", "" ); //FLA_Obj_show( "B", E, "%10.3e + %10.3e ", "" ); //FLA_Obj_show( "Q'AV", F, "%10.3e + %10.3e ", "" ); *diff = FLA_Max_elemwise_diff( E, F ); FLA_Obj_free( &E ); FLA_Obj_free( &F ); FLA_Obj_free( &QU ); FLA_Obj_free( &QV ); FLA_Obj_free( &WU ); FLA_Obj_free( &WV ); } *gflops = 4.0 * n * n * ( m - n / 3.0 ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &norm ); }
int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, b_alg, variant, n_repeats, i, datatype, n_variants = 1; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double safemin; double dtime, gflops, diff; FLA_Obj A, l, Q, T, W; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &b_alg ); fprintf( stdout, "%c %d\n", '%', b_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } /* char ch = 's'; safemin = dlamch_( &ch ); printf( "safemin = %23.15e\n", safemin ); ch = 'e'; double eps = dlamch_( &ch ); printf( "eps dla = %23.15e\n", eps ); printf( "eps fla = %23.15e\n", FLA_EPSILON_D ); */ for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / f2c_abs(m_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &Q ); FLA_Obj_create( datatype, 32, m, 0, 0, &T ); FLA_Obj_create( datatype, 32, m, 0, 0, &W ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), m, 1, 0, 0, &l ); //FLA_Random_herm_matrix( FLA_LOWER_TRIANGULAR, A ); //FLA_Random_spd_matrix( FLA_LOWER_TRIANGULAR, A ); FLA_Random_matrix( A ); FLA_Obj_set_to_identity( Q ); FLA_QR_UT( A, T ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Q ); fill_eigenvalues( l ); //FLA_Obj_show( "eig", l, "%9.2e ", "" ); FLA_Apply_diag_matrix( FLA_LEFT, FLA_NO_CONJUGATE, l, Q ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, Q ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, Q ); FLA_Copy( Q, A ); time_Hevd_ln( 0, FLA_ALG_REFERENCE, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "data_REFs( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff ); fflush( stdout ); time_Hevd_ln( -1, FLA_ALG_REFERENCE, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "data_REFd( %d, 1:2 ) = [ %d %6.3lf %6.2le ]; \n", i, p, gflops, diff ); fflush( stdout ); for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:9 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Hevd_ln( variant, FLA_ALG_UNBLOCKED, n_repeats, m, b_alg, A, l, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); //time_Hevd_ln( variant, FLA_ALG_UNB_OPT, n_repeats, m, b_alg, // A, l, &dtime, &diff, &gflops ); //fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); //fflush( stdout ); fprintf( stdout, "];\n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &W ); FLA_Obj_free( &Q ); FLA_Obj_free( &l ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); fprintf( stdout, "hold on;\n" ); for ( i = 1; i <= n_variants; i++ ) { fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i < n_variants; i++ ) fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i ); fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME Hevd_ln performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
void time_QR2_UT( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A_flat, FLA_Obj A_flat_ref, FLA_Obj B_flat, FLA_Obj D_flat, FLA_Obj T_flat, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_flat_save; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A_flat, &A_flat_save ); FLA_Copy( A_flat, A_flat_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy( A_flat_save, A_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_QR2_UT_unb_var1( B_flat, D_flat, T_flat ); break; case FLA_ALG_UNB_OPT: FLA_QR2_UT_opt_var1( B_flat, D_flat, T_flat ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_UNBLOCKED ) { FLA_Copy( A_flat, A_flat_ref ); //FLA_Copy( T_flat, A_flat_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( A_flat, A_flat_ref ); //*diff = FLA_Max_elemwise_diff( T_flat, A_flat_ref ); } *gflops = 2.0 * n * n * (m - n/3.0) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( A_flat ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy( A_flat_save, A_flat ); FLA_Obj_free( &A_flat_save ); }
void time_LU( int pivot_combo, int type, int nrepeats, int m, int n, dim_t nb_alg, dim_t nb_flash, FLA_Obj A, FLA_Obj p, FLA_Obj x, FLA_Obj b, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj AH_save, b_save; FLA_Obj AH, pH, bH, LH; FLASH_LU_incpiv_create_hier_matrices( A, 1, &nb_flash, nb_alg, &AH, &pH, &LH ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &nb_flash, &bH ); FLASH_Obj_create_copy_of( FLA_NO_TRANSPOSE, AH, &AH_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, b, &b_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( AH_save, AH ); *dtime = FLA_Clock(); switch( pivot_combo ){ case 0: { switch( type ) { case FLA_ALG_FRONT_OPT0: FLASH_LU_incpiv_noopt( AH, pH, LH ); break; case FLA_ALG_FRONT_OPT1: FLASH_LU_incpiv_opt1( AH, pH, LH ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLASH_FS_incpiv( AH, pH, LH, bH ); FLASH_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, AH, bH ); FLASH_Obj_flatten( bH, x ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A, x, FLA_MINUS_ONE, b ); FLA_Nrm2_external( b, norm ); FLA_Obj_extract_real_scalar( norm, diff ); } *gflops = 2.0 / 3.0 * m * m * n / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy( b_save, b ); FLASH_Obj_free( &AH ); FLASH_Obj_free( &pH ); FLASH_Obj_free( &bH ); FLASH_Obj_free( &LH ); FLA_Obj_free( &b_save ); FLASH_Obj_free( &AH_save ); }
int main(int argc, char *argv[]) { int n, nfirst, nlast, ninc, i, irep, nrepeats, nb_alg; double dtime, dtime_best, gflops, max_gflops, diff, d_n; FLA_Obj A, Aref, Aold, delta; /* Initialize FLAME */ FLA_Init( ); /* Every time trial is repeated "repeat" times and the fastest run in recorded */ printf( "%% number of repeats:" ); scanf( "%d", &nrepeats ); printf( "%% %d\n", nrepeats ); /* Enter the max GFLOPS attainable This is used to set the y-axis range for the graphs. Here is how you figure out what to enter (on Linux machines): 1) more /proc/cpuinfo (this lists the contents of this file). 2) read through this and figure out the clock rate of the machine (in GHz). 3) Find out (from an expert of from the web) the number of floating point instructions that can be performed per core per clock cycle. 4) Figure out if you are using "multithreaded BLAS" which automatically parallelize calls to the Basic Linear Algebra Subprograms. If so, check how many cores are available. 5) Multiply 2) x 3) x 4) and enter this in response to the below. */ printf( "%% enter max GFLOPS:" ); scanf( "%lf", &max_gflops ); printf( "%% %lf\n", max_gflops ); /* Enter the algorithmic block size */ printf( "%% enter nb_alg:" ); scanf( "%d", &nb_alg ); printf( "%% %d\n", nb_alg ); /* Turn on parameter checking */ FLA_Check_error_level_set( FLA_FULL_ERROR_CHECKING ); /* Timing trials for matrix sizes n=nfirst to nlast in increments of ninc will be performed */ printf( "%% enter nfirst, nlast, ninc:" ); scanf( "%d%d%d", &nfirst, &nlast, &ninc ); printf( "%% %d %d %d\n", nfirst, nlast, ninc ); i = 1; for ( n=nfirst; n<= nlast; n+=ninc ){ /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, n, n, 0, 0, &A ); FLA_Obj_create( FLA_DOUBLE, n, n, 0, 0, &Aref ); FLA_Obj_create( FLA_DOUBLE, n, n, 0, 0, &Aold ); FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &delta ); /* Generate random matrix A and save in Aold */ FLA_Random_matrix( Aold ); /* Add something large to the diagonal to make sure it isn't ill-conditionsed */ d_n = ( double ) n; *( ( double * ) FLA_Obj_buffer( delta ) ) = d_n; FLA_Shift_diag( FLA_NO_CONJUGATE, delta, Aold ); /* Set gflops = billions of floating point operations that will be performed */ gflops = 2.0/3.0 * n * n * n * 1.0e-09; /* Time the reference implementation */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, Aref ); dtime = FLA_Clock(); REF_LU( Aref ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_REF( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); /* Time FLA_LU */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); FLA_LU_nopiv( A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); /* Time the your implementations */ /* Variant 1 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_unb_var1( A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 1 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_blk_var1( A, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 2 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_unb_var2( A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 2 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_blk_var2( A, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 3 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_unb_var3( A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 3 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_blk_var3( A, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 4 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_unb_var4( A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var4( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 4 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_blk_var4( A, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var4( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 5 unblocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_unb_var5( A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var5( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 5 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); LU_blk_var5( A, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var5( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &Aold ); FLA_Obj_free( &Aref ); FLA_Obj_free( &delta ); printf( "\n" ); i++; } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ printf( "close all\n" ); /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.' ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.' ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.' ); \n" ); printf( "plot( data_unb_var4( :,1 ), data_unb_var4( :, 2 ), 'm-.' ); \n" ); printf( "plot( data_unb_var5( :,1 ), data_unb_var5( :, 2 ), 'c-.' ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r--' ); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g--' ); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b--' ); \n" ); printf( "plot( data_blk_var4( :,1 ), data_blk_var4( :, 2 ), 'm--' ); \n" ); printf( "plot( data_blk_var5( :,1 ), data_blk_var5( :, 2 ), 'c--' ); \n" ); printf( "hold on \n"); printf( "xlabel( 'matrix dimension m=n' );\n"); printf( "ylabel( 'GFLOPS/sec.' );\n"); printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops ); printf( "legend( 'FLA LU nopiv', ...\n"); printf( " 'Simple loops', ...\n"); printf( " 'unb var1', ...\n"); printf( " 'unb var2', ...\n"); printf( " 'unb var3', ...\n"); printf( " 'unb var4', ...\n"); printf( " 'unb var5', ...\n"); printf( " 'blk var1', ...\n"); printf( " 'blk var2', ...\n"); printf( " 'blk var3', ...\n"); printf( " 'blk var4', ...\n"); printf( " 'blk var5', 2);\n"); printf( "print -r100 -depsc LU.eps\n"); FLA_Finalize( ); return 0; }
FLA_Error REF_Svdd_uv_components( FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) /* { *dtime_bred = 1; *dtime_bsvd = 1; *dtime_appq = 1; *dtime_qrfa = 1; *dtime_gemm = 1; return FLA_Svdd_external( FLA_SVD_VECTORS_ALL, A, s, U, V ); } */ { FLA_Datatype dt_A; FLA_Datatype dt_A_real; dim_t m_A, n_A; dim_t min_m_n; FLA_Obj tq, tu, tv, d, e, Ur, Vr, W; FLA_Obj eT, epsilonB; FLA_Uplo uplo = FLA_UPPER_TRIANGULAR; double crossover_ratio = 16.0 / 10.0; double dtime_temp; dt_A = FLA_Obj_datatype( A ); dt_A_real = FLA_Obj_datatype_proj_to_real( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tq ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tu ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tv ); FLA_Obj_create( dt_A_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_A_real, min_m_n, 1, 0, 0, &e ); FLA_Obj_create( dt_A_real, n_A, n_A, 0, 0, &Ur ); FLA_Obj_create( dt_A_real, n_A, n_A, 0, 0, &Vr ); FLA_Part_2x1( e, &eT, &epsilonB, 1, FLA_BOTTOM ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce to bidiagonal form. FLA_Bidiag_blk_external( A, tu, tv ); FLA_Bidiag_UT_extract_diagonals( A, d, eT ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Divide-and-conquor algorithm. FLA_Bsvdd_external( uplo, d, e, Ur, Vr ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U. FLA_Copy_external( Ur, U ); FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, tu, U ); // Form V. FLA_Copy_external( Vr, V ); FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, A, tv, V ); } *dtime_appq = FLA_Clock() - dtime_temp; *dtime_qrfa = 0.0; *dtime_gemm = 0.0; } else { FLA_Obj AT, AB; FLA_Obj UL, UR; FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); // Create a temporary n-by-n matrix R. FLA_Obj_create( dt_A, n_A, n_A, 0, 0, &W ); dtime_temp = FLA_Clock(); { // Perform a QR factorization. FLA_QR_blk_external( A, tq ); FLA_Copyr_external( FLA_LOWER_TRIANGULAR, A, UL ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, A ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form Q. FLA_QR_form_Q_external( U, tq ); } *dtime_appq = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Reduce R to bidiagonal form. FLA_Bidiag_blk_external( AT, tu, tv ); FLA_Bidiag_UT_extract_diagonals( A, d, eT ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Divide-and-conquor algorithm. FLA_Bsvdd_external( uplo, d, e, Ur, Vr ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U in W. FLA_Copy_external( Ur, W ); FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, AT, tu, W ); // Form V. FLA_Copy_external( Vr, V ); FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, AT, tv, V ); } *dtime_appq += FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying // back to U. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, W, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; // Free R. FLA_Obj_free( &W ); } } else { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy singular values to output vector. FLA_Copy( d, s ); // Sort singular values and vectors. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &tq ); FLA_Obj_free( &tu ); FLA_Obj_free( &tv ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &Ur ); FLA_Obj_free( &Vr ); return FLA_SUCCESS; }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, Ak, T, Tk, D, Dk, A_copy, A_recovered, L, Q, Qk, W, x, y, z; dim_t m, n, k; dim_t min_m_n; FLA_Error init_result; double residual_A, residual_Axy; int use_form_q = 1; if ( argc == 4 ) { m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); min_m_n = min(m,n); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m n k\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " n : matrix width\n"); fprintf(stderr, " k : number of house holder vectors applied for testing\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 || n == 0 ) return 0; FLA_Init_safe( &init_result ); // FLAME LQ^H setup FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_LQ_UT_create_T( A, &T ); // Rand A and create A_copy. FLA_Random_matrix( A ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_copy ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered ); FLA_Copy( A, A_copy ); // LQ test ( A = L Q^H ) FLA_LQ_UT( A, T ); // Create Q (identity), L (A_copy) FLA_Obj_create( datatype, m, n, 0, 0, &Q ); FLA_Set_to_identity( Q ); FLA_Obj_create( datatype, m, m, 0, 0, &D ); FLA_Obj_create( datatype, k, n, 0, 0, &Qk ); FLA_Set_to_identity( Qk ); FLA_Obj_create( datatype, k, k, 0, 0, &Dk ); FLA_Obj_create( datatype, m, m, 0, 0, &L ); // Q^H := I H_{0}^H ... H_{k-1}^H if ( use_form_q ) { FLA_LQ_UT_form_Q( A, T, Q ); } else { FLA_Apply_Q_UT_create_workspace_side( FLA_RIGHT, T, Q, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, Q ); FLA_Obj_free( &W ); } // D := Q^T Q FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Q, Q, FLA_ZERO, D ); // Qk := I H0 ... Hk FLA_Part_1x2( T, &Tk, &W, k, FLA_LEFT ); FLA_Part_2x1( A, &Ak, &W, k, FLA_TOP ); if ( use_form_q ) { // Overwrite the result to test FLAME API FLA_Set( FLA_ZERO, Qk ); FLA_Copy( Ak, Qk ); FLA_LQ_UT_form_Q( Ak, Tk, Qk ); } else { FLA_Apply_Q_UT_create_workspace( Tk, Qk, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, Ak, Tk, W, Qk ); FLA_Obj_free( &W ); } // Dk := Qk^T Qk FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Qk, Qk, FLA_ZERO, Dk ); // L := A (Q^H)^H if ( use_form_q ) { // Note that the formed Q is actually Q^H; transb should be carefully assigned. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_copy, Q, FLA_ZERO, L ); } else { FLA_Apply_Q_UT_create_workspace( T, L, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, L ); FLA_Obj_free( &W ); } FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, L, Q, FLA_ZERO, A_recovered ); // Create vectors for testing FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Set( FLA_ZERO, x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Set( FLA_ZERO, y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Set( FLA_ZERO, z ); // x is given FLA_Set( FLA_ONE, x ); // y := Ax FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_copy, x, FLA_ZERO, y ); // z := L (Q^H) x , libflame FLA_Apply_Q_UT_create_workspace( T, x, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, x ); FLA_Obj_free( &W ); if ( m < n ) FLA_Part_2x1( x, &x, &W, m, FLA_TOP ); else FLA_Part_1x2( L, &L, &W, n, FLA_LEFT ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, L, x, FLA_ZERO, z ); // Comapre (A_copy, A_recovered), (y,z) and (y,w) residual_A = FLA_Max_elemwise_diff( A_copy, A_recovered ); residual_Axy = FLA_Max_elemwise_diff( y, z ); if ( 1 || residual_A > EPS || residual_Axy > EPS ) { FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - T - ", T, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Q - ", Q, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D = Q^T Q - ", D, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Qk - ", Qk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Dk = Qk^T Qk - ", Dk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - L - ", L, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------"); fprintf( stdout, "lapack2flame: %lu x %lu, %lu: ", m, n, k); fprintf( stdout, "| A - A_recovered | = %12.10e, | Ax - y | = %12.10e\n\n", residual_A, residual_Axy ) ; } FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &A_copy ); FLA_Obj_free( &A_recovered ); FLA_Obj_free( &L ); FLA_Obj_free( &Q ); FLA_Obj_free( &Qk ); FLA_Obj_free( &D ); FLA_Obj_free( &Dk ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Finalize_safe( init_result ); }
int main(int argc, char *argv[]) { int n, nfirst, nlast, ninc, nlast_unb, i, irep, nrepeats, nb_alg; double dtime, dtime_best, gflops, max_gflops, diff, d_n; FLA_Obj A, Aref, Aold, delta; /* Initialize FLAME */ FLA_Init( ); /* Every time trial is repeated "repeat" times and the fastest run in recorded */ printf( "%% number of repeats:" ); scanf( "%d", &nrepeats ); printf( "%% %d\n", nrepeats ); /* Enter the max GFLOPS attainable This is used to set the y-axis range for the graphs. Here is how you figure out what to enter (on Linux machines): 1) more /proc/cpuinfo (this lists the contents of this file). 2) read through this and figure out the clock rate of the machine (in GHz). 3) Find out (from an expert of from the web) the number of floating point instructions that can be performed per core per clock cycle. 4) Figure out if you are using "multithreaded BLAS" which automatically parallelize calls to the Basic Linear Algebra Subprograms. If so, check how many cores are available. 5) Multiply 2) x 3) x 4) and enter this in response to the below. If you enter a value for max GFLOPS that is lower that the maximum that is observed in the experiments, then the top of the graph is set to the observed maximum. Thus, one possibility is to simply set this to 0.0. */ printf( "%% enter max GFLOPS:" ); scanf( "%lf", &max_gflops ); printf( "%% %lf\n", max_gflops ); /* Enter the algorithmic block size */ printf( "%% enter nb_alg:" ); scanf( "%d", &nb_alg ); printf( "%% %d\n", nb_alg ); /* Timing trials for matrix sizes n=nfirst to nlast in increments of ninc will be performed. Unblocked versions are only tested to nlast_unb */ printf( "%% enter nfirst, nlast, ninc, nlast_unb:" ); scanf( "%d%d%d%d", &nfirst, &nlast, &ninc, &nlast_unb ); printf( "%% %d %d %d %d\n", nfirst, nlast, ninc, nlast_unb ); i = 1; for ( n=nfirst; n<= nlast; n+=ninc ){ /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aref ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aold ); FLA_Obj_create( FLA_DOUBLE, 1, 1, 1, 1, &delta ); /* Generate random matrix A and save in Aold */ FLA_Random_matrix( Aold ); /* Add something large to the diagonal to make sure it isn't ill-conditionsed */ d_n = ( double ) n; *( ( double * ) FLA_Obj_buffer_at_view( delta ) ) = d_n; FLA_Shift_diag( FLA_NO_CONJUGATE, delta, Aold ); /* Set gflops = billions of floating point operations that will be performed */ gflops = 1.0/3.0 * n * n * n * 1.0e-09; /* Time the reference implementation */ #if TIME_LAPACK == TRUE #else // if ( n <= nlast_unb ) #endif { for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, Aref ); dtime = FLA_Clock(); REF_Chol( TIME_LAPACK, Aref, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_REF( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); } /* Time FLA_Chol */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); FLA_Chol( FLA_LOWER_TRIANGULAR, A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); if ( gflops / dtime_best > max_gflops ) max_gflops = gflops / dtime_best; fflush( stdout ); /* Time the your implementations */ /* Variant 1 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR1 == TRUE Chol_unb_var1( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 1 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR1 == TRUE Chol_blk_var1( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 2 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR2 == TRUE Chol_unb_var2( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 2 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR2 == TRUE Chol_blk_var2( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 3 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR3 == TRUE Chol_unb_var3( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 3 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR3 == TRUE Chol_blk_var3( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &Aold ); FLA_Obj_free( &Aref ); FLA_Obj_free( &delta ); printf( "\n" ); i++; } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ printf( "close all\n" ); #if OCTAVE == TRUE /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), '-k;libflame;' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), '-m;reference;' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), \"-rx;UnbVar1;\" ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), \"-go;UnbVar2;\" ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), \"-b*;UnbVar3;\" ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), \"-rx;BlkVar1;\", \"markersize\", 3 ); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), \"-go;BlkVar2;\", \"markersize\", 3 ); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), \"-b*;BlkVar3;\", \"markersize\", 3 ); \n" ); #else /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.x' ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.o' ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.*' ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r-x'); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g-o'); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b-*'); \n" ); #endif printf( "hold off \n"); printf( "xlabel( 'matrix dimension m=n' );\n"); printf( "ylabel( 'GFLOPS/sec.' );\n"); printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops ); #if OCTAVE == TRUE printf( "legend( 2 ); \n" ); printf(" print -landscape -solid -color -deps -F:24 Chol.eps\n" ); #else printf( "legend( 'FLA Chol', ...\n"); printf( " 'Simple loops', ...\n"); printf( " 'unb var1', ...\n"); printf( " 'unb var2', ...\n"); printf( " 'unb var3', ...\n"); printf( " 'blk var1', ...\n"); printf( " 'blk var2', ...\n"); printf( " 'blk var3', 2);\n"); printf( "print -r100 -dpdf Chol.pdf\n"); #endif FLA_Finalize( ); exit( 0 ); }
int main( int argc, char** argv ) { FLA_Datatype comptype = COMPTYPE; FLA_Datatype realtype = REALTYPE; dim_t m; FLA_Obj a, aT, aB, a0, a1, a2; FLA_Obj v, vT, vB, v0, v1, v2; FLA_Error init_result; int use_abs = 1; if ( argc == 3 ) { m = atoi(argv[1]); use_abs = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m use_abs\n", argv[0]); fprintf(stderr, " m : test vector length\n"); fprintf(stderr, " use_abs : 0 - norm (realtype), 1 - abs (complex type)\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); FLA_Obj_create( comptype, m, 1, 0, 0, &a ); FLA_Obj_create( use_abs ? comptype : realtype, m, 1, 0, 0, &v ); FLA_Random_matrix( a ); FLA_Set( FLA_ZERO, v ); FLA_Obj_fshow( stdout, "- a -", a, "% 6.4e", "--" ); // Normalize a vector FLA_Part_2x1( a, &aT, &aB, 0, FLA_TOP ); FLA_Part_2x1( v, &vT, &vB, 0, FLA_TOP ); while ( FLA_Obj_length( aB ) > 0 ) { FLA_Repart_2x1_to_3x1( aT, &a0, &a1, aB, &a2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( vT, &v0, &v1, vB, &v2, 1, FLA_BOTTOM ); // -------------------------------------------- if ( use_abs ) { // a and v are complex datatype FLA_Copy( a1, v1 ); FLA_Absolute_value( v1 ); } else { // v is real datatype FLA_Nrm2( a1, v1 ); } if ( FLA_Obj_equals( v1, FLA_ZERO ) ) printf( " ZERO DETECTED\n" ); else FLA_Inv_scal( v1, a1 ); // Normalize the scalar // -------------------------------------------- FLA_Cont_with_3x1_to_2x1( &aT, a0, a1, &aB, a2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &vT, v0, v1, &vB, v2, FLA_TOP ); } FLA_Obj_fshow( stdout, "- a -", a, "% 6.4e", "--" ); FLA_Obj_fshow( stdout, "- v -", v, "% 6.4e", "--" ); // Check whether it is normalized FLA_Part_2x1( a, &aT, &aB, 0, FLA_TOP ); FLA_Part_2x1( v, &vT, &vB, 0, FLA_TOP ); while ( FLA_Obj_length( aB ) > 0 ) { FLA_Repart_2x1_to_3x1( aT, &a0, &a1, aB, &a2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( vT, &v0, &v1, vB, &v2, 1, FLA_BOTTOM ); // -------------------------------------------- if ( use_abs ) { // a and v are same datatype FLA_Copy( a1, v1 ); FLA_Absolute_value( v1 ); } else { // v is realdatatype FLA_Nrm2( a1, v1 ); } // -------------------------------------------- FLA_Cont_with_3x1_to_2x1( &aT, a0, a1, &aB, a2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &vT, v0, v1, &vB, v2, FLA_TOP ); } FLA_Obj_fshow( stdout, " - all should be one - ", v, "% 6.4e", "--"); FLA_Obj_free( &a ); FLA_Obj_free( &v ); FLA_Finalize_safe( init_result ); }
FLA_Error FLA_Svd_compute_scaling( FLA_Obj A, FLA_Obj sigma ) { FLA_Datatype dt_real; FLA_Obj norm; FLA_Obj safmin; FLA_Obj prec; FLA_Obj rmin; FLA_Obj rmax; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Svd_compute_scaling_check( A, sigma ); dt_real = FLA_Obj_datatype_proj_to_real( A ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &norm ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &prec ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &safmin ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &rmin ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &rmax ); // Query safmin, precision. FLA_Mach_params( FLA_MACH_PREC, prec ); FLA_Mach_params( FLA_MACH_SFMIN, safmin ); //FLA_Obj_show( "safmin", safmin, "%20.12e", "" ); //FLA_Obj_show( "prec", prec, "%20.12e", "" ); // rmin = sqrt( safmin ) / prec; FLA_Copy( safmin, rmin ); FLA_Sqrt( rmin ); FLA_Inv_scal( prec, rmin ); // rmax = 1 / rmin; FLA_Copy( rmin, rmax ); FLA_Invert( FLA_NO_CONJUGATE, rmax ); //FLA_Obj_show( "rmin", rmin, "%20.12e", "" ); //FLA_Obj_show( "rmax", rmax, "%20.12e", "" ); // Find the maximum absolute value of A. FLA_Max_abs_value( A, norm ); if ( FLA_Obj_gt( norm, FLA_ZERO ) && FLA_Obj_lt( norm, rmin ) ) { // sigma = rmin / norm; FLA_Copy( rmin, sigma ); FLA_Inv_scal( norm, sigma ); } else if ( FLA_Obj_gt( norm, rmax ) ) { // sigma = rmax / norm; FLA_Copy( rmax, sigma ); FLA_Inv_scal( norm, sigma ); } else { // sigma = 1.0; FLA_Copy( FLA_ONE, sigma ); } FLA_Obj_free( &norm ); FLA_Obj_free( &prec ); FLA_Obj_free( &safmin ); FLA_Obj_free( &rmin ); FLA_Obj_free( &rmax ); return FLA_SUCCESS; }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, A_flame, A_lapack, C; int m; FLA_Error init_result; FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame; FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack; testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack; int lwork, info, is_flame; if ( argc == 3 ) { m = atoi(argv[1]); is_flame = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m is_flame\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " is_flame : 1 yes, 0 no\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); fprintf( stdout, "lapack2flame: %d x %d: \n", m, m); FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Random_matrix( A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Random_matrix( C ); if ( is_flame ) { fprintf( stdout, " flame executed\n"); FLA_Bidiag_UT_create_T( A_flame, &TU, &TV ); FLA_Bidiag_UT( A_flame, TU, TV ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame ); FLA_Bidiag_UT_form_U( U_flame, TU, U_flame ); FLA_Bidiag_UT_form_V( V_flame, TV, V_flame ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_flame ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_flame ); FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame ); FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_flame, B_flame ); FLA_Set_diagonal_matrix( e_flame, BTR ); } if (1) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------"); } } else { fprintf( stdout, " lapack executed\n"); FLA_Obj_create( datatype, m, 1, 0, 0, &tauq ); FLA_Obj_create( datatype, m, 1, 0, 0, &taup ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_lapack ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_lapack ); buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack ); buff_tauq = (testtype*)FLA_Obj_buffer_at_view( tauq ); buff_taup = (testtype*)FLA_Obj_buffer_at_view( taup ); buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack ); lwork = 32*m; FLA_Obj_create( datatype, lwork, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sgebrd_( &m, &m, buff_A_lapack, &m, buff_d_lapack, buff_e_lapack, buff_tauq, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack ); FLA_Copy( A_lapack, U_lapack ); FLA_Copy( A_lapack, V_lapack ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack ); sorgbr_( "Q", &m, &m, &m, buff_U_lapack, &m, buff_tauq, buff_W, &lwork, &info ); sorgbr_( "P", &m, &m, &m, buff_V_lapack, &m, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_lapack, B_lapack ); FLA_Set_diagonal_matrix( e_lapack, BTR ); } FLA_Obj_free( &W ); if (1) { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------"); } } { testtype dummy; int zero = 0, one = 1; FLA_Obj D_lapack; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack ); if ( is_flame ) { buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame ); } FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sbdsqr_( "U", &m, &m, &m, &zero, buff_d_lapack, buff_e_lapack, buff_V_lapack, &m, buff_U_lapack, &m, &dummy, &one, buff_W, &info ); FLA_Obj_free( &W ); if (info != 0) printf( " Error info = %d\n", info ); if ( is_flame ) FLA_Set_diagonal_matrix( d_flame, D_lapack ); else FLA_Set_diagonal_matrix( d_lapack, D_lapack ); if ( is_flame ) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } else { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } FLA_Obj_free( &D_lapack ); } if ( is_flame ) { FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &U_flame ); FLA_Obj_free( &V_flame ); FLA_Obj_free( &d_flame ); FLA_Obj_free( &e_flame ); FLA_Obj_free( &B_flame ); } else { FLA_Obj_free( &tauq ); FLA_Obj_free( &taup ); FLA_Obj_free( &d_lapack ); FLA_Obj_free( &e_lapack ); FLA_Obj_free( &U_lapack ); FLA_Obj_free( &V_lapack ); FLA_Obj_free( &B_lapack ); } FLA_Obj_free( &A ); FLA_Obj_free( &A_flame ); FLA_Obj_free( &A_lapack ); FLA_Obj_free( &C ); FLA_Finalize_safe( init_result ); }
FLA_Error FLA_Hess_UT_step_unb_var2( FLA_Obj A, FLA_Obj T ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj TTL, TTR, T00, t01, T02, TBL, TBR, t10t, tau11, t12t, T20, t21, T22; FLA_Obj yT, y0, yB, psi1, y2; FLA_Obj zT, z0, zB, zeta1, z2; FLA_Obj y, z; FLA_Obj inv_tau11; FLA_Obj minus_inv_tau11; FLA_Obj first_elem; FLA_Obj beta; FLA_Obj conj_beta; FLA_Obj dot_product; FLA_Obj a21_t, a21_b; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &first_elem ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &conj_beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &dot_product ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( T, &TTL, &TTR, &TBL, &TBR, 0, 0, FLA_TL ); FLA_Part_2x1( y, &yT, &yB, 0, FLA_TOP ); FLA_Part_2x1( z, &zT, &zB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < b_alg ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02, /* ************* */ /* ************************** */ &t10t, /**/ &tau11, &t12t, TBL, /**/ TBR, &T20, /**/ &t21, &T22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( yT, &y0, /* ** */ /* **** */ &psi1, yB, &y2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( zT, &z0, /* ** */ /* ***** */ &zeta1, zB, &z2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ if ( FLA_Obj_length( A22 ) > 0 ) { FLA_Part_2x1( a21, &a21_t, &a21_b, 1, FLA_TOP ); // [ u21, tau11, a21 ] = House( a21 ); FLA_Househ2_UT( FLA_LEFT, a21_t, a21_b, tau11 ); // inv_tau11 = 1 / tau11; // minus_inv_tau11 = -1 / tau11; FLA_Set( FLA_ONE, inv_tau11 ); FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); FLA_Copy( inv_tau11, minus_inv_tau11 ); FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); // Save first element of a21_t and set it to one so we can use a21 as // u21 in subsequent computations. We will restore a21_t later on. FLA_Copy( a21_t, first_elem ); FLA_Set( FLA_ONE, a21_t ); // y21 = A22' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); // z21 = A22 * u21; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); // beta = u21' * z21 / 2; // conj_beta = conj(beta); FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); FLA_Inv_scal( FLA_TWO, beta ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); // y21' = ( y21' - beta / tau * u21' ) / tau; // y21 = ( y21 - conj(beta) / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, conj_beta ); FLA_Axpy( conj_beta, a21, y2 ); FLA_Scal( inv_tau11, y2 ); // z21 = ( z21 - beta / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, beta ); FLA_Axpy( beta, a21, z2 ); FLA_Scal( inv_tau11, z2 ); // a12t = a12t * ( I - u21 * u21' / tau ); // = a12t - ( a12t * u21 ) * u21' / tau; FLA_Dot( a12t, a21, dot_product ); FLA_Scal( minus_inv_tau11, dot_product ); FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); // A02 = A02 * ( I - u21 * u21' / tau ); // = A02 - ( A02 * u21 ) * u21' / tau; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); // A22 = A22 - u21 * y21' - z21 * u21'; FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); // t01 = U20' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); // Restore first element of a21. FLA_Copy( first_elem, a21_t ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02, t10t, tau11, /**/ t12t, /* ************** */ /* ************************ */ &TBL, /**/ &TBR, T20, t21, /**/ T22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &yT, y0, psi1, /* ** */ /* **** */ &yB, y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &zT, z0, zeta1, /* ** */ /* ***** */ &zB, z2, FLA_TOP ); } FLA_Obj_free( &inv_tau11 ); FLA_Obj_free( &minus_inv_tau11 ); FLA_Obj_free( &first_elem ); FLA_Obj_free( &beta ); FLA_Obj_free( &conj_beta ); FLA_Obj_free( &dot_product ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); return FLA_SUCCESS; }
void time_Hevd_lv_components( int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg, FLA_Obj A, FLA_Obj l, double* dtime, double* diff1, double* diff2, double* gflops, double* dtime_tred, double* gflops_tred, double* dtime_tevd, double* gflops_tevd, double* dtime_appq, double* gflops_appq, int* k_perf ) { int i; double k; double dtime_save = 1.0e9; double dtime_tred_save = 1.0e9; double dtime_tevd_save = 1.0e9; double dtime_appq_save = 1.0e9; double flops_tred; double flops_tevd; double flops_appq; double mult_tred; double mult_tevd; double mult_appq; FLA_Obj A_save, Z; if ( ( variant == -3 ) || ( variant == -4 ) || ( variant == -5 ) || //( variant == 0 ) || //( variant == -1 ) || //( variant == -2 ) || //( variant == 1 ) || //( variant == 2 ) || //( variant == 3 ) || //( variant == 4 ) || FALSE ) { *gflops = 0.0; *dtime = 0.0; *diff1 = 0.0; *diff2 = 0.0; *dtime_tred = 0.0; *dtime_tevd = 0.0; *dtime_appq = 0.0; *gflops_tred = 0.0; *gflops_tevd = 0.0; *gflops_appq = 0.0; *k_perf = 0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); FLA_Copy_external( A, A_save ); for ( i = 0 ; i < n_repeats; i++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case -3: { *k_perf = 0; REF_Hevd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -4: { *k_perf = 0; REF_Hevdd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -5: { *k_perf = 0; REF_Hevdr_lv( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } case 0: { *k_perf = 0; REF_Hevd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -1: { *k_perf = 0; REF_Hevdd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -2: { *k_perf = 0; REF_Hevdr_lv_components( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 1 case 1: { *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 2 case 2: { *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } } *dtime = FLA_Clock() - *dtime; if ( *dtime < dtime_save ) { dtime_save = *dtime; dtime_tred_save = *dtime_tred; dtime_tevd_save = *dtime_tevd; dtime_appq_save = *dtime_appq; } } *dtime = dtime_save; *dtime_tred = dtime_tred_save; *dtime_tevd = dtime_tevd_save; *dtime_appq = dtime_appq_save; //if ( variant == -3 || variant == 0 ) //printf( "\ndtime is %9.3e\n", *dtime ); { FLA_Obj V, A_rev_evd, norm, eye; if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; flops_tred = ( ( 4.0 / 3.0 ) * m * m * m ); flops_tevd = ( 4.5 * k * m * m + 3.0 * k * m * m * m ); if ( variant == -1 || variant == -2 || variant == -4 || variant == -5 ) flops_appq = ( 2.0 * m * m * m ); else flops_appq = ( 4.0 / 3.0 * m * m * m ); /* if ( FLA_Obj_is_complex( A ) ) { *gflops = ( 4.0 * flops_tred + 2.0 * flops_tevd + 4.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9; } else { *gflops = ( 1.0 * flops_tred + 1.0 * flops_tevd + 1.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9; } */ if ( FLA_Obj_is_complex( A ) ) { mult_tred = 4.0; mult_tevd = 2.0; mult_appq = 4.0; } else { mult_tred = 1.0; mult_tevd = 1.0; mult_appq = 1.0; } *gflops = ( mult_tred * flops_tred + mult_tevd * flops_tevd + mult_appq * flops_appq ) / *dtime / 1e9; *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &Z ); }
int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, k_accum, b_alg, n_iter_max, variant, n_repeats, i, n_variants = 2; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff1, diff2; FLA_Datatype datatype, dt_real; FLA_Obj A, l, Q, Ql, TT, r, d, e, A_orig, G, R, W2, de, alpha; FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter n_iter_max (per eigenvalue): ", '%' ); scanf( "%d", &n_iter_max ); fprintf( stdout, "%c %d\n", '%', n_iter_max ); fprintf( stdout, "%c enter number of sets of Givens rotations to accumulate:", '%' ); scanf( "%d", &k_accum ); fprintf( stdout, "%c %d\n", '%', k_accum ); fprintf( stdout, "%c enter blocking size for application of G:", '%' ); scanf( "%d", &b_alg ); fprintf( stdout, "%c %d\n", '%', b_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); fprintf( stdout, "\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, m, 0, 0, &A_orig ); FLA_Obj_create( datatype, m, m, 0, 0, &Q ); FLA_Obj_create( datatype, m, m, 0, 0, &Ql ); FLA_Obj_create( datatype, m, 1, 0, 0, &r ); FLA_Obj_create( datatype, m, m, 0, 0, &W2 ); FLA_Obj_create( datatype, m-1, k_accum, 0, 0, &G ); dt_real = FLA_Obj_datatype_proj_to_real( A ); FLA_Obj_create( dt_real, m, 1, 0, 0, &l ); FLA_Obj_create( dt_real, m, 1, 0, 0, &d ); FLA_Obj_create( dt_real, m-1, 1, 0, 0, &e ); FLA_Obj_create( dt_real, m, m, 0, 0, &R ); FLA_Obj_create( dt_real, 1, 1, 0, 0, &alpha ); *FLA_DOUBLE_PTR( alpha ) = 1.0 / ( sqrt( sqrt( (double) m ) ) ); FLA_Random_unitary_matrix( Q ); //FLA_Fill_with_uniform_dist( FLA_ONE, l ); //FLA_Fill_with_inverse_dist( FLA_ONE, l ); FLA_Fill_with_geometric_dist( alpha, l ); { FLA_Copy( Q, Ql ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, Ql ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Ql, Q, FLA_ZERO, A ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A ); FLA_Copy( A, A_orig ); } FLA_Set( FLA_ZERO, l ); FLA_Set( FLA_ZERO, Q ); FLA_Tridiag_UT_create_T( A, &TT ); FLA_Tridiag_UT( FLA_LOWER_TRIANGULAR, A, TT ); FLA_Tridiag_UT_realify( FLA_LOWER_TRIANGULAR, A, r ); FLA_Tridiag_UT_extract_diagonals( FLA_LOWER_TRIANGULAR, A, d, e ); FLA_Tridiag_UT_form_Q( FLA_LOWER_TRIANGULAR, A, TT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A ); FLA_Obj_free( &TT ); time_Tevd_v( 0, FLA_ALG_REFERENCE, n_repeats, m, k_accum, b_alg, n_iter_max, A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops ); fprintf( stdout, "data_REFq( %d, 1:3 ) = [ %d %6.3lf %9.2e %6.2le %6.2le ]; \n", i, p, gflops, dtime, diff1, diff2 ); fflush( stdout ); for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:3 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Tevd_v( variant, FLA_ALG_UNB_OPT, n_repeats, m, k_accum, b_alg, n_iter_max, A_orig, d, e, G, R, W2, A, l, &dtime, &diff1, &diff2, &gflops ); fprintf( stdout, "%6.3lf %9.2e %6.2le %6.2le ", gflops, dtime, diff1, diff2 ); fflush( stdout ); fprintf( stdout, "];\n" ); fflush( stdout ); } fprintf( stdout, "\n" ); FLA_Obj_free( &A ); FLA_Obj_free( &A_orig ); FLA_Obj_free( &Q ); FLA_Obj_free( &Ql ); FLA_Obj_free( &G ); FLA_Obj_free( &W2 ); FLA_Obj_free( &r ); FLA_Obj_free( &l ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &R ); FLA_Obj_free( &alpha ); } /* fprintf( stdout, "figure;\n" ); fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); fprintf( stdout, "hold on;\n" ); for ( i = 1; i <= n_variants; i++ ) { fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 4 ), '%c-.%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i < n_variants; i++ ) fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d', ... \n", i, i ); fprintf( stdout, "'unb\\_var%d', 'blk\\_var%d' ); \n", i, i ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME Hevd_lv performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc tridiag_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); return 0; }
void time_Lyap_h( int variant, int type, int n_repeats, int m, int nb_alg, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_save, norm; fla_blocksize_t* bp; fla_lyap_t* cntl_lyap_unb; fla_lyap_t* cntl_lyap_opt; fla_lyap_t* cntl_lyap_blk; if ( type == FLA_ALG_UNB_OPT && variant > 4 ) { *gflops = 0.0; *diff = 0.0; return; } bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_lyap_unb = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_UNB_VAR_OFFSET + variant, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_lyap_opt = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_OPT_VAR_OFFSET + variant, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_lyap_blk = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_BLK_VAR_OFFSET + variant, bp, fla_scal_cntl_blas, fla_lyap_cntl_leaf, fla_sylv_cntl, fla_gemm_cntl_blas, fla_gemm_cntl_blas, fla_hemm_cntl_blas, fla_her2k_cntl_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm ); FLA_Copy_external( C, C_save ); for ( irep = 0 ; irep < n_repeats; irep++ ) { FLA_Copy_external( C_save, C ); *dtime = FLA_Clock(); switch( variant ) { case 0: REF_Lyap_h( isgn, A, C, scale ); break; case 1: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var1( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var1( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var1( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 2: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var2( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var2( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var2( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 3: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var3( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var3( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var3( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 4: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var4( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var4( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var4( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Blocksize_free( bp ); FLA_Cntl_obj_free( cntl_lyap_unb ); FLA_Cntl_obj_free( cntl_lyap_opt ); FLA_Cntl_obj_free( cntl_lyap_blk ); /* if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C ); *diff = FLA_Max_elemwise_diff( C, C_ref ); } */ { FLA_Obj X, W; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W ); FLA_Copy( C, X ); FLA_Hermitianize( FLA_UPPER_TRIANGULAR, X ); FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE, W ); FLA_Scal( isgn, W ); /* if ( variant == 3 && type == FLA_ALG_UNBLOCKED ) { FLA_Obj_show( "W", W, "%10.3e + %10.3e ", "" ); FLA_Obj_show( "C_save", C_save, "%10.3e + %10.3e ", "" ); } */ FLA_Axpy( FLA_MINUS_ONE, C_save, W ); FLA_Norm1( W, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &X ); FLA_Obj_free( &W ); } *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_save, C ); FLA_Obj_free( &C_save ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg, double* dtime_tred, double* dtime_tevd, double* dtime_appq ) { FLA_Error r_val = FLA_SUCCESS; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, r, d, e, G, R, W; FLA_Obj d0, e0, ls, pu; dim_t mn_A; dim_t n_G = k_accum; double dtime_temp; mn_A = FLA_Obj_length( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); *dtime_tred = 1; *dtime_tevd = 1; *dtime_appq = 1; // If the matrix is a scalar, then the EVD is easy. if ( mn_A == 1 ) { FLA_Copy( A, l ); FLA_Set( FLA_ONE, A ); return FLA_SUCCESS; } // Create a matrix to hold block Householder transformations. FLA_Tridiag_UT_create_T( A, &T ); // Create a vector to hold the realifying scalars. FLA_Obj_create( dt, mn_A, 1, 0, 0, &r ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &d0 ); FLA_Obj_create( dt_real, mn_A-1, 1, 0, 0, &e0 ); FLA_Obj_create( dt_real, mn_A, 1, 0, 0, &pu ); FLA_Obj_create( FLA_INT, mn_A, 1, 0, 0, &ls ); FLA_Obj_create( dt_comp, mn_A-1, n_G, 0, 0, &G ); FLA_Obj_create( dt_real, mn_A, mn_A, 0, 0, &R ); FLA_Obj_create( dt, mn_A, mn_A, 0, 0, &W ); dtime_temp = FLA_Clock(); { // Reduce the matrix to tridiagonal form. FLA_Tridiag_UT( uplo, A, T ); } *dtime_tred = FLA_Clock() - dtime_temp; // Apply scalars to rotate elements on the sub-diagonal to the real domain. FLA_Tridiag_UT_realify( uplo, A, r ); // Extract the diagonal and sub-diagonal from A. FLA_Tridiag_UT_extract_diagonals( uplo, A, d, e ); dtime_temp = FLA_Clock(); { // Form Q, overwriting A. FLA_Tridiag_UT_form_Q( uplo, A, T ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the scalars in r to Q. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, r, A ); // Find the eigenvalues only. FLA_Copy( d, d0 ); FLA_Copy( e, e0 ); //r_val = FLA_Tevd_n_opt_var1( n_iter_max, d0, e0, G, A ); { int info; double* buff_d = FLA_DOUBLE_PTR( d0 ); double* buff_e = FLA_DOUBLE_PTR( e0 ); dsterf_( &mn_A, buff_d, buff_e, &info ); } FLA_Sort( FLA_FORWARD, d0 ); FLA_Set( FLA_ZERO, ls ); FLA_Set( FLA_ZERO, pu ); dtime_temp = FLA_Clock(); { // Perform an eigenvalue decomposition on the tridiagonal matrix. r_val = FLA_Tevd_v_opt_var4( n_iter_max, d, e, d0, ls, pu, G, R, W, A, b_alg ); } *dtime_tevd = FLA_Clock() - dtime_temp; // Copy the converged eigenvalues to the output vector. FLA_Copy( d, l ); // Sort the eigenvalues and eigenvectors in ascending order. FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_free( &T ); FLA_Obj_free( &r ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &d0 ); FLA_Obj_free( &pu ); FLA_Obj_free( &e0 ); FLA_Obj_free( &ls ); FLA_Obj_free( &G ); FLA_Obj_free( &R ); FLA_Obj_free( &W ); return r_val; }
int main(int argc, char *argv[]) { int m_input, m, p_first, p_last, p_inc, p, nb_alg, variant, uplo, n_repeats, i, j, datatype, n_variants = 3; int blocksize[16]; char *colors = "brkgmcbrkg"; char *ticks = "o+*xso+*xs"; char m_dim_desc[14]; char m_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, b, b_orig, norm; /* Initialize FLAME */ FLA_Init(); fprintf( stdout, "%c number of repeats:", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocking size:", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc:", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m (-1 means bind to problem size): ", '%' ); scanf( "%d", &m_input ); fprintf( stdout, "%c %d\n", '%', m_input ); /* Delete all existing data structures */ fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } // ---------------------- /* FLA_Obj R, C, R_conf; m = 3; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 1, m, &C ); FLA_Obj_create( datatype, m, m, m, 1, &R ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, R, &R_conf ); //printf( "R rs, cs = %u, %u\n", FLA_Obj_row_stride( R ), FLA_Obj_col_stride( R ) ); //printf( "Rconf rs, cs = %u, %u\n", FLA_Obj_row_stride( R_conf ), FLA_Obj_col_stride( R_conf ) ); //FLA_Random_tri_matrix( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, C ); //FLA_Copy( C, R ); //FLA_Random_tri_matrix( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, R ); FLA_Random_spd_matrix( FLA_LOWER_TRIANGULAR, R ); FLA_Copy( R, C ); FLA_Obj_show( "C", C, "%10.3e + %10.3e ", "" ); FLA_Obj_show( "R", R, "%10.3e + %10.3e ", "" ); exit(1); */ // ---------------------- for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; if( m < 0 ) m = p / abs(m_input); //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Obj_create( datatype, m, 1, 0, 0, &b ); FLA_Obj_create( datatype, m, 1, 0, 0, &b_orig ); /* FLA_Obj_create( datatype, m, m, m, 1, &A ); FLA_Obj_create( datatype, m, 1, 1, 1, &b ); FLA_Obj_create( datatype, m, 1, 1, 1, &b_orig ); */ if ( FLA_Obj_is_single_precision( A ) ) FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &norm ); else FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &norm ); FLA_Random_spd_matrix( FLA_LOWER_TRIANGULAR, A ); FLA_Random_matrix( b ); FLA_Copy( b, b_orig ); /* time_Chol_l( 0, FLA_ALG_REFERENCE, n_repeats, p, nb_alg, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "data_REF( %d, 1:2 ) = [ %d %6.3lf ]; \n", i, p, gflops ); fflush( stdout ); */ for ( variant = 1; variant <= n_variants; variant++ ){ fprintf( stdout, "data_var%d( %d, 1:5 ) = [ %d ", variant, i, p ); fflush( stdout ); time_Chol_l( variant, FLA_ALG_UNBLOCKED, n_repeats, p, nb_alg, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Chol_l( variant, FLA_ALG_UNB_OPT, n_repeats, p, nb_alg, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_Chol_l( variant, FLA_ALG_BLOCKED, n_repeats, p, nb_alg, A, b, b_orig, norm, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); } FLA_Obj_free( &A ); FLA_Obj_free( &b ); FLA_Obj_free( &b_orig ); FLA_Obj_free( &norm ); fprintf( stdout, "\n" ); } /* // Print the MATLAB commands to plot the data // Delete all existing figures fprintf( stdout, "figure;\n" ); // Plot the performance of the reference implementation fprintf( stdout, "plot( data_REF( :,1 ), data_REF( :, 2 ), '-' ); \n" ); // Indicate that you want to add to the existing plot fprintf( stdout, "hold on;\n" ); // Plot the data for the other numbers of threads for ( i = 1; i <= n_variants; i++ ){ fprintf( stdout, "plot( data_var%d( :,1 ), data_var%d( :, 2 ), '%c:%c' ); \n", i, i, colors[ i-1 ], ticks[ i-1 ] ); } fprintf( stdout, "legend( ... \n" ); fprintf( stdout, "'Reference', ... \n" ); for ( i = 1; i <= n_variants; i++ ) fprintf( stdout, "'FLAME var%d', ... \n", i ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME chol\\_l performance (%s)' );\n", m_dim_desc ); fprintf( stdout, "print -depsc chol_l_%s.eps\n", m_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); */ FLA_Finalize( ); }