FLA_Error FLA_Hess_UT_opt_var4( FLA_Obj A, FLA_Obj T ) { FLA_Error r_val; FLA_Obj Y, Z; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Y ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); r_val = FLA_Hess_UT_step_opt_var4( A, Y, Z, T ); FLA_Obj_free( &Y ); FLA_Obj_free( &Z ); return r_val; }
FLA_Error FLA_Tridiag_UT_l_ofu_var3( FLA_Obj A, FLA_Obj T ) { FLA_Error r_val; FLA_Obj Z; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); r_val = FLA_Tridiag_UT_l_step_ofu_var3( A, Z, T ); FLA_Obj_free( &Z ); return r_val; }
FLA_Error FLA_Obj_create_copy_of( FLA_Trans trans, FLA_Obj obj_cur, FLA_Obj *obj_new ) { // Create a new object conformal to the current object. FLA_Obj_create_conf_to( trans, obj_cur, obj_new ); #ifdef FLA_ENABLE_SCC if ( !FLA_is_owner() ) return FLA_SUCCESS; #endif // Copy the contents of the current object to the new object. FLA_Copyt_external( trans, obj_cur, *obj_new ); return FLA_SUCCESS; }
FLA_Error FLA_Syrk_ln_omp1t_var5_fc( FLA_Obj A, FLA_Obj C, int nb_alg ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj MyC; int b; FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); #pragma intel omp parallel taskq { while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ nb_alg = FLA_Obj_width( A )/omp_get_num_threads() + 1; b = min( FLA_Obj_width( AR ), nb_alg ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &A1, &A2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ #pragma intel omp task captureprivate(A1) private(MyC) { FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &MyC ); FLA_Obj_set_to_zero( MyC ); /* MyC := A1 * A1' */ FLA_Syrk( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A1, FLA_ZERO, MyC ); /* C := MyC */ FLA_Axpy_sync_circular( FLA_ONE, MyC, C ); FLA_Obj_free( &MyC ); } /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, A1, /**/ A2, FLA_LEFT ); } } return FLA_SUCCESS; }
FLA_Error FLA_Syrk_ln_omp1t_var5( FLA_Obj A, FLA_Obj C ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj MyC; int b; FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); #pragma intel omp parallel taskq { while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ b = FLA_Task_compute_blocksize( 0, A, AL, FLA_LEFT ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &A1, &A2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ #pragma intel omp task captureprivate(A1) private(MyC) { FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &MyC ); FLA_Obj_set_to_zero( MyC ); /* MyC := A1 * A1' */ FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A1, FLA_ZERO, MyC ); /* C := MyC */ FLA_Axpy_sync_pipeline( FLA_ONE, MyC, C ); FLA_Obj_free( &MyC ); } /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, A1, /**/ A2, FLA_LEFT ); } } return FLA_SUCCESS; }
FLA_Error FLA_Trmvsx_external( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y ) { FLA_Obj x_copy; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Trmvsx_check( uplo, transa, diag, alpha, A, x, beta, y ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, x, &x_copy ); FLA_Copy_external( x, x_copy ); FLA_Trmv_external( uplo, transa, diag, A, x_copy ); FLA_Scal_external( beta, y ); FLA_Axpy_external( alpha, x_copy, y ); FLA_Obj_free( &x_copy ); return FLA_SUCCESS; }
void time_Gemm_pp_nn( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref, double *dtime, double *diff, double *mflops ) { int irep, info, lwork; double dtime_old, d_minus_one = -1.0, d_one = 1.0; FLA_Obj Cold; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold ); FLA_Copy_external( C, Cold ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( Cold, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, ONE, A, B, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_pp_nn_var1( FLA_ONE, A, B, C, nb_alg ); break; case FLA_ALG_BLOCKED: REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, ONE, A, B, FLA_ONE, C ); break; default: printf("trouble\n"); } break; } } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, Cref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, Cref ); } *mflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1000000; *dtime = dtime_old; FLA_Copy_external( Cold, C ); FLA_Obj_free( &Cold ); }
void time_Copyt( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copyt( FLA_NO_TRANSPOSE, A, C ); break; case FLA_ALG_FRONT: FLA_Copyt( FLA_NO_TRANSPOSE, A, C ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copyt( FLA_TRANSPOSE, A, C ); break; case FLA_ALG_FRONT: FLA_Copyt( FLA_TRANSPOSE, A, C ); break; default: printf("trouble\n"); } break; } case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copyt( FLA_CONJ_NO_TRANSPOSE, A, C ); break; case FLA_ALG_FRONT: FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, A, C ); break; default: printf("trouble\n"); } break; } case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Copyt( FLA_CONJ_TRANSPOSE, A, C ); break; case FLA_ALG_FRONT: FLA_Copyt( FLA_CONJ_TRANSPOSE, A, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * m * n / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Syrk_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref, double *dtime, double *diff, double *gflops ) { int irep, info, lwork; double dtime_old, d_minus_one = -1.0, d_one = 1.0; FLA_Obj Cold; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold ); FLA_Copy_external( C, Cold ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( Cold, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C ); break; default: printf("trouble\n"); break; } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, Cref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, Cref ); } *gflops = 1.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( Cold, C ); FLA_Obj_free( &Cold ); }
void time_Her2k_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_gemm_t* cntl_gemm_blas; fla_her2k_t* cntl_her2k_blas; fla_her2k_t* cntl_her2k_var; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_her2k_blas = FLA_Cntl_her2k_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL, NULL ); cntl_her2k_var = FLA_Cntl_her2k_obj_create( FLA_FLAT, variant, bp, cntl_her2k_blas, cntl_gemm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var1( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var1( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var2( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var2( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var3( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var3( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var4( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var4( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 5:{ // Time variant 5 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var5( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var5( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 6:{ // Time variant 6 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var6( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var6( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 7:{ // Time variant 7 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var7( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var7( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 8:{ // Time variant 8 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var8( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var8( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 9:{ // Time variant 9 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var9( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var9( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } case 10:{ // Time variant 10 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Her2k_ln_unb_var10( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Her2k_ln_blk_var10( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_her2k_var ); FLA_Cntl_obj_free( cntl_her2k_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Lyap_h( int variant, int type, int n_repeats, int m, int nb_alg, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_save, norm; fla_blocksize_t* bp; fla_lyap_t* cntl_lyap_unb; fla_lyap_t* cntl_lyap_opt; fla_lyap_t* cntl_lyap_blk; if ( type == FLA_ALG_UNB_OPT && variant > 4 ) { *gflops = 0.0; *diff = 0.0; return; } bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_lyap_unb = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_UNB_VAR_OFFSET + variant, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_lyap_opt = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_OPT_VAR_OFFSET + variant, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_lyap_blk = FLA_Cntl_lyap_obj_create( FLA_FLAT, FLA_BLK_VAR_OFFSET + variant, bp, fla_scal_cntl_blas, fla_lyap_cntl_leaf, fla_sylv_cntl, fla_gemm_cntl_blas, fla_gemm_cntl_blas, fla_hemm_cntl_blas, fla_her2k_cntl_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( C ), 1, 1, 0, 0, &norm ); FLA_Copy_external( C, C_save ); for ( irep = 0 ; irep < n_repeats; irep++ ) { FLA_Copy_external( C_save, C ); *dtime = FLA_Clock(); switch( variant ) { case 0: REF_Lyap_h( isgn, A, C, scale ); break; case 1: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var1( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var1( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var1( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 2: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var2( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var2( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var2( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 3: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var3( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var3( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var3( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } case 4: { switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Lyap_h_unb_var4( isgn, A, C ); break; case FLA_ALG_UNB_OPT: FLA_Lyap_h_opt_var4( isgn, A, C ); break; case FLA_ALG_BLOCKED: FLA_Lyap_h_blk_var4( isgn, A, C, scale, cntl_lyap_blk ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Blocksize_free( bp ); FLA_Cntl_obj_free( cntl_lyap_unb ); FLA_Cntl_obj_free( cntl_lyap_opt ); FLA_Cntl_obj_free( cntl_lyap_blk ); /* if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { FLA_Hermitianize( FLA_UPPER_TRIANGULAR, C ); *diff = FLA_Max_elemwise_diff( C, C_ref ); } */ { FLA_Obj X, W; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &X ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &W ); FLA_Copy( C, X ); FLA_Hermitianize( FLA_UPPER_TRIANGULAR, X ); FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, X, FLA_ZERO, W ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, X, A, FLA_ONE, W ); FLA_Scal( isgn, W ); /* if ( variant == 3 && type == FLA_ALG_UNBLOCKED ) { FLA_Obj_show( "W", W, "%10.3e + %10.3e ", "" ); FLA_Obj_show( "C_save", C_save, "%10.3e + %10.3e ", "" ); } */ FLA_Axpy( FLA_MINUS_ONE, C_save, W ); FLA_Norm1( W, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &X ); FLA_Obj_free( &W ); } *gflops = ( 2.0 / 3.0 ) * ( m * m * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_save, C ); FLA_Obj_free( &C_save ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Gemm_nn_omp_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj BT, B0, BB, B1, B2; FLA_Obj C_local; int b; FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); #pragma intel omp parallel taskq { while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( A, AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); //b = min( FLA_Obj_width( AR ), nb_alg ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &A1, &A2, b, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ #pragma intel omp task captureprivate(A1,B1) private(C_local) { FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_local ); FLA_Obj_set_to_zero( C_local ); /* C = alpha * A1 * B1 + C; */ FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A1, B1, FLA_ONE, C_local ); REF_Axpy_sync_circular( FLA_ONE, C_local, C ); FLA_Obj_free( &C_local ); } /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, A1, /**/ A2, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } } return FLA_SUCCESS; }
void time_QR_UT( int variant, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, FLA_Obj b, FLA_Obj b_orig, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, b_save, norm; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); if ( FLA_Obj_is_single_precision( A ) ) FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &norm ); else FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &norm ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_QR_UT( A, t ); break; case FLA_ALG_FRONT: FLA_QR_UT( A, T ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Obj AT, AB; FLA_Obj bT, bB; FLA_Obj y; FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &y ); FLA_Copy_external( b, b_orig ); if ( FLA_Obj_is_real( A ) ) FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, t, b ); else FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, t, b ); FLA_Part_2x1( A, &AT, &AB, FLA_Obj_width( A ), FLA_TOP ); FLA_Part_2x1( b, &bT, &bB, FLA_Obj_width( A ), FLA_TOP ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, AT, bT ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, bT, FLA_ONE, b_orig ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b_orig, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &y ); } else { FLA_Obj x, y; FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &y ); FLA_Obj_create( FLA_Obj_datatype( b ), n, 1, 0, 0, &x ); FLA_Copy_external( b, b_orig ); FLA_QR_UT_solve( A, T, b, x ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, x, FLA_ONE, b_orig ); FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A_save, b_orig, FLA_ZERO, y ); FLA_Nrm2_external( y, norm ); FLA_Obj_extract_real_scalar( norm, diff ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); } *gflops = ( 2.0 * m * n * n - ( 2.0 / 3.0 ) * n * n * n ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &norm ); }
void time_Sylv( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Sylv( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale ); break; case FLA_ALG_FRONT: FLA_Sylv( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Sylv( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale ); break; case FLA_ALG_FRONT: FLA_Sylv( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale ); break; default: printf("trouble\n"); } break; } case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Sylv( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale ); break; case FLA_ALG_FRONT: FLA_Sylv( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale ); break; default: printf("trouble\n"); } break; } case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Sylv( FLA_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale ); break; case FLA_ALG_FRONT: FLA_Sylv( FLA_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ){ FLA_Copy_external( C, C_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = ( m * m * n + n * n * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Tevd_v( int variant, int type, int n_repeats, int m, int k_accum, int b_alg, int n_iter_max, FLA_Obj A_orig, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj A, FLA_Obj l, double *dtime, double *diff1, double* diff2, double *gflops ) { int irep; double k, dtime_old = 1.0e9; FLA_Obj A_save, G_save, d_save, e_save; if ( //( variant == 0 ) || //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || FALSE ) { *dtime = 0.0; *gflops = 0.0; *diff1 = 0.0; *diff2 = 0.0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, d, &d_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, e, &e_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); FLA_Copy_external( d, d_save ); FLA_Copy_external( e, e_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Tevd_v( d, e, A ); break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var1( n_iter_max, d, e, G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Tevd_v_opt_var2( n_iter_max, d, e, G, R, W, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } { FLA_Obj V, A_rev_evd, norm, eye; FLA_Copy( d, l ); //FLA_Obj_show( "A_save", A_save, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A_evd", A, "%9.2e + %9.2e ", "" ); FLA_Sort_evd( FLA_FORWARD, l, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); /* FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, D, FLA_ZERO, A_rev_evd ); FLA_Copy( A_rev_evd, D ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, D, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); */ //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_orig, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); //*diff = FLA_Max_elemwise_diff( A_orig, A_rev_evd ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); /* FLA_Obj_free( &EL ); FLA_Obj_free( &EU ); FLA_Obj_free( &D ); FLA_Obj_free( &dc ); FLA_Obj_free( &ec ); */ FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; if ( FLA_Obj_is_complex( A ) ) { *gflops = ( ( 4.5 * k * m * m ) + 2.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } else { *gflops = ( ( 4.5 * k * m * m ) + 1.0 * ( 3.0 * k * m * m * m ) ) / dtime_old / 1e9; } *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); FLA_Copy_external( d_save, d ); FLA_Copy_external( e_save, e ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &d_save ); FLA_Obj_free( &e_save ); }
void time_Gemm_nn( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref, double *dtime, double *diff, double *gflops ) { int irep, info, lwork; double dtime_old, d_minus_one = -1.0, d_one = 1.0; FLA_Obj Cold; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold ); FLA_Copy_external( C, Cold ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( Cold, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var1( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var2( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var3( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var4( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 5:{ // Time variant 5 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var5( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 6:{ // Time variant 6 switch( type ){ case FLA_ALG_OPENMP_BVAR: FLA_Gemm_nn_omp_var6( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 13:{ // Time variant 1->3 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var13( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 15:{ // Time variant 1->5 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var15( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 31:{ // Time variant 3->1 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var31( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 35:{ // Time variant 3->5 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var35( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 51:{ // Time variant 5->1 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var51( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } case 53:{ // Time variant 5->3 switch( type ){ case FLA_ALG_OPENMP_CVAR: FLA_Gemm_nn_omp_var53( FLA_ONE, A, B, C, nb_alg ); break; default: printf("trouble\n"); } break; } } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, Cref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, Cref ); //FLA_Obj_show( "C:", C, "%f", "\n"); } *gflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( Cold, C ); FLA_Obj_free( &Cold ); }
FLA_Error FLA_Gemm_nn_omp_var15( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj CT, C0, CB, C1, C2; FLA_Obj AL, AR, A10, A11, A12; FLA_Obj BT, B0, BB, B1, B2; FLA_Obj C1_local; int i, j, lock_ldim, lock_i; int b_m, b_k; FLA_Part_2x1( A, &AT, &AB, 0, FLA_TOP ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_TOP ); #pragma intel omp parallel taskq { while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ) { b_m = FLA_Determine_blocksize( A, AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, /* ** */ /* ** */ &A1, AB, &A2, b_m, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( CT, &C0, /* ** */ /* ** */ &C1, CB, &C2, b_m, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* C1 = alpha * A1 * B + C1; */ FLA_Part_1x2( A1, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ) { b_k = FLA_Determine_blocksize( A, AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); // Get the index of the current partition. // FIX THIS: need + b_m - 1 or something like this //j = FLA_Obj_length( CT ) / b_m; //i = FLA_Obj_width( AL ) / b_k; //lock_ldim = FLA_get_num_threads_in_m_dim(omp_get_num_threads()); lock_i = FLA_Obj_length( CT ) / b_m; FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A10, /**/ &A11, &A12, b_k, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b_k, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* C1 = alpha * A11 * B1 + C1; */ //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, //// alpha, A11, B1, FLA_ONE, C1 ); #pragma intel omp task captureprivate( lock_i, A11, B1, C1 ), private( C1_local ) { FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C1, &C1_local ); FLA_Obj_set_to_zero( C1_local ); /* C1_local = alpha * A1 * B11 + C1_local; */ FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A11, B1, FLA_ONE, C1_local ); // Acquire lock[i] (the lock for C1). omp_set_lock( &fla_omp_lock[lock_i] ); /* C1 += C1_local */ FLA_Axpy_external( FLA_ONE, C1_local, C1 ); //FLA_Axpy_sync_pipeline2( j*lock_ldim, FLA_ONE, C1_local, C1 ); //FLA_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 ); //REF_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 ); // Release lock[i] (the lock for C1). omp_unset_lock( &fla_omp_lock[lock_i] ); FLA_Obj_free( &C1_local ); } /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A10, A11, /**/ A12, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, A1, /* ** */ /* ** */ &AB, A2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &CT, C0, C1, /* ** */ /* ** */ &CB, C2, FLA_TOP ); } } return FLA_SUCCESS; }
void time_Syrk_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var1( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var1( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var1( A, C ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var2( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var2( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var2( A, C ); break; case FLA_ALG_OPENMP_2LOOPSPLUS: FLA_Syrk_ln_omp2x_var2( A, C ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var3( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var3( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var3( A, C ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var4( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var4( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var4( A, C ); break; default: printf("trouble\n"); } break; } case 5:{ // Time variant 5 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var5( A, C ); break; default: printf("trouble\n"); } break; } } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, C_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, C_ref ); //FLA_Obj_show( "C:", C, "%f", "\n"); } *gflops = 1.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Hevd_lv_components( int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg, FLA_Obj A, FLA_Obj l, double* dtime, double* diff1, double* diff2, double* gflops, double* dtime_tred, double* gflops_tred, double* dtime_tevd, double* gflops_tevd, double* dtime_appq, double* gflops_appq, int* k_perf ) { int i; double k; double dtime_save = 1.0e9; double dtime_tred_save = 1.0e9; double dtime_tevd_save = 1.0e9; double dtime_appq_save = 1.0e9; double flops_tred; double flops_tevd; double flops_appq; double mult_tred; double mult_tevd; double mult_appq; FLA_Obj A_save, Z; if ( ( variant == -3 ) || ( variant == -4 ) || ( variant == -5 ) || //( variant == 0 ) || //( variant == -1 ) || //( variant == -2 ) || //( variant == 1 ) || //( variant == 2 ) || //( variant == 3 ) || //( variant == 4 ) || FALSE ) { *gflops = 0.0; *dtime = 0.0; *diff1 = 0.0; *diff2 = 0.0; *dtime_tred = 0.0; *dtime_tevd = 0.0; *dtime_appq = 0.0; *gflops_tred = 0.0; *gflops_tevd = 0.0; *gflops_appq = 0.0; *k_perf = 0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); FLA_Copy_external( A, A_save ); for ( i = 0 ; i < n_repeats; i++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case -3: { *k_perf = 0; REF_Hevd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -4: { *k_perf = 0; REF_Hevdd_lv( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -5: { *k_perf = 0; REF_Hevdr_lv( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } case 0: { *k_perf = 0; REF_Hevd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -1: { *k_perf = 0; REF_Hevdd_lv_components( A, l, dtime_tred, dtime_tevd, dtime_appq ); break; } case -2: { *k_perf = 0; REF_Hevdr_lv_components( A, l, Z, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 1 case 1: { *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } // Time variant 2 case 2: { *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg, dtime_tred, dtime_tevd, dtime_appq ); break; } } *dtime = FLA_Clock() - *dtime; if ( *dtime < dtime_save ) { dtime_save = *dtime; dtime_tred_save = *dtime_tred; dtime_tevd_save = *dtime_tevd; dtime_appq_save = *dtime_appq; } } *dtime = dtime_save; *dtime_tred = dtime_tred_save; *dtime_tevd = dtime_tevd_save; *dtime_appq = dtime_appq_save; //if ( variant == -3 || variant == 0 ) //printf( "\ndtime is %9.3e\n", *dtime ); { FLA_Obj V, A_rev_evd, norm, eye; if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, V, FLA_ZERO, A_rev_evd ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd ); //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" ); FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd ); FLA_Norm_frob( A_rev_evd, norm ); FLA_Obj_extract_real_scalar( norm, diff1 ); FLA_Set_to_identity( eye ); FLA_Copy( V, A_rev_evd ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye ); FLA_Norm_frob( eye, norm ); FLA_Obj_extract_real_scalar( norm, diff2 ); FLA_Obj_free( &V ); FLA_Obj_free( &A_rev_evd ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); } k = 2.00; flops_tred = ( ( 4.0 / 3.0 ) * m * m * m ); flops_tevd = ( 4.5 * k * m * m + 3.0 * k * m * m * m ); if ( variant == -1 || variant == -2 || variant == -4 || variant == -5 ) flops_appq = ( 2.0 * m * m * m ); else flops_appq = ( 4.0 / 3.0 * m * m * m ); /* if ( FLA_Obj_is_complex( A ) ) { *gflops = ( 4.0 * flops_tred + 2.0 * flops_tevd + 4.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9; } else { *gflops = ( 1.0 * flops_tred + 1.0 * flops_tevd + 1.0 * flops_appq ) / *dtime / 1e9; *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9; } */ if ( FLA_Obj_is_complex( A ) ) { mult_tred = 4.0; mult_tevd = 2.0; mult_appq = 4.0; } else { mult_tred = 1.0; mult_tevd = 1.0; mult_appq = 1.0; } *gflops = ( mult_tred * flops_tred + mult_tevd * flops_tevd + mult_appq * flops_appq ) / *dtime / 1e9; *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9; *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9; *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9; FLA_Copy_external( A_save, A ); FLA_Obj_free( &A_save ); FLA_Obj_free( &Z ); }
void time_Transpose( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj A_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_old, A_tmp; fla_blocksize_t* bp; fla_transpose_t* cntl_trans_var_unb; fla_transpose_t* cntl_trans_var_blk; fla_swap_t* cntl_swap_var_blk; fla_swap_t* cntl_swap_blas; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_swap_blas = FLA_Cntl_swap_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_swap_var_blk = FLA_Cntl_swap_obj_create( FLA_FLAT, FLA_UNBLOCKED_VARIANT1, bp, cntl_swap_blas ); cntl_trans_var_unb = FLA_Cntl_transpose_obj_create( FLA_FLAT, FLA_UNBLOCKED_VARIANT1, NULL, NULL, NULL ); cntl_trans_var_blk = FLA_Cntl_transpose_obj_create( FLA_FLAT, variant, bp, cntl_trans_var_unb, cntl_swap_var_blk ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_old ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_tmp ); FLA_Copy_external( A, A_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_old, A ); *dtime = FLA_Clock(); switch( variant ){ case 0: //FLA_Copyt_external( FLA_TRANSPOSE, A, A_tmp ); //FLA_Set( FLA_ZERO, A ); //FLA_Copyt_external( FLA_NO_TRANSPOSE, A_tmp, A ); FLA_Transpose( A ); break; case 1:{ /* Time variant 1 */ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Transpose_unb_var1( A ); break; case FLA_ALG_BLOCKED: FLA_Transpose_blk_var1( A, cntl_trans_var_blk ); break; default: printf("trouble\n"); } break; } case 2:{ /* Time variant 2 */ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Transpose_unb_var2( A ); break; case FLA_ALG_BLOCKED: FLA_Transpose_blk_var2( A, cntl_trans_var_blk ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_trans_var_blk ); FLA_Cntl_obj_free( cntl_trans_var_unb ); FLA_Cntl_obj_free( cntl_swap_var_blk ); FLA_Cntl_obj_free( cntl_swap_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ){ FLA_Copy_external( A, A_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( A, A_ref ); } *gflops = 4 * n * n / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( A_old, A ); FLA_Obj_free( &A_old ); FLA_Obj_free( &A_tmp ); }
void time_Apply_G_rf( int variant, int type, int n_repeats, int m, int k, int n, int b_alg, FLA_Obj A, FLA_Obj A_ref, FLA_Obj G, FLA_Obj P, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, G_save, norm; if ( FLA_Obj_is_real( A ) ) { if ( //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 1 && type == FLA_ALG_UNB_ASM ) || //( variant == 1 && type == FLA_ALG_BLOCKED ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_ASM ) || //( variant == 2 && type == FLA_ALG_BLOCKED ) || //( variant == 3 && type == FLA_ALG_UNB_OPT ) || //( variant == 3 && type == FLA_ALG_UNB_ASM ) || //( variant == 3 && type == FLA_ALG_BLOCKED ) || //( variant == 6 && type == FLA_ALG_UNB_OPT ) || //( variant == 6 && type == FLA_ALG_UNB_ASM ) || //( variant == 6 && type == FLA_ALG_BLOCKED ) || //( variant == 9 && type == FLA_ALG_UNB_OPT ) || //( variant == 9 && type == FLA_ALG_UNB_ASM ) || //( variant == 9 && type == FLA_ALG_BLOCKED ) || ( variant == 4 ) || ( variant == 5 ) || ( variant == 7 ) || ( variant == 8 ) || FALSE ) { *gflops = 0.0; *diff = 0.0; return; } } else if ( FLA_Obj_is_complex( A ) ) { if ( //( variant == 1 && type == FLA_ALG_UNB_OPT ) || //( variant == 1 && type == FLA_ALG_UNB_ASM ) || //( variant == 1 && type == FLA_ALG_BLOCKED ) || //( variant == 2 && type == FLA_ALG_UNB_OPT ) || //( variant == 2 && type == FLA_ALG_UNB_ASM ) || //( variant == 2 && type == FLA_ALG_BLOCKED ) || //( variant == 3 && type == FLA_ALG_UNB_OPT ) || //( variant == 3 && type == FLA_ALG_UNB_ASM ) || //( variant == 3 && type == FLA_ALG_BLOCKED ) || //( variant == 6 && type == FLA_ALG_UNB_OPT ) || //( variant == 6 && type == FLA_ALG_UNB_ASM ) || //( variant == 6 && type == FLA_ALG_BLOCKED ) || //( variant == 9 && type == FLA_ALG_UNB_OPT ) || //( variant == 9 && type == FLA_ALG_UNB_ASM ) || //( variant == 9 && type == FLA_ALG_BLOCKED ) || ( variant == 4 ) || ( variant == 5 ) || ( variant == 7 ) || ( variant == 8 ) || FALSE ) { *gflops = 0.0; *diff = 0.0; return; } } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G, &G_save ); FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); //dim_t b_flash_m = b_alg; //dim_t b_flash_n = n; //FLASH_Obj_create_hier_copy_of_flat_ext( A, 1, &b_flash_m, &b_flash_n, &AH ); //printf ( "flash dims: %d x %d\n", FLA_Obj_length( AH ), FLA_Obj_width( AH ) ); FLA_Copy_external( A, A_save ); FLA_Copy_external( G, G_save ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); //FLASH_Obj_hierarchify( A_save, AH ); *dtime = FLA_Clock(); switch( variant ){ case 0: break; // Time variant 1 case 1: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var1( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var1( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var1( G, A, b_alg ); break; } break; } // Time variant 2 case 2: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var2( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var2( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var2( G, A, b_alg ); break; } break; } // Time variant 3 case 3: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var3( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var3( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var3( G, A, b_alg ); break; } break; } // Time variant 6 case 6: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var6( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var6( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var6( G, A, b_alg ); break; } break; } // Time variant 9 case 9: { switch( type ){ case FLA_ALG_UNB_OPT: FLA_Apply_G_rf_opt_var9( G, A ); break; case FLA_ALG_UNB_ASM: FLA_Apply_G_rf_asm_var9( G, A ); break; case FLA_ALG_BLOCKED: FLA_Apply_G_rf_blk_var9( G, A, b_alg ); break; } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( variant == 1 && type == FLA_ALG_UNB_OPT ) { //FLA_Obj_show( "A_ref", A, "%9.2e + %9.2e ", "" ); //FLA_Obj_show( "A", A, "%9.2e ", "" ); FLA_Copy( A, A_ref ); *diff = 0.0; } else { //FLA_Obj_show( "A", A, "%9.2e + %9.2e ", "" ); //if ( variant == 7 && type == FLA_ALG_UNB_ASM ) //FLA_Obj_show( "A", A, "%9.2e", "" ); //if ( variant == 9 ) FLASH_Obj_flatten( AH, A ); FLA_Axpy( FLA_MINUS_ONE, A_ref, A ); FLA_Norm_frob( A, norm ); FLA_Obj_extract_real_scalar( norm, diff ); //*diff = FLA_Max_elemwise_diff( A_ref, A ); } *gflops = 6.0 * k * m * ( n - 1 ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 2.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( G_save, G ); //FLASH_Obj_free( &AH ); FLA_Obj_free( &A_save ); FLA_Obj_free( &G_save ); FLA_Obj_free( &norm ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, Ak, T, Tk, D, Dk, A_copy, A_recovered, L, Q, Qk, W, x, y, z; dim_t m, n, k; dim_t min_m_n; FLA_Error init_result; double residual_A, residual_Axy; int use_form_q = 1; if ( argc == 4 ) { m = atoi(argv[1]); n = atoi(argv[2]); k = atoi(argv[3]); min_m_n = min(m,n); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m n k\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " n : matrix width\n"); fprintf(stderr, " k : number of house holder vectors applied for testing\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 || n == 0 ) return 0; FLA_Init_safe( &init_result ); // FLAME LQ^H setup FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_LQ_UT_create_T( A, &T ); // Rand A and create A_copy. FLA_Random_matrix( A ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_copy ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered ); FLA_Copy( A, A_copy ); // LQ test ( A = L Q^H ) FLA_LQ_UT( A, T ); // Create Q (identity), L (A_copy) FLA_Obj_create( datatype, m, n, 0, 0, &Q ); FLA_Set_to_identity( Q ); FLA_Obj_create( datatype, m, m, 0, 0, &D ); FLA_Obj_create( datatype, k, n, 0, 0, &Qk ); FLA_Set_to_identity( Qk ); FLA_Obj_create( datatype, k, k, 0, 0, &Dk ); FLA_Obj_create( datatype, m, m, 0, 0, &L ); // Q^H := I H_{0}^H ... H_{k-1}^H if ( use_form_q ) { FLA_LQ_UT_form_Q( A, T, Q ); } else { FLA_Apply_Q_UT_create_workspace_side( FLA_RIGHT, T, Q, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, Q ); FLA_Obj_free( &W ); } // D := Q^T Q FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Q, Q, FLA_ZERO, D ); // Qk := I H0 ... Hk FLA_Part_1x2( T, &Tk, &W, k, FLA_LEFT ); FLA_Part_2x1( A, &Ak, &W, k, FLA_TOP ); if ( use_form_q ) { // Overwrite the result to test FLAME API FLA_Set( FLA_ZERO, Qk ); FLA_Copy( Ak, Qk ); FLA_LQ_UT_form_Q( Ak, Tk, Qk ); } else { FLA_Apply_Q_UT_create_workspace( Tk, Qk, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, Ak, Tk, W, Qk ); FLA_Obj_free( &W ); } // Dk := Qk^T Qk FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, Qk, Qk, FLA_ZERO, Dk ); // L := A (Q^H)^H if ( use_form_q ) { // Note that the formed Q is actually Q^H; transb should be carefully assigned. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A_copy, Q, FLA_ZERO, L ); } else { FLA_Apply_Q_UT_create_workspace( T, L, &W ); FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, L ); FLA_Obj_free( &W ); } FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, L, Q, FLA_ZERO, A_recovered ); // Create vectors for testing FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Set( FLA_ZERO, x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Set( FLA_ZERO, y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Set( FLA_ZERO, z ); // x is given FLA_Set( FLA_ONE, x ); // y := Ax FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_copy, x, FLA_ZERO, y ); // z := L (Q^H) x , libflame FLA_Apply_Q_UT_create_workspace( T, x, &W ); FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, x ); FLA_Obj_free( &W ); if ( m < n ) FLA_Part_2x1( x, &x, &W, m, FLA_TOP ); else FLA_Part_1x2( L, &L, &W, n, FLA_LEFT ); FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, L, x, FLA_ZERO, z ); // Comapre (A_copy, A_recovered), (y,z) and (y,w) residual_A = FLA_Max_elemwise_diff( A_copy, A_recovered ); residual_Axy = FLA_Max_elemwise_diff( y, z ); if ( 1 || residual_A > EPS || residual_Axy > EPS ) { FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - T - ", T, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Q - ", Q, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D = Q^T Q - ", D, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Qk - ", Qk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Dk = Qk^T Qk - ", Dk, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - L - ", L, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------"); fprintf( stdout, "lapack2flame: %lu x %lu, %lu: ", m, n, k); fprintf( stdout, "| A - A_recovered | = %12.10e, | Ax - y | = %12.10e\n\n", residual_A, residual_Axy ) ; } FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &A_copy ); FLA_Obj_free( &A_recovered ); FLA_Obj_free( &L ); FLA_Obj_free( &Q ); FLA_Obj_free( &Qk ); FLA_Obj_free( &D ); FLA_Obj_free( &Dk ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Finalize_safe( init_result ); }
void time_Her2k( int param_combo, int type, int nrepeats, int m, int k, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Her2k( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Her2k( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Her2k( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Her2k( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Her2k( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 4.0 * 2.0 * m * m * k / dtime_old / 1.0e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Gemm( int param_combo, int type, int nrepeats, int m, int k, int n, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; if ( param_combo != 4 ) { *gflops = 0.0; *diff = 0.0; return; } FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 4 case 4:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); //FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 5 case 5:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 6 case 6:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 7 case 7:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 8 case 8:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } /* if ( type == FLA_ALG_REFERENCE ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } */ *gflops = 2.0 * m * k * n / dtime_old / 1.0e9; if ( param_combo == 0 || param_combo == 1 || param_combo == 2 || param_combo == 3 || param_combo == 6 ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Gemm_hh( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_gemm_t* cntl_gemm_blas; fla_gemm_t* cntl_gemm_var; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_gemm_var = FLA_Cntl_gemm_obj_create( FLA_FLAT, variant, bp, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ // Time reference implementation case 0: REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C ); break; // Time variant 1 case 1:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var1( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var1( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 2 case 2:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var2( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var2( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 3 case 3:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var3( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var3( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 4 case 4:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var4( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var4( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 5 case 5:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var5( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var5( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } // Time variant 6 case 6:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Gemm_hh_unb_var6( FLA_ONE, A, B, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Gemm_hh_blk_var6( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_gemm_var ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, A_flame, A_lapack, C; int m; FLA_Error init_result; FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame; FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack; testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack; int lwork, info, is_flame; if ( argc == 3 ) { m = atoi(argv[1]); is_flame = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m is_flame\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " is_flame : 1 yes, 0 no\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); fprintf( stdout, "lapack2flame: %d x %d: \n", m, m); FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Random_matrix( A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Random_matrix( C ); if ( is_flame ) { fprintf( stdout, " flame executed\n"); FLA_Bidiag_UT_create_T( A_flame, &TU, &TV ); FLA_Bidiag_UT( A_flame, TU, TV ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame ); FLA_Bidiag_UT_form_U( U_flame, TU, U_flame ); FLA_Bidiag_UT_form_V( V_flame, TV, V_flame ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_flame ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_flame ); FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame ); FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_flame, B_flame ); FLA_Set_diagonal_matrix( e_flame, BTR ); } if (1) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------"); } } else { fprintf( stdout, " lapack executed\n"); FLA_Obj_create( datatype, m, 1, 0, 0, &tauq ); FLA_Obj_create( datatype, m, 1, 0, 0, &taup ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_lapack ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_lapack ); buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack ); buff_tauq = (testtype*)FLA_Obj_buffer_at_view( tauq ); buff_taup = (testtype*)FLA_Obj_buffer_at_view( taup ); buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack ); lwork = 32*m; FLA_Obj_create( datatype, lwork, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sgebrd_( &m, &m, buff_A_lapack, &m, buff_d_lapack, buff_e_lapack, buff_tauq, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack ); FLA_Copy( A_lapack, U_lapack ); FLA_Copy( A_lapack, V_lapack ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack ); sorgbr_( "Q", &m, &m, &m, buff_U_lapack, &m, buff_tauq, buff_W, &lwork, &info ); sorgbr_( "P", &m, &m, &m, buff_V_lapack, &m, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_lapack, B_lapack ); FLA_Set_diagonal_matrix( e_lapack, BTR ); } FLA_Obj_free( &W ); if (1) { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------"); } } { testtype dummy; int zero = 0, one = 1; FLA_Obj D_lapack; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack ); if ( is_flame ) { buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame ); } FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sbdsqr_( "U", &m, &m, &m, &zero, buff_d_lapack, buff_e_lapack, buff_V_lapack, &m, buff_U_lapack, &m, &dummy, &one, buff_W, &info ); FLA_Obj_free( &W ); if (info != 0) printf( " Error info = %d\n", info ); if ( is_flame ) FLA_Set_diagonal_matrix( d_flame, D_lapack ); else FLA_Set_diagonal_matrix( d_lapack, D_lapack ); if ( is_flame ) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } else { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } FLA_Obj_free( &D_lapack ); } if ( is_flame ) { FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &U_flame ); FLA_Obj_free( &V_flame ); FLA_Obj_free( &d_flame ); FLA_Obj_free( &e_flame ); FLA_Obj_free( &B_flame ); } else { FLA_Obj_free( &tauq ); FLA_Obj_free( &taup ); FLA_Obj_free( &d_lapack ); FLA_Obj_free( &e_lapack ); FLA_Obj_free( &U_lapack ); FLA_Obj_free( &V_lapack ); FLA_Obj_free( &B_lapack ); } FLA_Obj_free( &A ); FLA_Obj_free( &A_flame ); FLA_Obj_free( &A_lapack ); FLA_Obj_free( &C ); FLA_Finalize_safe( init_result ); }
void time_Sylv_nn( int variant, int type, int n_repeats, int m, int n, int nb_alg, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_sylv_t* cntl_sylv_var; fla_sylv_t* cntl_sylv_unb; fla_gemm_t* cntl_gemm_blas; /* if( type == FLA_ALG_UNBLOCKED && n > 400 ) { *gflops = 0.0; *diff = 0.0; return; } */ bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_sylv_unb = FLA_Cntl_sylv_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_sylv_var = FLA_Cntl_sylv_obj_create( FLA_FLAT, variant, bp, cntl_sylv_unb, cntl_sylv_unb, cntl_sylv_unb, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < n_repeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: /* Time reference implementation */ REF_Sylv_nn( isgn, A, B, C, scale ); break; case 1:{ /* Time variant 1 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var1( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var1( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 2:{ /* Time variant 2 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var2( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var2( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 3:{ /* Time variant 3 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var3( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var3( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 4:{ /* Time variant 4 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var4( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var4( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 5:{ /* Time variant 5 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var5( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var5( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 6:{ /* Time variant 6 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var6( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var6( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 7:{ /* Time variant 7 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var7( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var7( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 8:{ /* Time variant 8 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var8( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var8( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 9:{ /* Time variant 9 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var9( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var9( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 10:{ /* Time variant 10 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var10( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var10( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 11:{ /* Time variant 11 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var11( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var11( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 12:{ /* Time variant 12 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var12( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var12( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 13:{ /* Time variant 13 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var13( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var13( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 14:{ /* Time variant 14 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var14( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var14( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 15:{ /* Time variant 15 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var15( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var15( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 16:{ /* Time variant 16 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var16( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var16( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 17:{ /* Time variant 17 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var17( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var17( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } case 18:{ /* Time variant 18 */ switch( type ){ case FLA_ALG_UNB_OPT: FLA_Sylv_nn_opt_var18( isgn, A, B, C, scale ); break; case FLA_ALG_BLOCKED: FLA_Sylv_nn_blk_var18( isgn, A, B, C, scale, cntl_sylv_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_sylv_var ); FLA_Cntl_obj_free( cntl_sylv_unb ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ){ FLA_Copy_external( C, C_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = ( m * m * n + n * n * m ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void libfla_test_apqut_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m, n; unsigned int min_m_n; signed int m_input; signed int n_input; FLA_Side side; FLA_Trans trans; FLA_Direct direct; FLA_Store storev; FLA_Obj A, T, W, B, eye, norm; FLA_Obj B_save; FLA_Obj A_test, T_test, W_test, B_test; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_trans( &pc_str[pci][1], &trans ); FLA_Param_map_char_to_flame_direct( &pc_str[pci][2], &direct ); FLA_Param_map_char_to_flame_storev( &pc_str[pci][3], &storev ); // We want to make sure the Apply_Q_UT routines work with rectangular // matrices. So we use m > n when testing with column-wise storage (via // QR factorization) and m < n when testing with row-wise storage (via // LQ factorization). if ( storev == FLA_COLUMNWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } else // if ( storev == FLA_ROWWISE ) { m_input = -1; n_input = -1; //m_input = -1; //n_input = -1; } // Determine the dimensions. if ( m_input < 0 ) m = p_cur * abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur * abs(n_input); else n = p_cur; // Compute the minimum dimension. min_m_n = min( m, n ); // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T ); if ( storev == FLA_COLUMNWISE ) libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); else libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], n, n, &B ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &eye ); FLA_Apply_Q_UT_create_workspace( T, B, &W ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_matrix( A ); FLA_Set_to_identity( B ); FLA_Set_to_identity( eye ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); else // if ( storev == FLA_ROWWISE ) FLASH_LQ_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Apply_Q_UT_create_workspace( T_test, B_test, &W_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { A_test = A; T_test = T; W_test = W; B_test = B; } // Compute a Householder factorization. if ( impl == FLA_TEST_HIER_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT( A_test, T_test ); else FLASH_LQ_UT( A_test, T_test ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { if ( storev == FLA_COLUMNWISE ) FLA_QR_UT( A_test, T_test ); else FLA_LQ_UT( A_test, T_test ); } // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( B_save, B_test ); else FLA_Copy_external( B_save, B_test ); time = FLA_Clock(); libfla_test_apqut_impl( impl, side, trans, direct, storev, A_test, T_test, W_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Multiply by its conjugate-transpose to get what should be (near) identity // and then subtract from actual identity to get what should be (near) zero. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( B_test, B ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } else // if ( impl == FLA_TEST_FLAT_FRONT_END ) { FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, B, B, FLA_MINUS_ONE, eye ); } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &T_test ); FLASH_Obj_free( &W_test ); FLASH_Obj_free( &B_test ); } // Compute the norm of eye, which contains I - Q * Q'. FLA_Norm1( eye, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Compute the performance of the best experiment repeat. *perf = ( 4.0 * m * min_m_n * n - 2.0 * min_m_n * min_m_n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Free the supporting flat objects. FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &T ); FLA_Obj_free( &W ); FLA_Obj_free( &B ); FLA_Obj_free( &eye ); FLA_Obj_free( &norm ); }
FLA_Error FLA_Lyap_n_unb_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj C ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj CTL, CTR, C00, c01, C02, CBL, CBR, c10t, gamma11, c12t, C20, c21, C22; FLA_Obj WTL, WTR, W00, w01, W02, WBL, WBR, w10t, omega11, w12t, W20, w21, W22; FLA_Obj W, omega; FLA_Scal( isgn, C ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &W ); FLA_Obj_create( FLA_Obj_datatype( A ), 1, 1, 0, 0, &omega ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_BR ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_BR ); FLA_Part_2x2( W, &WTL, &WTR, &WBL, &WBR, 0, 0, FLA_BR ); while ( FLA_Obj_length( CTL ) > 0 ){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &a01, /**/ &A02, &a10t, &alpha11, /**/ &a12t, /* ************* */ /* ************************** */ ABL, /**/ ABR, &A20, &a21, /**/ &A22, 1, 1, FLA_TL ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &c01, /**/ &C02, &c10t, &gamma11, /**/ &c12t, /* ************* */ /* ************************** */ CBL, /**/ CBR, &C20, &c21, /**/ &C22, 1, 1, FLA_TL ); FLA_Repart_2x2_to_3x3( WTL, /**/ WTR, &W00, &w01, /**/ &W02, &w10t, &omega11, /**/ &w12t, /* ************* */ /* ************************** */ WBL, /**/ WBR, &W20, &w21, /**/ &W22, 1, 1, FLA_TL ); /*------------------------------------------------------------*/ // gamma11 = gamma11 / ( alpha11 + alpha11' ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, alpha11, omega ); FLA_Mult_add( FLA_ONE, alpha11, omega ); FLA_Inv_scal( omega, gamma11 ); // c01 = c01 - a01 * gamma11; FLA_Axpys( FLA_MINUS_ONE, gamma11, a01, FLA_ONE, c01 ); // c01 = inv( triu(A00) + conj(alpha) * I ) * c01; FLA_Copyrt( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, A00, W00 ); FLA_Shift_diag( FLA_CONJUGATE, alpha11, W00 ); FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, W00, c01 ); // C00 = C00 - a01 * c01' - c01 * a01'; FLA_Her2( FLA_UPPER_TRIANGULAR, FLA_MINUS_ONE, a01, c01, C00 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ a01, A02, /* ************** */ /* ************************ */ a10t, /**/ alpha11, a12t, &ABL, /**/ &ABR, A20, /**/ a21, A22, FLA_BR ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ c01, C02, /* ************** */ /* ************************ */ c10t, /**/ gamma11, c12t, &CBL, /**/ &CBR, C20, /**/ c21, C22, FLA_BR ); FLA_Cont_with_3x3_to_2x2( &WTL, /**/ &WTR, W00, /**/ w01, W02, /* ************** */ /* ************************ */ w10t, /**/ omega11, w12t, &WBL, /**/ &WBR, W20, /**/ w21, W22, FLA_BR ); } FLA_Obj_free( &W ); FLA_Obj_free( &omega ); return FLA_SUCCESS; }
void time_Chol_u( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_save = 1.0e9; FLA_Obj A_save, b_save, b_orig_save; fla_blocksize_t* bp; fla_chol_t* cntl_chol_var; fla_chol_t* cntl_chol_unb; fla_syrk_t* cntl_syrk_blas; fla_herk_t* cntl_herk_blas; fla_trsm_t* cntl_trsm_blas; fla_gemm_t* cntl_gemm_blas; /* if( type == FLA_ALG_UNBLOCKED && n > 400 ) { *gflops = 0.0; *diff = 0.0; return; } */ bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_chol_unb = FLA_Cntl_chol_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_syrk_blas = FLA_Cntl_syrk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_herk_blas = FLA_Cntl_herk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_chol_var = FLA_Cntl_chol_obj_create( FLA_FLAT, variant, bp, cntl_chol_unb, cntl_syrk_blas, cntl_herk_blas, cntl_trsm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); FLA_Copy_external( b_orig, b_orig_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Chol_u( A ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var1( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var1( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var1( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var2( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var2( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var2( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var3( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var3( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var3( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_save = min( *dtime, dtime_save ); } FLA_Cntl_obj_free( cntl_chol_var ); FLA_Cntl_obj_free( cntl_chol_unb ); FLA_Cntl_obj_free( cntl_syrk_blas ); FLA_Cntl_obj_free( cntl_herk_blas ); FLA_Cntl_obj_free( cntl_trsm_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( type == FLA_ALG_REFERENCE ) { FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, A, b ); FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Hemv_external( FLA_UPPER_TRIANGULAR, FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig ); FLA_Nrm2_external( b_orig, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } else { FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, A, b ); FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Hemv_external( FLA_UPPER_TRIANGULAR, FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig ); FLA_Nrm2_external( b_orig, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } *gflops = 1.0 / 3.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_length( A ) / dtime_save / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_save; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Copy_external( b_orig_save, b_orig ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &b_orig_save ); }