FLA_Error FLA_Axpy_sync_circular( FLA_Obj alpha, FLA_Obj X, FLA_Obj B ) { FLA_Obj XL, XR, X0, X1, X2; FLA_Obj BL, BR, B0, B1, B2; int n_stages = FLA_omp_get_num_stages(); int stage_width = FLA_omp_compute_stage_width( X ); int thread_num = omp_get_thread_num(); int n_done = 0; int b, i; // Start thread i on the ith panel partition of B. FLA_Part_1x2( X, &XL, &XR, stage_width*thread_num, FLA_LEFT ); FLA_Part_1x2( B, &BL, &BR, stage_width*thread_num, FLA_LEFT ); while ( n_done++ < n_stages ){ // The last lockable partition may be smaller than the others. b = min( FLA_Obj_width( XR ), stage_width ); FLA_Repart_1x2_to_1x3( XL, /**/ XR, &X0, /**/ &X1, &X2, b, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ // Get the index of the current partition. i = FLA_Obj_width(XL)/stage_width; // Acquire lock[i] (the lock for X1 and B1). omp_set_lock( &fla_omp_lock[i] ); // B1 := alpha * X1 + B1 FLA_Axpy_external( alpha, X1, B1 ); // Release lock[i] (the lock for X1 and B1). omp_unset_lock( &fla_omp_lock[i] ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &XL, /**/ &XR, X0, X1, /**/ X2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); // If this thread reaches the last partition, wrap back around to // the first partition for the next iteration. if( FLA_Obj_width( XL ) == FLA_Obj_width( X ) ) { FLA_Part_1x2( X, &XL, &XR, 0, FLA_LEFT ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); } } return FLA_SUCCESS; }
FLA_Error FLA_Axpy_sync_pipeline( FLA_Obj alpha, FLA_Obj X, FLA_Obj B ) { FLA_Obj XL, XR, X0, X1, X2; FLA_Obj BL, BR, B0, B1, B2; int b, i, nb_alg; FLA_Part_1x2( X, &XL, &XR, 0, FLA_LEFT ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); // Compute the width of one lockable partition. nb_alg = FLA_omp_compute_stage_width( X ); while ( FLA_Obj_width( XL ) < FLA_Obj_width( X ) ){ b = min( FLA_Obj_width( XR ), nb_alg ); FLA_Repart_1x2_to_1x3( XL, /**/ XR, &X0, /**/ &X1, &X2, b, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ // Get the index of the current partition. i = FLA_Obj_width(XL)/nb_alg; // Acquire lock[i] (the lock for X1 and B1). omp_set_lock( &fla_omp_lock[i] ); // B1 := alpha * X1 + B1 FLA_Axpy_external( alpha, X1, B1 ); // Release lock[i] (the lock for X1 and B1). omp_unset_lock( &fla_omp_lock[i] ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &XL, /**/ &XR, X0, X1, /**/ X2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Trmvsx_external( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y ) { FLA_Obj x_copy; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Trmvsx_check( uplo, transa, diag, alpha, A, x, beta, y ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, x, &x_copy ); FLA_Copy_external( x, x_copy ); FLA_Trmv_external( uplo, transa, diag, A, x_copy ); FLA_Scal_external( beta, y ); FLA_Axpy_external( alpha, x_copy, y ); FLA_Obj_free( &x_copy ); return FLA_SUCCESS; }
FLA_Error FLA_Eig_gest_il_unb_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj BTL, BTR, B00, b01, B02, BBL, BBR, b10t, beta11, b12t, B20, b21, B22; //FLA_Obj yT, y01, // yB, psi11, // y21; //FLA_Obj y21_l, y21_r; FLA_Obj psi11, y12t, y21, Y22; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( B, &BTL, &BTR, &BBL, &BBR, 0, 0, FLA_TL ); //FLA_Part_2x1( Y, &yT, // &yB, 0, FLA_TOP ); FLA_Part_2x2( Y, &psi11, &y12t, &y21, &Y22, 1, 1, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( BTL, /**/ BTR, &B00, /**/ &b01, &B02, /* ************* */ /* ************************* */ &b10t, /**/ &beta11, &b12t, BBL, /**/ BBR, &B20, /**/ &b21, &B22, 1, 1, FLA_BR ); //FLA_Repart_2x1_to_3x1( yT, &y01, // /* ** */ /* ***** */ // &psi11, // yB, &y21, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ //FLA_Part_1x2( y21, &y21_l, &y21_r, 1, FLA_LEFT ); // alpha11 = inv(beta11) * alpha11 * inv(conj(beta11)); // = inv(beta11) * alpha11 * inv(beta11); FLA_Inv_scal_external( beta11, alpha11 ); FLA_Inv_scal_external( beta11, alpha11 ); //// y21 = b21 * alpha11; //FLA_Copy_external( b21, y21_l ); //FLA_Scal_external( alpha11, y21_l ); // psi11 = - 1/2 * alpha11; FLA_Copy_external( alpha11, psi11 ); FLA_Scal_external( FLA_MINUS_ONE_HALF, psi11 ); // a21 = a21 * inv(conj(beta11)); // = a21 * inv(beta11); FLA_Inv_scal_external( beta11, a21 ); //// a21 = a21 - 1/2 * y21; //FLA_Axpy_external( FLA_MINUS_ONE_HALF, y21_l, a21 ); // a21 = a21 - 1/2 * alpha11 * b21; FLA_Axpy_external( psi11, b21, a21 ); // A22 = A22 - a21 * b21' - b21 * a21'; FLA_Her2c_external( FLA_LOWER_TRIANGULAR, FLA_NO_CONJUGATE, FLA_MINUS_ONE, a21, b21, A22 ); //// a21 = a21 - 1/2 * y21; //FLA_Axpy_external( FLA_MINUS_ONE_HALF, y21_l, a21 ); // a21 = a21 - 1/2 * alpha11 * b21; FLA_Axpy_external( psi11, b21, a21 ); // a21 = inv( tril( B22 ) ) * a21; FLA_Trsv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, B22, a21 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &BTL, /**/ &BTR, B00, b01, /**/ B02, b10t, beta11, /**/ b12t, /* ************** */ /* *********************** */ &BBL, /**/ &BBR, B20, b21, /**/ B22, FLA_TL ); //FLA_Cont_with_3x1_to_2x1( &yT, y01, // psi11, // /* ** */ /* ***** */ // &yB, y21, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Eig_gest_nl_unb_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj BTL, BTR, B00, b01, B02, BBL, BBR, b10t, beta11, b12t, B20, b21, B22; //FLA_Obj yL, yR, y10t, psi11, y12t; //FLA_Obj y10t_t, // y10t_b; FLA_Obj psi11, y12t, y21, Y22; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( B, &BTL, &BTR, &BBL, &BBR, 0, 0, FLA_TL ); //FLA_Part_1x2( Y, &yL, &yR, 0, FLA_LEFT ); FLA_Part_2x2( Y, &psi11, &y12t, &y21, &Y22, 1, 1, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( BTL, /**/ BTR, &B00, /**/ &b01, &B02, /* ************* */ /* ************************* */ &b10t, /**/ &beta11, &b12t, BBL, /**/ BBR, &B20, /**/ &b21, &B22, 1, 1, FLA_BR ); //FLA_Repart_1x2_to_1x3( yL, /**/ yR, &y10t, /**/ &psi11, &y12t, // 1, FLA_RIGHT ); /*------------------------------------------------------------*/ //FLA_Part_2x1( y10t, &y10t_t, // &y10t_b, 1, FLA_TOP ); //// y10t = alpha11 * b10t; //FLA_Copy_external( b10t, y10t_t ); //FLA_Scal_external( alpha11, y10t_t ); // psi11 = 1/2 * alpha11; FLA_Copy_external( alpha11, psi11 ); FLA_Scal_external( FLA_ONE_HALF, psi11 ); //// a10t = a10t + 1/2 * y10t; //FLA_Axpy_external( FLA_ONE_HALF, y10t_t, a10t ); // a10t = a10t + 1/2 * alpha11 * b10t; FLA_Axpy_external( psi11, b10t, a10t ); // A00 = A00 + a10t' * b10t + b10t' * a10t; FLA_Her2c_external( FLA_LOWER_TRIANGULAR, FLA_CONJUGATE, FLA_ONE, a10t, b10t, A00 ); //// a10t = a10t + 1/2 * y10t; //FLA_Axpy_external( FLA_ONE_HALF, y10t_t, a10t ); // a10t = a10t + 1/2 * alpha11 * b10t; FLA_Axpy_external( psi11, b10t, a10t ); // a10t = conj(beta11) * a10t; // = beta11 * a10t; FLA_Scal_external( beta11, a10t ); // alpha11 = conj(beta11) * alpha11 * beta11; // = beta11 * alpha11 * beta11; FLA_Scal_external( beta11, alpha11 ); FLA_Scal_external( beta11, alpha11 ); // A20 = A20 + a21 * b10t; FLA_Ger_external( FLA_ONE, a21, b10t, A20 ); // a21 = a21 * beta11; FLA_Scal_external( beta11, a21 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &BTL, /**/ &BTR, B00, b01, /**/ B02, b10t, beta11, /**/ b12t, /* ************** */ /* *********************** */ &BBL, /**/ &BBR, B20, b21, /**/ B22, FLA_TL ); //FLA_Cont_with_1x3_to_1x2( &yL, /**/ &yR, y10t, psi11, /**/ y12t, // FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLA_Gemm_nn_omp_var15( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj C, fla_gemm_t* cntl ) { FLA_Obj AT, A0, AB, A1, A2; FLA_Obj CT, C0, CB, C1, C2; FLA_Obj AL, AR, A10, A11, A12; FLA_Obj BT, B0, BB, B1, B2; FLA_Obj C1_local; int i, j, lock_ldim, lock_i; int b_m, b_k; FLA_Part_2x1( A, &AT, &AB, 0, FLA_TOP ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_TOP ); #pragma intel omp parallel taskq { while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ) { b_m = FLA_Determine_blocksize( A, AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_2x1_to_3x1( AT, &A0, /* ** */ /* ** */ &A1, AB, &A2, b_m, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( CT, &C0, /* ** */ /* ** */ &C1, CB, &C2, b_m, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* C1 = alpha * A1 * B + C1; */ FLA_Part_1x2( A1, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x1( B, &BT, &BB, 0, FLA_TOP ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ) { b_k = FLA_Determine_blocksize( A, AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) ); // Get the index of the current partition. // FIX THIS: need + b_m - 1 or something like this //j = FLA_Obj_length( CT ) / b_m; //i = FLA_Obj_width( AL ) / b_k; //lock_ldim = FLA_get_num_threads_in_m_dim(omp_get_num_threads()); lock_i = FLA_Obj_length( CT ) / b_m; FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A10, /**/ &A11, &A12, b_k, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( BT, &B0, /* ** */ /* ** */ &B1, BB, &B2, b_k, FLA_BOTTOM ); /*------------------------------------------------------------*/ /* C1 = alpha * A11 * B1 + C1; */ //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, //// alpha, A11, B1, FLA_ONE, C1 ); #pragma intel omp task captureprivate( lock_i, A11, B1, C1 ), private( C1_local ) { FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C1, &C1_local ); FLA_Obj_set_to_zero( C1_local ); /* C1_local = alpha * A1 * B11 + C1_local; */ FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A11, B1, FLA_ONE, C1_local ); // Acquire lock[i] (the lock for C1). omp_set_lock( &fla_omp_lock[lock_i] ); /* C1 += C1_local */ FLA_Axpy_external( FLA_ONE, C1_local, C1 ); //FLA_Axpy_sync_pipeline2( j*lock_ldim, FLA_ONE, C1_local, C1 ); //FLA_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 ); //REF_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 ); // Release lock[i] (the lock for C1). omp_unset_lock( &fla_omp_lock[lock_i] ); FLA_Obj_free( &C1_local ); } /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A10, A11, /**/ A12, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &BT, B0, B1, /* ** */ /* ** */ &BB, B2, FLA_TOP ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &AT, A0, A1, /* ** */ /* ** */ &AB, A2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &CT, C0, C1, /* ** */ /* ** */ &CB, C2, FLA_TOP ); } } return FLA_SUCCESS; }
FLA_Error FLA_Eig_gest_iu_unb_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj BTL, BTR, B00, b01, B02, BBL, BBR, b10t, beta11, b12t, B20, b21, B22; FLA_Obj yT, y01, yB, psi11, y21; FLA_Obj y01_l, y01_r; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( B, &BTL, &BTR, &BBL, &BBR, 0, 0, FLA_TL ); FLA_Part_2x1( Y, &yT, &yB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( BTL, /**/ BTR, &B00, /**/ &b01, &B02, /* ************* */ /* ************************* */ &b10t, /**/ &beta11, &b12t, BBL, /**/ BBR, &B20, /**/ &b21, &B22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( yT, &y01, /* ** */ /* ***** */ &psi11, yB, &y21, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_Part_1x2( y01, &y01_l, &y01_r, 1, FLA_LEFT ); // y01 = 1/2 * A00 * b01; FLA_Hemvc_external( FLA_UPPER_TRIANGULAR, FLA_NO_CONJUGATE, FLA_ONE_HALF, A00, b01, FLA_ZERO, y01_l ); // a01 = a01 - y01; FLA_Axpy_external( FLA_MINUS_ONE, y01_l, a01 ); // alpha11 = alpha11 - a01' * b01 - b01' * a01; FLA_Dot2cs_external( FLA_CONJUGATE, FLA_MINUS_ONE, a01, b01, FLA_ONE, alpha11 ); // alpha11 = inv(beta11) * alpha11 * inv(conj(beta11)); // = inv(beta11) * alpha11 * inv(beta11); FLA_Inv_scal_external( beta11, alpha11 ); FLA_Inv_scal_external( beta11, alpha11 ); // a12t = a12t - b01' * A02; // a12t^T = a12t^T - A02^T * conj(b01); FLA_Gemvc_external( FLA_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, A02, b01, FLA_ONE, a12t ); // a12t = inv(conj(beta11)) * a12t; // a12t = inv(beta11) * a12t; FLA_Inv_scal_external( beta11, a12t ); // a01 = a01 - y01; FLA_Axpy_external( FLA_MINUS_ONE, y01_l, a01 ); // a01 = a01 * inv(beta11); FLA_Inv_scal_external( beta11, a01 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &BTL, /**/ &BTR, B00, b01, /**/ B02, b10t, beta11, /**/ b12t, /* ************** */ /* *********************** */ &BBL, /**/ &BBR, B20, b21, /**/ B22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &yT, y01, psi11, /* ** */ /* ***** */ &yB, y21, FLA_TOP ); } return FLA_SUCCESS; }
void time_Trinv_un( int variant, int type, int nrepeats, int m, int nb_alg, FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, b_save, b_orig_save; fla_blocksize_t* bp; fla_trinv_t* cntl_trinv_var; fla_trinv_t* cntl_trinv_unb; fla_gemm_t* cntl_gemm_blas; fla_trmm_t* cntl_trmm_blas; fla_trsm_t* cntl_trsm_blas; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_trinv_unb = FLA_Cntl_trinv_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT3, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_trmm_blas = FLA_Cntl_trmm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_trinv_var = FLA_Cntl_trinv_obj_create( FLA_FLAT, variant, bp, cntl_trinv_unb, cntl_trmm_blas, cntl_trsm_blas, cntl_trsm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); FLA_Copy_external( b_orig, b_orig_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ // Time reference case 0: REF_Trinv_un( A ); break; // Time variant 1 case 1:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trinv_un_unb_var1( A ); break; case FLA_ALG_UNB_OPT: FLA_Trinv_un_opt_var1( A ); break; case FLA_ALG_BLOCKED: FLA_Trinv_un_blk_var1( A, cntl_trinv_var ); break; default: printf("trouble\n"); } break; } // Time variant 2 case 2:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trinv_un_unb_var2( A ); break; case FLA_ALG_UNB_OPT: FLA_Trinv_un_opt_var2( A ); break; case FLA_ALG_BLOCKED: FLA_Trinv_un_blk_var2( A, cntl_trinv_var ); break; default: printf("trouble\n"); } break; } // Time variant 3 case 3:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trinv_un_unb_var3( A ); break; case FLA_ALG_UNB_OPT: FLA_Trinv_un_opt_var3( A ); break; case FLA_ALG_BLOCKED: FLA_Trinv_un_blk_var3( A, cntl_trinv_var ); break; default: printf("trouble\n"); } break; } // Time variant 4 case 4:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trinv_un_unb_var4( A ); break; case FLA_ALG_UNB_OPT: FLA_Trinv_un_opt_var4( A ); break; case FLA_ALG_BLOCKED: FLA_Trinv_un_blk_var4( A, cntl_trinv_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_trinv_var ); FLA_Cntl_obj_free( cntl_trinv_unb ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Cntl_obj_free( cntl_trmm_blas ); FLA_Cntl_obj_free( cntl_trsm_blas ); FLA_Blocksize_free( bp ); { FLA_Trmv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Trmv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A_save, b ); FLA_Axpy_external( FLA_MINUS_ONE, b_orig, b ); FLA_Nrm2_external( b, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } *gflops = 1.0 / 3.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_length( A ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Copy_external( b_orig_save, b_orig ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &b_orig_save ); }
void libfla_test_eig_gest_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Uplo inv; FLA_Uplo uplo; FLA_Obj A, B, Y, norm; FLA_Obj A_save, B_save; FLA_Obj A_test, B_test, Y_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_inv( &pc_str[pci][0], &inv ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); if ( inv == FLA_NO_INVERSE && ( ( impl == FLA_TEST_FLAT_UNB_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_OPT_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 3 ) ) ) { *perf = 0.0; *residual = 0.0; return; } // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &Y ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); // Initialize the test matrices. FLA_Random_spd_matrix( uplo, A ); FLA_Scalr( uplo, FLA_TWO, A ); FLA_Hermitianize( uplo, A ); FLA_Random_spd_matrix( uplo, B ); FLA_Scalr( uplo, FLA_TWO, B ); FLA_Chol( uplo, B ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( Y, 1, &b_flash, &Y_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); } else { A_test = A; Y_test = Y; B_test = B; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_hierarchify( A_save, A_test ); FLASH_Obj_hierarchify( B_save, B_test ); } else { FLA_Copy_external( A_save, A_test ); FLA_Copy_external( B_save, B_test ); } time = FLA_Clock(); libfla_test_eig_gest_impl( impl, inv, uplo, A_test, Y_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Check our solution. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLA_Trans trans_left, trans_right; FLASH_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLASH_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLASH_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } FLASH_Obj_flatten( A_test, A ); } else { FLA_Trans trans_left, trans_right; FLA_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLA_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLA_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &Y_test ); FLASH_Obj_free( &B_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_free(); // Compute the performance of the best experiment repeat. *perf = 1.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Axpy_external( FLA_MINUS_ONE, A_save, A ); FLA_Norm1( A, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &Y ); FLA_Obj_free( &B ); }
FLA_Error FLASH_Axpy_hierarchy( int direction, FLA_Obj alpha, FLA_Obj F, FLA_Obj* H ) { // Once we get down to a submatrix whose elements are scalars, we are down // to our base case. if ( FLA_Obj_elemtype( *H ) == FLA_SCALAR ) { // Depending on which top-level function invoked us, we either axpy // the source data in the flat matrix to the leaf-level submatrix of // the hierarchical matrix, or axpy the data in the hierarchical // submatrix to the flat matrix. if ( direction == FLA_FLAT_TO_HIER ) { #ifdef FLA_ENABLE_SCC if ( FLA_is_owner() ) #endif FLA_Axpy_external( alpha, F, *H ); } else if ( direction == FLA_HIER_TO_FLAT ) { #ifdef FLA_ENABLE_SCC if ( FLA_is_owner() ) #endif FLA_Axpy_external( alpha, *H, F ); } } else { FLA_Obj HL, HR, H0, H1, H2; FLA_Obj FL, FR, F0, F1, F2; FLA_Obj H1T, H01, H1B, H11, H21; FLA_Obj F1T, F01, F1B, F11, F21; dim_t b_m; dim_t b_n; FLA_Part_1x2( *H, &HL, &HR, 0, FLA_LEFT ); FLA_Part_1x2( F, &FL, &FR, 0, FLA_LEFT ); while ( FLA_Obj_width( HL ) < FLA_Obj_width( *H ) ) { FLA_Repart_1x2_to_1x3( HL, /**/ HR, &H0, /**/ &H1, &H2, 1, FLA_RIGHT ); // Get the scalar width of H1 and use that to determine the // width of F1. b_n = FLASH_Obj_scalar_width( H1 ); FLA_Repart_1x2_to_1x3( FL, /**/ FR, &F0, /**/ &F1, &F2, b_n, FLA_RIGHT ); // ------------------------------------------------------------- FLA_Part_2x1( H1, &H1T, &H1B, 0, FLA_TOP ); FLA_Part_2x1( F1, &F1T, &F1B, 0, FLA_TOP ); while ( FLA_Obj_length( H1T ) < FLA_Obj_length( H1 ) ) { FLA_Repart_2x1_to_3x1( H1T, &H01, /* ** */ /* *** */ &H11, H1B, &H21, 1, FLA_BOTTOM ); // Get the scalar length of H11 and use that to determine the // length of F11. b_m = FLASH_Obj_scalar_length( H11 ); FLA_Repart_2x1_to_3x1( F1T, &F01, /* ** */ /* *** */ &F11, F1B, &F21, b_m, FLA_BOTTOM ); // ------------------------------------------------------------- // Recursively axpy between F11 and H11. FLASH_Axpy_hierarchy( direction, alpha, F11, FLASH_OBJ_PTR_AT( H11 ) ); // ------------------------------------------------------------- FLA_Cont_with_3x1_to_2x1( &H1T, H01, H11, /* ** */ /* *** */ &H1B, H21, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &F1T, F01, F11, /* ** */ /* *** */ &F1B, F21, FLA_TOP ); } // ------------------------------------------------------------- FLA_Cont_with_1x3_to_1x2( &HL, /**/ &HR, H0, H1, /**/ H2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &FL, /**/ &FR, F0, F1, /**/ F2, FLA_LEFT ); } } return FLA_SUCCESS; }