FLA_Error FLA_Apply_G_rf_blk_var6( FLA_Obj G, FLA_Obj A, dim_t b_alg ) { FLA_Datatype datatype; int k_G, m_A, n_A; int rs_G, cs_G; int rs_A, cs_A; datatype = FLA_Obj_datatype( A ); k_G = FLA_Obj_width( G ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); switch ( datatype ) { case FLA_FLOAT: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); float* buff_A = ( float* ) FLA_FLOAT_PTR( A ); FLA_Apply_G_rf_bls_var6( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_DOUBLE: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_A = ( double* ) FLA_DOUBLE_PTR( A ); FLA_Apply_G_rf_bld_var6( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_COMPLEX: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A ); FLA_Apply_G_rf_blc_var6( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A ); FLA_Apply_G_rf_blz_var6( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A, b_alg ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Apply_G_rf_asm_var2( FLA_Obj G, FLA_Obj A ) /* Apply k sets of Givens rotations to a matrix A from the right, where each set takes the form: A := A ( G(n-1,k) ... G(1,k) G(0,k) )' = A G(0,k)' G(1,k)' ... G(n-1,k)' where Gik is the ith Givens rotation formed from the kth set, stored in the (i,k) entries of of G: Gik = / gamma_ik -sigma_ik \ \ sigma_ik gamma_ik / This variant iterates in pipelined, overlapping fashion and applies rotations to two columns at a time. -FGVZ */ { FLA_Datatype datatype; int k_G, m_A, n_A; int rs_G, cs_G; int rs_A, cs_A; datatype = FLA_Obj_datatype( A ); k_G = FLA_Obj_width( G ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_G = FLA_Obj_row_stride( G ); cs_G = FLA_Obj_col_stride( G ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); switch ( datatype ) { case FLA_FLOAT: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); float* buff_A = ( float* ) FLA_FLOAT_PTR( A ); FLA_Apply_G_rf_ass_var2( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); double* buff_A = ( double* ) FLA_DOUBLE_PTR( A ); FLA_Apply_G_rf_asd_var2( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G ); scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A ); FLA_Apply_G_rf_asc_var2( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G ); dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A ); FLA_Apply_G_rf_asz_var2( k_G, m_A, n_A, buff_G, rs_G, cs_G, buff_A, rs_A, cs_A ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Svd_ext_u_unb_var1( FLA_Svd_type jobu, FLA_Svd_type jobv, dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj scale, T, S, rL, rR, d, e, G, H, C; // C is dummy. dim_t m_A, n_A, min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; FLA_Bool u_is_formed = FALSE, v_is_formed = FALSE; int apply_scale; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = min( m_A, n_A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // Create matrices to hold block Householder transformations. FLA_Bidiag_UT_create_T( A, &T, &S ); // Create vectors to hold the realifying scalars. if ( FLA_Obj_is_complex( A ) ) { FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); } // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); // Create a real scaling factor. FLA_Obj_create( dt_real, 1, 1, 0, 0, &scale ); // Scale matrix A if necessary. FLA_Max_abs_value( A, scale ); apply_scale = ( FLA_Obj_gt( scale, FLA_OVERFLOW_SQUARE_THRES ) == TRUE ) - ( FLA_Obj_lt( scale, FLA_UNDERFLOW_SQUARE_THRES ) == TRUE ); if ( apply_scale ) FLA_Scal( apply_scale > 0 ? FLA_SAFE_MIN : FLA_SAFE_INV_MIN, A ); if ( m_A < crossover_ratio * n_A ) { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( A, T, S ); if ( FLA_Obj_is_complex( A ) ) FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( A, d, e ); // Form U and V. if ( u_is_formed == FALSE ) { switch ( jobu ) { case FLA_SVD_VECTORS_MIN_OVERWRITE: if ( jobv != FLA_SVD_VECTORS_NONE ) FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_NO_TRANSPOSE, V ); v_is_formed = TRUE; // For this case, V should be formed here. U = A; case FLA_SVD_VECTORS_ALL: case FLA_SVD_VECTORS_MIN_COPY: FLA_Bidiag_UT_form_U_ext( FLA_UPPER_TRIANGULAR, A, T, FLA_NO_TRANSPOSE, U ); u_is_formed = TRUE; break; case FLA_SVD_VECTORS_NONE: // Do nothing break; } } if ( v_is_formed == FALSE ) { if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE ) { FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_CONJ_TRANSPOSE, A ); v_is_formed = TRUE; /* and */ V = A; // This V is actually V^H. // V^H -> V FLA_Obj_flip_base( &V ); FLA_Obj_flip_view( &V ); if ( FLA_Obj_is_complex( A ) ) FLA_Conjugate( V ); } else if ( jobv != FLA_SVD_VECTORS_NONE ) { FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, A, S, FLA_NO_TRANSPOSE, V ); v_is_formed = TRUE; } } // For complex matrices, apply realification transformation. if ( FLA_Obj_is_complex( A ) && jobu != FLA_SVD_VECTORS_NONE ) { FLA_Obj UL, UR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); } if ( FLA_Obj_is_complex( A ) && jobv != FLA_SVD_VECTORS_NONE ) { FLA_Obj VL, VR; FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } // Perform a singular value decomposition on the upper bidiagonal matrix. r_val = FLA_Bsvd_ext_opt_var1( n_iter_max, d, e, G, H, jobu, U, jobv, V, FALSE, C, // C is not referenced b_alg ); } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; // Perform a QR factorization on A. FLA_QR_UT_create_T( A, &TQ ); FLA_QR_UT( A, TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); // Form U; if necessary overwrite on A. if ( u_is_formed == FALSE ) { switch ( jobu ) { case FLA_SVD_VECTORS_MIN_OVERWRITE: U = A; case FLA_SVD_VECTORS_ALL: case FLA_SVD_VECTORS_MIN_COPY: FLA_QR_UT_form_Q( A, TQ, U ); u_is_formed = TRUE; break; case FLA_SVD_VECTORS_NONE: // Do nothing break; } } FLA_Obj_free( &TQ ); // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); if ( FLA_Obj_is_complex( R ) ) FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_real_diagonals( R, d, e ); if ( v_is_formed == FALSE ) { if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE ) { FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, R, S, FLA_CONJ_TRANSPOSE, AT ); v_is_formed = TRUE; /* and */ V = AT; // This V is actually V^H. // V^H -> V FLA_Obj_flip_base( &V ); FLA_Obj_flip_view( &V ); if ( FLA_Obj_is_complex( A ) ) FLA_Conjugate( V ); } else if ( jobv != FLA_SVD_VECTORS_NONE ) { FLA_Bidiag_UT_form_V_ext( FLA_UPPER_TRIANGULAR, R, S, FLA_NO_TRANSPOSE, V ); v_is_formed = TRUE; } } // Apply householder vectors U in R. FLA_Bidiag_UT_form_U_ext( FLA_UPPER_TRIANGULAR, R, T, FLA_NO_TRANSPOSE, R ); // Apply the realifying scalars in rL and rR to U and V, respectively. if ( FLA_Obj_is_complex( A ) && jobu != FLA_SVD_VECTORS_NONE ) { FLA_Obj RL, RR; FLA_Part_1x2( R, &RL, &RR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, RL ); } if ( FLA_Obj_is_complex( A ) && jobv != FLA_SVD_VECTORS_NONE ) { FLA_Obj VL, VR; FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_ext_opt_var1( n_iter_max, d, e, G, H, jobu, R, jobv, V, FALSE, C, b_alg ); // Multiply R into U, storing the result in A and then copying back // to U. if ( jobu != FLA_SVD_VECTORS_NONE ) { FLA_Obj UL, UR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); if ( jobu == FLA_SVD_VECTORS_MIN_OVERWRITE || jobv == FLA_SVD_VECTORS_MIN_OVERWRITE ) { FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, UL, &C ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, C ); FLA_Copy( C, UL ); FLA_Obj_free( &C ); } else { FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } } FLA_Obj_free( &R ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // No sort is required as it is applied on FLA_Bsvd. if ( apply_scale ) FLA_Scal( apply_scale < 0 ? FLA_SAFE_MIN : FLA_SAFE_INV_MIN, s ); // When V is overwritten, flip it again. if ( jobv == FLA_SVD_VECTORS_MIN_OVERWRITE ) { // Always apply conjugation first wrt dimensions used; then, flip base. if ( FLA_Obj_is_complex( V ) ) FLA_Conjugate( V ); FLA_Obj_flip_base( &V ); } FLA_Obj_free( &scale ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); if ( FLA_Obj_is_complex( A ) ) { FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); } FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); return r_val; }
FLA_Error FLA_Syr2k_ut_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl ) { FLA_Obj AL, AR, A0, A1, A2; FLA_Obj BL, BR, B0, B1, B2; FLA_Obj CTL, CTR, C00, C01, C02, CBL, CBR, C10, C11, C12, C20, C21, C22; dim_t b; FLA_Scalr_internal( FLA_UPPER_TRIANGULAR, beta, C, FLA_Cntl_sub_scalr( cntl ) ); FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_TL ); while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){ b = FLA_Determine_blocksize( AR, FLA_RIGHT, FLA_Cntl_blocksize( cntl ) ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &A1, &A2, b, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, b, FLA_RIGHT ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, /**/ &C01, &C02, /* ************* */ /* ******************** */ &C10, /**/ &C11, &C12, CBL, /**/ CBR, &C20, /**/ &C21, &C22, b, b, FLA_BR ); /*------------------------------------------------------------*/ /* C01 = C01 + B0' * A1 */ FLA_Gemm_internal( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, B0, A1, FLA_ONE, C01, FLA_Cntl_sub_gemm1( cntl ) ); /* C12 = C12 + A1' * B2 */ FLA_Gemm_internal( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, alpha, A1, B2, FLA_ONE, C12, FLA_Cntl_sub_gemm2( cntl ) ); /* C11 = C11 + A1' * B1 + B1' * A1 */ FLA_Syr2k_internal( FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, alpha, A1, B1, FLA_ONE, C11, FLA_Cntl_sub_syr2k( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, A1, /**/ A2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, C01, /**/ C02, C10, C11, /**/ C12, /* ************** */ /* ****************** */ &CBL, /**/ &CBR, C20, C21, /**/ C22, FLA_TL ); } return FLA_SUCCESS; }
void time_Syrk_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var1( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var1( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var1( A, C ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var2( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var2( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var2( A, C ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var3( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var3( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var3( A, C ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var4( A, C ); break; case FLA_ALG_OPENMP_2TASKS: FLA_Syrk_ln_omp2t_var4( A, C ); break; case FLA_ALG_OPENMP_2LOOPS: FLA_Syrk_ln_omp2l_var4( A, C ); break; default: printf("trouble\n"); } break; } case 5:{ // Time variant 5 switch( type ){ case FLA_ALG_OPENMP_1TASK: FLA_Syrk_ln_omp1t_var5( A, C ); break; default: printf("trouble\n"); } break; } } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, C_ref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, C_ref ); //FLA_Obj_show( "C:", C, "%f", "\n"); } *gflops = 1.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Trmm_rln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_gemm_t* cntl_gemm_blas; fla_trmm_t* cntl_trmm_blas; fla_trmm_t* cntl_trmm_var; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_trmm_blas = FLA_Cntl_trmm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trmm_var = FLA_Cntl_trmm_obj_create( FLA_FLAT, variant, bp, cntl_trmm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trmm_rln_unb_var1( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trmm_rln_blk_var1( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trmm_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trmm_rln_unb_var2( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trmm_rln_blk_var2( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trmm_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trmm_rln_unb_var3( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trmm_rln_blk_var3( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trmm_var ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trmm_rln_unb_var4( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trmm_rln_blk_var4( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trmm_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_trmm_var ); FLA_Cntl_obj_free( cntl_trmm_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1.0e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
FLA_Error FLA_LQ_UT_opt_var2( FLA_Obj A, FLA_Obj T ) { FLA_Datatype datatype; int m_A, n_A; int rs_A, cs_A; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_LQ_UT_ops_var2( m_A, n_A, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_LQ_UT_opd_var2( m_A, n_A, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_LQ_UT_opc_var2( m_A, n_A, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_LQ_UT_opz_var2( m_A, n_A, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Herk_external_gpu( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu ) { FLA_Datatype datatype; int k_A; int m_A, n_A; int m_C; int ldim_A; int ldim_C; char blas_uplo; char blas_trans; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Herk_check( uplo, trans, alpha, A, beta, C ); if ( FLA_Obj_has_zero_dim( C ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); ldim_A = FLA_Obj_length( A ); m_C = FLA_Obj_length( C ); ldim_C = FLA_Obj_length( C ); if ( trans == FLA_NO_TRANSPOSE ) k_A = n_A; else k_A = m_A; FLA_Param_map_flame_to_netlib_uplo( uplo, &blas_uplo ); FLA_Param_map_flame_to_netlib_trans( trans, &blas_trans ); switch( datatype ){ case FLA_FLOAT: { float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_beta = ( float * ) FLA_FLOAT_PTR( beta ); cublasSsyrk( blas_uplo, blas_trans, m_C, k_A, *buff_alpha, ( float * ) A_gpu, ldim_A, *buff_beta, ( float * ) C_gpu, ldim_C ); break; } case FLA_DOUBLE: { double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_beta = ( double * ) FLA_DOUBLE_PTR( beta ); cublasDsyrk( blas_uplo, blas_trans, m_C, k_A, *buff_alpha, ( double * ) A_gpu, ldim_A, *buff_beta, ( double * ) C_gpu, ldim_C ); break; } case FLA_COMPLEX: { float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_beta = ( float * ) FLA_FLOAT_PTR( beta ); cublasCherk( blas_uplo, blas_trans, m_C, k_A, *buff_alpha, ( cuComplex * ) A_gpu, ldim_A, *buff_beta, ( cuComplex * ) C_gpu, ldim_C ); break; } case FLA_DOUBLE_COMPLEX: { double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_beta = ( double * ) FLA_DOUBLE_PTR( beta ); cublasZherk( blas_uplo, blas_trans, m_C, k_A, *buff_alpha, ( cuDoubleComplex * ) A_gpu, ldim_A, *buff_beta, ( cuDoubleComplex * ) C_gpu, ldim_C ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Gemv_external( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y ) { FLA_Datatype datatype; int m_A, n_A; int rs_A, cs_A; int inc_x; int inc_y; trans1_t blis_transa; conj1_t blis_conjx; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Gemv_check( transa, alpha, A, x, beta, y ); if ( FLA_Obj_has_zero_dim( A ) ) { FLA_Scal_external( beta, y ); return FLA_SUCCESS; } datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); inc_x = FLA_Obj_vector_inc( x ); inc_y = FLA_Obj_vector_inc( y ); FLA_Param_map_flame_to_blis_trans( transa, &blis_transa ); FLA_Param_map_flame_to_blis_conj( FLA_NO_CONJUGATE, &blis_conjx ); switch( datatype ){ case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_x = ( float * ) FLA_FLOAT_PTR( x ); float *buff_y = ( float * ) FLA_FLOAT_PTR( y ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_beta = ( float * ) FLA_FLOAT_PTR( beta ); bl1_sgemv( blis_transa, blis_conjx, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A, buff_x, inc_x, buff_beta, buff_y, inc_y ); break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_x = ( double * ) FLA_DOUBLE_PTR( x ); double *buff_y = ( double * ) FLA_DOUBLE_PTR( y ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_beta = ( double * ) FLA_DOUBLE_PTR( beta ); bl1_dgemv( blis_transa, blis_conjx, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A, buff_x, inc_x, buff_beta, buff_y, inc_y ); break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_x = ( scomplex * ) FLA_COMPLEX_PTR( x ); scomplex *buff_y = ( scomplex * ) FLA_COMPLEX_PTR( y ); scomplex *buff_alpha = ( scomplex * ) FLA_COMPLEX_PTR( alpha ); scomplex *buff_beta = ( scomplex * ) FLA_COMPLEX_PTR( beta ); bl1_cgemv( blis_transa, blis_conjx, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A, buff_x, inc_x, buff_beta, buff_y, inc_y ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_x = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( x ); dcomplex *buff_y = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( y ); dcomplex *buff_alpha = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); dcomplex *buff_beta = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( beta ); bl1_zgemv( blis_transa, blis_conjx, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A, buff_x, inc_x, buff_beta, buff_y, inc_y ); break; } } return FLA_SUCCESS; }
void time_Syrk_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref, double *dtime, double *diff, double *gflops ) { int irep, info, lwork; double dtime_old, d_minus_one = -1.0, d_one = 1.0; FLA_Obj Cold; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold ); FLA_Copy_external( C, Cold ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( Cold, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C ); break; default: printf("trouble\n"); break; } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else{ *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } if ( variant == 0 ){ FLA_Copy_external( C, Cref ); *diff = 0.0; } else{ *diff = FLA_Max_elemwise_diff( C, Cref ); } *gflops = 1.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_width( A ) / dtime_old / 1e9; *dtime = dtime_old; FLA_Copy_external( Cold, C ); FLA_Obj_free( &Cold ); }
void time_Symm( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Symm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Symm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Symm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Symm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Symm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Symm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Symm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; case FLA_ALG_FRONT: FLA_Symm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
FLA_Error FLA_Axpy_external( FLA_Obj alpha, FLA_Obj A, FLA_Obj B ) { FLA_Datatype datatype; int m_B, n_B; int rs_A, cs_A; int rs_B, cs_B; trans1_t blis_trans; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Axpy_check( alpha, A, B ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); m_B = FLA_Obj_length( B ); n_B = FLA_Obj_width( B ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); if ( FLA_Obj_is_conformal_to( FLA_NO_TRANSPOSE, A, B ) ) FLA_Param_map_flame_to_blis_trans( FLA_NO_TRANSPOSE, &blis_trans ); else // if ( FLA_Obj_is_conformal_to( FLA_TRANSPOSE, A, B ) ) FLA_Param_map_flame_to_blis_trans( FLA_TRANSPOSE, &blis_trans ); switch ( datatype ){ case FLA_FLOAT: { float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); bl1_saxpymt( blis_trans, m_B, n_B, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); break; } case FLA_DOUBLE: { double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); bl1_daxpymt( blis_trans, m_B, n_B, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); break; } case FLA_COMPLEX: { scomplex *buff_alpha = ( scomplex * ) FLA_COMPLEX_PTR( alpha ); scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); bl1_caxpymt( blis_trans, m_B, n_B, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_alpha = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); bl1_zaxpymt( blis_trans, m_B, n_B, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Trsmsx_external( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Datatype datatype; int m_B, n_B; int rs_A, cs_A; int rs_B, cs_B; int rs_C, cs_C; side_t blis_side; uplo_t blis_uplo; trans_t blis_trans; diag_t blis_diag; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Trsmsx_check( side, uplo, trans, diag, alpha, A, B, beta, C ); if ( FLA_Obj_has_zero_dim( B ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); m_B = FLA_Obj_length( B ); n_B = FLA_Obj_width( B ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); rs_C = FLA_Obj_row_stride( C ); cs_C = FLA_Obj_col_stride( C ); FLA_Param_map_flame_to_blis_side( side, &blis_side ); FLA_Param_map_flame_to_blis_uplo( uplo, &blis_uplo ); FLA_Param_map_flame_to_blis_trans( trans, &blis_trans ); FLA_Param_map_flame_to_blis_diag( diag, &blis_diag ); switch( datatype ){ case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); float *buff_C = ( float * ) FLA_FLOAT_PTR( C ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_beta = ( float * ) FLA_FLOAT_PTR( beta ); bli_strsmsx( blis_side, blis_uplo, blis_trans, blis_diag, m_B, n_B, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); double *buff_C = ( double * ) FLA_DOUBLE_PTR( C ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_beta = ( double * ) FLA_DOUBLE_PTR( beta ); bli_dtrsmsx( blis_side, blis_uplo, blis_trans, blis_diag, m_B, n_B, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); scomplex *buff_C = ( scomplex * ) FLA_COMPLEX_PTR( C ); scomplex *buff_alpha = ( scomplex * ) FLA_COMPLEX_PTR( alpha ); scomplex *buff_beta = ( scomplex * ) FLA_COMPLEX_PTR( beta ); bli_ctrsmsx( blis_side, blis_uplo, blis_trans, blis_diag, m_B, n_B, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); dcomplex *buff_C = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( C ); dcomplex *buff_alpha = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); dcomplex *buff_beta = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( beta ); bli_ztrsmsx( blis_side, blis_uplo, blis_trans, blis_diag, m_B, n_B, buff_alpha, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_beta, buff_C, rs_C, cs_C ); break; } } return FLA_SUCCESS; }
int main(int argc, char *argv[]) { int datatype, n_blocks_m, n_threads, m_input, n_input, m, n, p_first, p_last, p_inc, p, n_repeats, param_combo, i, n_param_combos = N_PARAM_COMBOS; dim_t nb_flash, nb_alg; char *colors = "brkgmcbrkgmcbrkgmc"; char *ticks = "o+*xso+*xso+*xso+*xs"; char m_dim_desc[14]; char n_dim_desc[14]; char m_dim_tag[10]; char n_dim_tag[10]; double max_gflops=6.0; double dtime, gflops, diff; FLA_Obj A, A_flat_ref, A_flat, B, B_flat, D, D_flat, t, T, T_flat; FLA_Init( ); fprintf( stdout, "%c number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c enter algorithmic blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c enter problem size first, last, inc: ", '%' ); scanf( "%d%d%d", &p_first, &p_last, &p_inc ); fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc ); fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' ); scanf( "%d%d", &m_input, &n_input ); fprintf( stdout, "%c %d %d\n", '%', m_input, n_input ); fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "\nclear all;\n\n" ); if ( m_input > 0 ) { sprintf( m_dim_desc, "m = %d", m_input ); sprintf( m_dim_tag, "m%dc", m_input); } else if( m_input < -1 ) { sprintf( m_dim_desc, "m = p/%d", -m_input ); sprintf( m_dim_tag, "m%dp", -m_input ); } else if( m_input == -1 ) { sprintf( m_dim_desc, "m = p" ); sprintf( m_dim_tag, "m%dp", 1 ); } if ( n_input > 0 ) { sprintf( n_dim_desc, "n = %d", n_input ); sprintf( n_dim_tag, "n%dc", n_input); } else if( n_input < -1 ) { sprintf( n_dim_desc, "n = p/%d", -n_input ); sprintf( n_dim_tag, "n%dp", -n_input ); } else if( n_input == -1 ) { sprintf( n_dim_desc, "n = p" ); sprintf( n_dim_tag, "n%dp", 1 ); } //datatype = FLA_FLOAT; //datatype = FLA_DOUBLE; //datatype = FLA_COMPLEX; datatype = FLA_DOUBLE_COMPLEX; FLASH_Queue_set_num_threads( n_threads ); for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 ) { m = m_input; n = n_input; if( m < 0 ) m = p / abs(m_input); if( n < 0 ) n = p / abs(n_input); nb_flash = n; for ( param_combo = 0; param_combo < n_param_combos; param_combo++ ) { FLA_Obj_create( datatype, m, nb_flash, &A_flat ); FLA_Obj_create( datatype, m, nb_flash, &A_flat_ref ); FLA_Obj_create( datatype, m, nb_flash, &T_flat ); FLA_Obj_create( datatype, nb_flash, 1, &t ); FLASH_Obj_create( datatype, m, nb_flash, 1, &nb_flash, &A ); n_blocks_m = FLA_Obj_length( A ); FLASH_Obj_create_ext( datatype, nb_alg * n_blocks_m, nb_flash, 1, &nb_alg, &nb_flash, &T ); FLA_Set( FLA_ZERO, T_flat ); FLASH_Set( FLA_ZERO, T ); FLASH_Random_matrix( A ); FLASH_Obj_flatten( A, A_flat ); FLA_Part_2x1( A, &B, &D, 1, FLA_TOP ); FLA_Part_2x1( A_flat, &B_flat, &D_flat, FLA_Obj_width( A_flat ), FLA_TOP ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, *(FLASH_OBJ_PTR_AT(B)) ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, B_flat ); fprintf( stdout, "data_qr2ut_%s( %d, 1:5 ) = [ %d ", pc_str[param_combo], i, p ); fflush( stdout ); time_QR2_UT( param_combo, FLA_ALG_REFERENCE, n_repeats, m, n, A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); time_QR2_UT( param_combo, FLA_ALG_FRONT, n_repeats, m, n, A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops ); fprintf( stdout, "%6.3lf %6.2le ", gflops, diff ); fflush( stdout ); fprintf( stdout, " ]; \n" ); fflush( stdout ); FLA_Obj_free( &A_flat ); FLA_Obj_free( &A_flat_ref ); FLA_Obj_free( &T_flat ); FLA_Obj_free( &t ); FLASH_Obj_free( &A ); FLASH_Obj_free( &T ); } fprintf( stdout, "\n" ); } fprintf( stdout, "figure;\n" ); fprintf( stdout, "hold on;\n" ); for ( i = 0; i < n_param_combos; i++ ) { fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 2 ), '%c:%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 4 ), '%c-.%c' ); \n", pc_str[i], pc_str[i], colors[ i ], ticks[ i ] ); } fprintf( stdout, "legend( ... \n" ); for ( i = 0; i < n_param_combos; i++ ) fprintf( stdout, "'ref\\_qr2ut\\_%s', 'fla\\_qr2ut\\_%s', ... \n", pc_str[i], pc_str[i] ); fprintf( stdout, "'Location', 'SouthEast' ); \n" ); fprintf( stdout, "xlabel( 'problem size p' );\n" ); fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" ); fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops ); fprintf( stdout, "title( 'FLAME qr2ut front-end performance (%s, %s)' );\n", m_dim_desc, n_dim_desc ); fprintf( stdout, "print -depsc qr2ut_front_%s_%s.eps\n", m_dim_tag, n_dim_tag ); fprintf( stdout, "hold off;\n"); fflush( stdout ); FLA_Finalize( ); return 0; }
FLA_Error FLA_Scale_diag( FLA_Conj conj, FLA_Obj alpha, FLA_Obj A ) { FLA_Datatype datatype_A; FLA_Datatype datatype_alpha; dim_t m_A, n_A; dim_t rs_A, cs_A; conj_t blis_conj; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Scale_diag_check( conj, alpha, A ); datatype_A = FLA_Obj_datatype( A ); datatype_alpha = FLA_Obj_datatype( alpha ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); FLA_Param_map_flame_to_blis_conj( conj, &blis_conj ); switch( datatype_A ){ case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); bli_sscalediag( blis_conj, 0, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); bli_dscalediag( blis_conj, 0, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { if ( datatype_alpha == FLA_COMPLEX ) { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_alpha = ( scomplex * ) FLA_COMPLEX_PTR( alpha ); bli_cscalediag( blis_conj, 0, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); } else { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); bli_csscalediag( blis_conj, 0, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); } break; } case FLA_DOUBLE_COMPLEX: { if ( datatype_alpha == FLA_DOUBLE_COMPLEX ) { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_alpha = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); bli_zscalediag( blis_conj, 0, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); } else { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); bli_zdscalediag( blis_conj, 0, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); } break; } } return FLA_SUCCESS; }
FLA_Error FLA_Tridiag_apply_Q_external( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Obj A, FLA_Obj t, FLA_Obj B ) { int info = 0; #ifdef FLA_ENABLE_EXTERNAL_LAPACK_INTERFACES FLA_Datatype datatype; // int m_A, n_A; int m_B, n_B; int cs_A; int cs_B; int k_t; int lwork; char blas_side; char blas_uplo; char blas_trans; FLA_Obj work; int i; //if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) // FLA_Apply_Q_check( side, trans, storev, A, t, B ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); // m_A = FLA_Obj_length( A ); // n_A = FLA_Obj_width( A ); cs_A = FLA_Obj_col_stride( A ); m_B = FLA_Obj_length( B ); n_B = FLA_Obj_width( B ); cs_B = FLA_Obj_col_stride( B ); k_t = FLA_Obj_vector_dim( t ); FLA_Param_map_flame_to_netlib_side( side, &blas_side ); FLA_Param_map_flame_to_netlib_uplo( uplo, &blas_uplo ); FLA_Param_map_flame_to_netlib_trans( trans, &blas_trans ); // Make a workspace query the first time through. This will provide us with // and ideal workspace size based on an internal block size. lwork = -1; FLA_Obj_create( datatype, 1, 1, 0, 0, &work ); for ( i = 0; i < 2; ++i ) { if ( i == 1 ) { // Grab the queried ideal workspace size from the work array, free the // work object, and then re-allocate the workspace with the ideal size. if ( datatype == FLA_FLOAT || datatype == FLA_COMPLEX ) lwork = ( int ) *FLA_FLOAT_PTR( work ); else if ( datatype == FLA_DOUBLE || datatype == FLA_DOUBLE_COMPLEX ) lwork = ( int ) *FLA_DOUBLE_PTR( work ); FLA_Obj_free( &work ); FLA_Obj_create( datatype, lwork, 1, 0, 0, &work ); } switch( datatype ){ case FLA_FLOAT: { float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); float *buff_t = ( float * ) FLA_FLOAT_PTR( t ); float *buff_B = ( float * ) FLA_FLOAT_PTR( B ); float *buff_work = ( float * ) FLA_FLOAT_PTR( work ); F77_sormtr( &blas_side, &blas_uplo, &blas_trans, &m_B, &n_B, buff_A, &cs_A, buff_t, buff_B, &cs_B, buff_work, &lwork, &info ); break; } case FLA_DOUBLE: { double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double *buff_t = ( double * ) FLA_DOUBLE_PTR( t ); double *buff_B = ( double * ) FLA_DOUBLE_PTR( B ); double *buff_work = ( double * ) FLA_DOUBLE_PTR( work ); F77_dormtr( &blas_side, &blas_uplo, &blas_trans, &m_B, &n_B, buff_A, &cs_A, buff_t, buff_B, &cs_B, buff_work, &lwork, &info ); break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); scomplex *buff_t = ( scomplex * ) FLA_COMPLEX_PTR( t ); scomplex *buff_B = ( scomplex * ) FLA_COMPLEX_PTR( B ); scomplex *buff_work = ( scomplex * ) FLA_COMPLEX_PTR( work ); F77_cunmtr( &blas_side, &blas_uplo, &blas_trans, &m_B, &n_B, buff_A, &cs_A, buff_t, buff_B, &cs_B, buff_work, &lwork, &info ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex *buff_t = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( t ); dcomplex *buff_B = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( B ); dcomplex *buff_work = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( work ); F77_zunmtr( &blas_side, &blas_uplo, &blas_trans, &m_B, &n_B, buff_A, &cs_A, buff_t, buff_B, &cs_B, buff_work, &lwork, &info ); break; } } } FLA_Obj_free( &work ); #else FLA_Check_error_code( FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED ); #endif return info; }
FLA_Error REF_Svd_uv_components( FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) /* { *dtime_bred = 1; *dtime_bsvd = 1; *dtime_appq = 1; *dtime_qrfa = 1; *dtime_gemm = 1; return FLA_Svd_external( FLA_SVD_VECTORS_ALL, FLA_SVD_VECTORS_ALL, A, s, U, V ); } */ { FLA_Datatype dt_A; FLA_Datatype dt_A_real; dim_t m_A, n_A; dim_t min_m_n; FLA_Obj tq, tu, tv, d, e, R; FLA_Obj eT, epsilonB; FLA_Uplo uplo = FLA_UPPER_TRIANGULAR; double crossover_ratio = 16.0 / 10.0; double dtime_temp; dt_A = FLA_Obj_datatype( A ); dt_A_real = FLA_Obj_datatype_proj_to_real( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tq ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tu ); FLA_Obj_create( dt_A, min_m_n, 1, 0, 0, &tv ); FLA_Obj_create( dt_A_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_A_real, min_m_n, 1, 0, 0, &e ); FLA_Part_2x1( e, &eT, &epsilonB, 1, FLA_BOTTOM ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce to bidiagonal form. FLA_Bidiag_blk_external( A, tu, tv ); FLA_Bidiag_UT_extract_real_diagonals( A, d, eT ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U. FLA_Copyr_external( FLA_LOWER_TRIANGULAR, A, U ); FLA_Bidiag_form_U_external( U, tu ); // Form V. FLA_Copyr_external( FLA_UPPER_TRIANGULAR, A, V ); FLA_Bidiag_form_V_external( V, tv ); } *dtime_appq = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // QR algorithm. FLA_Bsvd_external( uplo, d, e, U, V ); } *dtime_bsvd = FLA_Clock() - dtime_temp; *dtime_qrfa = 0.0; *dtime_gemm = 0.0; } else { FLA_Obj AT, AB; FLA_Obj UL, UR; FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); // Create a temporary n-by-n matrix R. FLA_Obj_create( dt_A, n_A, n_A, 0, 0, &R ); dtime_temp = FLA_Clock(); { // Perform a QR factorization. FLA_QR_blk_external( A, tq ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr_external( FLA_UPPER_TRIANGULAR, AT, R ); FLA_Copyr_external( FLA_LOWER_TRIANGULAR, A, UL ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form Q. FLA_QR_form_Q_external( U, tq ); } *dtime_appq = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Reduce R to bidiagonal form. FLA_Bidiag_blk_external( R, tu, tv ); FLA_Bidiag_UT_extract_real_diagonals( R, d, eT ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U in R. FLA_Copyr_external( FLA_UPPER_TRIANGULAR, R, V ); FLA_Bidiag_form_U_external( R, tu ); // Form V. FLA_Bidiag_form_V_external( V, tv ); } *dtime_appq += FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // QR algorithm. FLA_Bsvd_external( uplo, d, e, R, V ); } *dtime_bsvd = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying // back to U. FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; // Free R. FLA_Obj_free( &R ); } } else { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy singular values to output vector. FLA_Copy( d, s ); // Sort singular values and vectors. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &tq ); FLA_Obj_free( &tu ); FLA_Obj_free( &tv ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); return FLA_SUCCESS; }
FLA_Error FLA_Svd_uv_var1_components( dim_t n_iter_max, dim_t k_accum, dim_t b_alg, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, double* dtime_bred, double* dtime_bsvd, double* dtime_appq, double* dtime_qrfa, double* dtime_gemm ) { FLA_Error r_val = FLA_SUCCESS; FLA_Datatype dt; FLA_Datatype dt_real; FLA_Datatype dt_comp; FLA_Obj T, S, rL, rR, d, e, G, H; dim_t m_A, n_A; dim_t min_m_n; dim_t n_GH; double crossover_ratio = 17.0 / 9.0; double dtime_temp; *dtime_bred = 1; *dtime_bsvd = 1; *dtime_appq = 1; *dtime_qrfa = 1; *dtime_gemm = 1; n_GH = k_accum; m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); min_m_n = FLA_Obj_min_dim( A ); dt = FLA_Obj_datatype( A ); dt_real = FLA_Obj_datatype_proj_to_real( A ); dt_comp = FLA_Obj_datatype_proj_to_complex( A ); // If the matrix is a scalar, then the SVD is easy. if ( min_m_n == 1 ) { FLA_Copy( A, s ); FLA_Set_to_identity( U ); FLA_Set_to_identity( V ); return FLA_SUCCESS; } // Create matrices to hold block Householder transformations. //FLA_Bidiag_UT_create_T( A, &T, &S ); FLA_Obj_create( dt, 32, n_A, 0, 0, &T ); FLA_Obj_create( dt, 32, n_A, 0, 0, &S ); // Create vectors to hold the realifying scalars. FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( dt, min_m_n, 1, 0, 0, &rR ); // Create vectors to hold the diagonal and sub-diagonal. FLA_Obj_create( dt_real, min_m_n, 1, 0, 0, &d ); FLA_Obj_create( dt_real, min_m_n-1, 1, 0, 0, &e ); // Create matrices to hold the left and right Givens scalars. FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &G ); FLA_Obj_create( dt_comp, min_m_n-1, n_GH, 0, 0, &H ); if ( m_A >= n_A ) { if ( m_A < crossover_ratio * n_A ) { dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( A, T, S ); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Bidiag_UT_extract_diagonals( A, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form U and V. FLA_Bidiag_UT_form_U( A, T, U ); FLA_Bidiag_UT_form_V( A, S, V ); } *dtime_appq = FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. { FLA_Obj UL, UR; FLA_Obj VL, VR; FLA_Part_1x2( U, &UL, &UR, min_m_n, FLA_LEFT ); FLA_Part_1x2( V, &VL, &VR, min_m_n, FLA_LEFT ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, UL ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, VL ); } dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, U, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; *dtime_qrfa = 0.0; *dtime_gemm = 0.0; } else // if ( crossover_ratio * n_A <= m_A ) { FLA_Obj TQ, R; FLA_Obj AT, AB; FLA_Obj UL, UR; //FLA_QR_UT_create_T( A, &TQ ); FLA_Obj_create( dt, 32, n_A, 0, 0, &TQ ); dtime_temp = FLA_Clock(); { // Perform a QR factorization on A and form Q in U. FLA_QR_UT( A, TQ ); } *dtime_qrfa = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { FLA_QR_UT_form_Q( A, TQ, U ); } *dtime_appq = FLA_Clock() - dtime_temp; FLA_Obj_free( &TQ ); // Set the lower triangle of R to zero and then copy the upper // triangle of A to R. FLA_Part_2x1( A, &AT, &AB, n_A, FLA_TOP ); FLA_Obj_create( dt, n_A, n_A, 0, 0, &R ); FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, R ); FLA_Copyr( FLA_UPPER_TRIANGULAR, AT, R ); dtime_temp = FLA_Clock(); { // Reduce the matrix to bidiagonal form. // Apply scalars to rotate elements on the superdiagonal to the real domain. // Extract the diagonal and superdiagonal from A. FLA_Bidiag_UT( R, T, S ); FLA_Bidiag_UT_realify( R, rL, rR ); FLA_Bidiag_UT_extract_diagonals( R, d, e ); } *dtime_bred = FLA_Clock() - dtime_temp; dtime_temp = FLA_Clock(); { // Form V from right Householder vectors in upper triangle of R. FLA_Bidiag_UT_form_V( R, S, V ); // Form U in R. FLA_Bidiag_UT_form_U( R, T, R ); } *dtime_appq += FLA_Clock() - dtime_temp; // Apply the realifying scalars in rL and rR to U and V, respectively. FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, R ); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, rR, V ); dtime_temp = FLA_Clock(); { // Perform a singular value decomposition on the bidiagonal matrix. r_val = FLA_Bsvd_v_opt_var1( n_iter_max, d, e, G, H, R, V, b_alg ); } *dtime_bsvd = FLA_Clock() - dtime_temp; FLA_Part_1x2( U, &UL, &UR, n_A, FLA_LEFT ); dtime_temp = FLA_Clock(); { // Multiply R into U, storing the result in A and then copying back // to U. FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, UL, R, FLA_ZERO, A ); FLA_Copy( A, UL ); } *dtime_gemm = FLA_Clock() - dtime_temp; FLA_Obj_free( &R ); } } else // if ( m_A < n_A ) { FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED ); } // Copy the converged eigenvalues to the output vector. FLA_Copy( d, s ); // Sort the singular values and singular vectors in descending order. FLA_Sort_svd( FLA_BACKWARD, s, U, V ); FLA_Obj_free( &T ); FLA_Obj_free( &S ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &G ); FLA_Obj_free( &H ); return r_val; }
FLA_Error FLA_QR_UT_piv_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl ) { FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj TL, TR, T0, T1, W12; FLA_Obj TT, TB; FLA_Obj pT, p0, pB, p1, p2; FLA_Obj wT, w0, wB, w1, w2; dim_t b_alg, b; // Query the algorithmic blocksize by inspecting the length of T. b_alg = FLA_Obj_length( T ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); FLA_Part_2x1( p, &pT, &pB, 0, FLA_TOP ); FLA_Part_2x1( w, &wT, &wB, 0, FLA_TOP ); while ( FLA_Obj_min_dim( ABR ) > 0 ){ b = min( b_alg, FLA_Obj_min_dim( ABR ) ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &W12, b, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( pT, &p0, /* ** */ /* ** */ &p1, pB, &p2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( wT, &w0, /* ** */ /* ** */ &w1, wB, &w2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ // ** Reshape T matrices to match the blocksize b FLA_Part_2x1( TR, &TT, &TB, b, FLA_TOP ); // ** Perform a unblocked (BLAS2-oriented) QR factorization // with pivoting via the UT transform on ABR: // // ABR -> QB1 R11 // // where: // - QB1 is formed from UB1 (which is stored column-wise below the // diagonal of ( A11 A21 )^T and the upper-triangle of T1. // - R11 is stored to ( A11 A12 ). // - W12 stores T and partial updates for FLA_Apply_Q_UT_piv_var. FLA_QR_UT_piv_internal( ABR, TT, wB, p1, FLA_Cntl_sub_qrut( cntl ) ); if ( FLA_Obj_width( A12 ) > 0 ) { // ** Block update FLA_Part_2x1( W12, &TT, &TB, b, FLA_TOP ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A21, TT, FLA_ONE, A22 ); } // ** Apply pivots to previous columns. FLA_Apply_pivots( FLA_RIGHT, FLA_TRANSPOSE, p1, ATR ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ W12, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &pT, p0, p1, /* ** */ /* ** */ &pB, p2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &wT, w0, w1, /* ** */ /* ** */ &wB, w2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_LU_piv_unb_var5( FLA_Obj A, FLA_Obj p ) { FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj pT, p0, pB, pi1, p2; FLA_Obj AB0, aB1, AB2; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( p, &pT, &pB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) && FLA_Obj_width( ATL ) < FLA_Obj_width( A )){ FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( pT, &p0, /* ** */ /* *** */ &pi1, pB, &p2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ // aB1 = / alpha11 \ // \ a21 / FLA_Merge_2x1( alpha11, a21, &aB1 ); // Determine pivot index FLA_Amax_external( aB1, pi1 ); // Apply pivots to current column FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pi1, aB1 ); // a21 = a21 / alpha11 FLA_Inv_scal_external( alpha11, a21 ); // AB0 = / a10t \ // \ A20 / FLA_Merge_2x1( a10t, A20, &AB0 ); // Apply pivots to previous columns FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pi1, AB0 ); // AB2 = / a12t \ // \ A22 / FLA_Merge_2x1( a12t, A22, &AB2 ); // Apply pivots to remaining columns FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pi1, AB2 ); // A22 = A22 - a21 * a12t FLA_Ger_external( FLA_MINUS_ONE, a21, a12t, A22 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &pT, p0, pi1, /* ** */ /* *** */ &pB, p2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLA_Her2k_uh_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Obj AL, AR, A0, a1, A2; FLA_Obj BL, BR, B0, b1, B2; FLA_Obj CTL, CTR, C00, c01, C02, CBL, CBR, c10t, gamma11, c12t, C20, c21, C22; FLA_Scalr_external( FLA_UPPER_TRIANGULAR, beta, C ); FLA_Part_1x2( A, &AL, &AR, 0, FLA_RIGHT ); FLA_Part_1x2( B, &BL, &BR, 0, FLA_RIGHT ); FLA_Part_2x2( C, &CTL, &CTR, &CBL, &CBR, 0, 0, FLA_BR ); while ( FLA_Obj_width( AR ) < FLA_Obj_width( A ) ){ FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, &a1, /**/ &A2, 1, FLA_LEFT ); FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, &b1, /**/ &B2, 1, FLA_LEFT ); FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &c01, /**/ &C02, &c10t, &gamma11, /**/ &c12t, /* ************* */ /* ************************** */ CBL, /**/ CBR, &C20, &c21, /**/ &C22, 1, 1, FLA_TL ); /*------------------------------------------------------------*/ /* c01 = c01 + A0' * b1 */ FLA_Gemv_external( FLA_CONJ_TRANSPOSE, alpha, A0, b1, FLA_ONE, c01 ); /* c12t = c12t + b1' * A2 */ FLA_Gemvc_external( FLA_TRANSPOSE, FLA_CONJUGATE, alpha, A2, b1, FLA_ONE, c12t ); /* gamma11 = gamma11 + a1' * b1 + b1' * a1 */ FLA_Dot2cs_external( FLA_CONJUGATE, alpha, a1, b1, FLA_ONE, gamma11 ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, /**/ a1, A2, FLA_RIGHT ); FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, /**/ b1, B2, FLA_RIGHT ); FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ c01, C02, /* ************** */ /* ************************ */ c10t, /**/ gamma11, c12t, &CBL, /**/ &CBR, C20, /**/ c21, C22, FLA_BR ); } return FLA_SUCCESS; }
FLA_Error FLA_Setr( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A ) { FLA_Datatype datatype; int m_A, n_A; int rs_A, cs_A; uplo1_t blis_uplo; if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING ) FLA_Setr_check( uplo, alpha, A ); if ( FLA_Obj_has_zero_dim( A ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); FLA_Param_map_flame_to_blis_uplo( uplo, &blis_uplo ); switch ( datatype ) { case FLA_FLOAT: { float *buff_alpha = ( float * ) FLA_FLOAT_PTR( alpha ); float *buff_A = ( float * ) FLA_FLOAT_PTR( A ); bl1_ssetmr( blis_uplo, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE: { double *buff_alpha = ( double * ) FLA_DOUBLE_PTR( alpha ); double *buff_A = ( double * ) FLA_DOUBLE_PTR( A ); bl1_dsetmr( blis_uplo, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); break; } case FLA_COMPLEX: { scomplex *buff_alpha = ( scomplex * ) FLA_COMPLEX_PTR( alpha ); scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); bl1_csetmr( blis_uplo, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_alpha = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( alpha ); dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); bl1_zsetmr( blis_uplo, m_A, n_A, buff_alpha, buff_A, rs_A, cs_A ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Max_abs_value_herm( FLA_Uplo uplo, FLA_Obj A, FLA_Obj maxabs ) { FLA_Datatype datatype; dim_t n_A; dim_t rs_A, cs_A; uplo1_t blis_uplo; if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Max_abs_value_herm_check( uplo, A, maxabs ); datatype = FLA_Obj_datatype( A ); n_A = FLA_Obj_width( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); FLA_Param_map_flame_to_blis_uplo( uplo, &blis_uplo ); switch ( datatype ){ case FLA_FLOAT: { float* buff_A = ( float * ) FLA_FLOAT_PTR( A ); float* buff_maxabs = ( float * ) FLA_FLOAT_PTR( maxabs ); bl1_smaxabsmr( blis_uplo, n_A, n_A, buff_A, rs_A, cs_A, buff_maxabs ); break; } case FLA_DOUBLE: { double* buff_A = ( double * ) FLA_DOUBLE_PTR( A ); double* buff_maxabs = ( double * ) FLA_DOUBLE_PTR( maxabs ); bl1_dmaxabsmr( blis_uplo, n_A, n_A, buff_A, rs_A, cs_A, buff_maxabs ); break; } case FLA_COMPLEX: { scomplex *buff_A = ( scomplex * ) FLA_COMPLEX_PTR( A ); float *buff_maxabs = ( float * ) FLA_FLOAT_PTR( maxabs ); bl1_cmaxabsmr( blis_uplo, n_A, n_A, buff_A, rs_A, cs_A, buff_maxabs ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex *buff_A = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( A ); double *buff_maxabs = ( double * ) FLA_DOUBLE_PTR( maxabs ); bl1_zmaxabsmr( blis_uplo, n_A, n_A, buff_A, rs_A, cs_A, buff_maxabs ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Sylv_nn_opt_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale ) { FLA_Datatype datatype; int m_C, n_C; int rs_A, cs_A; int rs_B, cs_B; int rs_C, cs_C; int info; datatype = FLA_Obj_datatype( A ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_B = FLA_Obj_row_stride( B ); cs_B = FLA_Obj_col_stride( B ); m_C = FLA_Obj_length( C ); n_C = FLA_Obj_width( C ); rs_C = FLA_Obj_row_stride( C ); cs_C = FLA_Obj_col_stride( C ); switch ( datatype ) { case FLA_FLOAT: { int* buff_isgn = FLA_INT_PTR( isgn ); float* buff_A = FLA_FLOAT_PTR( A ); float* buff_B = FLA_FLOAT_PTR( B ); float* buff_C = FLA_FLOAT_PTR( C ); float* buff_scale = FLA_FLOAT_PTR( scale ); float sgn = ( float ) *buff_isgn; FLA_Sylv_nn_ops_var1( sgn, m_C, n_C, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_C, rs_C, cs_C, buff_scale, &info ); break; } case FLA_DOUBLE: { int* buff_isgn = FLA_INT_PTR( isgn ); double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_B = FLA_DOUBLE_PTR( B ); double* buff_C = FLA_DOUBLE_PTR( C ); double* buff_scale = FLA_DOUBLE_PTR( scale ); double sgn = ( double ) *buff_isgn; FLA_Sylv_nn_opd_var1( sgn, m_C, n_C, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_C, rs_C, cs_C, buff_scale, &info ); break; } case FLA_COMPLEX: { int* buff_isgn = FLA_INT_PTR( isgn ); scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_B = FLA_COMPLEX_PTR( B ); scomplex* buff_C = FLA_COMPLEX_PTR( C ); scomplex* buff_scale = FLA_COMPLEX_PTR( scale ); float sgn = ( float ) *buff_isgn; FLA_Sylv_nn_opc_var1( sgn, m_C, n_C, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_C, rs_C, cs_C, buff_scale, &info ); break; } case FLA_DOUBLE_COMPLEX: { int* buff_isgn = FLA_INT_PTR( isgn ); dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_B = FLA_DOUBLE_COMPLEX_PTR( B ); dcomplex* buff_C = FLA_DOUBLE_COMPLEX_PTR( C ); dcomplex* buff_scale = FLA_DOUBLE_COMPLEX_PTR( scale ); double sgn = ( double ) *buff_isgn; FLA_Sylv_nn_opz_var1( sgn, m_C, n_C, buff_A, rs_A, cs_A, buff_B, rs_B, cs_B, buff_C, rs_C, cs_C, buff_scale, &info ); break; } } return FLA_SUCCESS; }