FLA_Error FLA_Hemm_lu_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Obj BL, BR, B0, b1t, B2; FLA_Obj CL, CR, C0, c1t, C2; FLA_Part_1x2( B, &BL, &BR, 0, FLA_RIGHT ); FLA_Part_1x2( C, &CL, &CR, 0, FLA_RIGHT ); while ( FLA_Obj_width( BR ) < FLA_Obj_width( B ) ){ FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, &b1t, /**/ &B2, 1, FLA_LEFT ); FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, &c1t, /**/ &C2, 1, FLA_LEFT ); /*------------------------------------------------------------*/ /* c1t = c1t + A * b1t */ FLA_Hemv_external( FLA_UPPER_TRIANGULAR, alpha, A, b1t, beta, c1t ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, /**/ b1t, B2, FLA_RIGHT ); FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, /**/ c1t, C2, FLA_RIGHT ); } return FLA_SUCCESS; }
void libfla_test_hemm_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -2; unsigned int n; signed int n_input = -1; FLA_Side side; FLA_Uplo uplo; FLA_Obj A, B, C, x, y, z, w, norm; FLA_Obj alpha, beta; FLA_Obj C_save; FLA_Obj A_test, B_test, C_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur / abs(n_input); else n = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); // Create the matrices for the current operation. if ( side == FLA_LEFT ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, m, 1, 0, 0, &w ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, n, 1, 0, 0, &w ); } libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C ); // Create a norm scalar. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_herm_matrix( uplo, A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); // Initialize the test vectors. FLA_Random_matrix( x ); FLA_Set( FLA_ZERO, y ); FLA_Set( FLA_ZERO, z ); FLA_Set( FLA_ZERO, w ); // Set constants. alpha = FLA_TWO; beta = FLA_MINUS_ONE; // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test ); } else { A_test = A; B_test = B; C_test = C; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_hemm_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( C_save, C_test ); else FLA_Copy_external( C_save, C_test ); time = FLA_Clock(); libfla_test_hemm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Copy the solution to flat matrix X. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( C_test, C ); } else { // No action needed since C_test and C refer to the same object. } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &B_test ); FLASH_Obj_free( &C_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_hemm_cntl_free(); // Compute the performance of the best experiment repeat. if ( side == FLA_LEFT ) *perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF; else *perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute: // y = C * x // and compare to // z = ( beta * C_orig + alpha * A * B ) x (side = left) // z = ( beta * C_orig + alpha * B * A ) x (side = right) FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y ); if ( side == FLA_LEFT ) { FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w ); FLA_Hemv_external( uplo, alpha, A, w, FLA_ZERO, z ); } else { FLA_Hemv_external( uplo, FLA_ONE, A, x, FLA_ZERO, w ); FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B, w, FLA_ZERO, z ); } FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z ); // Compute || y - z ||. //FLA_Axpy_external( FLA_MINUS_ONE, y, z ); //FLA_Nrm2_external( z, norm ); //FLA_Obj_extract_real_scalar( norm, residual ); *residual = FLA_Max_elemwise_diff( y, z ); // Free the supporting flat objects. FLA_Obj_free( &C_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &w ); FLA_Obj_free( &norm ); }
void time_Chol_u( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_save = 1.0e9; FLA_Obj A_save, b_save, b_orig_save; fla_blocksize_t* bp; fla_chol_t* cntl_chol_var; fla_chol_t* cntl_chol_unb; fla_syrk_t* cntl_syrk_blas; fla_herk_t* cntl_herk_blas; fla_trsm_t* cntl_trsm_blas; fla_gemm_t* cntl_gemm_blas; /* if( type == FLA_ALG_UNBLOCKED && n > 400 ) { *gflops = 0.0; *diff = 0.0; return; } */ bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_chol_unb = FLA_Cntl_chol_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_syrk_blas = FLA_Cntl_syrk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_herk_blas = FLA_Cntl_herk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_chol_var = FLA_Cntl_chol_obj_create( FLA_FLAT, variant, bp, cntl_chol_unb, cntl_syrk_blas, cntl_herk_blas, cntl_trsm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); FLA_Copy_external( b_orig, b_orig_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Chol_u( A ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var1( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var1( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var1( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var2( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var2( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var2( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var3( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var3( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var3( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_save = min( *dtime, dtime_save ); } FLA_Cntl_obj_free( cntl_chol_var ); FLA_Cntl_obj_free( cntl_chol_unb ); FLA_Cntl_obj_free( cntl_syrk_blas ); FLA_Cntl_obj_free( cntl_herk_blas ); FLA_Cntl_obj_free( cntl_trsm_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( type == FLA_ALG_REFERENCE ) { FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, A, b ); FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Hemv_external( FLA_UPPER_TRIANGULAR, FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig ); FLA_Nrm2_external( b_orig, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } else { FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, A, b ); FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Hemv_external( FLA_UPPER_TRIANGULAR, FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig ); FLA_Nrm2_external( b_orig, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } *gflops = 1.0 / 3.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_length( A ) / dtime_save / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_save; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Copy_external( b_orig_save, b_orig ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &b_orig_save ); }
FLA_Error FLA_Hemv( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y ) { return FLA_Hemv_external( uplo, alpha, A, x, beta, y ); }