void time_Trmm( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, A_flat, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 4 case 4:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 5 case 5:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 6 case 6:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 7 case 7:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 8 case 8:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 9 case 9:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 10 case 10:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 11 case 11:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * FLASH_Obj_scalar_length( C ) * FLASH_Obj_scalar_width( C ) * FLASH_Obj_scalar_width( A ) / dtime_old / 1.0e9; if ( param_combo == 0 || param_combo == 3 || param_combo == 6 || param_combo == 9 ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &C_flat ); }
void libfla_test_eig_gest_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; FLA_Uplo inv; FLA_Uplo uplo; FLA_Obj A, B, Y, norm; FLA_Obj A_save, B_save; FLA_Obj A_test, B_test, Y_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_inv( &pc_str[pci][0], &inv ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); if ( inv == FLA_NO_INVERSE && ( ( impl == FLA_TEST_FLAT_UNB_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_OPT_VAR && var == 3 ) || ( impl == FLA_TEST_FLAT_BLK_VAR && var == 3 ) ) ) { *perf = 0.0; *residual = 0.0; return; } // Create the matrices for the current operation. libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &Y ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B ); // Initialize the test matrices. FLA_Random_spd_matrix( uplo, A ); FLA_Scalr( uplo, FLA_TWO, A ); FLA_Hermitianize( uplo, A ); FLA_Random_spd_matrix( uplo, B ); FLA_Scalr( uplo, FLA_TWO, B ); FLA_Chol( uplo, B ); // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save ); // Create a real scalar object to hold the norm of A. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( Y, 1, &b_flash, &Y_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); } else { A_test = A; Y_test = Y; B_test = B; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_hierarchify( A_save, A_test ); FLASH_Obj_hierarchify( B_save, B_test ); } else { FLA_Copy_external( A_save, A_test ); FLA_Copy_external( B_save, B_test ); } time = FLA_Clock(); libfla_test_eig_gest_impl( impl, inv, uplo, A_test, Y_test, B_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Check our solution. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLA_Trans trans_left, trans_right; FLASH_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLASH_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLASH_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLASH_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } FLASH_Obj_flatten( A_test, A ); } else { FLA_Trans trans_left, trans_right; FLA_Hermitianize( uplo, A_test ); if ( ( inv == FLA_NO_INVERSE && uplo == FLA_LOWER_TRIANGULAR ) || ( inv == FLA_INVERSE && uplo == FLA_UPPER_TRIANGULAR ) ) { trans_left = FLA_CONJ_TRANSPOSE; trans_right = FLA_NO_TRANSPOSE; } else { trans_left = FLA_NO_TRANSPOSE; trans_right = FLA_CONJ_TRANSPOSE; } if ( inv == FLA_NO_INVERSE ) { FLA_Trsm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trsm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } else // if ( inv == FLA_INVERSE ) { FLA_Trmm( FLA_LEFT, uplo, trans_left, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); FLA_Trmm( FLA_RIGHT, uplo, trans_right, FLA_NONUNIT_DIAG, FLA_ONE, B_test, A_test ); } } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &Y_test ); FLASH_Obj_free( &B_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR ) libfla_test_eig_gest_cntl_free(); // Compute the performance of the best experiment repeat. *perf = 1.0 * m * m * m / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute the residual. FLA_Axpy_external( FLA_MINUS_ONE, A_save, A ); FLA_Norm1( A, norm ); FLA_Obj_extract_real_scalar( norm, residual ); // Free the supporting flat objects. FLA_Obj_free( &norm ); FLA_Obj_free( &A_save ); FLA_Obj_free( &B_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &Y ); FLA_Obj_free( &B ); }