FLA_Error FLA_Symm_lu_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Obj BL, BR, B0, b1t, B2; FLA_Obj CL, CR, C0, c1t, C2; FLA_Part_1x2( B, &BL, &BR, 0, FLA_RIGHT ); FLA_Part_1x2( C, &CL, &CR, 0, FLA_RIGHT ); while ( FLA_Obj_width( BR ) < FLA_Obj_width( B ) ){ FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, &b1t, /**/ &B2, 1, FLA_LEFT ); FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, &c1t, /**/ &C2, 1, FLA_LEFT ); /*------------------------------------------------------------*/ /* c1t = c1t + A * b1t */ FLA_Symv_external( FLA_UPPER_TRIANGULAR, alpha, A, b1t, beta, c1t ); /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, /**/ b1t, B2, FLA_RIGHT ); FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, /**/ c1t, C2, FLA_RIGHT ); } return FLA_SUCCESS; }
FLA_Error FLA_Symm_ru_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C ) { FLA_Obj BT, B0, BB, b1t, B2; FLA_Obj CT, C0, CB, c1t, C2; FLA_Part_2x1( B, &BT, &BB, 0, FLA_BOTTOM ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_BOTTOM ); while ( FLA_Obj_length( BB ) < FLA_Obj_length( B ) ){ FLA_Repart_2x1_to_3x1( BT, &B0, &b1t, /* ** */ /* ** */ BB, &B2, 1, FLA_TOP ); FLA_Repart_2x1_to_3x1( CT, &C0, &c1t, /* ** */ /* ** */ CB, &C2, 1, FLA_TOP ); /*------------------------------------------------------------*/ /* c1t = c1t + b1t * A */ /* c1t' = c1t' + A' * b1t' */ FLA_Symv_external( FLA_UPPER_TRIANGULAR, alpha, A, b1t, beta, c1t ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &BT, B0, /* ** */ /* ** */ b1t, &BB, B2, FLA_BOTTOM ); FLA_Cont_with_3x1_to_2x1( &CT, C0, /* ** */ /* ** */ c1t, &CB, C2, FLA_BOTTOM ); } return FLA_SUCCESS; }
void libfla_test_symm_experiment( test_params_t params, unsigned int var, char* sc_str, FLA_Datatype datatype, unsigned int p_cur, unsigned int pci, unsigned int n_repeats, signed int impl, double* perf, double* residual ) { dim_t b_flash = params.b_flash; dim_t b_alg_flat = params.b_alg_flat; double time_min = 1e9; double time; unsigned int i; unsigned int m; signed int m_input = -1; unsigned int n; signed int n_input = -1; FLA_Side side; FLA_Uplo uplo; FLA_Obj A, B, C, x, y, z, w, norm; FLA_Obj alpha, beta; FLA_Obj C_save; FLA_Obj A_test, B_test, C_test; // Determine the dimensions. if ( m_input < 0 ) m = p_cur / abs(m_input); else m = p_cur; if ( n_input < 0 ) n = p_cur / abs(n_input); else n = p_cur; // Translate parameter characters to libflame constants. FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side ); FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo ); // Create the matrices for the current operation. if ( side == FLA_LEFT ) { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, m, 1, 0, 0, &w ); } else { libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A ); // Create vectors for use in test. FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Obj_create( datatype, n, 1, 0, 0, &w ); } libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B ); libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C ); // Create a norm scalar. FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm ); // Initialize the test matrices. FLA_Random_symm_matrix( uplo, A ); FLA_Random_matrix( B ); FLA_Random_matrix( C ); // Initialize the test vectors. FLA_Random_matrix( x ); FLA_Set( FLA_ZERO, y ); FLA_Set( FLA_ZERO, z ); FLA_Set( FLA_ZERO, w ); // Set constants. alpha = FLA_TWO; beta = FLA_MINUS_ONE; // Save the original object contents in a temporary object. FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save ); // Use hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test ); FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test ); } else { A_test = A; B_test = B; C_test = C; } // Create a control tree for the individual variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_create( var, b_alg_flat ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { if ( impl == FLA_TEST_HIER_FRONT_END ) FLASH_Obj_hierarchify( C_save, C_test ); else FLA_Copy_external( C_save, C_test ); time = FLA_Clock(); libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test ); time = FLA_Clock() - time; time_min = min( time_min, time ); } // Copy the solution to flat matrix X. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_flatten( C_test, C ); } else { // No action needed since C_test and C refer to the same object. } // Free the hierarchical matrices if we're testing the FLASH front-end. if ( impl == FLA_TEST_HIER_FRONT_END ) { FLASH_Obj_free( &A_test ); FLASH_Obj_free( &B_test ); FLASH_Obj_free( &C_test ); } // Free the control trees if we're testing the variants. if ( impl == FLA_TEST_FLAT_UNB_VAR || impl == FLA_TEST_FLAT_OPT_VAR || impl == FLA_TEST_FLAT_BLK_VAR || impl == FLA_TEST_FLAT_UNB_EXT || impl == FLA_TEST_FLAT_BLK_EXT ) libfla_test_symm_cntl_free(); // Compute the performance of the best experiment repeat. if ( side == FLA_LEFT ) *perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF; else *perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0; // Compute: // y = C * x // and compare to // z = ( beta * C_orig + alpha * A * B ) x (side = left) // z = ( beta * C_orig + alpha * B * A ) x (side = right) FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y ); if ( side == FLA_LEFT ) { FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w ); FLA_Symv_external( uplo, alpha, A, w, FLA_ZERO, z ); } else { FLA_Symv_external( uplo, FLA_ONE, A, x, FLA_ZERO, w ); FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B, w, FLA_ZERO, z ); } FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z ); // Compute || y - z ||. //FLA_Axpy_external( FLA_MINUS_ONE, y, z ); //FLA_Nrm2_external( z, norm ); //FLA_Obj_extract_real_scalar( norm, residual ); *residual = FLA_Max_elemwise_diff( y, z ); // Free the supporting flat objects. FLA_Obj_free( &C_save ); // Free the flat test matrices. FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &C ); FLA_Obj_free( &x ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &w ); FLA_Obj_free( &norm ); }
int main( int argc, char *argv[] ) { int i, j, n_threads, n_repeats, n_trials, increment, begin, sorting, caching, work_stealing, data_affinity; dim_t size, nb_alg; FLA_Datatype datatype = FLA_DOUBLE; FLA_Inv inv = FLA_NO_INVERSE; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Obj A, B, x, b, b_norm, AH, BH; double length, b_norm_value = 0.0, dtime, *dtimes, *flops; #ifndef FLA_ENABLE_WINDOWS_BUILD char output_file_m[100]; FILE *fpp; #endif fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' ); scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity ); fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #else sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | FLASH |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #endif FLA_Init(); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); FLASH_Queue_set_num_threads( n_threads ); FLASH_Queue_set_sorting( sorting ); FLASH_Queue_set_caching( caching ); FLASH_Queue_set_work_stealing( work_stealing ); FLASH_Queue_set_data_affinity( data_affinity ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); FLA_Obj_create( datatype, size, size, 0, 0, &B ); FLA_Obj_create( datatype, size, 1, 0, 0, &x ); FLA_Obj_create( datatype, size, 1, 0, 0, &b ); FLA_Obj_create( datatype, 1, 1, 0, 0, &b_norm ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( x ); FLA_Random_matrix( b ); FLA_Symmetrize( uplo, A ); FLA_Symmetrize( uplo, B ); length = ( double ) FLA_Obj_length( B ); FLA_Add_to_diag( &length, B ); FLA_Symv_external( uplo, FLA_ONE, B, x, FLA_ZERO, b ); FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &nb_alg, &BH ); FLASH_Chol( uplo, BH ); dtime = FLA_Clock(); FLASH_Eig_gest( inv, uplo, AH, BH ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; FLASH_Obj_free( &AH ); FLASH_Obj_free( &BH ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 1.0 * size * size * size / dtime / 1e9; #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, " %d %6.3f %le\n", size, flops[i], b_norm_value ); #else fprintf( fpp, " %d %6.3f\n", size, flops[i] ); fprintf( stdout, "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); fprintf( stdout, "Matrix size: %u x %u | nb_alg: %u\n", size, size, nb_alg ); fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &b_norm ); } #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "];\n\n" ); #else fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); #endif FLA_free( dtimes ); FLA_free( flops ); FLA_Finalize(); return 0; }
FLA_Error FLA_Symv( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y ) { return FLA_Symv_external( uplo, alpha, A, x, beta, y ); }