FLA_Error FLA_Check_submatrix_dims_and_offset( dim_t m, dim_t n, dim_t i, dim_t j, FLA_Obj A ) { FLA_Error e_val = FLA_SUCCESS; dim_t m_A, n_A; if ( FLA_Obj_elemtype( A ) == FLA_MATRIX ) { m_A = FLASH_Obj_scalar_length( A ); n_A = FLASH_Obj_scalar_width( A ); } else { m_A = FLA_Obj_length( A ); n_A = FLA_Obj_width( A ); } if ( i > m_A || j > n_A ) e_val = FLA_INVALID_SUBMATRIX_OFFSET; else if ( i + m > m_A || j + n > n_A ) e_val = FLA_INVALID_SUBMATRIX_DIMS; return e_val; }
void time_Apply_Q( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj B, FLA_Obj B_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj B_save, A_flat, B_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, B, &B_flat ); FLASH_Copy( B, B_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( B_save, B ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( B, B_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Apply_Q( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A_flat, t, B_flat ); break; case FLA_ALG_FRONT: //printf("\n"); FLASH_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, B ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A_flat, B_flat ); FLASH_Obj_hierarchify( B_flat, B_ref ); *diff = 0.0; } else { FLASH_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, B ); *diff = FLASH_Max_elemwise_diff( B, B_ref ); } *gflops = 2.0 * FLASH_Obj_scalar_length( A ) * FLASH_Obj_scalar_width( A ) * FLASH_Obj_scalar_width( B ) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( B_save, B ); FLASH_Obj_free( &B_save ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &B_flat ); }
void time_Trmm( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, A_flat, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 4 case 4:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 5 case 5:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 6 case 6:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 7 case 7:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 8 case 8:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 9 case 9:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 10 case 10:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 11 case 11:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A_flat, C_flat ); break; case FLA_ALG_FRONT: FLASH_Trmm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_TWO, A, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * FLASH_Obj_scalar_length( C ) * FLASH_Obj_scalar_width( C ) * FLASH_Obj_scalar_width( A ) / dtime_old / 1.0e9; if ( param_combo == 0 || param_combo == 3 || param_combo == 6 || param_combo == 9 ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &C_flat ); }
void time_SPDinv( int param_combo, int type, int nrepeats, int m, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_SPDinv( FLA_LOWER_TRIANGULAR, C_flat ); break; case FLA_ALG_FRONT: FLASH_SPDinv( FLA_LOWER_TRIANGULAR, C ); break; default: printf("trouble\n"); } break; } case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_SPDinv( FLA_UPPER_TRIANGULAR, C_flat ); break; case FLA_ALG_FRONT: FLASH_SPDinv( FLA_UPPER_TRIANGULAR, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ){ FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else{ *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * FLASH_Obj_scalar_length( C ) * FLASH_Obj_scalar_length( C ) * FLASH_Obj_scalar_length( C ) / dtime_old / 1e9; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &C_flat ); }
FLA_Error FLASH_Axpy_hierarchy( int direction, FLA_Obj alpha, FLA_Obj F, FLA_Obj* H ) { // Once we get down to a submatrix whose elements are scalars, we are down // to our base case. if ( FLA_Obj_elemtype( *H ) == FLA_SCALAR ) { // Depending on which top-level function invoked us, we either axpy // the source data in the flat matrix to the leaf-level submatrix of // the hierarchical matrix, or axpy the data in the hierarchical // submatrix to the flat matrix. if ( direction == FLA_FLAT_TO_HIER ) { #ifdef FLA_ENABLE_SCC if ( FLA_is_owner() ) #endif FLA_Axpy_external( alpha, F, *H ); } else if ( direction == FLA_HIER_TO_FLAT ) { #ifdef FLA_ENABLE_SCC if ( FLA_is_owner() ) #endif FLA_Axpy_external( alpha, *H, F ); } } else { FLA_Obj HL, HR, H0, H1, H2; FLA_Obj FL, FR, F0, F1, F2; FLA_Obj H1T, H01, H1B, H11, H21; FLA_Obj F1T, F01, F1B, F11, F21; dim_t b_m; dim_t b_n; FLA_Part_1x2( *H, &HL, &HR, 0, FLA_LEFT ); FLA_Part_1x2( F, &FL, &FR, 0, FLA_LEFT ); while ( FLA_Obj_width( HL ) < FLA_Obj_width( *H ) ) { FLA_Repart_1x2_to_1x3( HL, /**/ HR, &H0, /**/ &H1, &H2, 1, FLA_RIGHT ); // Get the scalar width of H1 and use that to determine the // width of F1. b_n = FLASH_Obj_scalar_width( H1 ); FLA_Repart_1x2_to_1x3( FL, /**/ FR, &F0, /**/ &F1, &F2, b_n, FLA_RIGHT ); // ------------------------------------------------------------- FLA_Part_2x1( H1, &H1T, &H1B, 0, FLA_TOP ); FLA_Part_2x1( F1, &F1T, &F1B, 0, FLA_TOP ); while ( FLA_Obj_length( H1T ) < FLA_Obj_length( H1 ) ) { FLA_Repart_2x1_to_3x1( H1T, &H01, /* ** */ /* *** */ &H11, H1B, &H21, 1, FLA_BOTTOM ); // Get the scalar length of H11 and use that to determine the // length of F11. b_m = FLASH_Obj_scalar_length( H11 ); FLA_Repart_2x1_to_3x1( F1T, &F01, /* ** */ /* *** */ &F11, F1B, &F21, b_m, FLA_BOTTOM ); // ------------------------------------------------------------- // Recursively axpy between F11 and H11. FLASH_Axpy_hierarchy( direction, alpha, F11, FLASH_OBJ_PTR_AT( H11 ) ); // ------------------------------------------------------------- FLA_Cont_with_3x1_to_2x1( &H1T, H01, H11, /* ** */ /* *** */ &H1B, H21, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &F1T, F01, F11, /* ** */ /* *** */ &F1B, F21, FLA_TOP ); } // ------------------------------------------------------------- FLA_Cont_with_1x3_to_1x2( &HL, /**/ &HR, H0, H1, /**/ H2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &FL, /**/ &FR, F0, F1, /**/ F2, FLA_LEFT ); } } return FLA_SUCCESS; }
void time_Symm( int param_combo, int type, int nrepeats, int m, int n, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old, A_flat, B_flat, C_flat; FLASH_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, A, &A_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, B, &B_flat ); FLASH_Obj_create_flat_conf_to_hier( FLA_NO_TRANSPOSE, C, &C_flat ); FLASH_Copy( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLASH_Copy( C_old, C ); FLASH_Obj_flatten( A, A_flat ); FLASH_Obj_flatten( B, B_flat ); FLASH_Obj_flatten( C, C_flat ); *dtime = FLA_Clock(); switch( param_combo ){ // Time parameter combination 0 case 0:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Symm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A_flat, B_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Symm( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 1 case 1:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Symm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_ONE, A_flat, B_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Symm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 2 case 2:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Symm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_ONE, A_flat, B_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Symm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } // Time parameter combination 3 case 3:{ switch( type ){ case FLA_ALG_REFERENCE: REF_Symm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_ONE, A_flat, B_flat, FLA_ZERO, C_flat ); break; case FLA_ALG_FRONT: FLASH_Symm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_ONE, A, B, FLA_ZERO, C ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } if ( type == FLA_ALG_REFERENCE ) { FLASH_Obj_hierarchify( C_flat, C_ref ); *diff = 0.0; } else { *diff = FLASH_Max_elemwise_diff( C, C_ref ); } *gflops = 2.0 * FLASH_Obj_scalar_length( C ) * FLASH_Obj_scalar_width( C ) * FLASH_Obj_scalar_width( A ) / dtime_old / 1.0e9; if ( FLA_Obj_is_complex( C ) ) *gflops *= 4.0; *dtime = dtime_old; FLASH_Copy( C_old, C ); FLASH_Obj_free( &C_old ); FLASH_Obj_free( &A_flat ); FLASH_Obj_free( &B_flat ); FLASH_Obj_free( &C_flat ); }