void FLA_Trsm_cntl_init() { // Set blocksizes with default values for conventional storage. fla_trsm_var2_bsize = FLA_Query_blocksizes( FLA_DIMENSION_MIN ); fla_trsm_var3_bsize = FLA_Query_blocksizes( FLA_DIMENSION_MIN ); // Create a control tree that assumes A and B are b x b blocks. fla_trsm_cntl_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL, NULL ); // Create a control tree that assumes A is a block and B is a panel. fla_trsm_cntl_bp = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_BLOCKED_VARIANT3, fla_trsm_var3_bsize, fla_scal_cntl_blas, fla_trsm_cntl_blas, NULL ); // Create a control tree that assumes A is large and B is a panel. fla_trsm_cntl_mp = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_BLOCKED_VARIANT2, fla_trsm_var2_bsize, fla_scal_cntl_blas, fla_trsm_cntl_blas, fla_gemm_cntl_blas ); // Create a control tree that assumes A and B are both large. fla_trsm_cntl_mm = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_BLOCKED_VARIANT3, fla_trsm_var3_bsize, fla_scal_cntl_blas, fla_trsm_cntl_mp, NULL ); }
void FLASH_Trsm_cntl_init() { // Set trsm blocksize for hierarchical storage. flash_trsm_bsize = FLA_Blocksize_create( 1, 1, 1, 1 ); // Create a control tree that assumes A and B are b x b blocks. flash_trsm_cntl_blas = FLA_Cntl_trsm_obj_create( FLA_HIER, FLA_SUBPROBLEM, NULL, NULL, NULL, NULL ); // Create a control tree that assumes A is a block and B is a panel. flash_trsm_cntl_bp = FLA_Cntl_trsm_obj_create( FLA_HIER, FLA_BLOCKED_VARIANT3, flash_trsm_bsize, flash_scal_cntl, flash_trsm_cntl_blas, NULL ); // Create a control tree that assumes A is large and B is a panel. flash_trsm_cntl_mp = FLA_Cntl_trsm_obj_create( FLA_HIER, FLA_BLOCKED_VARIANT2, flash_trsm_bsize, flash_scal_cntl, flash_trsm_cntl_blas, flash_gemm_cntl_op_bp ); // Create a control tree that assumes A and B are both large. flash_trsm_cntl_mm = FLA_Cntl_trsm_obj_create( FLA_HIER, FLA_BLOCKED_VARIANT3, flash_trsm_bsize, flash_scal_cntl, flash_trsm_cntl_mp, NULL ); }
void time_Trsm_lun( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj C_old; fla_blocksize_t* bp; fla_gemm_t* cntl_gemm_blas; fla_trsm_t* cntl_trsm_blas; fla_trsm_t* cntl_trsm_var; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trsm_var = FLA_Cntl_trsm_obj_create( FLA_FLAT, variant, bp, cntl_trsm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ){ case 0: // Time reference implementation REF_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trsm_lun_unb_var1( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trsm_lun_blk_var1( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trsm_lun_unb_var2( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trsm_lun_blk_var2( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trsm_lun_unb_var3( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trsm_lun_blk_var3( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var ); break; default: printf("trouble\n"); } break; } case 4:{ // Time variant 4 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trsm_lun_unb_var4( FLA_NONUNIT_DIAG, FLA_ONE, A, C ); break; case FLA_ALG_BLOCKED: FLA_Trsm_lun_blk_var4( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_trsm_var ); FLA_Cntl_obj_free( cntl_trsm_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ) { FLA_Copy_external( C, C_ref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, C_ref ); } *gflops = 1.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1.0e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Chol_u( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_save = 1.0e9; FLA_Obj A_save, b_save, b_orig_save; fla_blocksize_t* bp; fla_chol_t* cntl_chol_var; fla_chol_t* cntl_chol_unb; fla_syrk_t* cntl_syrk_blas; fla_herk_t* cntl_herk_blas; fla_trsm_t* cntl_trsm_blas; fla_gemm_t* cntl_gemm_blas; /* if( type == FLA_ALG_UNBLOCKED && n > 400 ) { *gflops = 0.0; *diff = 0.0; return; } */ bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_chol_unb = FLA_Cntl_chol_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_syrk_blas = FLA_Cntl_syrk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_herk_blas = FLA_Cntl_herk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_chol_var = FLA_Cntl_chol_obj_create( FLA_FLAT, variant, bp, cntl_chol_unb, cntl_syrk_blas, cntl_herk_blas, cntl_trsm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); FLA_Copy_external( b_orig, b_orig_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Chol_u( A ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var1( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var1( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var1( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var2( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var2( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var2( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var3( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var3( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var3( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_save = min( *dtime, dtime_save ); } FLA_Cntl_obj_free( cntl_chol_var ); FLA_Cntl_obj_free( cntl_chol_unb ); FLA_Cntl_obj_free( cntl_syrk_blas ); FLA_Cntl_obj_free( cntl_herk_blas ); FLA_Cntl_obj_free( cntl_trsm_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( type == FLA_ALG_REFERENCE ) { FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, A, b ); FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Hemv_external( FLA_UPPER_TRIANGULAR, FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig ); FLA_Nrm2_external( b_orig, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } else { FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, A, b ); FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Hemv_external( FLA_UPPER_TRIANGULAR, FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig ); FLA_Nrm2_external( b_orig, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } *gflops = 1.0 / 3.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_length( A ) / dtime_save / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_save; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Copy_external( b_orig_save, b_orig ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &b_orig_save ); }
void time_Trinv_un( int variant, int type, int nrepeats, int m, int nb_alg, FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old = 1.0e9; FLA_Obj A_save, b_save, b_orig_save; fla_blocksize_t* bp; fla_trinv_t* cntl_trinv_var; fla_trinv_t* cntl_trinv_unb; fla_gemm_t* cntl_gemm_blas; fla_trmm_t* cntl_trmm_blas; fla_trsm_t* cntl_trsm_blas; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_trinv_unb = FLA_Cntl_trinv_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT3, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_trmm_blas = FLA_Cntl_trmm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_trinv_var = FLA_Cntl_trinv_obj_create( FLA_FLAT, variant, bp, cntl_trinv_unb, cntl_trmm_blas, cntl_trsm_blas, cntl_trsm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); FLA_Copy_external( b_orig, b_orig_save ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ // Time reference case 0: REF_Trinv_un( A ); break; // Time variant 1 case 1:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trinv_un_unb_var1( A ); break; case FLA_ALG_UNB_OPT: FLA_Trinv_un_opt_var1( A ); break; case FLA_ALG_BLOCKED: FLA_Trinv_un_blk_var1( A, cntl_trinv_var ); break; default: printf("trouble\n"); } break; } // Time variant 2 case 2:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trinv_un_unb_var2( A ); break; case FLA_ALG_UNB_OPT: FLA_Trinv_un_opt_var2( A ); break; case FLA_ALG_BLOCKED: FLA_Trinv_un_blk_var2( A, cntl_trinv_var ); break; default: printf("trouble\n"); } break; } // Time variant 3 case 3:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trinv_un_unb_var3( A ); break; case FLA_ALG_UNB_OPT: FLA_Trinv_un_opt_var3( A ); break; case FLA_ALG_BLOCKED: FLA_Trinv_un_blk_var3( A, cntl_trinv_var ); break; default: printf("trouble\n"); } break; } // Time variant 4 case 4:{ switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Trinv_un_unb_var4( A ); break; case FLA_ALG_UNB_OPT: FLA_Trinv_un_opt_var4( A ); break; case FLA_ALG_BLOCKED: FLA_Trinv_un_blk_var4( A, cntl_trinv_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } FLA_Cntl_obj_free( cntl_trinv_var ); FLA_Cntl_obj_free( cntl_trinv_unb ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Cntl_obj_free( cntl_trmm_blas ); FLA_Cntl_obj_free( cntl_trsm_blas ); FLA_Blocksize_free( bp ); { FLA_Trmv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Trmv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A_save, b ); FLA_Axpy_external( FLA_MINUS_ONE, b_orig, b ); FLA_Nrm2_external( b, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } *gflops = 1.0 / 3.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_length( A ) / dtime_old / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_old; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Copy_external( b_orig_save, b_orig ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &b_orig_save ); }