void FLASH_Syrk_cntl_init() { // Set syrk blocksize for hierarchical storage. flash_syrk_bsize = FLA_Blocksize_create( 1, 1, 1, 1 ); // Create a control tree that assumes A is a b x b block. flash_syrk_cntl_blas = FLA_Cntl_syrk_obj_create( FLA_HIER, FLA_SUBPROBLEM, NULL, NULL, NULL, NULL ); // Create a control tree that assumes A * A' forms an inner panel product. flash_syrk_cntl_ip = FLA_Cntl_syrk_obj_create( FLA_HIER, FLA_BLOCKED_VARIANT5, flash_syrk_bsize, flash_scalr_cntl, flash_syrk_cntl_blas, NULL ); // Create a control tree that assumes A * A' forms an outer panel product. flash_syrk_cntl_op = FLA_Cntl_syrk_obj_create( FLA_HIER, FLA_BLOCKED_VARIANT2, flash_syrk_bsize, flash_scalr_cntl, flash_syrk_cntl_blas, flash_gemm_cntl_pb_bb ); // Create a control tree that assumes A is large. flash_syrk_cntl_mm = FLA_Cntl_syrk_obj_create( FLA_HIER, FLA_BLOCKED_VARIANT5, flash_syrk_bsize, flash_scalr_cntl, flash_syrk_cntl_op, NULL ); }
void time_Syrk_ln( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref, double *dtime, double *diff, double *gflops ) { int irep; double dtime_old; FLA_Obj C_old; fla_blocksize_t* bp; fla_gemm_t* cntl_gemm_blas; fla_syrk_t* cntl_syrk_blas; fla_syrk_t* cntl_syrk_var; bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_syrk_blas = FLA_Cntl_syrk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_syrk_var = FLA_Cntl_syrk_obj_create( FLA_FLAT, variant, bp, cntl_syrk_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old ); FLA_Copy_external( C, C_old ); for ( irep = 0 ; irep < nrepeats; irep++ ) { FLA_Copy_external( C_old, C ); *dtime = FLA_Clock(); switch( variant ) { case 0: // Time reference implementation REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C ); break; case 1: { // Time variant 1 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Syrk_ln_unb_var1( FLA_ONE, A, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Syrk_ln_blk_var1( FLA_ONE, A, FLA_ONE, C, cntl_syrk_var ); break; default: printf("trouble\n"); } break; } case 2: { // Time variant 2 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Syrk_ln_unb_var2( FLA_ONE, A, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Syrk_ln_blk_var2( FLA_ONE, A, FLA_ONE, C, cntl_syrk_var ); break; default: printf("trouble\n"); } break; } case 3: { // Time variant 3 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Syrk_ln_unb_var3( FLA_ONE, A, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Syrk_ln_blk_var3( FLA_ONE, A, FLA_ONE, C, cntl_syrk_var ); break; default: printf("trouble\n"); } break; } case 4: { // Time variant 4 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Syrk_ln_unb_var4( FLA_ONE, A, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Syrk_ln_blk_var4( FLA_ONE, A, FLA_ONE, C, cntl_syrk_var ); break; default: printf("trouble\n"); } break; } case 5: { // Time variant 5 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Syrk_ln_unb_var5( FLA_ONE, A, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Syrk_ln_blk_var5( FLA_ONE, A, FLA_ONE, C, cntl_syrk_var ); break; default: printf("trouble\n"); } break; } case 6: { // Time variant 6 switch( type ) { case FLA_ALG_UNBLOCKED: FLA_Syrk_ln_unb_var6( FLA_ONE, A, FLA_ONE, C ); break; case FLA_ALG_BLOCKED: FLA_Syrk_ln_blk_var6( FLA_ONE, A, FLA_ONE, C, cntl_syrk_var ); break; default: printf("trouble\n"); } } break; } if ( irep == 0 ) dtime_old = FLA_Clock() - *dtime; else { *dtime = FLA_Clock() - *dtime; dtime_old = min( *dtime, dtime_old ); } } FLA_Cntl_obj_free( cntl_syrk_var ); FLA_Cntl_obj_free( cntl_syrk_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( variant == 0 ) { FLA_Copy_external( C, Cref ); *diff = 0.0; } else { *diff = FLA_Max_elemwise_diff( C, Cref ); } *gflops = 1.0 * FLA_Obj_length( C ) * FLA_Obj_width( C ) * FLA_Obj_width( A ) / dtime_old / 1.0e9; *dtime = dtime_old; FLA_Copy_external( C_old, C ); FLA_Obj_free( &C_old ); }
void time_Chol_u( int variant, int type, int nrepeats, int n, int nb_alg, FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm, double *dtime, double *diff, double *gflops ) { int irep; double dtime_save = 1.0e9; FLA_Obj A_save, b_save, b_orig_save; fla_blocksize_t* bp; fla_chol_t* cntl_chol_var; fla_chol_t* cntl_chol_unb; fla_syrk_t* cntl_syrk_blas; fla_herk_t* cntl_herk_blas; fla_trsm_t* cntl_trsm_blas; fla_gemm_t* cntl_gemm_blas; /* if( type == FLA_ALG_UNBLOCKED && n > 400 ) { *gflops = 0.0; *diff = 0.0; return; } */ bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg ); cntl_chol_unb = FLA_Cntl_chol_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL ); cntl_syrk_blas = FLA_Cntl_syrk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_herk_blas = FLA_Cntl_herk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL ); cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL ); cntl_chol_var = FLA_Cntl_chol_obj_create( FLA_FLAT, variant, bp, cntl_chol_unb, cntl_syrk_blas, cntl_herk_blas, cntl_trsm_blas, cntl_gemm_blas ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save ); FLA_Copy_external( A, A_save ); FLA_Copy_external( b, b_save ); FLA_Copy_external( b_orig, b_orig_save ); for ( irep = 0 ; irep < nrepeats; irep++ ){ FLA_Copy_external( A_save, A ); *dtime = FLA_Clock(); switch( variant ){ case 0: REF_Chol_u( A ); break; case 1:{ // Time variant 1 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var1( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var1( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var1( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } case 2:{ // Time variant 2 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var2( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var2( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var2( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } case 3:{ // Time variant 3 switch( type ){ case FLA_ALG_UNBLOCKED: FLA_Chol_u_unb_var3( A ); break; case FLA_ALG_UNB_OPT: FLA_Chol_u_opt_var3( A ); break; case FLA_ALG_BLOCKED: FLA_Chol_u_blk_var3( A, cntl_chol_var ); break; default: printf("trouble\n"); } break; } } *dtime = FLA_Clock() - *dtime; dtime_save = min( *dtime, dtime_save ); } FLA_Cntl_obj_free( cntl_chol_var ); FLA_Cntl_obj_free( cntl_chol_unb ); FLA_Cntl_obj_free( cntl_syrk_blas ); FLA_Cntl_obj_free( cntl_herk_blas ); FLA_Cntl_obj_free( cntl_trsm_blas ); FLA_Cntl_obj_free( cntl_gemm_blas ); FLA_Blocksize_free( bp ); if ( type == FLA_ALG_REFERENCE ) { FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, A, b ); FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Hemv_external( FLA_UPPER_TRIANGULAR, FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig ); FLA_Nrm2_external( b_orig, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } else { FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, A, b ); FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, A, b ); FLA_Hemv_external( FLA_UPPER_TRIANGULAR, FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig ); FLA_Nrm2_external( b_orig, norm ); FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm, 1, 1, diff, 1, 1 ); } *gflops = 1.0 / 3.0 * FLA_Obj_length( A ) * FLA_Obj_length( A ) * FLA_Obj_length( A ) / dtime_save / 1e9; if ( FLA_Obj_is_complex( A ) ) *gflops *= 4.0; *dtime = dtime_save; FLA_Copy_external( A_save, A ); FLA_Copy_external( b_save, b ); FLA_Copy_external( b_orig_save, b_orig ); FLA_Obj_free( &A_save ); FLA_Obj_free( &b_save ); FLA_Obj_free( &b_orig_save ); }