Beispiel #1
0
void FLA_Trsm_cntl_init()
{
	// Set blocksizes with default values for conventional storage.
	fla_trsm_var2_bsize = FLA_Query_blocksizes( FLA_DIMENSION_MIN );
	fla_trsm_var3_bsize = FLA_Query_blocksizes( FLA_DIMENSION_MIN );

	// Create a control tree that assumes A and B are b x b blocks.
	fla_trsm_cntl_blas  = FLA_Cntl_trsm_obj_create( FLA_FLAT,
	                                                FLA_SUBPROBLEM,
	                                                NULL,
	                                                NULL,
	                                                NULL,
	                                                NULL );

	// Create a control tree that assumes A is a block and B is a panel.
	fla_trsm_cntl_bp    = FLA_Cntl_trsm_obj_create( FLA_FLAT,
	                                                FLA_BLOCKED_VARIANT3,
	                                                fla_trsm_var3_bsize,
	                                                fla_scal_cntl_blas,
	                                                fla_trsm_cntl_blas,
	                                                NULL );

	// Create a control tree that assumes A is large and B is a panel.
	fla_trsm_cntl_mp    = FLA_Cntl_trsm_obj_create( FLA_FLAT,
	                                                FLA_BLOCKED_VARIANT2,
	                                                fla_trsm_var2_bsize,
	                                                fla_scal_cntl_blas,
	                                                fla_trsm_cntl_blas,
	                                                fla_gemm_cntl_blas );

	// Create a control tree that assumes A and B are both large.
	fla_trsm_cntl_mm    = FLA_Cntl_trsm_obj_create( FLA_FLAT,
	                                                FLA_BLOCKED_VARIANT3,
	                                                fla_trsm_var3_bsize,
	                                                fla_scal_cntl_blas,
	                                                fla_trsm_cntl_mp,
	                                                NULL );
}
void FLASH_Trsm_cntl_init()
{
	// Set trsm blocksize for hierarchical storage.
	flash_trsm_bsize      = FLA_Blocksize_create( 1, 1, 1, 1 );

	// Create a control tree that assumes A and B are b x b blocks.
	flash_trsm_cntl_blas  = FLA_Cntl_trsm_obj_create( FLA_HIER,
	                                                  FLA_SUBPROBLEM,
	                                                  NULL,
	                                                  NULL,
	                                                  NULL,
	                                                  NULL );

	// Create a control tree that assumes A is a block and B is a panel.
	flash_trsm_cntl_bp    = FLA_Cntl_trsm_obj_create( FLA_HIER,
	                                                  FLA_BLOCKED_VARIANT3,
	                                                  flash_trsm_bsize,
	                                                  flash_scal_cntl,
	                                                  flash_trsm_cntl_blas,
	                                                  NULL );

	// Create a control tree that assumes A is large and B is a panel.
	flash_trsm_cntl_mp    = FLA_Cntl_trsm_obj_create( FLA_HIER,
	                                                  FLA_BLOCKED_VARIANT2,
	                                                  flash_trsm_bsize,
	                                                  flash_scal_cntl,
	                                                  flash_trsm_cntl_blas,
	                                                  flash_gemm_cntl_op_bp );

	// Create a control tree that assumes A and B are both large.
	flash_trsm_cntl_mm    = FLA_Cntl_trsm_obj_create( FLA_HIER,
	                                                  FLA_BLOCKED_VARIANT3,
	                                                  flash_trsm_bsize,
	                                                  flash_scal_cntl,
	                                                  flash_trsm_cntl_mp,
	                                                  NULL );
}
Beispiel #3
0
void time_Trsm_lun( 
               int variant, int type, int nrepeats, int n, int nb_alg,
               FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_old;

  fla_blocksize_t*
    bp;
  fla_gemm_t*
    cntl_gemm_blas;
  fla_trsm_t*
    cntl_trsm_blas;
  fla_trsm_t*
    cntl_trsm_var;

  bp             = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL );
  cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
  cntl_trsm_var  = FLA_Cntl_trsm_obj_create( FLA_FLAT, variant, bp, cntl_trsm_blas, cntl_gemm_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < nrepeats; irep++ )
  {
    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      // Time reference implementation
      REF_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                FLA_ONE, A, C );
      break;

    case 1:{
      // Time variant 1
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Trsm_lun_unb_var1( FLA_NONUNIT_DIAG, FLA_ONE, A, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Trsm_lun_blk_var1( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:{
      // Time variant 2
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Trsm_lun_unb_var2( FLA_NONUNIT_DIAG, FLA_ONE, A, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Trsm_lun_blk_var2( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 3:{
      // Time variant 3
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Trsm_lun_unb_var3( FLA_NONUNIT_DIAG, FLA_ONE, A, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Trsm_lun_blk_var3( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 4:{
      // Time variant 4
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Trsm_lun_unb_var4( FLA_NONUNIT_DIAG, FLA_ONE, A, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Trsm_lun_blk_var4( FLA_NONUNIT_DIAG, FLA_ONE, A, C, cntl_trsm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  FLA_Cntl_obj_free( cntl_trsm_var );
  FLA_Cntl_obj_free( cntl_trsm_blas );
  FLA_Cntl_obj_free( cntl_gemm_blas );
  FLA_Blocksize_free( bp );

  if ( variant == 0 )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }

  *gflops = 1.0 * 
            FLA_Obj_length( C ) * 
            FLA_Obj_width( C ) * 
            FLA_Obj_width( A ) / 
            dtime_old / 
            1.0e9;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
Beispiel #4
0
void time_Chol_u(
                  int variant, int type, int nrepeats, int n, int nb_alg,
                  FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm,
                  double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_save = 1.0e9;

  FLA_Obj
    A_save, b_save, b_orig_save;

  fla_blocksize_t*
    bp;
  fla_chol_t*
    cntl_chol_var;
  fla_chol_t*
    cntl_chol_unb;
  fla_syrk_t*
    cntl_syrk_blas;
  fla_herk_t*
    cntl_herk_blas;
  fla_trsm_t*
    cntl_trsm_blas;
  fla_gemm_t*
    cntl_gemm_blas;

/*
  if( type == FLA_ALG_UNBLOCKED && n > 400 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }
*/

  bp               = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_chol_unb    = FLA_Cntl_chol_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT2, NULL, NULL, NULL, NULL, NULL, NULL );
  cntl_syrk_blas   = FLA_Cntl_syrk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
  cntl_herk_blas   = FLA_Cntl_herk_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
  cntl_trsm_blas   = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
  cntl_gemm_blas   = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL );
  cntl_chol_var    = FLA_Cntl_chol_obj_create( FLA_FLAT, variant, bp,
                                               cntl_chol_unb,
                                               cntl_syrk_blas,
                                               cntl_herk_blas,
                                               cntl_trsm_blas,
                                               cntl_gemm_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( b, b_save );
  FLA_Copy_external( b_orig, b_orig_save );


  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:

      REF_Chol_u( A );

      break;

    case 1:{

      // Time variant 1
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Chol_u_unb_var1( A );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Chol_u_opt_var1( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Chol_u_blk_var1( A, cntl_chol_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:{

      // Time variant 2
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Chol_u_unb_var2( A );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Chol_u_opt_var2( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Chol_u_blk_var2( A, cntl_chol_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    } 
    case 3:{

      // Time variant 3 
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Chol_u_unb_var3( A );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Chol_u_opt_var3( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Chol_u_blk_var3( A, cntl_chol_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    }

    *dtime = FLA_Clock() - *dtime;
    dtime_save = min( *dtime, dtime_save );
  }

  FLA_Cntl_obj_free( cntl_chol_var );
  FLA_Cntl_obj_free( cntl_chol_unb );
  FLA_Cntl_obj_free( cntl_syrk_blas );
  FLA_Cntl_obj_free( cntl_herk_blas );
  FLA_Cntl_obj_free( cntl_trsm_blas );
  FLA_Cntl_obj_free( cntl_gemm_blas );
  FLA_Blocksize_free( bp );

  if ( type == FLA_ALG_REFERENCE )
  {
    FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE,
                       FLA_UNIT_DIAG, A, b );
    FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                       FLA_NONUNIT_DIAG, A, b );

    FLA_Hemv_external( FLA_UPPER_TRIANGULAR,
                       FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig );

    FLA_Nrm2_external( b_orig, norm );
    FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm,
                               1, 1, diff, 1, 1 );
  }
  else
  {
    FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE,
                       FLA_UNIT_DIAG, A, b );
    FLA_Trsv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                       FLA_NONUNIT_DIAG, A, b );

    FLA_Hemv_external( FLA_UPPER_TRIANGULAR,
                       FLA_ONE, A_save, b, FLA_MINUS_ONE, b_orig );

    FLA_Nrm2_external( b_orig, norm );
    FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm,
                               1, 1, diff, 1, 1 );
  }

  *gflops = 1.0 / 3.0 * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) / 
            dtime_save / 1e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_save;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( b_save, b );
  FLA_Copy_external( b_orig_save, b_orig );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &b_save );
  FLA_Obj_free( &b_orig_save );
}
Beispiel #5
0
void time_Trinv_un(
                  int variant, int type, int nrepeats, int m, int nb_alg,
                  FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm,
                  double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, b_save, b_orig_save;

  fla_blocksize_t*
    bp;
  fla_trinv_t*
    cntl_trinv_var;
  fla_trinv_t*
    cntl_trinv_unb;
  fla_gemm_t*
    cntl_gemm_blas;
  fla_trmm_t*
    cntl_trmm_blas;
  fla_trsm_t*
    cntl_trsm_blas;


  bp                = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_trinv_unb    = FLA_Cntl_trinv_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT3, NULL, NULL, NULL, NULL, NULL, NULL );
  cntl_trmm_blas    = FLA_Cntl_trmm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
  cntl_trsm_blas    = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
  cntl_gemm_blas    = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL );
  cntl_trinv_var    = FLA_Cntl_trinv_obj_create( FLA_FLAT, variant, bp, cntl_trinv_unb, cntl_trmm_blas, cntl_trsm_blas, cntl_trsm_blas, cntl_gemm_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( b, b_save );
  FLA_Copy_external( b_orig, b_orig_save );


  for ( irep = 0 ; irep < nrepeats; irep++ )
  {
    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    // Time reference
    case 0:
      REF_Trinv_un( A );
      break;

    // Time variant 1
    case 1:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Trinv_un_unb_var1( A );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Trinv_un_opt_var1( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Trinv_un_blk_var1( A, cntl_trinv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time variant 2
    case 2:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Trinv_un_unb_var2( A );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Trinv_un_opt_var2( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Trinv_un_blk_var2( A, cntl_trinv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    } 

    // Time variant 3 
    case 3:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Trinv_un_unb_var3( A );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Trinv_un_opt_var3( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Trinv_un_blk_var3( A, cntl_trinv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time variant 4 
    case 4:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Trinv_un_unb_var4( A );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Trinv_un_opt_var4( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Trinv_un_blk_var4( A, cntl_trinv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  FLA_Cntl_obj_free( cntl_trinv_var );
  FLA_Cntl_obj_free( cntl_trinv_unb );
  FLA_Cntl_obj_free( cntl_gemm_blas );
  FLA_Cntl_obj_free( cntl_trmm_blas );
  FLA_Cntl_obj_free( cntl_trsm_blas );
  FLA_Blocksize_free( bp );

  {
    FLA_Trmv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                       FLA_NONUNIT_DIAG, A, b );

    FLA_Trmv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                       FLA_NONUNIT_DIAG, A_save, b );

    FLA_Axpy_external( FLA_MINUS_ONE, b_orig, b );

    FLA_Nrm2_external( b, norm );
    FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm,
                               1, 1, diff, 1, 1 );
  }

  *gflops = 1.0 / 3.0 * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) / 
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( b_save, b );
  FLA_Copy_external( b_orig_save, b_orig );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &b_save );
  FLA_Obj_free( &b_orig_save );
}