コード例 #1
0
ファイル: FLA_Gemm_nn_blk_var1.c プロジェクト: flame/libflame
FLA_Error FLA_Gemm_nt_blk_var1_ht( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, FLA_Gemm_t* cntl )
{
  FLA_Obj AT,              A0,
          AB,              A1,
                           A2;

  FLA_Obj CT,              C0,
          CB,              C1,
                           C2;

  int b;

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x1( C,    &CT, 
                      &CB,            0, FLA_TOP );

  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){

    b = 1;

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b, FLA_BOTTOM );

    FLA_Repart_2x1_to_3x1( CT,                &C0, 
                        /* ** */            /* ** */
                                              &C1, 
                           CB,                &C2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    /* C1 = alpha * A1 * B' + C1;   */
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, 
                       alpha, *FLASH_OBJ_PTR_AT( A1 ), *FLASH_OBJ_PTR_AT( B ), beta, *FLASH_OBJ_PTR_AT( C1 ), 
                       NULL );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x1_to_2x1( &CT,                C0, 
                                                  C1, 
                            /* ** */           /* ** */
                              &CB,                C2,     FLA_TOP );

  }

  return FLA_SUCCESS;
}
コード例 #2
0
ファイル: FLA_Gemm.c プロジェクト: pgawron/tlash
FLA_Error FLA_Gemm( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C )
{
  FLA_Error r_val = FLA_SUCCESS;
  
  // Check parameters.
  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_Gemm_check( transa, transb, alpha, A, B, beta, C );

#ifdef FLA_ENABLE_BLAS3_FRONT_END_CNTL_TREES
  r_val = FLA_Gemm_internal( transa, transb, alpha, A, B, beta, C, fla_gemm_cntl_mm_op );
#else
  r_val = FLA_Gemm_external( transa, transb, alpha, A, B, beta, C );
#endif

  return r_val;
}
コード例 #3
0
FLA_Error FLA_Syrk_ln_omp2l_var1( FLA_Obj A, FLA_Obj C )
{
  FLA_Obj AT,              A0,
          AB,              A1,
                           A2;

  FLA_Obj CTL,   CTR,      C00, C01, C02, 
          CBL,   CBR,      C10, C11, C12,
                           C20, C21, C22;

  int b;

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x2( C,    &CTL, &CTR,
                      &CBL, &CBR,     0, 0, FLA_TL );

  #pragma intel omp parallel taskq
  {
  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){

    b = FLA_Task_compute_blocksize( 0, A, AT, FLA_TOP );

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b, FLA_BOTTOM );

    FLA_Repart_2x2_to_3x3( CTL, /**/ CTR,       &C00, /**/ &C01, &C02,
                        /* ************* */   /* ******************** */
                                                &C10, /**/ &C11, &C12,
                           CBL, /**/ CBR,       &C20, /**/ &C21, &C22,
                           b, b, FLA_BR );

    /*------------------------------------------------------------*/

    #pragma intel omp task captureprivate(A1, A0, C10)
    {
    /* C10 = C10 + A1 * A0' */
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A1, A0, FLA_ONE, C10 );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR,       C00, C01, /**/ C02,
                                                     C10, C11, /**/ C12,
                            /* ************** */  /* ****************** */
                              &CBL, /**/ &CBR,       C20, C21, /**/ C22,
                              FLA_TL );

  }

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x2( C,    &CTL, &CTR,
                      &CBL, &CBR,     0, 0, FLA_TL );

  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){

    b = FLA_Task_compute_blocksize( 0, A, AT, FLA_TOP );

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b, FLA_BOTTOM );

    FLA_Repart_2x2_to_3x3( CTL, /**/ CTR,       &C00, /**/ &C01, &C02,
                        /* ************* */   /* ******************** */
                                                &C10, /**/ &C11, &C12,
                           CBL, /**/ CBR,       &C20, /**/ &C21, &C22,
                           b, b, FLA_BR );

    /*------------------------------------------------------------*/

    #pragma intel omp task captureprivate(C11, A1)
    {
    /* C11 = C11 + A1 * A1' */
    FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A1, FLA_ONE, C11 );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR,       C00, C01, /**/ C02,
                                                     C10, C11, /**/ C12,
                            /* ************** */  /* ****************** */
                              &CBL, /**/ &CBR,       C20, C21, /**/ C22,
                              FLA_TL );

  }
  }
  return FLA_SUCCESS;
}
コード例 #4
0
FLA_Error REF_Svdd_uv_components( FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V,
                                  double* dtime_bred, double* dtime_bsvd, double* dtime_appq,
                                  double* dtime_qrfa, double* dtime_gemm )
/*
{
  *dtime_bred = 1;
  *dtime_bsvd = 1;
  *dtime_appq = 1;
  *dtime_qrfa = 1;
  *dtime_gemm = 1;

  return FLA_Svdd_external( FLA_SVD_VECTORS_ALL, A, s, U, V );
}
*/

{
  FLA_Datatype dt_A;
  FLA_Datatype dt_A_real;
  dim_t        m_A, n_A;
  dim_t        min_m_n;
  FLA_Obj      tq, tu, tv, d, e, Ur, Vr, W;
  FLA_Obj      eT, epsilonB;
  FLA_Uplo     uplo = FLA_UPPER_TRIANGULAR;
  double       crossover_ratio = 16.0 / 10.0;
  double       dtime_temp;

  dt_A      = FLA_Obj_datatype( A );
  dt_A_real = FLA_Obj_datatype_proj_to_real( A );
  m_A       = FLA_Obj_length( A );
  n_A       = FLA_Obj_width( A );

  min_m_n   = FLA_Obj_min_dim( A );

  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tq );
  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tu );
  FLA_Obj_create( dt_A,      min_m_n, 1,   0, 0, &tv );
  FLA_Obj_create( dt_A_real, min_m_n, 1,   0, 0, &d );
  FLA_Obj_create( dt_A_real, min_m_n, 1,   0, 0, &e );
  FLA_Obj_create( dt_A_real, n_A,     n_A, 0, 0, &Ur );
  FLA_Obj_create( dt_A_real, n_A,     n_A, 0, 0, &Vr );


  FLA_Part_2x1( e,   &eT,
                     &epsilonB,    1, FLA_BOTTOM );

  if ( m_A >= n_A )
  {
    if ( m_A < crossover_ratio * n_A )
    {
      dtime_temp = FLA_Clock();
      {
        // Reduce to bidiagonal form.
        FLA_Bidiag_blk_external( A, tu, tv );
        FLA_Bidiag_UT_extract_diagonals( A, d, eT );
      }
      *dtime_bred = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Divide-and-conquor algorithm.
        FLA_Bsvdd_external( uplo, d, e, Ur, Vr );
      }
      *dtime_bsvd = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form U.
        FLA_Copy_external( Ur, U );
        FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, tu, U );

        // Form V.
        FLA_Copy_external( Vr, V );
        FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, A, tv, V );
      }
      *dtime_appq = FLA_Clock() - dtime_temp;


      *dtime_qrfa = 0.0;
      *dtime_gemm = 0.0;
    }
    else
    {
      FLA_Obj AT,
              AB;
      FLA_Obj UL, UR;

      FLA_Part_2x1( A,   &AT,
                         &AB,        n_A, FLA_TOP );
      FLA_Part_1x2( U,   &UL, &UR,   n_A, FLA_LEFT );

      // Create a temporary n-by-n matrix R.
      FLA_Obj_create( dt_A, n_A, n_A, 0, 0, &W );

      dtime_temp = FLA_Clock();
      {
        // Perform a QR factorization.
        FLA_QR_blk_external( A, tq );
        FLA_Copyr_external( FLA_LOWER_TRIANGULAR, A, UL );
        FLA_Setr( FLA_LOWER_TRIANGULAR, FLA_ZERO, A );
      }
      *dtime_qrfa = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form Q.
        FLA_QR_form_Q_external( U, tq );
      }
      *dtime_appq = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Reduce R to bidiagonal form.
        FLA_Bidiag_blk_external( AT, tu, tv );
        FLA_Bidiag_UT_extract_diagonals( A, d, eT );
      }
      *dtime_bred = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Divide-and-conquor algorithm.
        FLA_Bsvdd_external( uplo, d, e, Ur, Vr );
      }
      *dtime_bsvd = FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Form U in W.
        FLA_Copy_external( Ur, W );
        FLA_Bidiag_apply_U_external( FLA_LEFT, FLA_NO_TRANSPOSE, AT, tu, W );

        // Form V.
        FLA_Copy_external( Vr, V );
        FLA_Bidiag_apply_V_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, AT, tv, V );
      }
      *dtime_appq += FLA_Clock() - dtime_temp;


      dtime_temp = FLA_Clock();
      {
        // Multiply R into U, storing the result in A and then copying
        // back to U.
        FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                           FLA_ONE, UL, W, FLA_ZERO, A );
        FLA_Copy( A, UL );
      }
      *dtime_gemm = FLA_Clock() - dtime_temp;


      // Free R.
      FLA_Obj_free( &W );
    }
  }
  else
  {
    FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
  }

  // Copy singular values to output vector.
  FLA_Copy( d, s );

  // Sort singular values and vectors.
  FLA_Sort_svd( FLA_BACKWARD, s, U, V );

  FLA_Obj_free( &tq );
  FLA_Obj_free( &tu );
  FLA_Obj_free( &tv );
  FLA_Obj_free( &d );
  FLA_Obj_free( &e );
  FLA_Obj_free( &Ur );
  FLA_Obj_free( &Vr );

  return FLA_SUCCESS;
}
コード例 #5
0
ファイル: REF_Gemm_tt.c プロジェクト: anaptyxis/libflame
FLA_Error REF_Gemm( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C )
{
  FLA_Gemm_external( transa, transb, alpha, A, B, beta, C );

  return 0;
}
コード例 #6
0
ファイル: time_Gemm.c プロジェクト: anaptyxis/libflame
void time_Gemm( 
               int param_combo, int type, int nrepeats, int m, int k, int n,
               FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_old;

  if ( param_combo != 4 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( param_combo ){

    // Time parameter combination 0
    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 1
    case 1:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 2
    case 2:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 3
    case 3:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 4
    case 4:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        //FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 5
    case 5:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 6
    case 6:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 7
    case 7:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 8
    case 8:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }
	
    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }


/*
  if ( type == FLA_ALG_REFERENCE )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }
*/

  *gflops = 2.0 * m * k * n / 
            dtime_old / 
            1.0e9;

  if ( param_combo == 0 ||
       param_combo == 1 ||
       param_combo == 2 ||
       param_combo == 3 ||
       param_combo == 6 )
  *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
コード例 #7
0
FLA_Error FLA_Gemm_nn_omp_var15( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj C, fla_gemm_t* cntl )
{
  FLA_Obj AT,              A0,
          AB,              A1,
                           A2;

  FLA_Obj CT,              C0,
          CB,              C1,
                           C2;

  FLA_Obj AL,    AR,       A10,  A11,  A12;

  FLA_Obj BT,              B0,
          BB,              B1,
                           B2;
  FLA_Obj C1_local;

  int i, j, lock_ldim, lock_i;
  int b_m, b_k;

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x1( C,    &CT, 
                      &CB,            0, FLA_TOP );

  #pragma intel omp parallel taskq
  {
  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) )
  {
    b_m = FLA_Determine_blocksize( A, AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) );

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b_m, FLA_BOTTOM );

    FLA_Repart_2x1_to_3x1( CT,                &C0, 
                        /* ** */            /* ** */
                                              &C1, 
                           CB,                &C2,        b_m, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    /* C1 = alpha * A1 * B + C1; */
  
    FLA_Part_1x2( A1,   &AL,  &AR,      0, FLA_LEFT );

    FLA_Part_2x1( B,    &BT, 
                        &BB,            0, FLA_TOP );
  
    while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) )
    {
      b_k = FLA_Determine_blocksize( A, AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) );

      // Get the index of the current partition.
      // FIX THIS: need + b_m - 1 or something like this
      //j = FLA_Obj_length( CT ) / b_m;
      //i = FLA_Obj_width( AL ) / b_k;
      //lock_ldim = FLA_get_num_threads_in_m_dim(omp_get_num_threads());
      lock_i = FLA_Obj_length( CT ) / b_m;
  
      FLA_Repart_1x2_to_1x3( AL,  /**/ AR,        &A10, /**/ &A11, &A12,
                             b_k, FLA_RIGHT );

      FLA_Repart_2x1_to_3x1( BT,                &B0, 
                          /* ** */            /* ** */
                                                &B1, 
                             BB,                &B2,        b_k, FLA_BOTTOM );
  
      /*------------------------------------------------------------*/
  
      /*    C1 = alpha * A11 * B1 + C1; */
      //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
      ////           alpha, A11, B1, FLA_ONE, C1 );

      #pragma intel omp task captureprivate( lock_i, A11, B1, C1 ), private( C1_local )
      {
      FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C1, &C1_local );
      FLA_Obj_set_to_zero( C1_local );

      /*    C1_local = alpha * A1 * B11 + C1_local; */
      FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                         alpha, A11, B1, FLA_ONE, C1_local );

      // Acquire lock[i] (the lock for C1).
      omp_set_lock( &fla_omp_lock[lock_i] );

      /* C1 += C1_local */
      FLA_Axpy_external( FLA_ONE, C1_local, C1 );
      //FLA_Axpy_sync_pipeline2( j*lock_ldim, FLA_ONE, C1_local, C1 );
      //FLA_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 );
      //REF_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 );

      // Release lock[i] (the lock for C1).
      omp_unset_lock( &fla_omp_lock[lock_i] );

      FLA_Obj_free( &C1_local );
      }
  
      /*------------------------------------------------------------*/
  
      FLA_Cont_with_1x3_to_1x2( &AL,  /**/ &AR,        A10, A11, /**/ A12,
                                FLA_LEFT );

      FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                                                    B1, 
                              /* ** */           /* ** */
                                &BB,                B2,     FLA_TOP );
    }
  
    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x1_to_2x1( &CT,                C0, 
                                                  C1, 
                            /* ** */           /* ** */
                              &CB,                C2,     FLA_TOP );
  }
  }

  return FLA_SUCCESS;
}
コード例 #8
0
ファイル: FLA_Gemm_task.c プロジェクト: anaptyxis/libflame
FLA_Error FLA_Gemm_task( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl )
{
  return FLA_Gemm_external( transa, transb, alpha, A, B, beta, C );
}
コード例 #9
0
FLA_Error FLA_Gemm_nn_omp_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj C, fla_gemm_t* cntl )
{
  FLA_Obj AL,    AR,       A0,  A1,  A2;

  FLA_Obj BT,              B0,
          BB,              B1,
                           B2;
  FLA_Obj C_local;

  int b;

  FLA_Part_1x2( A,    &AL,  &AR,      0, FLA_LEFT );

  FLA_Part_2x1( B,    &BT, 
                      &BB,            0, FLA_TOP );

  #pragma intel omp parallel taskq
  {
  while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) ){

    b = FLA_Determine_blocksize( A, AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) );
    //b = min( FLA_Obj_width( AR ), nb_alg );

    FLA_Repart_1x2_to_1x3( AL,  /**/ AR,        &A0, /**/ &A1, &A2,
                           b, FLA_RIGHT );

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
                        /* ** */            /* ** */
                                              &B1, 
                           BB,                &B2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    #pragma intel omp task captureprivate(A1,B1) private(C_local)
    {
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_local );
    FLA_Obj_set_to_zero( C_local );

    /* C = alpha * A1 * B1 + C; */
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                       alpha, A1, B1, FLA_ONE, C_local );

    REF_Axpy_sync_circular( FLA_ONE, C_local, C );

    FLA_Obj_free( &C_local );
    }

    /*------------------------------------------------------------*/

    FLA_Cont_with_1x3_to_1x2( &AL,  /**/ &AR,        A0, A1, /**/ A2,
                              FLA_LEFT );

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                                                  B1, 
                            /* ** */           /* ** */
                              &BB,                B2,     FLA_TOP );

  }
  }

  return FLA_SUCCESS;
}
コード例 #10
0
ファイル: lq.c プロジェクト: anaptyxis/libflame
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Obj      A, Ak, T, Tk, D, Dk, A_copy, A_recovered, L, Q, Qk, W, x, y, z;
  dim_t        m, n, k;
  dim_t        min_m_n;
  FLA_Error    init_result; 
  double       residual_A, residual_Axy;
  int          use_form_q = 1;

  if ( argc == 4 ) {
    m = atoi(argv[1]);
    n = atoi(argv[2]);
    k = atoi(argv[3]);
    min_m_n = min(m,n);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m n k\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       n : matrix width\n");
    fprintf(stderr, "       k : number of house holder vectors applied for testing\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 || n == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // FLAME LQ^H setup
  FLA_Obj_create( datatype, m, n, 0, 0, &A );
  FLA_LQ_UT_create_T( A, &T );
  
  // Rand A and create A_copy.
  FLA_Random_matrix( A ); 

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_copy );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered );
  FLA_Copy( A, A_copy );

  // LQ test ( A = L Q^H )
  FLA_LQ_UT( A, T );

  // Create Q (identity), L (A_copy)
  FLA_Obj_create( datatype, m, n, 0, 0, &Q  ); FLA_Set_to_identity( Q  );
  FLA_Obj_create( datatype, m, m, 0, 0, &D  );

  FLA_Obj_create( datatype, k, n, 0, 0, &Qk ); FLA_Set_to_identity( Qk );
  FLA_Obj_create( datatype, k, k, 0, 0, &Dk  );

  
  FLA_Obj_create( datatype, m, m, 0, 0, &L );

  // Q^H := I H_{0}^H ... H_{k-1}^H
  if ( use_form_q ) {
    FLA_LQ_UT_form_Q( A, T, Q );
  } else {
    FLA_Apply_Q_UT_create_workspace_side( FLA_RIGHT, T, Q, &W );
    FLA_Apply_Q_UT( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    A, T, W, Q );
    FLA_Obj_free( &W );
  }

  // D := Q^T Q
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, Q, Q, FLA_ZERO, D ); 

  // Qk := I H0 ... Hk
  FLA_Part_1x2( T, &Tk, &W, k, FLA_LEFT );
  FLA_Part_2x1( A, &Ak, &W, k, FLA_TOP );

  if ( use_form_q ) {
    // Overwrite the result to test FLAME API
    FLA_Set( FLA_ZERO, Qk );
    FLA_Copy( Ak, Qk );
    FLA_LQ_UT_form_Q( Ak, Tk, Qk );
  } else {
    FLA_Apply_Q_UT_create_workspace( Tk, Qk, &W );
    FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    Ak, Tk, W, Qk );
    FLA_Obj_free( &W );
  }

  // Dk := Qk^T Qk
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, Qk, Qk, FLA_ZERO, Dk ); 

  // L := A (Q^H)^H
  if ( use_form_q ) {
    // Note that the formed Q is actually Q^H; transb should be carefully assigned.
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                       FLA_ONE, A_copy, Q, FLA_ZERO, L );
  } else {
    FLA_Apply_Q_UT_create_workspace( T, L, &W );
    FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    A, T, W, L );
    FLA_Obj_free( &W );
  }
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
                     FLA_ONE, L, Q, FLA_ZERO, A_recovered ); 

  // Create vectors for testing
  FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Set( FLA_ZERO, x );
  FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Set( FLA_ZERO, y );
  FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Set( FLA_ZERO, z );

  // x is given
  FLA_Set( FLA_ONE, x );

  // y := Ax
  FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_copy, x, FLA_ZERO, y );
  
  // z := L (Q^H) x , libflame
  FLA_Apply_Q_UT_create_workspace( T, x, &W );
  FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                  A, T, W, x );
  FLA_Obj_free( &W );

  if ( m < n )
    FLA_Part_2x1( x, &x, &W, m, FLA_TOP );
  else 
    FLA_Part_1x2( L, &L, &W, n, FLA_LEFT );    

  FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, L, x, FLA_ZERO, z );
  
  // Comapre (A_copy, A_recovered), (y,z) and (y,w)
  residual_A    = FLA_Max_elemwise_diff( A_copy, A_recovered );
  residual_Axy  = FLA_Max_elemwise_diff( y, z );

  if ( 1 || residual_A > EPS || residual_Axy > EPS ) {
    FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - T - ", T, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Q - ", Q, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - D = Q^T Q - ", D, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Qk - ", Qk, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Dk = Qk^T Qk - ", Dk, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - L - ", L, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------");
    fprintf( stdout, "lapack2flame: %lu x %lu, %lu: ", m, n, k);
    fprintf( stdout, "| A - A_recovered | = %12.10e, | Ax - y | = %12.10e\n\n",
             residual_A, residual_Axy ) ;
  }
  
  FLA_Obj_free( &A );
  FLA_Obj_free( &T );

  FLA_Obj_free( &A_copy );
  FLA_Obj_free( &A_recovered );
  FLA_Obj_free( &L );
  FLA_Obj_free( &Q );
  FLA_Obj_free( &Qk );
  FLA_Obj_free( &D );
  FLA_Obj_free( &Dk );

  FLA_Obj_free( &x );
  FLA_Obj_free( &y );
  FLA_Obj_free( &z );
                   
  FLA_Finalize_safe( init_result );     
}
コード例 #11
0
ファイル: time_Hess_UT.c プロジェクト: anaptyxis/libflame
void time_Hess_UT(
                 int variant, int type, int nrepeats, int m,
                 FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Hess_UT( A, t );
        break;
      case FLA_ALG_FRONT:
        FLA_Hess_UT( A, T );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  //if ( type == FLA_ALG_REFERENCE )
  //{
  //  ;
  //}
  //else
  {
    FLA_Obj    AT, AB;
    FLA_Obj Q, QT, QB;
    FLA_Obj E, ET, EB;
    FLA_Obj F;
    dim_t   m_A, m_T;

    m_A = FLA_Obj_length( A );
    m_T = FLA_Obj_length( T );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &Q );
    FLA_Set_to_identity( Q );

    FLA_Part_2x1( Q,    &QT,
                        &QB,    1, FLA_TOP );
    FLA_Part_2x1( A,    &AT,
                        &AB,    1, FLA_TOP );

    if ( type == FLA_ALG_REFERENCE )
    {
      if ( FLA_Obj_is_real( A ) )
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB );
      else
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB );
    }
    else
      FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QB );

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F );

    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_ONE, A_save, Q, FLA_ZERO, E );
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                       FLA_ONE, Q, E, FLA_ZERO, F );

    FLA_Copy( A, E );
    FLA_Part_2x1( E,    &ET,
                        &EB,    1, FLA_TOP );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, EB );

    *diff = FLA_Max_elemwise_diff( E, F );

    FLA_Obj_free( &Q );
    FLA_Obj_free( &E );
    FLA_Obj_free( &F );
  }

  *gflops = ( 10.0 / 3.0 * m * m * m ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &norm );
}
コード例 #12
0
void time_Apply_Q_UT_lnfc(
               int variant, int type, int n_repeats, int m, int n, int nb_alg,
               FLA_Obj A, FLA_Obj A_orig, FLA_Obj t, FLA_Obj T, FLA_Obj s, FLA_Obj S, FLA_Obj B,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, A_orig_save, B_save, norm;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_orig_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( A, A_orig_save );
  FLA_Copy_external( B, B_save );

  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( A_orig_save, A_orig );
    FLA_Copy_external( B_save, B );

    *dtime = FLA_Clock();

    switch( variant )
    {

    case 0:
      REF_Apply_Q_UT_lnfc( A, t, B );
      //REF_Bidiag_form_U_blk_external( FLA_LEFT, FLA_NO_TRANSPOSE, A, t, B );
      //FLA_Bidiag_blk_external( A_orig, t, s );
      //REF_Bidiag_form_U_blk_external( FLA_LEFT, FLA_NO_TRANSPOSE, A_orig, t, B );
      break;

    case 1:
    {
      // Time variant 1 
      switch( type ){
      case FLA_ALG_BLOCKED:
        //FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, B );
        FLA_QR_UT_form_Q( A, T, B );
        //FLA_Bidiag_UT_form_U( A, T, B );
        //FLA_Bidiag_UT( A_orig, T, S );
        //FLA_Bidiag_UT_form_U( A_orig, T, B );
        break;
      }

      break;
    }


    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }



/*
  if ( variant == 0 )
  {
    FLA_Copy_external( b, b_ref );
    if ( FLA_Obj_is_real( A ) )
      FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, t, b );
    else
      FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, t, b );
    FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                       FLA_NONUNIT_DIAG, FLA_ONE, A, b );
    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_save, b, FLA_ONE, b_ref );
    FLA_Nrm2_external( b_ref, norm );
    if ( FLA_Obj_is_single_precision( A ) )
      *diff = *(FLA_FLOAT_PTR(norm));
    else
      *diff = *(FLA_DOUBLE_PTR(norm));
  }
  else
*/
  {
    FLA_Obj_set_to_identity( A );
//FLA_Obj_show( "B", B, "%8.1e %8.1e ", "" );
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_ONE, B, B, FLA_MINUS_ONE, A );
    FLA_Norm_frob( A, norm );
    FLA_Obj_extract_real_scalar( norm, diff );

  }
/*
  *gflops = 2.0 * n * n * ( m - n / 3.0 ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;
*/

    *gflops = ( 4.0 * ( 2.0           * m * n * n - 2.0 / 3.0 * n * n * n ) +
                4.0 * ( 4.0 / 3.0     * m * m * m ) +
                4.0 * ( 4.0 / 3.0     * n * n * n ) +
                      ( 13.0      * 2 * m * m     ) +
                2.0 * (       3.0 * 2 * m * m * m ) +
                2.0 * (       3.0 * 2 * n * n * n ) ) /
              dtime_old / 1e9;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( A_orig_save, A_orig );
  FLA_Copy_external( B_save, B );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &A_orig_save );
  FLA_Obj_free( &B_save );
  FLA_Obj_free( &norm );
}
コード例 #13
0
FLA_Error FLA_QR_UT_piv_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl )
{
  FLA_Obj ATL,   ATR,      A00, A01, A02, 
          ABL,   ABR,      A10, A11, A12,
                           A20, A21, A22;

  FLA_Obj TL,    TR,       T0,  T1,  W12;
  FLA_Obj TT,    TB;

  FLA_Obj pT,              p0,
          pB,              p1,
                           p2;

  FLA_Obj wT,              w0,
          wB,              w1,
                           w2;

  dim_t   b_alg, b;

  // Query the algorithmic blocksize by inspecting the length of T.
  b_alg = FLA_Obj_length( T );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );

  FLA_Part_1x2( T,    &TL,  &TR,      0, FLA_LEFT );

  FLA_Part_2x1( p,    &pT, 
                      &pB,            0, FLA_TOP );

  FLA_Part_2x1( w,    &wT, 
                      &wB,            0, FLA_TOP );

  while ( FLA_Obj_min_dim( ABR ) > 0 ){

    b = min( b_alg, FLA_Obj_min_dim( ABR ) );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );

    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, /**/ &T1, &W12,
                           b, FLA_RIGHT );

    FLA_Repart_2x1_to_3x1( pT,                &p0, 
                        /* ** */            /* ** */
                                              &p1, 
                           pB,                &p2,        b, FLA_BOTTOM );

    FLA_Repart_2x1_to_3x1( wT,                &w0, 
                        /* ** */            /* ** */
                                              &w1, 
                           wB,                &w2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    // ** Reshape T matrices to match the blocksize b
    FLA_Part_2x1( TR,   &TT, 
                        &TB,    b, FLA_TOP );

    // ** Perform a unblocked (BLAS2-oriented) QR factorization 
    // with pivoting via the UT transform on ABR:
    //
    //   ABR  -> QB1 R11
    //
    // where:
    //  - QB1 is formed from UB1 (which is stored column-wise below the
    //    diagonal of ( A11 A21 )^T and the upper-triangle of T1. 
    //  - R11 is stored to ( A11 A12 ).
    //  - W12 stores  T and partial updates for FLA_Apply_Q_UT_piv_var.
    FLA_QR_UT_piv_internal( ABR, TT, wB, p1, 
                            FLA_Cntl_sub_qrut( cntl ) );

    if ( FLA_Obj_width( A12 ) > 0 )
    {
      // ** Block update
      FLA_Part_2x1( W12,  &TT, 
                          &TB,    b, FLA_TOP );
 
      FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
                         FLA_MINUS_ONE, A21, TT, FLA_ONE, A22 );
    }

    // ** Apply pivots to previous columns.
    FLA_Apply_pivots( FLA_RIGHT, FLA_TRANSPOSE, p1, ATR );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );

    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, T1, /**/ W12,
                              FLA_LEFT );

    FLA_Cont_with_3x1_to_2x1( &pT,                p0, 
                                                  p1, 
                            /* ** */           /* ** */
                              &pB,                p2,     FLA_TOP );

    FLA_Cont_with_3x1_to_2x1( &wT,                w0, 
                                                  w1, 
                            /* ** */           /* ** */
                              &wB,                w2,     FLA_TOP );
  }

  return FLA_SUCCESS;
}
コード例 #14
0
ファイル: bidiag_ext.c プロジェクト: anaptyxis/libflame
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Datatype realtype = REALTYPE;
  FLA_Obj      
    A, TU, TV, 
    A_copy, A_recovered,
    U, V, Vb, B, Be, d, e, 
    DU, DV;

  FLA_Obj     
    ATL, ATR,
    ABL, ABR, Ae;

  FLA_Uplo     uplo;
  dim_t        m, n, min_m_n;
  FLA_Error    init_result; 

  double       residual_A = 0.0;

  if ( argc == 3 ) {
    m = atoi(argv[1]);
    n = atoi(argv[2]);
    min_m_n = min(m,n);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m n\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       n : matrix width\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 || n == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // FLAME Bidiag setup
  FLA_Obj_create( datatype, m, n, 0, 0, &A );
  FLA_Bidiag_UT_create_T( A, &TU, &TV );

  // Rand A and create A_copy.
  FLA_Random_matrix( A ); 
  {
    scomplex *buff_A = FLA_Obj_buffer_at_view( A );
    buff_A[0].real = 4.4011e-01; buff_A[0].imag = -4.0150e-09; buff_A[2].real = -2.2385e-01; buff_A[2].imag = -1.5546e-01; buff_A[4].real = -6.3461e-02; buff_A[4].imag = 2.7892e-01; buff_A[6].real = -1.3197e-01; buff_A[6].imag = 5.0888e-01;  
    buff_A[1].real = 3.3352e-01; buff_A[1].imag = -6.6346e-02; buff_A[3].real = -1.9307e-01; buff_A[3].imag = -8.4066e-02; buff_A[5].real = -6.0446e-03; buff_A[5].imag = 2.2094e-01; buff_A[7].real = -2.3299e-02; buff_A[7].imag = 4.0553e-01;
  }

  //FLA_Set_to_identity( A );
  //FLA_Scal( FLA_MINUS_ONE, A );

  if ( m >= n ) {
    uplo = FLA_UPPER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
    Ae = ATR; 
  } else {
    uplo = FLA_LOWER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
    Ae = ABL;
  }

  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_copy );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered );

  // Bidiag test
  {
    FLA_Obj      norm;
    FLA_Bool     apply_scale;

    FLA_Obj_create( realtype, 1,1, 0,0, &norm );

    FLA_Max_abs_value( A, norm );
    apply_scale = FLA_Obj_gt( norm, FLA_OVERFLOW_SQUARE_THRES ); 

    if ( apply_scale ) FLA_Scal( FLA_SAFE_MIN, A );
    FLA_Bidiag_UT( A, TU, TV );
    if ( apply_scale ) FLA_Bidiag_UT_scale_diagonals( FLA_SAFE_INV_MIN, A ); 

    FLA_Obj_free( &norm );
  }


  // Orthonomal basis U, V. 
  FLA_Obj_create( datatype, m, min_m_n, 0, 0, &U ); FLA_Set( FLA_ZERO, U );
  FLA_Obj_create( datatype, min_m_n, n, 0, 0, &V ); FLA_Set( FLA_ZERO, V );

  FLA_Bidiag_UT_form_U_ext( uplo, A, TU, FLA_NO_TRANSPOSE,   U );
  FLA_Bidiag_UT_form_V_ext( uplo, A, TV, FLA_CONJ_TRANSPOSE, V ); 

  if ( FLA_Obj_is_complex( A ) ){
    FLA_Obj rL, rR;
    
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rL );
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rR );

    FLA_Obj_fshow( stdout, " - Factor no realified - ", A, "% 6.4e", "------");
    FLA_Bidiag_UT_realify( A, rL, rR );
    FLA_Obj_fshow( stdout, " - Factor    realified - ", A, "% 6.4e", "------");

    FLA_Obj_fshow( stdout, " - rL - ", rL, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - rR - ", rR, "% 6.4e", "------");

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, U );
    FLA_Apply_diag_matrix( FLA_LEFT,  FLA_CONJUGATE, rR, V );

    FLA_Obj_free( &rL );
    FLA_Obj_free( &rR );
  }

  // U^H U
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DU );
  FLA_Gemm_external( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, 
                     FLA_ONE, U, U, FLA_ZERO, DU );

  // V^H V
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DV );
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, V, V, FLA_ZERO, DV );
  
  // Recover the matrix
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &B );
  FLA_Set( FLA_ZERO, B );

  // Set B
  FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &d );  
  FLA_Set_diagonal_vector( A, d );
  FLA_Set_diagonal_matrix( d, B );
  FLA_Obj_free( &d );

  if ( min_m_n > 1 ) {
    FLA_Obj_create( datatype, min_m_n - 1 , 1, 0, 0, &e );  
    FLA_Set_diagonal_vector( Ae, e );
    if ( uplo == FLA_UPPER_TRIANGULAR ) {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
      Be = ATR;
    } else {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
      Be = ABL;
    }
    FLA_Set_diagonal_matrix( e, Be );
    FLA_Obj_free( &e );
  }

  // Vb := B (V^H)
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, V, &Vb );
  FLA_Trmm_external( FLA_LEFT, uplo, FLA_NO_TRANSPOSE,
                     FLA_NONUNIT_DIAG, FLA_ONE, B, Vb );

  // A := U Vb
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                     FLA_ONE, U, Vb, FLA_ZERO, A_recovered );

  residual_A    = FLA_Max_elemwise_diff( A_copy, A_recovered );

  if (1) {
    FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TU - ", TU, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TV - ", TV, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - B - ", B, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U - ", U, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - V - ", V, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Vb - ", Vb, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U'U - ", DU,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - VV' - ", DV,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------");
    fprintf( stdout, "lapack2flame: %lu x %lu: ", m, n);
    fprintf( stdout, "recovery A = %12.10e\n\n", residual_A ) ;
  }
  
  FLA_Obj_free( &A );
  FLA_Obj_free( &TU );
  FLA_Obj_free( &TV );

  FLA_Obj_free( &B );

  FLA_Obj_free( &U );
  FLA_Obj_free( &V );
  FLA_Obj_free( &Vb );

  FLA_Obj_free( &DU );
  FLA_Obj_free( &DV );

  FLA_Obj_free( &A_copy );
  FLA_Obj_free( &A_recovered );


  FLA_Finalize_safe( init_result );     
}
コード例 #15
0
ファイル: FLA_Gemm_task.c プロジェクト: anaptyxis/libflame
FLA_Error FLA_Gemm_tt_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl )
{
  return FLA_Gemm_external( FLA_TRANSPOSE, FLA_TRANSPOSE, alpha, A, B, beta, C );
}
コード例 #16
0
ファイル: test_apqut.c プロジェクト: flame/libflame
void libfla_test_apqut_experiment( test_params_t params,
                                   unsigned int  var,
                                   char*         sc_str,
                                   FLA_Datatype  datatype,
                                   unsigned int  p_cur,
                                   unsigned int  pci,
                                   unsigned int  n_repeats,
                                   signed int    impl,
                                   double*       perf,
                                   double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m, n;
	unsigned int min_m_n;
	signed int   m_input;
	signed int   n_input;
	FLA_Side     side;
	FLA_Trans    trans;
	FLA_Direct   direct;
	FLA_Store    storev;
	FLA_Obj      A, T, W, B, eye, norm;
	FLA_Obj      B_save;
	FLA_Obj      A_test, T_test, W_test, B_test;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side );
	FLA_Param_map_char_to_flame_trans( &pc_str[pci][1], &trans );
	FLA_Param_map_char_to_flame_direct( &pc_str[pci][2], &direct );
	FLA_Param_map_char_to_flame_storev( &pc_str[pci][3], &storev );

	// We want to make sure the Apply_Q_UT routines work with rectangular
	// matrices. So we use m > n when testing with column-wise storage (via
	// QR factorization) and m < n when testing with row-wise storage (via
	// LQ factorization).
	if ( storev == FLA_COLUMNWISE )
	{
		m_input = -1;
		n_input = -1;
		//m_input = -1;
		//n_input = -1;
	}
	else // if ( storev == FLA_ROWWISE )
	{
		m_input = -1;
		n_input = -1;
		//m_input = -1;
		//n_input = -1;
	}

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur * abs(n_input);
	else               n = p_cur;

	// Compute the minimum dimension.
	min_m_n = min( m, n );

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, n, &A );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, min_m_n, &T );
	if ( storev == FLA_COLUMNWISE )
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, m, &B );
	else
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], n, n, &B );

	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &eye );

	FLA_Apply_Q_UT_create_workspace( T, B, &W );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Initialize the test matrices.
	FLA_Random_matrix( A );
	FLA_Set_to_identity( B );
	FLA_Set_to_identity( eye );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, B, &B_save );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE )
			FLASH_QR_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test );
		else // if ( storev == FLA_ROWWISE )
			FLASH_LQ_UT_create_hier_matrices( A, 1, &b_flash, &A_test, &T_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
		FLASH_Apply_Q_UT_create_workspace( T_test, B_test, &W_test );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		A_test = A;
		T_test = T;
		W_test = W;
		B_test = B;
	}

	// Compute a Householder factorization.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE ) FLASH_QR_UT( A_test, T_test );
		else                            FLASH_LQ_UT( A_test, T_test );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		if ( storev == FLA_COLUMNWISE ) FLA_QR_UT( A_test, T_test );
		else                            FLA_LQ_UT( A_test, T_test );
	}

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( B_save, B_test );
		else
			FLA_Copy_external( B_save, B_test );

		time = FLA_Clock();

		libfla_test_apqut_impl( impl, side, trans, direct, storev,
		                        A_test, T_test, W_test, B_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Multiply by its conjugate-transpose to get what should be (near) identity
	// and then subtract from actual identity to get what should be (near) zero.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_flatten( B_test, B );
		FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
		                   FLA_ONE, B, B, FLA_MINUS_ONE, eye );
	}
	else // if ( impl == FLA_TEST_FLAT_FRONT_END )
	{
		FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
		                   FLA_ONE, B, B, FLA_MINUS_ONE, eye );
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &T_test );
		FLASH_Obj_free( &W_test );
		FLASH_Obj_free( &B_test );
	}

	// Compute the norm of eye, which contains I - Q * Q'.
	FLA_Norm1( eye, norm );
	FLA_Obj_extract_real_scalar( norm, residual );

	// Compute the performance of the best experiment repeat.
	*perf = (  4.0 *       m * min_m_n * n -
	           2.0 * min_m_n * min_m_n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Free the supporting flat objects.
	FLA_Obj_free( &B_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
	FLA_Obj_free( &W );
	FLA_Obj_free( &B );
	FLA_Obj_free( &eye );
	FLA_Obj_free( &norm );
}
コード例 #17
0
FLA_Error FLA_Hess_UT_blk_var4( FLA_Obj A, FLA_Obj T )
{
  FLA_Obj  ATL,   ATR,      A00, A01, A02, 
           ABL,   ABR,      A10, A11, A12,
                            A20, A21, A22;
  FLA_Obj  UT,              U0,
           UB,              U1,
                            U2;
  FLA_Obj  YT,              Y0,
           YB,              Y1,
                            Y2;
  FLA_Obj  ZT,              Z0,
           ZB,              Z1,
                            Z2;
  FLA_Obj  TL,    TR,       T0, T1, T2; 

  FLA_Obj  U, Y, Z;
  FLA_Obj  ABR_l;
  FLA_Obj  UB_l, U2_l;
  FLA_Obj  YB_l, Y2_l;
  FLA_Obj  ZB_l, Z2_l;
  FLA_Obj  WT_l;
  FLA_Obj  T1_tl;
  FLA_Obj  none, none2, none3;
  FLA_Obj  UB_tl,
           UB_bl;
  FLA_Datatype datatype_A;
  dim_t        m_A;
  dim_t        b_alg, b, bb;

  b_alg      = FLA_Obj_length( T );

  datatype_A = FLA_Obj_datatype( A );
  m_A        = FLA_Obj_length( A );

  FLA_Obj_create( datatype_A, m_A,    b_alg, 0, 0, &U );
  FLA_Obj_create( datatype_A, m_A,    b_alg, 0, 0, &Y );
  FLA_Obj_create( datatype_A, m_A,    b_alg, 0, 0, &Z );

  FLA_Part_2x2( A,    &ATL, &ATR,
                      &ABL, &ABR,     0, 0, FLA_TL );
  FLA_Part_2x1( U,    &UT, 
                      &UB,            0, FLA_TOP );
  FLA_Part_2x1( Y,    &YT, 
                      &YB,            0, FLA_TOP );
  FLA_Part_2x1( Z,    &ZT, 
                      &ZB,            0, FLA_TOP );
  FLA_Part_1x2( T,    &TL,  &TR,      0, FLA_LEFT ); 

  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) )
  {
    b = min( FLA_Obj_length( ABR ), b_alg );

    FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
                        /* ************* */   /* ******************** */
                                                &A10, /**/ &A11, &A12,
                           ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
                           b, b, FLA_BR );
    FLA_Repart_2x1_to_3x1( UT,                &U0, 
                        /* ** */            /* ** */
                                              &U1, 
                           UB,                &U2,        b, FLA_BOTTOM );
    FLA_Repart_2x1_to_3x1( YT,                &Y0, 
                        /* ** */            /* ** */
                                              &Y1, 
                           YB,                &Y2,        b, FLA_BOTTOM );
    FLA_Repart_2x1_to_3x1( ZT,                &Z0, 
                        /* ** */            /* ** */
                                              &Z1, 
                           ZB,                &Z2,        b, FLA_BOTTOM );
    FLA_Repart_1x2_to_1x3( TL,  /**/ TR,        &T0, /**/ &T1, &T2,
                           b, FLA_RIGHT );

    /*------------------------------------------------------------*/

    FLA_Part_2x2( T1,     &T1_tl, &none,   
                          &none2, &none3,   b, b, FLA_TL ); 

    bb = min( FLA_Obj_length( ABR ) - 1, b_alg );

    FLA_Part_1x2( ABR,    &ABR_l, &none,    bb, FLA_LEFT ); 
    FLA_Part_1x2( UB,     &UB_l,  &none,    bb, FLA_LEFT ); 
    FLA_Part_1x2( YB,     &YB_l,  &none,    bb, FLA_LEFT ); 
    FLA_Part_1x2( ZB,     &ZB_l,  &none,    bb, FLA_LEFT ); 

    FLA_Part_2x1( UB_l,   &none,
                          &U2_l,             b, FLA_TOP );
    FLA_Part_2x1( YB_l,   &none,
                          &Y2_l,             b, FLA_TOP );
    FLA_Part_2x1( ZB_l,   &none,
                          &Z2_l,             b, FLA_TOP );

    // [ ABR, YB, ZB, T1 ] = FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1, b );
    //FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1_tl );
    //FLA_Hess_UT_step_ofu_var4( ABR, YB, ZB, T1_tl );
    FLA_Hess_UT_step_opt_var4( ABR, YB, ZB, T1_tl );

    // Build UB from ABR, with explicit unit subdiagonal and zeros.
    FLA_Copy_external( ABR_l, UB_l );
    FLA_Part_2x1( UB_l,   &UB_tl, 
                          &UB_bl,            1, FLA_TOP );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl );
    FLA_Set( FLA_ZERO, UB_tl );

    // ATR = ATR - ATR * UB * inv( triu( T ) ) * UB' );
    if ( FLA_Obj_length( ATR ) > 0 )
    {
      // NOTE: We use ZT as temporary workspace.
      FLA_Part_1x2( ZT,     &WT_l,  &none,    bb, FLA_LEFT ); 
      FLA_Part_2x2( T1,     &T1_tl, &none,   
                            &none2, &none3,   bb, bb, FLA_TL ); 

      // WT_l = ATR * UB_l * inv( triu( T ) ).
      FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                         FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l );
      FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, 
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l );

      // ATR = ATR - WT_l * UB_l'
      FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                         FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR );
    }

    // A22 = A22 - U2 * Y2' - Z2 * U2';
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_MINUS_ONE, U2_l, Y2_l, FLA_ONE, A22 );
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_MINUS_ONE, Z2_l, U2_l, FLA_ONE, A22 );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
                                                     A10, A11, /**/ A12,
                            /* ************** */  /* ****************** */
                              &ABL, /**/ &ABR,       A20, A21, /**/ A22,
                              FLA_TL );
    FLA_Cont_with_3x1_to_2x1( &UT,                U0, 
                                                  U1, 
                            /* ** */           /* ** */
                              &UB,                U2,     FLA_TOP );
    FLA_Cont_with_3x1_to_2x1( &YT,                Y0, 
                                                  Y1, 
                            /* ** */           /* ** */
                              &YB,                Y2,     FLA_TOP );
    FLA_Cont_with_3x1_to_2x1( &ZT,                Z0, 
                                                  Z1, 
                            /* ** */           /* ** */
                              &ZB,                Z2,     FLA_TOP );
    FLA_Cont_with_1x3_to_1x2( &TL,  /**/ &TR,        T0, T1, /**/ T2,
                              FLA_LEFT );
  }

  FLA_Obj_free( &U );
  FLA_Obj_free( &Y );
  FLA_Obj_free( &Z );

  return FLA_SUCCESS;
}
コード例 #18
0
ファイル: test_gemm.c プロジェクト: anaptyxis/libflame
int test_gemm( FILE* stream, param_t param, result_t *result)
{
    FLA_Datatype datatype = param.datatype;
    FLA_Trans    transa = param.trans[0], transb = param.trans[1];
    FLA_Obj      A, B, C, x, y, z, w;
    FLA_Obj      alpha, beta;
    double       time, time_min =  MAX_TIME_VALUE;
    unsigned int i,
             m = param.dims[0],
             n = param.dims[1],
             k = param.dims[2],
             repeat = param.repeat;
    int          is_trans, is_complex;

    // Create matrices.
    is_trans = (transa == FLA_NO_TRANSPOSE);
    FLA_Obj_create( datatype, (is_trans ? m:k), (is_trans ? k:m), 0,0, &A );
    is_trans = (transb == FLA_NO_TRANSPOSE);
    FLA_Obj_create( datatype, (is_trans ? k:n), (is_trans ? n:k), 0,0, &B );
    FLA_Obj_create( datatype,                m,                n, 0,0, &C );

    FLA_Obj_create( datatype, n, 1, 0, 0, &x );
    FLA_Obj_create( datatype, m, 1, 0, 0, &y );
    FLA_Obj_create( datatype, m, 1, 0, 0, &z );
    FLA_Obj_create( datatype, k, 1, 0, 0, &w );

    // Initialize the test matrices.
    FLA_Random_matrix( A );
    FLA_Random_matrix( B );
    FLA_Random_matrix( C );

    FLA_Random_matrix( x );
    FLA_Set( FLA_ZERO, y );
    FLA_Set( FLA_ZERO, w );
    FLA_Set( FLA_ZERO, z );

    // Constants.
    alpha = FLA_MINUS_ONE;
    beta  = FLA_ZERO;

    // Repeat the experiment repeat times and record results.
    for ( i = 0; i < repeat; ++i )
    {
        time = FLA_Clock();
        FLA_Gemm_external( transa, transb, alpha, A, B, beta, C );
        time = FLA_Clock() - time;

        time_min = min( time_min, time );
    }

    is_complex = FLA_Obj_is_complex( C );
    result->performance = ( FMULS * FP_PER_MUL(is_complex) +
                            FADDS * FP_PER_ADD(is_complex) )/time_min/FLOPS_PER_UNIT_PERF;

    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y );
    FLA_Gemv_external( transb,           FLA_ONE, B, x, FLA_ZERO, w );
    FLA_Gemv_external( transa,           alpha,   A, w, FLA_ZERO, z );

    result->residual = FLA_Max_elemwise_diff( y, z );

    FLA_Obj_free( &A );
    FLA_Obj_free( &B );
    FLA_Obj_free( &C );
    FLA_Obj_free( &x );
    FLA_Obj_free( &y );
    FLA_Obj_free( &z );
    FLA_Obj_free( &w );

    return 0;
}