예제 #1
0
double FLASH_Max_elemwise_diff( FLA_Obj A, FLA_Obj B )
{
	FLA_Obj A_flat, B_flat;
	double  max_diff;

	// Exit early if one dimension is zero.
	if ( FLA_Obj_has_zero_dim( A ) ) return -1.0;

	// Create a temporary flat copy of the hierarchical objects.
	FLASH_Obj_create_flat_copy_of_hier( A, &A_flat );
	FLASH_Obj_create_flat_copy_of_hier( B, &B_flat );

	// Get the maximum element-wise diff.
	max_diff = FLA_Max_elemwise_diff( A_flat, B_flat );
	
	// Free the temporary flat objects.
	FLA_Obj_free( &A_flat );
	FLA_Obj_free( &B_flat );
	
	return max_diff;
}
예제 #2
0
void time_Sylv(
                int param_combo, int type, int nrepeats, int m, int n,
                FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale,
                double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_old;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( param_combo ){

    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Sylv( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale );
        break;
      case FLA_ALG_FRONT:
        FLA_Sylv( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 1:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Sylv( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale );
        break;
      case FLA_ALG_FRONT:
        FLA_Sylv( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Sylv( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale );
        break;
      case FLA_ALG_FRONT:
        FLA_Sylv( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, isgn, A, B, C, scale );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 3:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Sylv( FLA_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale );
        break;
      case FLA_ALG_FRONT:
        FLA_Sylv( FLA_TRANSPOSE, FLA_TRANSPOSE, isgn, A, B, C, scale );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  if ( type == FLA_ALG_REFERENCE ){
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else{
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }

  *gflops = ( m * m * n + n * n * m ) / 
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( C ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
예제 #3
0
void time_Gemm_nn( 
               int variant, int type, int nrepeats, int n, int nb_alg,
               FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep,
    info, lwork;

  double
    dtime_old,
    d_minus_one = -1.0, d_one = 1.0;

  FLA_Obj
    Cold;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold );

  FLA_Copy_external( C, Cold );

  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( Cold, C );

    *dtime = FLA_Clock();

    switch( variant ){
    case 0:
      // Time reference implementation
      REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
                FLA_ONE, A, B, FLA_ONE, C );
      break;

    case 1:{
      // Time variant 1
      switch( type ){
      case FLA_ALG_OPENMP_BVAR:
        FLA_Gemm_nn_omp_var1( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 2:{
      // Time variant 2
      switch( type ){
      case FLA_ALG_OPENMP_BVAR:
        FLA_Gemm_nn_omp_var2( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 3:{
      // Time variant 3
      switch( type ){
      case FLA_ALG_OPENMP_BVAR:
        FLA_Gemm_nn_omp_var3( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 4:{
      // Time variant 4
      switch( type ){
      case FLA_ALG_OPENMP_BVAR:
        FLA_Gemm_nn_omp_var4( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 5:{
      // Time variant 5
      switch( type ){
      case FLA_ALG_OPENMP_BVAR:
        FLA_Gemm_nn_omp_var5( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 6:{
      // Time variant 6
      switch( type ){
      case FLA_ALG_OPENMP_BVAR:
        FLA_Gemm_nn_omp_var6( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 13:{
      // Time variant 1->3
      switch( type ){
      case FLA_ALG_OPENMP_CVAR:
        FLA_Gemm_nn_omp_var13( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 15:{
      // Time variant 1->5
      switch( type ){
      case FLA_ALG_OPENMP_CVAR:
        FLA_Gemm_nn_omp_var15( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 31:{
      // Time variant 3->1 
      switch( type ){
      case FLA_ALG_OPENMP_CVAR:
        FLA_Gemm_nn_omp_var31( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 35:{
      // Time variant 3->5 
      switch( type ){
      case FLA_ALG_OPENMP_CVAR:
        FLA_Gemm_nn_omp_var35( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 51:{
      // Time variant 5->1 
      switch( type ){
      case FLA_ALG_OPENMP_CVAR:
        FLA_Gemm_nn_omp_var51( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 53:{
      // Time variant 5->3 
      switch( type ){
      case FLA_ALG_OPENMP_CVAR:
        FLA_Gemm_nn_omp_var53( FLA_ONE, A, B, C, nb_alg );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    }

    if ( irep == 0 )
      dtime_old = FLA_Clock() - *dtime;
    else{
      *dtime = FLA_Clock() - *dtime;
      dtime_old = min( *dtime, dtime_old );
    }
  }


  if ( variant == 0 ){
    FLA_Copy_external( C, Cref );
    *diff = 0.0;
  }
  else{
    *diff = FLA_Max_elemwise_diff( C, Cref );
    //FLA_Obj_show( "C:", C, "%f", "\n");
  }

  *gflops = 2.0 * 
            FLA_Obj_length( C ) * 
            FLA_Obj_width( C ) * 
            FLA_Obj_width( A ) / 
            dtime_old / 
            1e9;

  *dtime = dtime_old;

  FLA_Copy_external( Cold, C );

  FLA_Obj_free( &Cold );
}
예제 #4
0
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Datatype realtype = REALTYPE;
  FLA_Obj      
    A, TU, TV, 
    A_copy, A_recovered,
    U, V, Vb, B, Be, d, e, 
    DU, DV;

  FLA_Obj     
    ATL, ATR,
    ABL, ABR, Ae;

  FLA_Uplo     uplo;
  dim_t        m, n, min_m_n;
  FLA_Error    init_result; 

  double       residual_A = 0.0;

  if ( argc == 3 ) {
    m = atoi(argv[1]);
    n = atoi(argv[2]);
    min_m_n = min(m,n);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m n\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       n : matrix width\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 || n == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // FLAME Bidiag setup
  FLA_Obj_create( datatype, m, n, 0, 0, &A );
  FLA_Bidiag_UT_create_T( A, &TU, &TV );

  // Rand A and create A_copy.
  FLA_Random_matrix( A ); 
  {
    scomplex *buff_A = FLA_Obj_buffer_at_view( A );
    buff_A[0].real = 4.4011e-01; buff_A[0].imag = -4.0150e-09; buff_A[2].real = -2.2385e-01; buff_A[2].imag = -1.5546e-01; buff_A[4].real = -6.3461e-02; buff_A[4].imag = 2.7892e-01; buff_A[6].real = -1.3197e-01; buff_A[6].imag = 5.0888e-01;  
    buff_A[1].real = 3.3352e-01; buff_A[1].imag = -6.6346e-02; buff_A[3].real = -1.9307e-01; buff_A[3].imag = -8.4066e-02; buff_A[5].real = -6.0446e-03; buff_A[5].imag = 2.2094e-01; buff_A[7].real = -2.3299e-02; buff_A[7].imag = 4.0553e-01;
  }

  //FLA_Set_to_identity( A );
  //FLA_Scal( FLA_MINUS_ONE, A );

  if ( m >= n ) {
    uplo = FLA_UPPER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
    Ae = ATR; 
  } else {
    uplo = FLA_LOWER_TRIANGULAR;
    FLA_Part_2x2( A, &ATL, &ATR,
                     &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
    Ae = ABL;
  }

  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_copy );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered );

  // Bidiag test
  {
    FLA_Obj      norm;
    FLA_Bool     apply_scale;

    FLA_Obj_create( realtype, 1,1, 0,0, &norm );

    FLA_Max_abs_value( A, norm );
    apply_scale = FLA_Obj_gt( norm, FLA_OVERFLOW_SQUARE_THRES ); 

    if ( apply_scale ) FLA_Scal( FLA_SAFE_MIN, A );
    FLA_Bidiag_UT( A, TU, TV );
    if ( apply_scale ) FLA_Bidiag_UT_scale_diagonals( FLA_SAFE_INV_MIN, A ); 

    FLA_Obj_free( &norm );
  }


  // Orthonomal basis U, V. 
  FLA_Obj_create( datatype, m, min_m_n, 0, 0, &U ); FLA_Set( FLA_ZERO, U );
  FLA_Obj_create( datatype, min_m_n, n, 0, 0, &V ); FLA_Set( FLA_ZERO, V );

  FLA_Bidiag_UT_form_U_ext( uplo, A, TU, FLA_NO_TRANSPOSE,   U );
  FLA_Bidiag_UT_form_V_ext( uplo, A, TV, FLA_CONJ_TRANSPOSE, V ); 

  if ( FLA_Obj_is_complex( A ) ){
    FLA_Obj rL, rR;
    
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rL );
    FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rR );

    FLA_Obj_fshow( stdout, " - Factor no realified - ", A, "% 6.4e", "------");
    FLA_Bidiag_UT_realify( A, rL, rR );
    FLA_Obj_fshow( stdout, " - Factor    realified - ", A, "% 6.4e", "------");

    FLA_Obj_fshow( stdout, " - rL - ", rL, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - rR - ", rR, "% 6.4e", "------");

    FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, U );
    FLA_Apply_diag_matrix( FLA_LEFT,  FLA_CONJUGATE, rR, V );

    FLA_Obj_free( &rL );
    FLA_Obj_free( &rR );
  }

  // U^H U
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DU );
  FLA_Gemm_external( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, 
                     FLA_ONE, U, U, FLA_ZERO, DU );

  // V^H V
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DV );
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, V, V, FLA_ZERO, DV );
  
  // Recover the matrix
  FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &B );
  FLA_Set( FLA_ZERO, B );

  // Set B
  FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &d );  
  FLA_Set_diagonal_vector( A, d );
  FLA_Set_diagonal_matrix( d, B );
  FLA_Obj_free( &d );

  if ( min_m_n > 1 ) {
    FLA_Obj_create( datatype, min_m_n - 1 , 1, 0, 0, &e );  
    FLA_Set_diagonal_vector( Ae, e );
    if ( uplo == FLA_UPPER_TRIANGULAR ) {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, min_m_n - 1, 1, FLA_TL );
      Be = ATR;
    } else {
      FLA_Part_2x2( B, &ATL, &ATR,
                    &ABL, &ABR, 1, min_m_n - 1, FLA_TL );
      Be = ABL;
    }
    FLA_Set_diagonal_matrix( e, Be );
    FLA_Obj_free( &e );
  }

  // Vb := B (V^H)
  FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, V, &Vb );
  FLA_Trmm_external( FLA_LEFT, uplo, FLA_NO_TRANSPOSE,
                     FLA_NONUNIT_DIAG, FLA_ONE, B, Vb );

  // A := U Vb
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                     FLA_ONE, U, Vb, FLA_ZERO, A_recovered );

  residual_A    = FLA_Max_elemwise_diff( A_copy, A_recovered );

  if (1) {
    FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TU - ", TU, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - TV - ", TV, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - B - ", B, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U - ", U, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - V - ", V, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Vb - ", Vb, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - U'U - ", DU,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - VV' - ", DV,  "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------");
    fprintf( stdout, "lapack2flame: %lu x %lu: ", m, n);
    fprintf( stdout, "recovery A = %12.10e\n\n", residual_A ) ;
  }
  
  FLA_Obj_free( &A );
  FLA_Obj_free( &TU );
  FLA_Obj_free( &TV );

  FLA_Obj_free( &B );

  FLA_Obj_free( &U );
  FLA_Obj_free( &V );
  FLA_Obj_free( &Vb );

  FLA_Obj_free( &DU );
  FLA_Obj_free( &DV );

  FLA_Obj_free( &A_copy );
  FLA_Obj_free( &A_recovered );


  FLA_Finalize_safe( init_result );     
}
예제 #5
0
파일: test_symm.c 프로젝트: flame/libflame
void libfla_test_symm_experiment( test_params_t params,
                                  unsigned int  var,
                                  char*         sc_str,
                                  FLA_Datatype  datatype,
                                  unsigned int  p_cur,
                                  unsigned int  pci,
                                  unsigned int  n_repeats,
                                  signed int    impl,
                                  double*       perf,
                                  double*       residual )
{
	dim_t        b_flash    = params.b_flash;
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	unsigned int n;
	signed int   n_input    = -1;
	FLA_Side     side;
	FLA_Uplo     uplo;
	FLA_Obj      A, B, C, x, y, z, w, norm;
	FLA_Obj      alpha, beta;
	FLA_Obj      C_save;
	FLA_Obj      A_test, B_test, C_test;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur / abs(m_input);
	else               m = p_cur;
	if ( n_input < 0 ) n = p_cur / abs(n_input);
	else               n = p_cur;

	// Translate parameter characters to libflame constants.
	FLA_Param_map_char_to_flame_side( &pc_str[pci][0], &side );
	FLA_Param_map_char_to_flame_uplo( &pc_str[pci][1], &uplo );

	// Create the matrices for the current operation.
	if ( side == FLA_LEFT )
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );

		// Create vectors for use in test.
		FLA_Obj_create( datatype, n, 1, 0, 0, &x );
		FLA_Obj_create( datatype, m, 1, 0, 0, &y );
		FLA_Obj_create( datatype, m, 1, 0, 0, &z );
		FLA_Obj_create( datatype, m, 1, 0, 0, &w );
	}
	else
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], n, n, &A );

		// Create vectors for use in test.
		FLA_Obj_create( datatype, n, 1, 0, 0, &x );
		FLA_Obj_create( datatype, m, 1, 0, 0, &y );
		FLA_Obj_create( datatype, m, 1, 0, 0, &z );
		FLA_Obj_create( datatype, n, 1, 0, 0, &w );
	}
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, n, &B );
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[2], m, n, &C );

	// Create a norm scalar.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Initialize the test matrices.
	FLA_Random_symm_matrix( uplo, A );
	FLA_Random_matrix( B );
	FLA_Random_matrix( C );

	// Initialize the test vectors.
	FLA_Random_matrix( x );
    FLA_Set( FLA_ZERO, y );
    FLA_Set( FLA_ZERO, z );
    FLA_Set( FLA_ZERO, w );

	// Set constants.
	alpha = FLA_TWO;
	beta  = FLA_MINUS_ONE;

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, C, &C_save );

	// Use hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_create_hier_copy_of_flat( A, 1, &b_flash, &A_test );
		FLASH_Obj_create_hier_copy_of_flat( B, 1, &b_flash, &B_test );
		FLASH_Obj_create_hier_copy_of_flat( C, 1, &b_flash, &C_test );
	}
	else
	{
		A_test = A;
		B_test = B;
		C_test = C;
	}

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR ||
	     impl == FLA_TEST_FLAT_UNB_EXT ||
	     impl == FLA_TEST_FLAT_BLK_EXT )
		libfla_test_symm_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		if ( impl == FLA_TEST_HIER_FRONT_END )
			FLASH_Obj_hierarchify( C_save, C_test );
		else
			FLA_Copy_external( C_save, C_test );
		
		time = FLA_Clock();

		libfla_test_symm_impl( impl, side, uplo, alpha, A_test, B_test, beta, C_test );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Copy the solution to flat matrix X.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_flatten( C_test, C );
	}
	else
    {
		// No action needed since C_test and C refer to the same object.
	}

	// Free the hierarchical matrices if we're testing the FLASH front-end.
	if ( impl == FLA_TEST_HIER_FRONT_END )
	{
		FLASH_Obj_free( &A_test );
		FLASH_Obj_free( &B_test );
		FLASH_Obj_free( &C_test );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR ||
	     impl == FLA_TEST_FLAT_UNB_EXT ||
	     impl == FLA_TEST_FLAT_BLK_EXT )
		libfla_test_symm_cntl_free();

	// Compute the performance of the best experiment repeat.
	if ( side == FLA_LEFT )
		*perf = ( 1 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
	else
		*perf = ( 1 * m * n * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Compute:
	//   y = C * x
	// and compare to
	//   z = ( beta * C_orig + alpha * A * B ) x      (side = left)
	//   z = ( beta * C_orig + alpha * B * A ) x      (side = right)
	FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y );

	if ( side == FLA_LEFT )
	{
		FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, B, x, FLA_ZERO, w );
		FLA_Symv_external( uplo,             alpha,   A, w, FLA_ZERO, z );
	}
	else
	{
		FLA_Symv_external( uplo,             FLA_ONE, A, x, FLA_ZERO, w );
		FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha,   B, w, FLA_ZERO, z );
	}
	FLA_Gemv_external( FLA_NO_TRANSPOSE, beta, C_save, x, FLA_ONE, z );

	// Compute || y - z ||.
	//FLA_Axpy_external( FLA_MINUS_ONE, y, z );
	//FLA_Nrm2_external( z, norm );
	//FLA_Obj_extract_real_scalar( norm, residual );
	*residual = FLA_Max_elemwise_diff( y, z );

	// Free the supporting flat objects.
	FLA_Obj_free( &C_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &B );
	FLA_Obj_free( &C );
	FLA_Obj_free( &x );
	FLA_Obj_free( &y );
	FLA_Obj_free( &z );
	FLA_Obj_free( &w );
	FLA_Obj_free( &norm );
}
예제 #6
0
void time_Gemm_hh( 
               int variant, int type, int nrepeats, int n, int nb_alg,
               FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_old;

  fla_blocksize_t*
    bp;
  fla_gemm_t*
    cntl_gemm_blas;
  fla_gemm_t*
    cntl_gemm_var;

  bp             = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL );
  cntl_gemm_var  = FLA_Cntl_gemm_obj_create( FLA_FLAT, variant, bp, cntl_gemm_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( variant ){
    // Time reference implementation
    case 0:
      REF_Gemm( FLA_CONJ_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                FLA_ONE, A, B, FLA_ONE, C );
      break;

    // Time variant 1
    case 1:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Gemm_hh_unb_var1( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Gemm_hh_blk_var1( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time variant 2
    case 2:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Gemm_hh_unb_var2( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Gemm_hh_blk_var2( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time variant 3
    case 3:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Gemm_hh_unb_var3( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Gemm_hh_blk_var3( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time variant 4
    case 4:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Gemm_hh_unb_var4( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Gemm_hh_blk_var4( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time variant 5
    case 5:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Gemm_hh_unb_var5( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Gemm_hh_blk_var5( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time variant 6
    case 6:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Gemm_hh_unb_var6( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Gemm_hh_blk_var6( FLA_ONE, A, B, FLA_ONE, C, cntl_gemm_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }
	
    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  FLA_Cntl_obj_free( cntl_gemm_var );
  FLA_Cntl_obj_free( cntl_gemm_blas );
  FLA_Blocksize_free( bp );

  if ( variant == 0 )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }

  *gflops = 2.0 * 
            FLA_Obj_length( C ) * 
            FLA_Obj_width( C ) * 
            FLA_Obj_width( A ) / 
            dtime_old / 
            1.0e9;

  if ( FLA_Obj_is_complex( C ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
예제 #7
0
파일: time_Her2k.c 프로젝트: pgawron/tlash
void time_Her2k( 
               int param_combo, int type, int nrepeats, int m, int k,
               FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_old;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( param_combo ){

    // Time parameter combination 0
    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Her2k( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 1
    case 1:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 2
    case 2:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Her2k( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Her2k( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    // Time parameter combination 3
    case 3:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Her2k( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Her2k( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ZERO, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }
	
    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }


  if ( type == FLA_ALG_REFERENCE )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }

  *gflops = 4.0 * 2.0 * m * m * k /
            dtime_old / 
            1.0e9;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
예제 #8
0
void time_Gemm_pp_nn( 
		     int variant, int type, int nrepeats, int n, int nb_alg,
		     FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref,
		     double *dtime, double *diff, double *mflops )
{
  int
    irep,
    info, lwork;

  double
    dtime_old,
    d_minus_one = -1.0, d_one = 1.0;

  FLA_Obj
    Cold;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold );

  FLA_Copy_external( C, Cold );

  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( Cold, C );

    *dtime = FLA_Clock();

    switch( variant ){
    case 0:
      // Time reference implementation
      REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
		ONE, A, B, FLA_ONE, C );
      break;

    case 1:{
      // Time variant 1
      switch( type ){
      case FLA_ALG_UNBLOCKED:
	FLA_Gemm_pp_nn_var1( FLA_ONE, A, B, C, nb_alg );
	break;
      case FLA_ALG_BLOCKED:
        REF_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
		  ONE, A, B, FLA_ONE, C );
	break;
      default:
	printf("trouble\n");
      }

      break;
    }
    }

    if ( irep == 0 )
      dtime_old = FLA_Clock() - *dtime;
    else{
      *dtime = FLA_Clock() - *dtime;
      dtime_old = min( *dtime, dtime_old );
    }
  }

  if ( variant == 0 ){
    FLA_Copy_external( C, Cref );
    *diff = 0.0;
  }
  else{
    *diff = FLA_Max_elemwise_diff( C, Cref );
  }

  *mflops = 2.0 * 
            FLA_Obj_length( C ) * 
            FLA_Obj_width( C ) * 
            FLA_Obj_width( A ) / 
            dtime_old / 
            1000000;

  *dtime = dtime_old;

  FLA_Copy_external( Cold, C );

  FLA_Obj_free( &Cold );
}
예제 #9
0
void time_Hess_UT(
                 int variant, int type, int nrepeats, int m,
                 FLA_Obj A, FLA_Obj A_ref, FLA_Obj t, FLA_Obj T, FLA_Obj W,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Hess_UT( A, t );
        break;
      case FLA_ALG_FRONT:
        FLA_Hess_UT( A, T );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  //if ( type == FLA_ALG_REFERENCE )
  //{
  //  ;
  //}
  //else
  {
    FLA_Obj    AT, AB;
    FLA_Obj Q, QT, QB;
    FLA_Obj E, ET, EB;
    FLA_Obj F;
    dim_t   m_A, m_T;

    m_A = FLA_Obj_length( A );
    m_T = FLA_Obj_length( T );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_A, m_A, 0, 0, &Q );
    FLA_Set_to_identity( Q );

    FLA_Part_2x1( Q,    &QT,
                        &QB,    1, FLA_TOP );
    FLA_Part_2x1( A,    &AT,
                        &AB,    1, FLA_TOP );

    if ( type == FLA_ALG_REFERENCE )
    {
      if ( FLA_Obj_is_real( A ) )
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB );
      else
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, AB, t, QB );
    }
    else
      FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, AB, T, W, QB );

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F );

    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
                       FLA_ONE, A_save, Q, FLA_ZERO, E );
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
                       FLA_ONE, Q, E, FLA_ZERO, F );

    FLA_Copy( A, E );
    FLA_Part_2x1( E,    &ET,
                        &EB,    1, FLA_TOP );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, EB );

    *diff = FLA_Max_elemwise_diff( E, F );

    FLA_Obj_free( &Q );
    FLA_Obj_free( &E );
    FLA_Obj_free( &F );
  }

  *gflops = ( 10.0 / 3.0 * m * m * m ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &norm );
}
예제 #10
0
void time_Bidiag_UT(
                 int param_combo, int type, int nrepeats, int m, int n,
                 FLA_Obj A, FLA_Obj tu, FLA_Obj tv, FLA_Obj TU, FLA_Obj TV,
                 double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_save, norm;


  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( A_save, A );

    *dtime = FLA_Clock();

    switch( param_combo ){

    case 0:
    {
      switch( type )
      {
      case FLA_ALG_REFERENCE:
        REF_Bidiag_UT( A, tu, tv );
        break;
      case FLA_ALG_FRONT:
        FLA_Bidiag_UT( A, TU, TV );
        break;
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );

  }

  {
    FLA_Obj AL, AR;
    FLA_Obj ATL, ATR,
            ABL, ABR;
    FLA_Obj QU;
    FLA_Obj QV, QVL, QVR;
    FLA_Obj E, EL, ER;
    FLA_Obj F;
    FLA_Obj WU, WV, eye;
    FLA_Obj tvT,
            tvB;
    dim_t   m_A, n_A, m_TU;


//FLA_Obj_show( "A_save", A_save, "%10.3e", "" );

    m_A = FLA_Obj_length( A );
    n_A = FLA_Obj_width( A );
    m_TU = FLA_Obj_length( TU );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_A,  m_A, 0, 0, &QU );
    FLA_Obj_create( FLA_Obj_datatype( A ), n_A,  n_A, 0, 0, &QV );

    FLA_Obj_create( FLA_Obj_datatype( A ), m_TU,  m_A, 0, 0, &WU );
    FLA_Obj_create( FLA_Obj_datatype( A ), m_TU,  n_A, 0, 0, &WV );

    FLA_Set_to_identity( QU );
    FLA_Set_to_identity( QV );

    FLA_Part_1x2( QV,   &QVL, &QVR,   1, FLA_LEFT );
    FLA_Part_1x2( A,    &AL,  &AR,    1, FLA_LEFT );
    FLA_Part_2x2( A,    &ATL, &ATR,
                        &ABL, &ABR,   1, 1, FLA_BL );
    FLA_Part_2x1( tv,   &tvT,
                        &tvB,   1, FLA_BOTTOM );

    if ( type == FLA_ALG_REFERENCE )
    {
      if ( FLA_Obj_is_real( A ) )
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU );
      else
        FLA_Apply_Q_blk_external( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A, tu, QU );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, AR, tv, QVR );
      //
      // Need to apply backwards transformation, since vectors are stored columnwise.
      // QL? RQ?
      //
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, ATR, tvT, QVR );
      //FLA_Apply_Q_blk_external( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, AR, tvT, QVR );
    }
    else
    {
      FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, TU, WU, QU );
      FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, AR, TV, WV, QVR );
    }

/*
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye );     
    FLA_Set_to_identity( eye );

    //FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
    //          FLA_ONE, QV, QV, FLA_MINUS_ONE, eye );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
              FLA_ONE, QU, QU, FLA_MINUS_ONE, eye );

FLA_Obj_show( "eye", eye, "%10.3e", "" );
    FLA_Norm_frob( eye, norm );
    FLA_Obj_extract_real_scalar( norm, diff );
    FLA_Obj_free( &eye );
*/

    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &E );
    FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &F );

    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, A_save, QV, FLA_ZERO, E );
    FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
              FLA_ONE, QU, E, FLA_ZERO, F );

//FLA_Obj_show( "A_save", A_save, "%10.3e", "" );

    FLA_Copy( A, E );
    FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, E );
    FLA_Part_1x2( E,    &EL, &ER,      1, FLA_LEFT );
    FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, ER );

//FLA_Obj_show( "B", E, "%10.3e", "" );
//FLA_Obj_show( "Q'AV", F, "%10.3e", "" );
//FLA_Obj_show( "B", E, "%10.3e + %10.3e ", "" );
//FLA_Obj_show( "Q'AV", F, "%10.3e + %10.3e ", "" );

    *diff = FLA_Max_elemwise_diff( E, F );
    FLA_Obj_free( &E );
    FLA_Obj_free( &F );

    FLA_Obj_free( &QU );
    FLA_Obj_free( &QV );
    FLA_Obj_free( &WU );
    FLA_Obj_free( &WV );
  }

  *gflops = 4.0 * n * n * ( m - n / 3.0 ) /
            dtime_old / 1e9;
  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( A_save, A );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &norm );
}
예제 #11
0
void time_QR2_UT(
               int param_combo, int type, int nrepeats, int m, int n,
               FLA_Obj A_flat, FLA_Obj A_flat_ref, FLA_Obj B_flat, FLA_Obj D_flat, FLA_Obj T_flat,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_flat_save;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A_flat, &A_flat_save );

  FLA_Copy( A_flat, A_flat_save );

  for ( irep = 0 ; irep < nrepeats; irep++ )
  {
    FLA_Copy( A_flat_save, A_flat );

    *dtime = FLA_Clock();

    switch( param_combo ){

    // Time parameter combination 0
    case 0:{
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_QR2_UT_unb_var1( B_flat,
                                       D_flat, T_flat );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_QR2_UT_opt_var1( B_flat,
                                       D_flat, T_flat );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }
	
    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }


  if ( type == FLA_ALG_UNBLOCKED )
  {
    FLA_Copy( A_flat, A_flat_ref );
    //FLA_Copy( T_flat, A_flat_ref );

    *diff = 0.0;
  }
  else
  {
    *diff = FLA_Max_elemwise_diff( A_flat, A_flat_ref );
    //*diff = FLA_Max_elemwise_diff( T_flat, A_flat_ref );
  }

  *gflops = 2.0 * n * n * (m - n/3.0) /
            dtime_old / 
            1.0e9;

  if ( FLA_Obj_is_complex( A_flat ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy( A_flat_save, A_flat );

  FLA_Obj_free( &A_flat_save );
}
예제 #12
0
void time_Eig_gest_nu(
               int variant, int type, int n_repeats, int n, int b_alg,
               FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_save = 1.0e9;

  FLA_Obj
    A_save, B_save, norm;

  fla_blocksize_t* bp;
  fla_eig_gest_t*  cntl_eig_gest_var;
  fla_eig_gest_t*  cntl_eig_gest_unb;


  if ( ( type == FLA_ALG_UNBLOCKED || type == FLA_ALG_UNB_OPT ) &&
       n > 300 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }

  if ( variant == 3 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }


  bp                = FLA_Blocksize_create( b_alg, b_alg, b_alg, b_alg );
  cntl_eig_gest_unb = FLA_Cntl_eig_gest_obj_create( FLA_FLAT,
                                                    //FLA_UNBLOCKED_VARIANT1,
                                                    FLA_UNB_OPT_VARIANT1,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL,
                                                    NULL );
  cntl_eig_gest_var = FLA_Cntl_eig_gest_obj_create( FLA_FLAT,
                                                    variant,
                                                    bp,
                                                    cntl_eig_gest_unb,
                                                    fla_axpy_cntl_blas,
                                                    fla_axpy_cntl_blas,
                                                    fla_gemm_cntl_blas,
                                                    fla_gemm_cntl_blas,
                                                    fla_gemm_cntl_blas,
                                                    fla_hemm_cntl_blas,
                                                    fla_her2k_cntl_blas,
                                                    fla_trmm_cntl_blas,
                                                    fla_trmm_cntl_blas,
                                                    fla_trsm_cntl_blas,
                                                    fla_trsm_cntl_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save );

  FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

  FLA_Copy_external( A, A_save );
  FLA_Copy_external( B, B_save );


  for ( irep = 0 ; irep < n_repeats; irep++ ){

    FLA_Copy_external( A_save, A );
    FLA_Copy_external( B_save, B );

    *dtime = FLA_Clock();

    switch( variant ){

    case 0:
      REF_Eig_gest_nu( A, B );
      break;

    case 1:
    {
      // Time variant 1
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        FLA_Eig_gest_nu_unb_var1( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Eig_gest_nu_opt_var1( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Eig_gest_nu_blk_var1( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:
    {
      // Time variant 2
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        FLA_Eig_gest_nu_unb_var2( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Eig_gest_nu_opt_var2( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Eig_gest_nu_blk_var2( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 3:
    {
      // Time variant 3
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        //FLA_Eig_gest_nu_unb_var3( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        //FLA_Eig_gest_nu_opt_var3( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        //FLA_Eig_gest_nu_blk_var3( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 4:
    {
      // Time variant 4
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        FLA_Eig_gest_nu_unb_var4( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Eig_gest_nu_opt_var4( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Eig_gest_nu_blk_var4( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 5:
    {
      // Time variant 5
      switch( type )
      {
      case FLA_ALG_UNBLOCKED:
        FLA_Eig_gest_nu_unb_var5( A, Y, B );
        break;
      case FLA_ALG_UNB_OPT:
        FLA_Eig_gest_nu_opt_var5( A, Y, B );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Eig_gest_nu_blk_var5( A, Y, B, cntl_eig_gest_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_save = min( *dtime, dtime_save );
  }

  FLA_Cntl_obj_free( cntl_eig_gest_var );
  FLA_Cntl_obj_free( cntl_eig_gest_unb );
  FLA_Blocksize_free( bp );

  // Recover A.
  if ( inv == FLA_NO_INVERSE )
  {
    if ( uplo == FLA_LOWER_TRIANGULAR )
    {
      // A = L' * A_orig * L
      // A_orig = inv(L') * A * inv(L)
      FLA_Hermitianize( FLA_LOWER_TRIANGULAR, A );
      FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR,
                         FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
      FLA_Trsm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
    }
    else // if ( uplo == FLA_UPPER_TRIANGULAR )
    {
      // A = U * A_orig * U'
      // A_orig = inv(U) * A * inv(U')
      FLA_Hermitianize( FLA_UPPER_TRIANGULAR, A );
      FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
      FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
                         FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
    }
  }
  else // if ( inv == FLA_INVERSE )
  {
    if ( uplo == FLA_LOWER_TRIANGULAR )
    {
      // A = inv(L) * A_orig * inv(L')
      // A_orig = L * A * L'
      FLA_Hermitianize( FLA_LOWER_TRIANGULAR, A );
      FLA_Trmm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR,
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
      FLA_Trmm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
                         FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
    }
    else // if ( uplo == FLA_UPPER_TRIANGULAR )
    {
      // A = inv(U') * A_orig * inv(U)
      // A_orig = U' * A * U
      FLA_Hermitianize( FLA_UPPER_TRIANGULAR, A );
      FLA_Trmm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR,
                         FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
      FLA_Trmm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
                         FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                         FLA_ONE, B, A );
    }
  }

  *diff = FLA_Max_elemwise_diff( A, A_save );

/*
if ( type == FLA_ALG_UNBLOCKED )
{
FLA_Obj_show( "A", A, "%10.3e", "" );
FLA_Obj_show( "A_orig", A_save, "%10.3e", "" );
}
*/

  *gflops = 1.0 * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) / 
            dtime_save / 1e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_save;

  FLA_Copy_external( A_save, A );
  FLA_Copy_external( B_save, B );

  FLA_Obj_free( &A_save );
  FLA_Obj_free( &B_save );
  FLA_Obj_free( &norm );
}
예제 #13
0
void time_Chol(
                int param_combo, int type, int nrepeats, int m, int n,
                FLA_Obj A, FLA_Obj A_ref, FLA_Obj T, FLA_Obj t_ref, FLA_Obj B, FLA_Obj B_ref, FLA_Obj X, FLA_Obj X_ref, FLA_Obj W,
                double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    B_save;
  FLA_Obj
    normx;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B, &B_save );

  if ( FLA_Obj_is_single_precision( A ) )
    FLA_Obj_create( FLA_FLOAT, 1, 1, 0, 0, &normx );
  else
    FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &normx );

  FLA_Copy_external( B, B_save );

  for ( irep = 0 ; irep < nrepeats; irep++ )
  {
    FLA_Copy_external( B_save, B );
    FLA_Copy_external( B_save, B_ref );

    *dtime = FLA_Clock();

    switch( param_combo ){

    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        FLA_Copy_external( B_ref, X_ref );
        //REF_Apply_Q( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A_ref, t_ref, X_ref );
        REF_Apply_Q( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_COLUMNWISE, A_ref, t_ref, X_ref );
        FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                           FLA_NONUNIT_DIAG, FLA_ONE, A_ref, X_ref );
        break;
      case FLA_ALG_FRONT:
        FLA_Copy_external( B, X );
        FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, A, T, W, X );
        FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
                           FLA_NONUNIT_DIAG, FLA_ONE, A, X );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 1:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        FLA_Copy_external( B_ref, X_ref );
        FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE,
                           FLA_NONUNIT_DIAG, FLA_ONE, A_ref, X_ref );
        //REF_Apply_Q( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, A_ref, t_ref, X_ref );
        REF_Apply_Q( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_ROWWISE, A_ref, t_ref, X_ref );
        break;
      case FLA_ALG_FRONT:
        FLA_Copy_external( B, X );
        FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE,
                           FLA_NONUNIT_DIAG, FLA_ONE, A, X );
        FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE, A, T, W, X );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  if ( type == FLA_ALG_REFERENCE )
  {
    //FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A_ref, X_ref, FLA_ONE, B_ref );
    //FLA_Nrm2_external( B_ref, normx );
    //FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, normx, 1, 1, diff, 1, 1 );

    //FLA_Obj_show( "X_ref:", X_ref, "%12.4e", "" );

    *diff = 0.0;
  }
  else
  {
    //FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A, X, FLA_ONE, B );
    //FLA_Nrm2_external( B, normx );
    //FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, normx, 1, 1, diff, 1, 1 );

    //FLA_Obj_show( "X_fla:", X, "%12.4e", "" );

    *diff = FLA_Max_elemwise_diff( X, X_ref );
  }

  *gflops = 1.0 / 3.0  * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( B ) * 
            FLA_Obj_width( B ) / 
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( B_save, B );
  FLA_Copy_external( B_save, B_ref );

  FLA_Obj_free( &B_save );
  FLA_Obj_free( &normx );
}
예제 #14
0
파일: driver.c 프로젝트: pgawron/tlash
int main(int argc, char *argv[])
{
  int 
    n, nfirst, nlast, ninc, i, irep,
    nrepeats, nb_alg;

  double
    dtime,
    dtime_best,
    gflops,
    max_gflops,
    diff,
    d_n;

  FLA_Obj
    A, Aref, Aold, delta;
  
  /* Initialize FLAME */
  FLA_Init( );

  /* Every time trial is repeated "repeat" times and the fastest run in recorded */
  printf( "%% number of repeats:" );
  scanf( "%d", &nrepeats );
  printf( "%% %d\n", nrepeats );

  /* Enter the max GFLOPS attainable 
     This is used to set the y-axis range for the graphs. Here is how
     you figure out what to enter (on Linux machines):
     1) more /proc/cpuinfo   (this lists the contents of this file).
     2) read through this and figure out the clock rate of the machine (in GHz).
     3) Find out (from an expert of from the web) the number of floating point
        instructions that can be performed per core per clock cycle.
     4) Figure out if you are using "multithreaded BLAS" which automatically
        parallelize calls to the Basic Linear Algebra Subprograms.  If so,
        check how many cores are available.
     5) Multiply 2) x 3) x 4) and enter this in response to the below.
  */

  printf( "%% enter max GFLOPS:" );
  scanf( "%lf", &max_gflops );
  printf( "%% %lf\n", max_gflops );

  /* Enter the algorithmic block size */
  printf( "%% enter nb_alg:" );
  scanf( "%d", &nb_alg );
  printf( "%% %d\n", nb_alg );

  /* Turn on parameter checking */
  FLA_Check_error_level_set( FLA_FULL_ERROR_CHECKING );

  /* Timing trials for matrix sizes n=nfirst to nlast in increments 
     of ninc will be performed */
  printf( "%% enter nfirst, nlast, ninc:" );
  scanf( "%d%d%d", &nfirst, &nlast, &ninc );
  printf( "%% %d %d %d\n", nfirst, nlast, ninc );

  i = 1;
  for ( n=nfirst; n<= nlast; n+=ninc ){
   
    /* Allocate space for the matrices */
    FLA_Obj_create( FLA_DOUBLE, n, n, 0, 0, &A );
    FLA_Obj_create( FLA_DOUBLE, n, n, 0, 0, &Aref );
    FLA_Obj_create( FLA_DOUBLE, n, n, 0, 0, &Aold );
    FLA_Obj_create( FLA_DOUBLE, 1, 1, 0, 0, &delta );

    /* Generate random matrix A and save in Aold */
    FLA_Random_matrix( Aold );

    /* Add something large to the diagonal to make sure it isn't ill-conditionsed */
    d_n = ( double ) n;
    *( ( double * ) FLA_Obj_buffer( delta ) ) = d_n;
    FLA_Shift_diag( FLA_NO_CONJUGATE, delta, Aold );
    
    /* Set gflops = billions of floating point operations that will be performed */
    gflops = 2.0/3.0 * n * n * n * 1.0e-09;

    /* Time the reference implementation */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, Aref );

      dtime = FLA_Clock();

      REF_LU( Aref );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    printf( "data_REF( %d, 1:2 ) = [ %d %le ];\n", i, n,
            gflops / dtime_best );
    fflush( stdout );


    /* Time FLA_LU */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );

      dtime = FLA_Clock();

      FLA_LU_nopiv( A );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n,
            gflops / dtime_best );
    fflush( stdout );


    /* Time the your implementations */


    /* Variant 1 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_unb_var1( A );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_unb_var1( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 1 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_blk_var1( A, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var1( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );


    /* Variant 2 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_unb_var2( A );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_unb_var2( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 2 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_blk_var2( A, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var2( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 3 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_unb_var3( A );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_unb_var3( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 3 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_blk_var3( A, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var3( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 4 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_unb_var4( A );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_unb_var4( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 4 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_blk_var4( A, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var4( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 5 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_unb_var5( A );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_unb_var5( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 5 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

      LU_blk_var5( A, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
        dtime_best = dtime;
      else
        dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var5( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    FLA_Obj_free( &A );
    FLA_Obj_free( &Aold );
    FLA_Obj_free( &Aref );
    FLA_Obj_free( &delta );
    printf( "\n" );

    i++;
  }

  /* Print the MATLAB commands to plot the data */

  /* Delete all existing figures */
  printf( "close all\n" );

  /* Plot the performance of FLAME */
  printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" );

  /* Indicate that you want to add to the existing plot */
  printf( "hold on\n" );

  /* Plot the performance of the reference implementation */
  printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" );

  /* Plot the performance of your implementations */

  printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.' ); \n" );
  printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.' ); \n" );
  printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.' ); \n" );
  printf( "plot( data_unb_var4( :,1 ), data_unb_var4( :, 2 ), 'm-.' ); \n" );
  printf( "plot( data_unb_var5( :,1 ), data_unb_var5( :, 2 ), 'c-.' ); \n" );
  printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r--' ); \n" );
  printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g--' ); \n" );
  printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b--' ); \n" );
  printf( "plot( data_blk_var4( :,1 ), data_blk_var4( :, 2 ), 'm--' ); \n" );
  printf( "plot( data_blk_var5( :,1 ), data_blk_var5( :, 2 ), 'c--' ); \n" );

  printf( "hold on \n");

  printf( "xlabel( 'matrix dimension m=n' );\n");
  printf( "ylabel( 'GFLOPS/sec.' );\n");
  printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops );
  printf( "legend( 'FLA LU nopiv', ...\n");
  printf( "        'Simple loops', ...\n");
  printf( "        'unb var1', ...\n");
  printf( "        'unb var2', ...\n");
  printf( "        'unb var3', ...\n");
  printf( "        'unb var4', ...\n");
  printf( "        'unb var5', ...\n");
  printf( "        'blk var1', ...\n");
  printf( "        'blk var2', ...\n");
  printf( "        'blk var3', ...\n");
  printf( "        'blk var4', ...\n");
  printf( "        'blk var5', 2);\n");

  printf( "print -r100 -depsc LU.eps\n");

  FLA_Finalize( );

  return 0;
}
예제 #15
0
void libfla_test_hessut_experiment( test_params_t params,
                                    unsigned int  var,
                                    char*         sc_str,
                                    FLA_Datatype  datatype,
                                    unsigned int  p_cur,
                                    unsigned int  pci,
                                    unsigned int  n_repeats,
                                    signed int    impl,
                                    double*       perf,
                                    double*       residual )
{
	dim_t        b_alg_flat = params.b_alg_flat;
	double       time_min   = 1e9;
	double       time;
	unsigned int i;
	unsigned int m;
	signed int   m_input    = -1;
	FLA_Obj      A, T, W, Qh, AQ, QhAQ, norm;
	FLA_Obj      AT, AB;
	FLA_Obj      QhT, QhB;
	FLA_Obj      A_save;

	// Determine the dimensions.
	if ( m_input < 0 ) m = p_cur * abs(m_input);
	else               m = p_cur;

	// Create the matrices for the current operation.
	libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[0], m, m, &A );

	if ( impl == FLA_TEST_FLAT_FRONT_END ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &T );
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], b_alg_flat, m, &W );
	}
	else
	{
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &T );
		libfla_test_obj_create( datatype, FLA_NO_TRANSPOSE, sc_str[1], m, m, &W );
	}

	// Initialize the test matrices.
	FLA_Random_matrix( A );

	// Save the original object contents in a temporary object.
	FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_save );

	// Create auxiliary matrices to be used when checking the result.
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Qh );
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &AQ );
	FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &QhAQ );

	// Create a real scalar object to hold the norm of A.
	FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );

	// Create a control tree for the individual variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_hessut_cntl_create( var, b_alg_flat );

	// Repeat the experiment n_repeats times and record results.
	for ( i = 0; i < n_repeats; ++i )
	{
		FLA_Copy_external( A_save, A );
		
		time = FLA_Clock();

		libfla_test_hessut_impl( impl, A, T );
		
		time = FLA_Clock() - time;
		time_min = min( time_min, time );
	}

	// Free the control trees if we're testing the variants.
	if ( impl == FLA_TEST_FLAT_UNB_VAR ||
	     impl == FLA_TEST_FLAT_OPT_VAR ||
	     impl == FLA_TEST_FLAT_BLK_VAR )
		libfla_test_hessut_cntl_free();

	// Compute the performance of the best experiment repeat.
	*perf = ( 10.0 / 3.0 * m * m * m ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( FLA_Obj_is_complex( A ) ) *perf *= 4.0;

	// Check the result by computing R - Q' A_orig Q.
	FLA_Set_to_identity( Qh );
	FLA_Part_2x1( Qh,   &QhT,
	                    &QhB,   1, FLA_TOP );
	FLA_Part_2x1( A,    &AT,
	                    &AB,    1, FLA_TOP );
	FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE,
	                AB, T, W, QhB );
	FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
	          FLA_ONE, A_save, Qh, FLA_ZERO, AQ );
	FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
	          FLA_ONE, Qh, AQ, FLA_ZERO, QhAQ );
	FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, AB );
	*residual = FLA_Max_elemwise_diff( A, QhAQ );

	// Free the supporting flat objects.
	FLA_Obj_free( &W );
	FLA_Obj_free( &Qh );
	FLA_Obj_free( &AQ );
	FLA_Obj_free( &QhAQ );
	FLA_Obj_free( &norm );
	FLA_Obj_free( &A_save );

	// Free the flat test matrices.
	FLA_Obj_free( &A );
	FLA_Obj_free( &T );
}
예제 #16
0
void time_Copyt( 
               int param_combo, int type, int nrepeats, int m, int n,
               FLA_Obj A, FLA_Obj C, FLA_Obj C_ref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_old;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( param_combo ){

    // Time parameter combination 0
    case 0:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Copyt( FLA_NO_TRANSPOSE, A, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Copyt( FLA_NO_TRANSPOSE, A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 1:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Copyt( FLA_TRANSPOSE, A, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Copyt( FLA_TRANSPOSE, A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Copyt( FLA_CONJ_NO_TRANSPOSE, A, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 3:{
      switch( type ){
      case FLA_ALG_REFERENCE:
        REF_Copyt( FLA_CONJ_TRANSPOSE, A, C );
        break;
      case FLA_ALG_FRONT:
        FLA_Copyt( FLA_CONJ_TRANSPOSE, A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }
	
    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }


  if ( type == FLA_ALG_REFERENCE )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }

  *gflops = 2.0 * m * n / 
            dtime_old / 
            1.0e9;

  if ( FLA_Obj_is_complex( A ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
예제 #17
0
void time_Syrk_ln( 
	       int variant, int type, int nrepeats, int n, int nb_alg,
	       FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj Cref,
	       double *dtime, double *diff, double *gflops )
{
  int
    irep,
    info, lwork;

  double
    dtime_old,
    d_minus_one = -1.0, d_one = 1.0;

  FLA_Obj
    Cold;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &Cold );

  FLA_Copy_external( C, Cold );

  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( Cold, C );

    *dtime = FLA_Clock();

    switch( variant ){
    case 0:
      // Time reference implementation
      REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C );
      break;

    default:
	 printf("trouble\n");
      break;
    }

    if ( irep == 0 )
      dtime_old = FLA_Clock() - *dtime;
    else{
      *dtime = FLA_Clock() - *dtime;
      dtime_old = min( *dtime, dtime_old );
    }
  }


  if ( variant == 0 ){
    FLA_Copy_external( C, Cref );
    *diff = 0.0;
  }
  else{
    *diff = FLA_Max_elemwise_diff( C, Cref );
  }

  *gflops = 1.0 * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) * 
            FLA_Obj_width( A ) / 
            dtime_old / 
            1e9;

  *dtime = dtime_old;

  FLA_Copy_external( Cold, C );

  FLA_Obj_free( &Cold );
}
예제 #18
0
파일: lq.c 프로젝트: anaptyxis/libflame
int main( int argc, char** argv ) {
  FLA_Datatype datatype = TESTTYPE;
  FLA_Obj      A, Ak, T, Tk, D, Dk, A_copy, A_recovered, L, Q, Qk, W, x, y, z;
  dim_t        m, n, k;
  dim_t        min_m_n;
  FLA_Error    init_result; 
  double       residual_A, residual_Axy;
  int          use_form_q = 1;

  if ( argc == 4 ) {
    m = atoi(argv[1]);
    n = atoi(argv[2]);
    k = atoi(argv[3]);
    min_m_n = min(m,n);
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m n k\n", argv[0]);
    fprintf(stderr, "       m : matrix length\n");
    fprintf(stderr, "       n : matrix width\n");
    fprintf(stderr, "       k : number of house holder vectors applied for testing\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 || n == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // FLAME LQ^H setup
  FLA_Obj_create( datatype, m, n, 0, 0, &A );
  FLA_LQ_UT_create_T( A, &T );
  
  // Rand A and create A_copy.
  FLA_Random_matrix( A ); 

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_copy );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered );
  FLA_Copy( A, A_copy );

  // LQ test ( A = L Q^H )
  FLA_LQ_UT( A, T );

  // Create Q (identity), L (A_copy)
  FLA_Obj_create( datatype, m, n, 0, 0, &Q  ); FLA_Set_to_identity( Q  );
  FLA_Obj_create( datatype, m, m, 0, 0, &D  );

  FLA_Obj_create( datatype, k, n, 0, 0, &Qk ); FLA_Set_to_identity( Qk );
  FLA_Obj_create( datatype, k, k, 0, 0, &Dk  );

  
  FLA_Obj_create( datatype, m, m, 0, 0, &L );

  // Q^H := I H_{0}^H ... H_{k-1}^H
  if ( use_form_q ) {
    FLA_LQ_UT_form_Q( A, T, Q );
  } else {
    FLA_Apply_Q_UT_create_workspace_side( FLA_RIGHT, T, Q, &W );
    FLA_Apply_Q_UT( FLA_RIGHT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    A, T, W, Q );
    FLA_Obj_free( &W );
  }

  // D := Q^T Q
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, Q, Q, FLA_ZERO, D ); 

  // Qk := I H0 ... Hk
  FLA_Part_1x2( T, &Tk, &W, k, FLA_LEFT );
  FLA_Part_2x1( A, &Ak, &W, k, FLA_TOP );

  if ( use_form_q ) {
    // Overwrite the result to test FLAME API
    FLA_Set( FLA_ZERO, Qk );
    FLA_Copy( Ak, Qk );
    FLA_LQ_UT_form_Q( Ak, Tk, Qk );
  } else {
    FLA_Apply_Q_UT_create_workspace( Tk, Qk, &W );
    FLA_Apply_Q_UT( FLA_LEFT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    Ak, Tk, W, Qk );
    FLA_Obj_free( &W );
  }

  // Dk := Qk^T Qk
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                     FLA_ONE, Qk, Qk, FLA_ZERO, Dk ); 

  // L := A (Q^H)^H
  if ( use_form_q ) {
    // Note that the formed Q is actually Q^H; transb should be carefully assigned.
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, 
                       FLA_ONE, A_copy, Q, FLA_ZERO, L );
  } else {
    FLA_Apply_Q_UT_create_workspace( T, L, &W );
    FLA_Apply_Q_UT( FLA_RIGHT, FLA_NO_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                    A, T, W, L );
    FLA_Obj_free( &W );
  }
  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
                     FLA_ONE, L, Q, FLA_ZERO, A_recovered ); 

  // Create vectors for testing
  FLA_Obj_create( datatype, n, 1, 0, 0, &x ); FLA_Set( FLA_ZERO, x );
  FLA_Obj_create( datatype, m, 1, 0, 0, &y ); FLA_Set( FLA_ZERO, y );
  FLA_Obj_create( datatype, m, 1, 0, 0, &z ); FLA_Set( FLA_ZERO, z );

  // x is given
  FLA_Set( FLA_ONE, x );

  // y := Ax
  FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A_copy, x, FLA_ZERO, y );
  
  // z := L (Q^H) x , libflame
  FLA_Apply_Q_UT_create_workspace( T, x, &W );
  FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_ROWWISE,
                  A, T, W, x );
  FLA_Obj_free( &W );

  if ( m < n )
    FLA_Part_2x1( x, &x, &W, m, FLA_TOP );
  else 
    FLA_Part_1x2( L, &L, &W, n, FLA_LEFT );    

  FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, L, x, FLA_ZERO, z );
  
  // Comapre (A_copy, A_recovered), (y,z) and (y,w)
  residual_A    = FLA_Max_elemwise_diff( A_copy, A_recovered );
  residual_Axy  = FLA_Max_elemwise_diff( y, z );

  if ( 1 || residual_A > EPS || residual_Axy > EPS ) {
    FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - T - ", T, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Q - ", Q, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - D = Q^T Q - ", D, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Qk - ", Qk, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Dk = Qk^T Qk - ", Dk, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - L - ", L, "% 6.4e", "------");
    FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------");
    fprintf( stdout, "lapack2flame: %lu x %lu, %lu: ", m, n, k);
    fprintf( stdout, "| A - A_recovered | = %12.10e, | Ax - y | = %12.10e\n\n",
             residual_A, residual_Axy ) ;
  }
  
  FLA_Obj_free( &A );
  FLA_Obj_free( &T );

  FLA_Obj_free( &A_copy );
  FLA_Obj_free( &A_recovered );
  FLA_Obj_free( &L );
  FLA_Obj_free( &Q );
  FLA_Obj_free( &Qk );
  FLA_Obj_free( &D );
  FLA_Obj_free( &Dk );

  FLA_Obj_free( &x );
  FLA_Obj_free( &y );
  FLA_Obj_free( &z );
                   
  FLA_Finalize_safe( init_result );     
}
예제 #19
0
int main(int argc, char *argv[])
{
  int n, nfirst, nlast, ninc, nlast_unb, i, irep,
    nrepeats, nb_alg;

  double
    dtime, dtime_best, 
    gflops, max_gflops,
    diff, d_n;

  FLA_Obj
    A, Aref, Aold, delta;
  
  /* Initialize FLAME */
  FLA_Init( );

  /* Every time trial is repeated "repeat" times and the fastest run in recorded */
  printf( "%% number of repeats:" );
  scanf( "%d", &nrepeats );
  printf( "%% %d\n", nrepeats );

  /* Enter the max GFLOPS attainable 
     This is used to set the y-axis range for the graphs. Here is how
     you figure out what to enter (on Linux machines):
     1) more /proc/cpuinfo   (this lists the contents of this file).
     2) read through this and figure out the clock rate of the machine (in GHz).
     3) Find out (from an expert of from the web) the number of floating point
        instructions that can be performed per core per clock cycle.
     4) Figure out if you are using "multithreaded BLAS" which automatically
        parallelize calls to the Basic Linear Algebra Subprograms.  If so,
        check how many cores are available.
     5) Multiply 2) x 3) x 4) and enter this in response to the below.

     If you enter a value for max GFLOPS that is lower that the maximum that
     is observed in the experiments, then the top of the graph is set to the 
     observed maximum.  Thus, one possibility is to simply set this to 0.0.
  */

  printf( "%% enter max GFLOPS:" );
  scanf( "%lf", &max_gflops );
  printf( "%% %lf\n", max_gflops );

  /* Enter the algorithmic block size */
  printf( "%% enter nb_alg:" );
  scanf( "%d", &nb_alg );
  printf( "%% %d\n", nb_alg );

  /* Timing trials for matrix sizes n=nfirst to nlast in increments 
     of ninc will be performed.  Unblocked versions are only tested to
     nlast_unb */
  printf( "%% enter nfirst, nlast, ninc, nlast_unb:" );
  scanf( "%d%d%d%d", &nfirst, &nlast, &ninc, &nlast_unb );
  printf( "%% %d %d %d %d\n", nfirst, nlast, ninc, nlast_unb );

  i = 1;
  for ( n=nfirst; n<= nlast; n+=ninc ){
   
    /* Allocate space for the matrices */
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A );
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aref );
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aold );
    FLA_Obj_create( FLA_DOUBLE, 1, 1, 1, 1, &delta );

    /* Generate random matrix A and save in Aold */
    FLA_Random_matrix( Aold );

    /* Add something large to the diagonal to make sure it isn't ill-conditionsed */
    d_n = ( double ) n;
    *( ( double * ) FLA_Obj_buffer_at_view( delta ) ) = d_n;
    FLA_Shift_diag( FLA_NO_CONJUGATE, delta, Aold );
    
    /* Set gflops = billions of floating point operations that will be performed */
    gflops = 1.0/3.0 * n * n * n * 1.0e-09;

    /* Time the reference implementation */
#if TIME_LAPACK == TRUE

#else
    //    if ( n <= nlast_unb )
#endif
    {
      for ( irep=0; irep<nrepeats; irep++ ){
	FLA_Copy( Aold, Aref );
    
	dtime = FLA_Clock();
    
	REF_Chol( TIME_LAPACK, Aref, nb_alg );
    
	dtime = FLA_Clock() - dtime;
    
	if ( irep == 0 ) 
	  dtime_best = dtime;
	else
	  dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
      }
  
      printf( "data_REF( %d, 1:2 ) = [ %d %le ];\n", i, n,
	      gflops / dtime_best );
      fflush( stdout );
    }  

    /* Time FLA_Chol */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );

      dtime = FLA_Clock();

      FLA_Chol( FLA_LOWER_TRIANGULAR, A );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n,
            gflops / dtime_best );

    if ( gflops / dtime_best > max_gflops ) 
      max_gflops = gflops / dtime_best;

    fflush( stdout );


    /* Time the your implementations */


    /* Variant 1 unblocked */
    
    if ( n <= nlast_unb ){
      for ( irep=0; irep<nrepeats; irep++ ){

	FLA_Copy( Aold, A );
    
	dtime = FLA_Clock();

#if TIME_UNB_VAR1 == TRUE
	Chol_unb_var1( A );
#else
	REF_Chol( TIME_LAPACK, A, nb_alg );
#endif


	dtime = FLA_Clock() - dtime;

	if ( irep == 0 ) 
	  dtime_best = dtime;
	else
	  dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
      }    

      diff = FLA_Max_elemwise_diff( A, Aref );

      printf( "data_unb_var1( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
	      gflops / dtime_best, diff );
      fflush( stdout );
    }

    /* Variant 1 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

#if TIME_BLK_VAR1 == TRUE
      Chol_blk_var1( A, nb_alg );
#else
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var1( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );


    /* Variant 2 unblocked */
    if ( n <= nlast_unb ){
      for ( irep=0; irep<nrepeats; irep++ ){
	
	FLA_Copy( Aold, A );
	
	dtime = FLA_Clock();
	

#if TIME_UNB_VAR2 == TRUE
	Chol_unb_var2( A );
#else	
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

	dtime = FLA_Clock() - dtime;
	
	if ( irep == 0 ) 
	  dtime_best = dtime;
	else
	  dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
      }    
      
      diff = FLA_Max_elemwise_diff( A, Aref );
      
      printf( "data_unb_var2( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
	      gflops / dtime_best, diff );
      fflush( stdout );
    }

    /* Variant 2 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

#if TIME_BLK_VAR2 == TRUE
      Chol_blk_var2( A, nb_alg );
#else
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var2( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    /* Variant 3 unblocked */
    if ( n <= nlast_unb ){
      for ( irep=0; irep<nrepeats; irep++ ){
	
	FLA_Copy( Aold, A );
	
	dtime = FLA_Clock();
	
#if TIME_UNB_VAR3 == TRUE
	Chol_unb_var3( A );
#else
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

	dtime = FLA_Clock() - dtime;
	
	if ( irep == 0 ) 
	  dtime_best = dtime;
	else
	  dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
      }    
      
      diff = FLA_Max_elemwise_diff( A, Aref );
      
      printf( "data_unb_var3( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
	      gflops / dtime_best, diff );
      fflush( stdout );
    }

    /* Variant 3 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Aold, A );
    
      dtime = FLA_Clock();

#if TIME_BLK_VAR3 == TRUE
      Chol_blk_var3( A, nb_alg );
#else
      REF_Chol( TIME_LAPACK, A, nb_alg );
#endif

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( A, Aref );

    printf( "data_blk_var3( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );

    FLA_Obj_free( &A );
    FLA_Obj_free( &Aold );
    FLA_Obj_free( &Aref );
    FLA_Obj_free( &delta );
    printf( "\n" );

    i++;
  }

  /* Print the MATLAB commands to plot the data */

  /* Delete all existing figures */
  printf( "close all\n" );


#if OCTAVE == TRUE
  /* Plot the performance of FLAME */
  printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), '-k;libflame;' ); \n" );

  /* Indicate that you want to add to the existing plot */
  printf( "hold on\n" );

  /* Plot the performance of the reference implementation */
  printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), '-m;reference;' ); \n" );

  /* Plot the performance of your implementations */
  printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), \"-rx;UnbVar1;\" ); \n" );
  printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), \"-go;UnbVar2;\" ); \n" );
  printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), \"-b*;UnbVar3;\" ); \n" );
  printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), \"-rx;BlkVar1;\", \"markersize\", 3 ); \n" );
  printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), \"-go;BlkVar2;\", \"markersize\", 3  ); \n" );
  printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), \"-b*;BlkVar3;\", \"markersize\", 3  ); \n" );

#else

  /* Plot the performance of FLAME */
  printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" );

  /* Indicate that you want to add to the existing plot */
  printf( "hold on\n" );

  /* Plot the performance of the reference implementation */
  printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" );

  /* Plot the performance of your implementations */
  printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.x' ); \n" );
  printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.o' ); \n" );
  printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.*' ); \n" );
  printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r-x'); \n" );
  printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g-o'); \n" );
  printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b-*'); \n" );
#endif

  printf( "hold off \n");

  printf( "xlabel( 'matrix dimension m=n' );\n");
  printf( "ylabel( 'GFLOPS/sec.' );\n");
  printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops );

#if OCTAVE == TRUE
  printf( "legend( 2 ); \n" );

  printf(" print -landscape -solid -color -deps -F:24 Chol.eps\n" );
#else
  printf( "legend( 'FLA Chol', ...\n");
  printf( "        'Simple loops', ...\n");
  printf( "        'unb var1', ...\n");
  printf( "        'unb var2', ...\n");
  printf( "        'unb var3', ...\n");
  printf( "        'blk var1', ...\n");
  printf( "        'blk var2', ...\n");
  printf( "        'blk var3', 2);\n");

  printf( "print -r100 -dpdf Chol.pdf\n");
#endif

  FLA_Finalize( );

  exit( 0 );
}
예제 #20
0
void time_Transpose(
                  int variant, int type, int nrepeats, int n, int nb_alg,
                  FLA_Obj A, FLA_Obj A_ref,
                  double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    A_old, A_tmp;

  fla_blocksize_t*
    bp;
  fla_transpose_t*
    cntl_trans_var_unb;
  fla_transpose_t*
    cntl_trans_var_blk;
  fla_swap_t*
    cntl_swap_var_blk;
  fla_swap_t*
    cntl_swap_blas;


  bp                 = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_swap_blas     = FLA_Cntl_swap_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL );
  cntl_swap_var_blk  = FLA_Cntl_swap_obj_create( FLA_FLAT, FLA_UNBLOCKED_VARIANT1, bp, cntl_swap_blas );
  cntl_trans_var_unb = FLA_Cntl_transpose_obj_create( FLA_FLAT, FLA_UNBLOCKED_VARIANT1, NULL, NULL, NULL );
  cntl_trans_var_blk = FLA_Cntl_transpose_obj_create( FLA_FLAT, variant, bp, cntl_trans_var_unb, cntl_swap_var_blk );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_old );
  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_tmp );

  FLA_Copy_external( A, A_old );


  for ( irep = 0 ; irep < nrepeats; irep++ ){
    FLA_Copy_external( A_old, A );

    *dtime = FLA_Clock();

    switch( variant ){
    case 0:

      //FLA_Copyt_external( FLA_TRANSPOSE, A, A_tmp );
      //FLA_Set( FLA_ZERO, A );
      //FLA_Copyt_external( FLA_NO_TRANSPOSE, A_tmp, A );
      FLA_Transpose( A );

      break;

    case 1:{

      /* Time variant 1 */
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Transpose_unb_var1( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Transpose_blk_var1( A, cntl_trans_var_blk );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:{

      /* Time variant 2 */
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Transpose_unb_var2( A );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Transpose_blk_var2( A, cntl_trans_var_blk );
        break;
      default:
        printf("trouble\n");
      }

      break;
    } 

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  FLA_Cntl_obj_free( cntl_trans_var_blk );
  FLA_Cntl_obj_free( cntl_trans_var_unb );
  FLA_Cntl_obj_free( cntl_swap_var_blk );
  FLA_Cntl_obj_free( cntl_swap_blas );
  FLA_Blocksize_free( bp );

  if ( variant == 0 ){
    FLA_Copy_external( A, A_ref );
    *diff = 0.0;
  }
  else{
    *diff = FLA_Max_elemwise_diff( A, A_ref );
  }

  *gflops = 4 * n * n /
            dtime_old / 1e9;

  *dtime = dtime_old;

  FLA_Copy_external( A_old, A );

  FLA_Obj_free( &A_old );
  FLA_Obj_free( &A_tmp );
}
예제 #21
0
int main(int argc, char *argv[])
{
  int m, n, k, nfirst, nlast, ninc, i, irep,
    nrepeats, nb_alg, check;;

  double
    dtime,
    dtime_best,
    gflops,
    max_gflops,
    diff,
    d_n;

  FLA_Obj
    A, B, C, Cref, Cold;
  
  /* Initialize FLAME */
  FLA_Init( );

  /* Every time trial is repeated "repeat" times */
  printf( "%% number of repeats:" );
  scanf( "%d", &nrepeats );
  printf( "%% %d\n", nrepeats );

  /* Enter the max GFLOPS attainable */
  printf( "%% enter max GFLOPS:" );
  scanf( "%lf", &max_gflops );
  printf( "%% %lf\n", max_gflops );

  /* Enter the algorithmic block size */
  printf( "%% enter nb_alg:" );
  scanf( "%d", &nb_alg );
  printf( "%% %d\n", nb_alg );

  /* Timing trials for matrix sizes n=nfirst to nlast in increments 
     of ninc will be performed */
  printf( "%% enter nfirst, nlast, ninc:" );
  scanf( "%d%d%d", &nfirst, &nlast, &ninc );
  printf( "%% %d %d %d\n", nfirst, nlast, ninc );

  i = 1;
  for ( n=nfirst; n<= nlast; n+=ninc ){
   
    /* Allocate space for the matrices */

    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A );
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &B );
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &C );
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Cref );
    FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Cold );

    /* Generate random matrices L and B */
    FLA_Random_matrix( A );
    FLA_Random_matrix( B );
    FLA_Random_matrix( Cold );

    gflops = 2.0 * n * n * n * 1.0e-09;

    /* Time FLA_Symm */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, Cref );

      dtime = FLA_Clock();

      FLA_Symm( FLA_LEFT, FLA_LOWER_TRIANGULAR, 
		FLA_ONE, A, B, FLA_ONE, Cref );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n,
            gflops / dtime_best );
    fflush( stdout );


    /* Time the your implementations */


#if TEST_UNB_VAR1==TRUE
    /* Variant 1 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_unb_var1( A, B, C );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_unb_var1( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_BLK_VAR1==TRUE
    /* Variant 1 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_blk_var1( A, B, C, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_blk_var1( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_UNB_VAR2==TRUE
    /* Variant 2 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_unb_var2( A, B, C );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_unb_var2( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_BLK_VAR2==TRUE
    /* Variant 2 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_blk_var2( A, B, C, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_blk_var2( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_UNB_VAR3==TRUE
    /* Variant 3 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_unb_var3( A, B, C );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_unb_var3( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_BLK_VAR3==TRUE
    /* Variant 3 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_blk_var3( A, B, C, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_blk_var3( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_UNB_VAR4==TRUE
    /* Variant 4 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_unb_var4( A, B, C );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_unb_var4( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_BLK_VAR4==TRUE
    /* Variant 4 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_blk_var4( A, B, C, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_blk_var4( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif


#if TEST_UNB_VAR5==TRUE
    /* Variant 5 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_unb_var5( A, B, C );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_unb_var5( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_BLK_VAR5==TRUE
    /* Variant 5 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_blk_var5( A, B, C, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_blk_var5( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif


#if TEST_UNB_VAR6==TRUE
    /* Variant 6 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_unb_var6( A, B, C );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_unb_var6( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_BLK_VAR6==TRUE
    /* Variant 6 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_blk_var6( A, B, C, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_blk_var6( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif


#if TEST_UNB_VAR7==TRUE
    /* Variant 7 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_unb_var7( A, B, C );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_unb_var7( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_BLK_VAR7==TRUE
    /* Variant 4 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_blk_var7( A, B, C, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_blk_var7( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif


#if TEST_UNB_VAR8==TRUE
    /* Variant 8 unblocked */

    for ( irep=0; irep<nrepeats; irep++ ){

      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_unb_var8( A, B, C );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }    

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_unb_var8( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif

#if TEST_BLK_VAR8==TRUE
    /* Variant 4 blocked */

    for ( irep=0; irep<nrepeats; irep++ ){
      FLA_Copy( Cold, C );
    
      dtime = FLA_Clock();

      Symm_blk_var8( A, B, C, nb_alg );

      dtime = FLA_Clock() - dtime;

      if ( irep == 0 ) 
	dtime_best = dtime;
      else
	dtime_best = ( dtime < dtime_best ? dtime : dtime_best );
    }

    diff = FLA_Max_elemwise_diff( C, Cref );

    printf( "data_blk_var8( %d, 1:3 ) = [ %d %le  %le];\n", i, n,
            gflops / dtime_best, diff );
    fflush( stdout );
#endif


    FLA_Obj_free( &A );
    FLA_Obj_free( &B );
    FLA_Obj_free( &C );
    FLA_Obj_free( &Cref );
    FLA_Obj_free( &Cold );
    printf( "\n" );

    i++;
  }

  /* Print the MATLAB commands to plot the data */

  /* Delete all existing figures */
  printf( "close all\n" );

  /* Plot the performance of FLAME */
  printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" );

  /* Indicate that you want to add to the existing plot */
  printf( "hold on\n" );

  /* Plot the performance of the reference implementation */
  //  printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" );

  /* Plot the performance of your implementations */

#if TEST_UNB_VAR1==TRUE
  printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.' ); \n" );
#endif
#if TEST_UNB_VAR2==TRUE
  printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.' ); \n" );
#endif
#if TEST_UNB_VAR3==TRUE
  printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.' ); \n" );
#endif
#if TEST_UNB_VAR4==TRUE
  printf( "plot( data_unb_var4( :,1 ), data_unb_var4( :, 2 ), 'm-.' ); \n" );
#endif
#if TEST_UNB_VAR5==TRUE
  printf( "plot( data_unb_var5( :,1 ), data_unb_var5( :, 2 ), 'c-.' ); \n" );
#endif
#if TEST_UNB_VAR6==TRUE
  printf( "plot( data_unb_var6( :,1 ), data_unb_var6( :, 2 ), 'y-.' ); \n" );
#endif
#if TEST_UNB_VAR7==TRUE
  printf( "plot( data_unb_var7( :,1 ), data_unb_var7( :, 2 ), 'k-.' ); \n" );
#endif
#if TEST_UNB_VAR8==TRUE
  printf( "plot( data_unb_var8( :,1 ), data_unb_var8( :, 2 ), 'm:' ); \n" );
#endif
#if TEST_BLK_VAR1==TRUE
  printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r--' ); \n" );
#endif
#if TEST_BLK_VAR2==TRUE
  printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g--' ); \n" );
#endif
#if TEST_BLK_VAR3==TRUE
  printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b--' ); \n" );
#endif
#if TEST_BLK_VAR4==TRUE
  printf( "plot( data_blk_var4( :,1 ), data_blk_var4( :, 2 ), 'm--' ); \n" );
#endif
#if TEST_BLK_VAR5==TRUE
  printf( "plot( data_blk_var5( :,1 ), data_blk_var5( :, 2 ), 'c--' ); \n" );
#endif
#if TEST_BLK_VAR6==TRUE
  printf( "plot( data_blk_var6( :,1 ), data_blk_var6( :, 2 ), 'y--' ); \n" );
#endif
#if TEST_BLK_VAR7==TRUE
  printf( "plot( data_blk_var7( :,1 ), data_blk_var7( :, 2 ), 'k--' ); \n" );
#endif
#if TEST_BLK_VAR8==TRUE
  printf( "plot( data_blk_var8( :,1 ), data_blk_var8( :, 2 ), 'm-' ); \n" );
#endif

  printf( "hold on \n");

  printf( "xlabel( 'matrix dimension m=n' );\n");
  printf( "ylabel( 'GFLOPS/sec.' );\n");
  //  printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops );
  printf( "legend( 'FLA Trsm', ...\n");
#if TEST_UNB_VAR1==TRUE
  printf( "        'unb var1', ...\n");
#endif
#if TEST_UNB_VAR2==TRUE
  printf( "        'unb var2', ...\n");
#endif
#if TEST_UNB_VAR3==TRUE
  printf( "        'unb var3', ...\n");
#endif
#if TEST_UNB_VAR4==TRUE
  printf( "        'unb var4', ...\n");
#endif
#if TEST_UNB_VAR5==TRUE
  printf( "        'unb var5', ...\n");
#endif
#if TEST_UNB_VAR6==TRUE
  printf( "        'unb var6', ...\n");
#endif
#if TEST_UNB_VAR7==TRUE
  printf( "        'unb var7', ...\n");
#endif
#if TEST_UNB_VAR8==TRUE
  printf( "        'unb var8', ...\n");
#endif
#if TEST_BLK_VAR1==TRUE
  printf( "        'blk var1', ...\n");
#endif
#if TEST_BLK_VAR2==TRUE
  printf( "        'blk var2', ...\n");
#endif
#if TEST_BLK_VAR3==TRUE
  printf( "        'blk var3', ...\n");
#endif
#if TEST_BLK_VAR4==TRUE
  printf( "        'blk var4', ...\n");
#endif
#if TEST_BLK_VAR5==TRUE
  printf( "        'blk var5', ...\n");
#endif
#if TEST_BLK_VAR6==TRUE
  printf( "        'blk var6', ...\n");
#endif
#if TEST_BLK_VAR7==TRUE
  printf( "        'blk var7', ...\n");
#endif
#if TEST_BLK_VAR8==TRUE
  printf( "        'blk var8', ...\n");
#endif
  printf( "         2 );\n");

  FLA_Finalize( );
}
예제 #22
0
void time_Syrk_ln( 
               int variant, int type, int nrepeats, int n, int nb_alg,
               FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old;

  FLA_Obj
    C_old;

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );

  for ( irep = 0 ; irep < nrepeats; irep++ ){

    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( variant ){
    case 0:
      // Time reference implementation
      REF_Syrk_ln( FLA_ONE, A, FLA_ONE, C );
      break;

    case 1:{
      // Time variant 1
      switch( type ){
      case FLA_ALG_OPENMP_1TASK:
        FLA_Syrk_ln_omp1t_var1( A, C );
        break;
      case FLA_ALG_OPENMP_2TASKS:
        FLA_Syrk_ln_omp2t_var1( A, C );
        break;
      case FLA_ALG_OPENMP_2LOOPS:
        FLA_Syrk_ln_omp2l_var1( A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 2:{
      // Time variant 2
      switch( type ){
      case FLA_ALG_OPENMP_1TASK:
        FLA_Syrk_ln_omp1t_var2( A, C );
        break;
      case FLA_ALG_OPENMP_2TASKS:
        FLA_Syrk_ln_omp2t_var2( A, C );
        break;
      case FLA_ALG_OPENMP_2LOOPS:
        FLA_Syrk_ln_omp2l_var2( A, C );
        break;
      case FLA_ALG_OPENMP_2LOOPSPLUS:
        FLA_Syrk_ln_omp2x_var2( A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    } 
    case 3:{
      // Time variant 3 
      switch( type ){
      case FLA_ALG_OPENMP_1TASK:
        FLA_Syrk_ln_omp1t_var3( A, C );
        break;
      case FLA_ALG_OPENMP_2TASKS:
        FLA_Syrk_ln_omp2t_var3( A, C );
        break;
      case FLA_ALG_OPENMP_2LOOPS:
        FLA_Syrk_ln_omp2l_var3( A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 4:{
      // Time variant 4
      switch( type ){
      case FLA_ALG_OPENMP_1TASK:
        FLA_Syrk_ln_omp1t_var4( A, C );
        break;
      case FLA_ALG_OPENMP_2TASKS:
        FLA_Syrk_ln_omp2t_var4( A, C );
        break;
      case FLA_ALG_OPENMP_2LOOPS:
        FLA_Syrk_ln_omp2l_var4( A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    case 5:{
      // Time variant 5
      switch( type ){
      case FLA_ALG_OPENMP_1TASK:
        FLA_Syrk_ln_omp1t_var5( A, C );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }
    }

    if ( irep == 0 )
      dtime_old = FLA_Clock() - *dtime;
    else{
      *dtime = FLA_Clock() - *dtime;
      dtime_old = min( *dtime, dtime_old );
    }
  }


  if ( variant == 0 ){
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else{
    *diff = FLA_Max_elemwise_diff( C, C_ref );
    //FLA_Obj_show( "C:", C, "%f", "\n");
  }

  *gflops = 1.0 * 
            FLA_Obj_length( A ) * 
            FLA_Obj_length( A ) * 
            FLA_Obj_width( A ) / 
            dtime_old / 
            1e9;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
예제 #23
0
void time_Sylv_nn(
                   int variant, int type, int n_repeats, int m, int n, int nb_alg,
                   FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref, FLA_Obj scale,
                   double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9;

  FLA_Obj
    C_old;

  fla_blocksize_t*
    bp;
  fla_sylv_t*
    cntl_sylv_var;
  fla_sylv_t*
    cntl_sylv_unb;
  fla_gemm_t*
    cntl_gemm_blas;

/*
  if( type == FLA_ALG_UNBLOCKED && n > 400 )
  {
    *gflops = 0.0;
    *diff   = 0.0;
    return;
  }
*/

  bp               = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_sylv_unb    = FLA_Cntl_sylv_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL );
  cntl_gemm_blas   = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
  cntl_sylv_var    = FLA_Cntl_sylv_obj_create( FLA_FLAT, variant, bp, cntl_sylv_unb, cntl_sylv_unb, cntl_sylv_unb, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas, cntl_gemm_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < n_repeats; irep++ ){
    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( variant ){
    case 0:
      /* Time reference implementation */
      REF_Sylv_nn( isgn, A, B, C, scale );

      break;

    case 1:{

      /* Time variant 1 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var1( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var1( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:{

      /* Time variant 2 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var2( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var2( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 3:{

      /* Time variant 3 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var3( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var3( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 4:{

      /* Time variant 4 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var4( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var4( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 5:{

      /* Time variant 5 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var5( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var5( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 6:{

      /* Time variant 6 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var6( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var6( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 7:{

      /* Time variant 7 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var7( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var7( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 8:{

      /* Time variant 8 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var8( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var8( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 9:{

      /* Time variant 9 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var9( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var9( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 10:{

      /* Time variant 10 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var10( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var10( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 11:{

      /* Time variant 11 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var11( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var11( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 12:{

      /* Time variant 12 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var12( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var12( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 13:{

      /* Time variant 13 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var13( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var13( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 14:{

      /* Time variant 14 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var14( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var14( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 15:{

      /* Time variant 15 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var15( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var15( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 16:{

      /* Time variant 16 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var16( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var16( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 17:{

      /* Time variant 17 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var17( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var17( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 18:{

      /* Time variant 18 */
      switch( type ){
      case FLA_ALG_UNB_OPT:
        FLA_Sylv_nn_opt_var18( isgn, A, B, C, scale );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Sylv_nn_blk_var18( isgn, A, B, C, scale, cntl_sylv_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  FLA_Cntl_obj_free( cntl_sylv_var );
  FLA_Cntl_obj_free( cntl_sylv_unb );
  FLA_Cntl_obj_free( cntl_gemm_blas );
  FLA_Blocksize_free( bp );

  if ( variant == 0 ){
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else{
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }

  *gflops = ( m * m * n + n * n * m ) / 
            dtime_old / 1e9;

  if ( FLA_Obj_is_complex( C ) )
    *gflops *= 4.0;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
예제 #24
0
int main( int argc, char** argv ) {
  FLA_Datatype testtype = TESTTYPE;
  dim_t        m;
  FLA_Obj      A;
  FLA_Obj      a1, b1, r1;
  FLA_Obj      a2, b2, r2;
  FLA_Uplo     uplo;
  FLA_Error    init_result; 

  if ( argc == 3 ) {
    m = atoi(argv[1]);
    uplo = ( atoi(argv[2]) == 1  ? FLA_UPPER_TRIANGULAR : FLA_LOWER_TRIANGULAR );
  } else {
    fprintf(stderr, "       \n");
    fprintf(stderr, "Usage: %s m uplo\n", argv[0]);
    fprintf(stderr, "       m    : test matrix length\n");
    fprintf(stderr, "       uplo : 0) lower, 1) upper\n");
    fprintf(stderr, "       \n");
    return -1;
  }
  if ( m == 0 )
    return 0;

  FLA_Init_safe( &init_result );          

  // Test matrix A 
  FLA_Obj_create( testtype, m, m, 0, 0, &A );
  FLA_Random_spd_matrix( uplo, A );
  FLA_Hermitianize( uplo, A );
  FLA_Obj_fshow( stdout,  "- A -", A, "% 6.4e", "--" );

  FLA_Obj_create( testtype, m, 1, 0, 0, &a1 );
  FLA_Obj_create( testtype, m, 1, 0, 0, &a2 );

  if ( m > 1 ) {
    FLA_Obj_create( testtype, m-1, 1, 0, 0, &b1 );
    FLA_Obj_create( testtype, m-1, 1, 0, 0, &b2 );
  }
  
  FLA_Obj_create( testtype, m, 1, 0, 0, &r1 );
  FLA_Obj_create( testtype, m, 1, 0, 0, &r2 );

  // Mine 
  FLA_Tridiag_UT_extract_diagonals( uplo, A, a1, b1 );
  FLA_Obj_fshow( stdout,  "- a1 -", a1, "% 6.4e", "--" );  
  if ( m > 1 ) FLA_Obj_fshow( stdout,  "- b1 -", b1, "% 6.4e", "--" );  

  FLA_Tridiag_UT_realify_subdiagonal( b1, r1 );
  if ( m > 1 ) FLA_Obj_fshow( stdout,  "- b1 realified -", b1, "% 6.4e", "--" );  
  FLA_Obj_fshow( stdout,  "- r1 -", r1, "% 6.4e", "--" );  

  
  // Field
  FLA_Tridiag_UT_realify( uplo, A, r2 );
  FLA_Tridiag_UT_extract_diagonals( uplo, A, a2, b2 );
  FLA_Obj_fshow( stdout,  "- a2  -", a2, "% 6.4e", "--" );  
  if ( m > 1 ) FLA_Obj_fshow( stdout,  "- b2 realified -", b2, "% 6.4e", "--" );  
  FLA_Obj_fshow( stdout,  "- r2 -", r2, "% 6.4e", "--" );  

  printf(" diff_a  = %e\n", FLA_Max_elemwise_diff( a1, a2 ));
  if ( m > 1 ) printf(" diff_b  = %e\n", FLA_Max_elemwise_diff( b1, b2 ));
  printf(" diff_rL = %e\n", FLA_Max_elemwise_diff( r1, r2 ));

  FLA_Obj_fshow( stdout,  "- A realified-", A, "% 6.4e", "--" );

  FLA_Obj_free( &r2 );
  FLA_Obj_free( &r1 );

  if ( m > 1 ) {
    FLA_Obj_free( &b2 );
    FLA_Obj_free( &b1 );
  }

  FLA_Obj_free( &a2 );
  FLA_Obj_free( &a1 );

  FLA_Obj_free( &A );

  FLA_Finalize_safe( init_result );     
}
예제 #25
0
void time_Her2k_ln( 
               int variant, int type, int nrepeats, int n, int nb_alg,
               FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj C_ref,
               double *dtime, double *diff, double *gflops )
{
  int
    irep;

  double
    dtime_old = 1.0e9; 

  FLA_Obj
    C_old;

  fla_blocksize_t*
    bp;
  fla_gemm_t*
    cntl_gemm_blas;
  fla_her2k_t*
    cntl_her2k_blas;
  fla_her2k_t*
    cntl_her2k_var;

  bp              = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
  cntl_gemm_blas  = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL );
  cntl_her2k_blas = FLA_Cntl_her2k_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL, NULL );
  cntl_her2k_var  = FLA_Cntl_her2k_obj_create( FLA_FLAT, variant, bp, cntl_her2k_blas, cntl_gemm_blas, cntl_gemm_blas );

  FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C, &C_old );

  FLA_Copy_external( C, C_old );


  for ( irep = 0 ; irep < nrepeats; irep++ )
  {

    FLA_Copy_external( C_old, C );

    *dtime = FLA_Clock();

    switch( variant ){
    case 0:
      // Time reference implementation
      REF_Her2k( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A, B, FLA_ONE, C );
      break;

    case 1:{
      // Time variant 1
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var1( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var1( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 2:{
      // Time variant 2
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var2( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var2( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 3:{
      // Time variant 3
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var3( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var3( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 4:{
      // Time variant 4
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var4( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var4( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 5:{
      // Time variant 5
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var5( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var5( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 6:{
      // Time variant 6
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var6( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var6( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 7:{
      // Time variant 7
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var7( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var7( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 8:{
      // Time variant 8
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var8( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var8( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 9:{
      // Time variant 9
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var9( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var9( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    case 10:{
      // Time variant 10
      switch( type ){
      case FLA_ALG_UNBLOCKED:
        FLA_Her2k_ln_unb_var10( FLA_ONE, A, B, FLA_ONE, C );
        break;
      case FLA_ALG_BLOCKED:
        FLA_Her2k_ln_blk_var10( FLA_ONE, A, B, FLA_ONE, C, cntl_her2k_var );
        break;
      default:
        printf("trouble\n");
      }

      break;
    }

    }

    *dtime = FLA_Clock() - *dtime;
    dtime_old = min( *dtime, dtime_old );
  }

  FLA_Cntl_obj_free( cntl_her2k_var );
  FLA_Cntl_obj_free( cntl_her2k_blas );
  FLA_Cntl_obj_free( cntl_gemm_blas );
  FLA_Blocksize_free( bp );

  if ( variant == 0 )
  {
    FLA_Copy_external( C, C_ref );
    *diff = 0.0;
  }
  else
  {
    *diff = FLA_Max_elemwise_diff( C, C_ref );
  }

  *gflops = 2.0 * 
            FLA_Obj_length( C ) * 
            FLA_Obj_width( C ) * 
            FLA_Obj_width( A ) / 
            dtime_old / 
            1e9;

  *dtime = dtime_old;

  FLA_Copy_external( C_old, C );

  FLA_Obj_free( &C_old );
}
예제 #26
0
int test_gemm( FILE* stream, param_t param, result_t *result)
{
    FLA_Datatype datatype = param.datatype;
    FLA_Trans    transa = param.trans[0], transb = param.trans[1];
    FLA_Obj      A, B, C, x, y, z, w;
    FLA_Obj      alpha, beta;
    double       time, time_min =  MAX_TIME_VALUE;
    unsigned int i,
             m = param.dims[0],
             n = param.dims[1],
             k = param.dims[2],
             repeat = param.repeat;
    int          is_trans, is_complex;

    // Create matrices.
    is_trans = (transa == FLA_NO_TRANSPOSE);
    FLA_Obj_create( datatype, (is_trans ? m:k), (is_trans ? k:m), 0,0, &A );
    is_trans = (transb == FLA_NO_TRANSPOSE);
    FLA_Obj_create( datatype, (is_trans ? k:n), (is_trans ? n:k), 0,0, &B );
    FLA_Obj_create( datatype,                m,                n, 0,0, &C );

    FLA_Obj_create( datatype, n, 1, 0, 0, &x );
    FLA_Obj_create( datatype, m, 1, 0, 0, &y );
    FLA_Obj_create( datatype, m, 1, 0, 0, &z );
    FLA_Obj_create( datatype, k, 1, 0, 0, &w );

    // Initialize the test matrices.
    FLA_Random_matrix( A );
    FLA_Random_matrix( B );
    FLA_Random_matrix( C );

    FLA_Random_matrix( x );
    FLA_Set( FLA_ZERO, y );
    FLA_Set( FLA_ZERO, w );
    FLA_Set( FLA_ZERO, z );

    // Constants.
    alpha = FLA_MINUS_ONE;
    beta  = FLA_ZERO;

    // Repeat the experiment repeat times and record results.
    for ( i = 0; i < repeat; ++i )
    {
        time = FLA_Clock();
        FLA_Gemm_external( transa, transb, alpha, A, B, beta, C );
        time = FLA_Clock() - time;

        time_min = min( time_min, time );
    }

    is_complex = FLA_Obj_is_complex( C );
    result->performance = ( FMULS * FP_PER_MUL(is_complex) +
                            FADDS * FP_PER_ADD(is_complex) )/time_min/FLOPS_PER_UNIT_PERF;

    FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, C, x, FLA_ZERO, y );
    FLA_Gemv_external( transb,           FLA_ONE, B, x, FLA_ZERO, w );
    FLA_Gemv_external( transa,           alpha,   A, w, FLA_ZERO, z );

    result->residual = FLA_Max_elemwise_diff( y, z );

    FLA_Obj_free( &A );
    FLA_Obj_free( &B );
    FLA_Obj_free( &C );
    FLA_Obj_free( &x );
    FLA_Obj_free( &y );
    FLA_Obj_free( &z );
    FLA_Obj_free( &w );

    return 0;
}