Beispiel #1
0
FLA_Error FLA_Symm_internal( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl )
{
	FLA_Error r_val = FLA_SUCCESS;

	if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
		FLA_Symm_internal_check( side, uplo, alpha, A, B, beta, C, cntl );

	if      ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_MATRIX &&
	          FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
	{
		// Recurse
		r_val = FLA_Symm_internal( side,
		                           uplo,
		                           alpha,
		                           *FLASH_OBJ_PTR_AT( A ),
		                           *FLASH_OBJ_PTR_AT( B ),
		                           beta,
		                           *FLASH_OBJ_PTR_AT( C ),
		                           flash_symm_cntl_mm );
	}
	else if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_SCALAR &&
	          FLASH_Queue_get_enabled( ) )
	{
		// Enqueue
		ENQUEUE_FLASH_Symm( side, uplo, alpha, A, B, beta, C, cntl );
	}
	else
	{
		if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
		     FLA_Obj_elemtype( A ) == FLA_SCALAR &&
		     !FLASH_Queue_get_enabled( ) )
		{
			// Execute leaf
			cntl = flash_symm_cntl_blas;
		}

		// Parameter combinations
		if      ( side == FLA_LEFT )
		{
			if      ( uplo == FLA_LOWER_TRIANGULAR )
				r_val = FLA_Symm_ll( alpha, A, B, beta, C, cntl );
			else if ( uplo == FLA_UPPER_TRIANGULAR )
				r_val = FLA_Symm_lu( alpha, A, B, beta, C, cntl );
		}
		else if ( side == FLA_RIGHT )
		{
			if      ( uplo == FLA_LOWER_TRIANGULAR )
				r_val = FLA_Symm_rl( alpha, A, B, beta, C, cntl );
			else if ( uplo == FLA_UPPER_TRIANGULAR )
				r_val = FLA_Symm_ru( alpha, A, B, beta, C, cntl );
		}
	}

	return r_val;
}
Beispiel #2
0
FLA_Error FLA_Gemm_nt_blk_var1_ht( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, FLA_Gemm_t* cntl )
{
  FLA_Obj AT,              A0,
          AB,              A1,
                           A2;

  FLA_Obj CT,              C0,
          CB,              C1,
                           C2;

  int b;

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x1( C,    &CT, 
                      &CB,            0, FLA_TOP );

  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){

    b = 1;

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b, FLA_BOTTOM );

    FLA_Repart_2x1_to_3x1( CT,                &C0, 
                        /* ** */            /* ** */
                                              &C1, 
                           CB,                &C2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    /* C1 = alpha * A1 * B' + C1;   */
    FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, 
                       alpha, *FLASH_OBJ_PTR_AT( A1 ), *FLASH_OBJ_PTR_AT( B ), beta, *FLASH_OBJ_PTR_AT( C1 ), 
                       NULL );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x1_to_2x1( &CT,                C0, 
                                                  C1, 
                            /* ** */           /* ** */
                              &CB,                C2,     FLA_TOP );

  }

  return FLA_SUCCESS;
}
Beispiel #3
0
FLA_Error FLA_Copyt_internal( FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl )
{
	FLA_Error r_val = FLA_SUCCESS;

	if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
		FLA_Copyt_internal_check( trans, A, B, cntl );

	if      ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_MATRIX &&
	          FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
	{
		// Recurse
		r_val = FLA_Copyt_internal( trans,
		                            *FLASH_OBJ_PTR_AT( A ),
		                            *FLASH_OBJ_PTR_AT( B ),
		                            flash_copyt_cntl );
	}
	else if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_SCALAR &&
	          FLASH_Queue_get_enabled( ) )
	{
		// Enqueue
		ENQUEUE_FLASH_Copyt( trans, A, B, cntl );
	}
	else
	{
		if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
		     FLA_Obj_elemtype( A ) == FLA_SCALAR &&
		     !FLASH_Queue_get_enabled( ) )
		{
			// Execute leaf
			cntl = flash_copyt_cntl_blas;
		}
		
		// Parameter combinations
		if      ( trans == FLA_NO_TRANSPOSE )
		{
			r_val = FLA_Copyt_n( A, B, cntl );
		}
		else if ( trans == FLA_TRANSPOSE )
		{
			r_val = FLA_Copyt_t( A, B, cntl );
		}
		else if ( trans == FLA_CONJ_NO_TRANSPOSE )
		{
			r_val = FLA_Copyt_c( A, B, cntl );
		}
		else if ( trans == FLA_CONJ_TRANSPOSE )
		{
			r_val = FLA_Copyt_h( A, B, cntl );
		}
	}

	return r_val;
}
FLA_Error FLA_Apply_pivots_internal( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl )
{
   FLA_Error r_val = FLA_SUCCESS;

   if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
        FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
   {
      if ( FLASH_Queue_get_enabled( ) )
      {
         // Enqueue
         ENQUEUE_FLASH_Apply_pivots_macro( side, trans, *FLASH_OBJ_PTR_AT( p ), A, cntl );
      }
      else
      {
         // Execute leaf
         r_val = FLA_Apply_pivots_macro_task( side, trans, *FLASH_OBJ_PTR_AT( p ), A, cntl );
      }
   }
   else
   {
      // Parameter combinations
      if ( trans == FLA_NO_TRANSPOSE )
      {
         if      ( side == FLA_LEFT )
         {
            r_val = FLA_Apply_pivots_ln( p, A, cntl );
         }
         else if ( side == FLA_RIGHT )
         {
            r_val = FLA_Apply_pivots_rn( p, A, cntl );
         }
      }
      else if ( trans == FLA_TRANSPOSE )
      {
         if      ( side == FLA_LEFT )
         {
            r_val = FLA_Apply_pivots_lt( p, A, cntl );
         }
         else if ( side == FLA_RIGHT )
         {
            r_val = FLA_Apply_pivots_rt( p, A, cntl );
         }
      }
   }   

   return r_val;
}
Beispiel #5
0
FLA_Error FLA_Trsm_rlt_blk_var3_ht( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Trsm_t* cntl )
{
  FLA_Obj BT,              B0,
          BB,              B1,
                           B2;

  int b;

  FLA_Part_2x1( B,    &BT, 
                      &BB,            0, FLA_TOP );

  while ( FLA_Obj_length( BT ) < FLA_Obj_length( B ) ){

    b = 1;

    FLA_Repart_2x1_to_3x1( BT,                &B0, 
                        /* ** */            /* ** */
                                              &B1, 
                           BB,                &B2,        b, FLA_BOTTOM );

    /*------------------------------------------------------------*/

    /* B1 = B1 / tril( A'); */
    FLA_Trsm_external( FLA_RIGHT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, diagA,
                       alpha, *FLASH_OBJ_PTR_AT( A ), *FLASH_OBJ_PTR_AT( B1 ),
                       NULL );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &BT,                B0, 
                                                  B1, 
                            /* ** */           /* ** */
                              &BB,                B2,     FLA_TOP );

  }

  return FLA_SUCCESS;
}
FLA_Error FLA_Trinv_internal( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl )
{
	FLA_Error r_val = FLA_SUCCESS;

	if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
		FLA_Trinv_internal_check( uplo, diag, A, cntl );

	if      ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_MATRIX &&
	          FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
	{
		// Recurse
		r_val = FLA_Trinv_internal( uplo,
		                            diag,
		                            *FLASH_OBJ_PTR_AT( A ),
		                            flash_trinv_cntl );
	}
	else if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_SCALAR &&
	          FLASH_Queue_get_enabled( ) )
	{
		// Enqueue
		ENQUEUE_FLASH_Trinv( uplo, diag, A, cntl );
	}
	else
	{
		if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
		     FLA_Obj_elemtype( A ) == FLA_SCALAR &&
		     !FLASH_Queue_get_enabled( ) )
		{
			// Execute leaf
			cntl = fla_trinv_cntl_leaf;
		}

		// Parameter combinations
		if      ( uplo == FLA_LOWER_TRIANGULAR )
		{
			if      ( diag == FLA_NONUNIT_DIAG )
			{
				r_val = FLA_Trinv_ln( A, cntl );
			}
			else if ( diag == FLA_UNIT_DIAG )
			{
				r_val = FLA_Trinv_lu( A, cntl );
			}
		}
		else if ( uplo == FLA_UPPER_TRIANGULAR )
		{
			if      ( diag == FLA_NONUNIT_DIAG )
			{
				r_val = FLA_Trinv_un( A, cntl );
			}
			else if ( diag == FLA_UNIT_DIAG )
			{
				r_val = FLA_Trinv_uu( A, cntl );
			}
		}
	}

	return r_val;
}
Beispiel #7
0
FLA_Error FLA_Gemm_internal( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl )
{
	FLA_Error r_val = FLA_SUCCESS;

	if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
		FLA_Gemm_internal_check( transa, transb, alpha, A, B, beta, C, cntl );

	if      ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          (FLA_Obj_elemtype( A ) == FLA_TENSOR || FLA_Obj_elemtype( A ) == FLA_MATRIX) &&
	          FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
	{
		// Recurse
		r_val = FLA_Gemm_internal( transa, 
		                           transb, 
		                           alpha, 
		                           *FLASH_OBJ_PTR_AT( A ), 
		                           *FLASH_OBJ_PTR_AT( B ), 
		                           beta, 
		                           *FLASH_OBJ_PTR_AT( C ), 
		                           flash_gemm_cntl_mm_op );
	}
	else if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_SCALAR &&
	          FLASH_Queue_get_enabled( ) )
	{
		// Enqueue
		ENQUEUE_FLASH_Gemm( transa, transb, alpha, A, B, beta, C, cntl );
	}
	else
	{
		if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
		     FLA_Obj_elemtype( A ) == FLA_SCALAR &&
		     !FLASH_Queue_get_enabled( ) )
		{
			// Execute leaf
			cntl = flash_gemm_cntl_blas;
		}

		// Parameter combinations
		if      ( transa == FLA_NO_TRANSPOSE )
		{
			if      ( transb == FLA_NO_TRANSPOSE )
				r_val = FLA_Gemm_nn( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_TRANSPOSE )
				r_val = FLA_Gemm_nt( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_CONJ_NO_TRANSPOSE )
				r_val = FLA_Gemm_nc( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_CONJ_TRANSPOSE )
				r_val = FLA_Gemm_nh( alpha, A, B, beta, C, cntl );
		}
		else if ( transa == FLA_TRANSPOSE )
		{
			if      ( transb == FLA_NO_TRANSPOSE )
				r_val = FLA_Gemm_tn( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_TRANSPOSE )
				r_val = FLA_Gemm_tt( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_CONJ_NO_TRANSPOSE )
				r_val = FLA_Gemm_tc( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_CONJ_TRANSPOSE )
				r_val = FLA_Gemm_th( alpha, A, B, beta, C, cntl );
		}
		else if ( transa == FLA_CONJ_NO_TRANSPOSE )
		{
			if      ( transb == FLA_NO_TRANSPOSE )
				r_val = FLA_Gemm_cn( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_TRANSPOSE )
				r_val = FLA_Gemm_ct( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_CONJ_NO_TRANSPOSE )
				r_val = FLA_Gemm_cc( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_CONJ_TRANSPOSE )
				r_val = FLA_Gemm_ch( alpha, A, B, beta, C, cntl );
		}
		else if ( transa == FLA_CONJ_TRANSPOSE )
		{
			if      ( transb == FLA_NO_TRANSPOSE )
				r_val = FLA_Gemm_hn( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_TRANSPOSE )
				r_val = FLA_Gemm_ht( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_CONJ_NO_TRANSPOSE )
				r_val = FLA_Gemm_hc( alpha, A, B, beta, C, cntl );
			else if ( transb == FLA_CONJ_TRANSPOSE )
				r_val = FLA_Gemm_hh( alpha, A, B, beta, C, cntl );
		}
	}

	return r_val;
}
Beispiel #8
0
FLA_Error FLASH_SA_LU( FLA_Obj B, FLA_Obj C, 
                       FLA_Obj D, FLA_Obj E, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t* cntl )
{
   FLA_Obj DT,              D0,
           DB,              D1,
                            D2;

   FLA_Obj ET,              E0,
           EB,              E1,
                            E2;

   FLA_Obj pT,              p0,
           pB,              p1,
                            p2;

   FLA_Obj LT,              L0,
           LB,              L1,
                            L2;

   FLA_Part_2x1( D,    &DT,
                       &DB,            0, FLA_TOP );

   FLA_Part_2x1( E,    &ET,
                       &EB,            0, FLA_TOP );

   FLA_Part_2x1( p,    &pT,
                       &pB,            0, FLA_TOP );

   FLA_Part_2x1( L,    &LT,
                       &LB,            0, FLA_TOP );

   while ( FLA_Obj_length( DT ) < FLA_Obj_length( D ) )
   {
      FLA_Repart_2x1_to_3x1( DT,                &D0,
                          /* ** */            /* ** */
                                                &D1,
                             DB,                &D2,        1, FLA_BOTTOM );

      FLA_Repart_2x1_to_3x1( ET,                &E0,
                          /* ** */            /* ** */
                                                &E1,
                             EB,                &E2,        1, FLA_BOTTOM );

      FLA_Repart_2x1_to_3x1( pT,                &p0,
                          /* ** */            /* ** */
                                                &p1,
                             pB,                &p2,        1, FLA_BOTTOM );

      FLA_Repart_2x1_to_3x1( LT,                &L0,
                          /* ** */            /* ** */
                                                &L1,
                             LB,                &L2,        1, FLA_BOTTOM );

      /*------------------------------------------------------------*/

      if ( FLASH_Queue_get_enabled( ) )
      {
         // Enqueue
         ENQUEUE_FLASH_SA_LU( *FLASH_OBJ_PTR_AT( B ),
                              *FLASH_OBJ_PTR_AT( D1 ),
                              *FLASH_OBJ_PTR_AT( p1 ),
                              *FLASH_OBJ_PTR_AT( L1 ),
                              nb_alg,
                              FLA_Cntl_sub_lu( cntl ) );
      }
      else
      {
         // Execute leaf
         FLA_SA_LU_task( *FLASH_OBJ_PTR_AT( B ),
                         *FLASH_OBJ_PTR_AT( D1 ),
                         *FLASH_OBJ_PTR_AT( p1 ),
                         *FLASH_OBJ_PTR_AT( L1 ),
                         nb_alg,
                         FLA_Cntl_sub_lu( cntl ) );
      }
      
      FLASH_SA_FS( L1,
                   D1, p1, C,
                           E1, nb_alg, FLA_Cntl_sub_gemm1( cntl ) );

      /*------------------------------------------------------------*/

      FLA_Cont_with_3x1_to_2x1( &DT,                D0,
                                                    D1,
                              /* ** */           /* ** */
                                &DB,                D2,     FLA_TOP );

      FLA_Cont_with_3x1_to_2x1( &ET,                E0,
                                                    E1,
                              /* ** */           /* ** */
                                &EB,                E2,     FLA_TOP );

      FLA_Cont_with_3x1_to_2x1( &pT,                p0,
                                                    p1,
                              /* ** */           /* ** */
                                &pB,                p2,     FLA_TOP );

      FLA_Cont_with_3x1_to_2x1( &LT,                L0,
                                                    L1,
                              /* ** */           /* ** */
                                &LB,                L2,     FLA_TOP );
   }
   
   return FLA_SUCCESS;
}
FLA_Error FLA_Apply_Q_UT_internal( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl )
{
	FLA_Error r_val = FLA_SUCCESS;

	if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
		FLA_Apply_Q_UT_internal_check( side, trans, direct, storev, A, T, W, B, cntl );

	if      ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_MATRIX &&
	          FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
	{
		// Recurse
		r_val = FLA_Apply_Q_UT_internal( side,
		                                 trans,
		                                 direct,
		                                 storev,
		                                 *FLASH_OBJ_PTR_AT( A ),
		                                 *FLASH_OBJ_PTR_AT( T ),
		                                 *FLASH_OBJ_PTR_AT( W ),
		                                 *FLASH_OBJ_PTR_AT( B ),
		                                 flash_apqut_cntl );
	}
	else if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_SCALAR &&
	          FLASH_Queue_get_enabled( ) )
	{
		// Enqueue
		ENQUEUE_FLASH_Apply_Q_UT( side, trans, direct, storev, A, T, W, B, cntl );
	}
	else
	{
		if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
		     FLA_Obj_elemtype( A ) == FLA_SCALAR &&
		     !FLASH_Queue_get_enabled( ) )
		{
			// Execute leaf.
			cntl = fla_apqut_cntl_leaf;
		}

		if      ( side == FLA_LEFT )
		{
			if      ( trans == FLA_NO_TRANSPOSE )
			{
				if      ( direct == FLA_FORWARD )
				{
					if      ( storev == FLA_COLUMNWISE )
						r_val = FLA_Apply_Q_UT_lnfc( A, T, W, B, cntl );
					else if ( storev == FLA_ROWWISE )
						r_val = FLA_Apply_Q_UT_lnfr( A, T, W, B, cntl );
				}
				else if ( direct == FLA_BACKWARD )
				{
					if      ( storev == FLA_COLUMNWISE )
						r_val = FLA_Apply_Q_UT_lnbc( A, T, W, B, cntl );
					else if ( storev == FLA_ROWWISE )
						r_val = FLA_Apply_Q_UT_lnbr( A, T, W, B, cntl );
				}
			}
			else if ( trans == FLA_TRANSPOSE || trans == FLA_CONJ_TRANSPOSE )
			{
				if      ( direct == FLA_FORWARD )
				{
					if      ( storev == FLA_COLUMNWISE )
						r_val = FLA_Apply_Q_UT_lhfc( A, T, W, B, cntl );
					else if ( storev == FLA_ROWWISE )
						r_val = FLA_Apply_Q_UT_lhfr( A, T, W, B, cntl );
				}
				else if ( direct == FLA_BACKWARD )
				{
					if      ( storev == FLA_COLUMNWISE )
						r_val = FLA_Apply_Q_UT_lhbc( A, T, W, B, cntl );
					else if ( storev == FLA_ROWWISE )
						r_val = FLA_Apply_Q_UT_lhbr( A, T, W, B, cntl );
				}
			}
		}
		else if ( side == FLA_RIGHT )
		{
			if      ( trans == FLA_NO_TRANSPOSE )
			{
				if      ( direct == FLA_FORWARD )
				{
					if      ( storev == FLA_COLUMNWISE )
						r_val = FLA_Apply_Q_UT_rnfc( A, T, W, B, cntl );
					else if ( storev == FLA_ROWWISE )
						r_val = FLA_Apply_Q_UT_rnfr( A, T, W, B, cntl );
				}
				else if ( direct == FLA_BACKWARD )
				{
					if      ( storev == FLA_COLUMNWISE )
						r_val = FLA_Apply_Q_UT_rnbc( A, T, W, B, cntl );
					else if ( storev == FLA_ROWWISE )
						r_val = FLA_Apply_Q_UT_rnbr( A, T, W, B, cntl );
				}
			}
			else if ( trans == FLA_TRANSPOSE || trans == FLA_CONJ_TRANSPOSE )
			{
				if      ( direct == FLA_FORWARD )
				{
					if      ( storev == FLA_COLUMNWISE )
						r_val = FLA_Apply_Q_UT_rhfc( A, T, W, B, cntl );
					else if ( storev == FLA_ROWWISE )
						r_val = FLA_Apply_Q_UT_rhfr( A, T, W, B, cntl );
				}
				else if ( direct == FLA_BACKWARD )
				{
					if      ( storev == FLA_COLUMNWISE )
						r_val = FLA_Apply_Q_UT_rhbc( A, T, W, B, cntl );
					else if ( storev == FLA_ROWWISE )
						r_val = FLA_Apply_Q_UT_rhbr( A, T, W, B, cntl );
				}
			}
		}
	}

	return r_val;
}
Beispiel #10
0
FLA_Error FLA_Copy_internal( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl )
{
	FLA_Error r_val = FLA_SUCCESS;

	if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
		FLA_Copy_internal_check( A, B, cntl );

	if      ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_MATRIX &&
	          FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
	{
		// Recurse
		r_val = FLA_Copy_internal( *FLASH_OBJ_PTR_AT( A ),
		                           *FLASH_OBJ_PTR_AT( B ),
		                           flash_copy_cntl );
	}
	else if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
	          FLA_Obj_elemtype( A ) == FLA_SCALAR &&
	          FLASH_Queue_get_enabled( ) )
	{
		// Enqueue
		ENQUEUE_FLASH_Copy( A, B, cntl );
	}
	else
	{
		if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
		     FLA_Obj_elemtype( A ) == FLA_SCALAR &&
		     !FLASH_Queue_get_enabled( ) )
		{
			// Execute leaf
			cntl = flash_copy_cntl_blas;
		}
		
		// Parameter combinations
		if      ( FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
		{
			r_val = FLA_Copy_task( A, B, cntl );
		}
		else if ( FLA_Cntl_variant( cntl ) == FLA_BLOCKED_VARIANT1 )
		{
			r_val = FLA_Copy_blk_var1( A, B, cntl );
		}
#ifdef FLA_ENABLE_NON_CRITICAL_CODE
		else if ( FLA_Cntl_variant( cntl ) == FLA_BLOCKED_VARIANT2 )
		{
			r_val = FLA_Copy_blk_var2( A, B, cntl );
		}
#endif
		else if ( FLA_Cntl_variant( cntl ) == FLA_BLOCKED_VARIANT3 )
		{
			r_val = FLA_Copy_blk_var3( A, B, cntl );
		}
#ifdef FLA_ENABLE_NON_CRITICAL_CODE
		else if ( FLA_Cntl_variant( cntl ) == FLA_BLOCKED_VARIANT4 )
		{
			r_val = FLA_Copy_blk_var4( A, B, cntl );
		}
#endif
		else
		{
			r_val = FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
		}
	}

	return r_val;
}
Beispiel #11
0
FLA_Error FLA_Syrk_ln_blk_var2_ht( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, FLA_Syrk_t* cntl )
{
  FLA_Obj AT,              A0,
          AB,              A1,
                           A2;

  FLA_Obj CTL,   CTR,      C00, C01, C02, 
          CBL,   CBR,      C10, C11, C12,
                           C20, C21, C22;

  int b;

  FLA_Part_2x1( A,    &AT, 
                      &AB,            0, FLA_TOP );

  FLA_Part_2x2( C,    &CTL, &CTR,
                      &CBL, &CBR,     0, 0, FLA_TL );

  while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){

    b = 1;

    FLA_Repart_2x1_to_3x1( AT,                &A0, 
                        /* ** */            /* ** */
                                              &A1, 
                           AB,                &A2,        b, FLA_BOTTOM );

    FLA_Repart_2x2_to_3x3( CTL, /**/ CTR,       &C00, /**/ &C01, &C02,
                        /* ************* */   /* ******************** */
                                                &C10, /**/ &C11, &C12,
                           CBL, /**/ CBR,       &C20, /**/ &C21, &C22,
                           b, b, FLA_BR );

    /*------------------------------------------------------------*/

    /* C21 = C21 + A2 * A1' */
    FLA_Gemm_nt_blk_var1_ht( alpha, A2, A1, beta, C21,
                          NULL );

    /* C11 = C11 + A1 * A1' */
    FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, 
                       alpha, *FLASH_OBJ_PTR_AT( A1 ), beta, *FLASH_OBJ_PTR_AT( C11 ),
                       NULL );

    /*------------------------------------------------------------*/

    FLA_Cont_with_3x1_to_2x1( &AT,                A0, 
                                                  A1, 
                            /* ** */           /* ** */
                              &AB,                A2,     FLA_TOP );

    FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR,       C00, C01, /**/ C02,
                                                     C10, C11, /**/ C12,
                            /* ************** */  /* ****************** */
                              &CBL, /**/ &CBR,       C20, C21, /**/ C22,
                              FLA_TL );

  }

  return FLA_SUCCESS;
}
Beispiel #12
0
FLA_Error FLA_Trsm_internal( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl )
{
    FLA_Error r_val = FLA_SUCCESS;

    if ( FLA_Check_error_level() == FLA_FULL_ERROR_CHECKING )
        FLA_Trsm_internal_check( side, uplo, transa, diag, alpha, A, B, cntl );

    if      ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
              FLA_Obj_elemtype( A ) == FLA_MATRIX &&
              FLA_Cntl_variant( cntl ) == FLA_SUBPROBLEM )
    {
        // Recurse
        r_val = FLA_Trsm_internal( side,
                                   uplo,
                                   transa,
                                   diag,
                                   alpha,
                                   *FLASH_OBJ_PTR_AT( A ),
                                   *FLASH_OBJ_PTR_AT( B ),
                                   flash_trsm_cntl_mm );
    }
    else if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
              FLA_Obj_elemtype( A ) == FLA_SCALAR &&
              FLASH_Queue_get_enabled( ) )
    {
        // Enqueue
        ENQUEUE_FLASH_Trsm( side, uplo, transa, diag, alpha, A, B, cntl );
    }
    else
    {
        if ( FLA_Cntl_matrix_type( cntl ) == FLA_HIER &&
                FLA_Obj_elemtype( A ) == FLA_SCALAR &&
                !FLASH_Queue_get_enabled( ) )
        {
            // Execute leaf
            cntl = flash_trsm_cntl_blas;
        }

        // Parameter combinations
        if      ( side == FLA_LEFT )
        {
            if      ( uplo == FLA_LOWER_TRIANGULAR )
            {
                if      ( transa == FLA_NO_TRANSPOSE )
                    r_val = FLA_Trsm_lln( diag, alpha, A, B, cntl );
                else if ( transa == FLA_TRANSPOSE )
                    r_val = FLA_Trsm_llt( diag, alpha, A, B, cntl );
                else if ( transa == FLA_CONJ_NO_TRANSPOSE )
                    r_val = FLA_Trsm_llc( diag, alpha, A, B, cntl );
                else if ( transa == FLA_CONJ_TRANSPOSE )
                    r_val = FLA_Trsm_llh( diag, alpha, A, B, cntl );
            }
            else if ( uplo == FLA_UPPER_TRIANGULAR )
            {
                if      ( transa == FLA_NO_TRANSPOSE )
                    r_val = FLA_Trsm_lun( diag, alpha, A, B, cntl );
                else if ( transa == FLA_TRANSPOSE )
                    r_val = FLA_Trsm_lut( diag, alpha, A, B, cntl );
                else if ( transa == FLA_CONJ_NO_TRANSPOSE )
                    r_val = FLA_Trsm_luc( diag, alpha, A, B, cntl );
                else if ( transa == FLA_CONJ_TRANSPOSE )
                    r_val = FLA_Trsm_luh( diag, alpha, A, B, cntl );
            }
        }
        else if ( side == FLA_RIGHT )
        {
            if      ( uplo == FLA_LOWER_TRIANGULAR )
            {
                if      ( transa == FLA_NO_TRANSPOSE )
                    r_val = FLA_Trsm_rln( diag, alpha, A, B, cntl );
                else if ( transa == FLA_TRANSPOSE )
                    r_val = FLA_Trsm_rlt( diag, alpha, A, B, cntl );
                else if ( transa == FLA_CONJ_NO_TRANSPOSE )
                    r_val = FLA_Trsm_rlc( diag, alpha, A, B, cntl );
                else if ( transa == FLA_CONJ_TRANSPOSE )
                    r_val = FLA_Trsm_rlh( diag, alpha, A, B, cntl );
            }
            else if ( uplo == FLA_UPPER_TRIANGULAR )
            {
                if      ( transa == FLA_NO_TRANSPOSE )
                    r_val = FLA_Trsm_run( diag, alpha, A, B, cntl );
                else if ( transa == FLA_TRANSPOSE )
                    r_val = FLA_Trsm_rut( diag, alpha, A, B, cntl );
                else if ( transa == FLA_CONJ_NO_TRANSPOSE )
                    r_val = FLA_Trsm_ruc( diag, alpha, A, B, cntl );
                else if ( transa == FLA_CONJ_TRANSPOSE )
                    r_val = FLA_Trsm_ruh( diag, alpha, A, B, cntl );
            }
        }
    }

    return r_val;
}
Beispiel #13
0
FLA_Error FLASH_Axpy_hierarchy( int direction, FLA_Obj alpha, FLA_Obj F, FLA_Obj* H )
{
	// Once we get down to a submatrix whose elements are scalars, we are down
	// to our base case.
	if ( FLA_Obj_elemtype( *H ) == FLA_SCALAR )
	{
		// Depending on which top-level function invoked us, we either axpy
		// the source data in the flat matrix to the leaf-level submatrix of
		// the hierarchical matrix, or axpy the data in the hierarchical
		// submatrix to the flat matrix.
		if      ( direction == FLA_FLAT_TO_HIER )
		{
#ifdef FLA_ENABLE_SCC
			if ( FLA_is_owner() )
#endif
			FLA_Axpy_external( alpha, F, *H );
		}
		else if ( direction == FLA_HIER_TO_FLAT )
		{
#ifdef FLA_ENABLE_SCC
			if ( FLA_is_owner() )
#endif
			FLA_Axpy_external( alpha, *H, F );
		}
	}
	else
	{
		FLA_Obj HL,  HR,       H0,  H1,  H2;
		FLA_Obj FL,  FR,       F0,  F1,  F2;

		FLA_Obj H1T,           H01,
		        H1B,           H11,
		                       H21;
		FLA_Obj F1T,           F01,
		        F1B,           F11,
		                       F21;

		dim_t b_m;
		dim_t b_n;

		FLA_Part_1x2( *H,    &HL,  &HR,      0, FLA_LEFT );
		FLA_Part_1x2(  F,    &FL,  &FR,      0, FLA_LEFT );

		while ( FLA_Obj_width( HL ) < FLA_Obj_width( *H ) )
		{
			FLA_Repart_1x2_to_1x3( HL,  /**/ HR,        &H0, /**/ &H1, &H2,
			                       1, FLA_RIGHT );

			// Get the scalar width of H1 and use that to determine the
			// width of F1.
			b_n = FLASH_Obj_scalar_width( H1 );

			FLA_Repart_1x2_to_1x3( FL,  /**/ FR,        &F0, /**/ &F1, &F2,
			                       b_n, FLA_RIGHT );

			// -------------------------------------------------------------

			FLA_Part_2x1( H1,    &H1T,
			                     &H1B,       0, FLA_TOP );
			FLA_Part_2x1( F1,    &F1T,
			                     &F1B,       0, FLA_TOP );

			while ( FLA_Obj_length( H1T ) < FLA_Obj_length( H1 ) )
			{
				FLA_Repart_2x1_to_3x1( H1T,               &H01,
				                    /* ** */            /* *** */
				                                          &H11,
				                       H1B,               &H21,        1, FLA_BOTTOM );

				// Get the scalar length of H11 and use that to determine the
				// length of F11.
				b_m = FLASH_Obj_scalar_length( H11 );

				FLA_Repart_2x1_to_3x1( F1T,               &F01,
				                    /* ** */            /* *** */
				                                          &F11,
				                       F1B,               &F21,        b_m, FLA_BOTTOM );
				// -------------------------------------------------------------

				// Recursively axpy between F11 and H11.
				FLASH_Axpy_hierarchy( direction, alpha, F11,
				                      FLASH_OBJ_PTR_AT( H11 ) );

				// -------------------------------------------------------------

				FLA_Cont_with_3x1_to_2x1( &H1T,               H01,
				                                              H11,
				                        /* ** */           /* *** */
				                          &H1B,               H21,     FLA_TOP );
				FLA_Cont_with_3x1_to_2x1( &F1T,               F01,
				                                              F11,
				                        /* ** */           /* *** */
				                          &F1B,               F21,     FLA_TOP );
			}

			// -------------------------------------------------------------

			FLA_Cont_with_1x3_to_1x2( &HL,  /**/ &HR,        H0, H1, /**/ H2,
			                          FLA_LEFT );
			FLA_Cont_with_1x3_to_1x2( &FL,  /**/ &FR,        F0, F1, /**/ F2,
			                          FLA_LEFT );
		}
	}

	return FLA_SUCCESS;
}
Beispiel #14
0
int main(int argc, char *argv[])
{
  int 
    datatype,
    n_blocks_m,
    n_threads,
    m_input, n_input,
    m, n,
    p_first, p_last, p_inc,
    p,
    n_repeats,
    param_combo,
    i,
    n_param_combos = N_PARAM_COMBOS;
  
  dim_t
    nb_flash, nb_alg;

  char *colors = "brkgmcbrkgmcbrkgmc";
  char *ticks  = "o+*xso+*xso+*xso+*xs";
  char m_dim_desc[14];
  char n_dim_desc[14];
  char m_dim_tag[10];
  char n_dim_tag[10];

  double max_gflops=6.0;

  double
    dtime,
    gflops,
    diff;

  FLA_Obj
    A, A_flat_ref, A_flat, B, B_flat, D, D_flat, t, T, T_flat;
  
  FLA_Init( );


  fprintf( stdout, "%c number of repeats: ", '%' );
  scanf( "%d", &n_repeats );
  fprintf( stdout, "%c %d\n", '%', n_repeats );

  fprintf( stdout, "%c enter algorithmic blocksize: ", '%' );
  scanf( "%u", &nb_alg );
  fprintf( stdout, "%c %u\n", '%', nb_alg );

  fprintf( stdout, "%c enter problem size first, last, inc: ", '%' );
  scanf( "%d%d%d", &p_first, &p_last, &p_inc );
  fprintf( stdout, "%c %d %d %d\n", '%', p_first, p_last, p_inc );

  fprintf( stdout, "%c enter m n (-1 means bind to problem size): ", '%' );
  scanf( "%d%d", &m_input, &n_input );
  fprintf( stdout, "%c %d %d\n", '%', m_input, n_input );

  fprintf( stdout, "%c enter the number of SuperMatrix threads: ", '%' );
  scanf( "%d", &n_threads );
  fprintf( stdout, "%c %d\n", '%', n_threads );


  fprintf( stdout, "\nclear all;\n\n" );


  if     ( m_input >  0 ) {
    sprintf( m_dim_desc, "m = %d", m_input );
    sprintf( m_dim_tag,  "m%dc", m_input);
  }
  else if( m_input <  -1 ) {
    sprintf( m_dim_desc, "m = p/%d", -m_input );
    sprintf( m_dim_tag,  "m%dp", -m_input );
  }
  else if( m_input == -1 ) {
    sprintf( m_dim_desc, "m = p" );
    sprintf( m_dim_tag,  "m%dp", 1 );
  }
  if     ( n_input >  0 ) {
    sprintf( n_dim_desc, "n = %d", n_input );
    sprintf( n_dim_tag,  "n%dc", n_input);
  }
  else if( n_input <  -1 ) {
    sprintf( n_dim_desc, "n = p/%d", -n_input );
    sprintf( n_dim_tag,  "n%dp", -n_input );
  }
  else if( n_input == -1 ) {
    sprintf( n_dim_desc, "n = p" );
    sprintf( n_dim_tag,  "n%dp", 1 );
  }

  //datatype = FLA_FLOAT;
  //datatype = FLA_DOUBLE;
  //datatype = FLA_COMPLEX;
  datatype = FLA_DOUBLE_COMPLEX;

  FLASH_Queue_set_num_threads( n_threads );

  for ( p = p_first, i = 1; p <= p_last; p += p_inc, i += 1 )
  {
    m = m_input;
    n = n_input;

    if( m < 0 ) m = p / abs(m_input);
    if( n < 0 ) n = p / abs(n_input);

    nb_flash = n;

    for ( param_combo = 0; param_combo < n_param_combos; param_combo++ )
    {

      FLA_Obj_create( datatype, m,          nb_flash, &A_flat );
      FLA_Obj_create( datatype, m,          nb_flash, &A_flat_ref );
      FLA_Obj_create( datatype, m,          nb_flash, &T_flat );
      FLA_Obj_create( datatype, nb_flash,   1,        &t );

      FLASH_Obj_create( datatype, m, nb_flash, 1, &nb_flash, &A );
      n_blocks_m = FLA_Obj_length( A );
      FLASH_Obj_create_ext( datatype, nb_alg * n_blocks_m, nb_flash, 1, &nb_alg, &nb_flash, &T );

      FLA_Set( FLA_ZERO, T_flat );
      FLASH_Set( FLA_ZERO, T );

      FLASH_Random_matrix( A );
      FLASH_Obj_flatten( A, A_flat );

      FLA_Part_2x1( A,   &B,
                         &D,   1, FLA_TOP );

      FLA_Part_2x1( A_flat,   &B_flat,
                              &D_flat,   FLA_Obj_width( A_flat ), FLA_TOP );

      FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, *(FLASH_OBJ_PTR_AT(B)) );
      FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, B_flat );


      fprintf( stdout, "data_qr2ut_%s( %d, 1:5 ) = [ %d  ", pc_str[param_combo], i, p );
      fflush( stdout );

      time_QR2_UT( param_combo, FLA_ALG_REFERENCE, n_repeats, m, n,
                    A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );

      time_QR2_UT( param_combo, FLA_ALG_FRONT, n_repeats, m, n,
                    A, A_flat_ref, B, B_flat, D, D_flat, A_flat, t, T, T_flat, &dtime, &diff, &gflops );

      fprintf( stdout, "%6.3lf %6.2le ", gflops, diff );
      fflush( stdout );


      fprintf( stdout, " ]; \n" );
      fflush( stdout );

      FLA_Obj_free( &A_flat );
      FLA_Obj_free( &A_flat_ref );
      FLA_Obj_free( &T_flat );
      FLA_Obj_free( &t );

      FLASH_Obj_free( &A );
      FLASH_Obj_free( &T );
    }

    fprintf( stdout, "\n" );
  }

  fprintf( stdout, "figure;\n" );

  fprintf( stdout, "hold on;\n" );

  for ( i = 0; i < n_param_combos; i++ ) {
    fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 2 ), '%c:%c' ); \n",
            pc_str[i], pc_str[i], colors[ i ], ticks[ i ] );
    fprintf( stdout, "plot( data_qr2ut_%s( :,1 ), data_qr2ut_%s( :, 4 ), '%c-.%c' ); \n",
            pc_str[i], pc_str[i], colors[ i ], ticks[ i ] );
  }

  fprintf( stdout, "legend( ... \n" );

  for ( i = 0; i < n_param_combos; i++ )
    fprintf( stdout, "'ref\\_qr2ut\\_%s', 'fla\\_qr2ut\\_%s', ... \n", pc_str[i], pc_str[i] );

  fprintf( stdout, "'Location', 'SouthEast' ); \n" );


  fprintf( stdout, "xlabel( 'problem size p' );\n" );
  fprintf( stdout, "ylabel( 'GFLOPS/sec.' );\n" );
  fprintf( stdout, "axis( [ 0 %d 0 %.2f ] ); \n", p_last, max_gflops );
  fprintf( stdout, "title( 'FLAME qr2ut front-end performance (%s, %s)' );\n",
           m_dim_desc, n_dim_desc );
  fprintf( stdout, "print -depsc qr2ut_front_%s_%s.eps\n", m_dim_tag, n_dim_tag );
  fprintf( stdout, "hold off;\n");
  fflush( stdout );

  FLA_Finalize( );

  return 0;
}