void MatrixSymDiagStd::Mp_StM(
  DMatrixSlice* gms_lhs, value_type alpha, BLAS_Cpp::Transp trans_rhs) const
{
  using DenseLinAlgPack::Vp_StV;
  // Assert that the dimensions match up.
  DenseLinAlgPack::Mp_M_assert_sizes( gms_lhs->rows(), gms_lhs->cols(), BLAS_Cpp::no_trans
    , rows(), cols(), trans_rhs );
  // Add to the diagonal
  Vp_StV( &gms_lhs->diag(), alpha, diag_ );
}
inline
void Vp_MtV(DVectorSlice* vs_lhs, const M& M_rhs1, BLAS_Cpp::Transp trans_rhs1
  , const V& V_rhs2, value_type beta)
{
  Vp_StMtV(vs_lhs,1.0,M_rhs1,trans_rhs1,V_rhs2,beta);
}
inline
void Mp_MtM(DMatrixSlice* gms_lhs, const M1& M1_rhs1, BLAS_Cpp::Transp trans_rhs1
  , const M2& M2_rhs2, BLAS_Cpp::Transp trans_rhs2, value_type beta)
{
  Mp_StMtM(gms_lhs,1.0,M1_rhs1,trans_rhs1,M2_rhs2,trans_rhs2,beta);
}
inline
void Vp_V(DVectorSlice* vs_lhs, const V& V_rhs) {
  Vp_StV(vs_lhs,1.0,V_rhs);
}
inline
void Mp_M(DMatrixSlice* gms_lhs, const M& M_rhs, BLAS_Cpp::Transp trans_rhs) {
  Mp_StM(gms_lhs,1.0,M_rhs,trans_rhs);
}
bool MoochoPack::LineSearchWatchDog_Step::do_step(Algorithm& _algo
  , poss_type step_poss, IterationPack::EDoStepType type, poss_type assoc_step_poss)
{
  using DenseLinAlgPack::norm_inf;
  using DenseLinAlgPack::V_VpV;
  using DenseLinAlgPack::Vp_StV;
  using DenseLinAlgPack::Vt_S;

  using LinAlgOpPack::Vp_V;
  using LinAlgOpPack::V_MtV;

  using ConstrainedOptPack::print_vector_change_stats;

  NLPAlgo	&algo	= rsqp_algo(_algo);
  NLPAlgoState	&s		= algo.rsqp_state();
  NLP			&nlp	= algo.nlp();

  EJournalOutputLevel olevel = algo.algo_cntr().journal_output_level();
  std::ostream& out = algo.track().journal_out();
  out << std::boolalpha;

  // print step header.
  if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
    using IterationPack::print_algorithm_step;
    print_algorithm_step( algo, step_poss, type, assoc_step_poss, out );
  }

  // /////////////////////////////////////////
  // Set references to iteration quantities
  //
  // Set k+1 first then go back to get k to ensure
  // we have backward storage.
  
  DVector
    &x_kp1 = s.x().set_k(+1).v();
  value_type
    &f_kp1 = s.f().set_k(+1);
  DVector
    &c_kp1 = s.c().set_k(+1).v();

  const value_type
    &f_k = s.f().get_k(0);
  const DVector
    &c_k = s.c().get_k(0).v();
  const DVector
    &x_k = s.x().get_k(0).v();
  const DVector
    &d_k = s.d().get_k(0).v();
  value_type
    &alpha_k = s.alpha().get_k(0);

  // /////////////////////////////////////
  // Compute Dphi_k, phi_kp1 and phi_k

  // Dphi_k
  const value_type
    Dphi_k = merit_func().deriv();
  if( Dphi_k >= 0 ) {
    throw LineSearchFailure( "LineSearch2ndOrderCorrect_Step::do_step(...) : " 
      "Error, d_k is not a descent direction for the merit function " );
  }

  // ph_kp1
  value_type
    &phi_kp1 = s.phi().set_k(+1) = merit_func().value( f_kp1, c_kp1 );

  // Must compute phi(x) at the base point x_k since the penalty parameter may have changed.
  const value_type
    &phi_k = s.phi().set_k(0) = merit_func().value( f_k, c_k );

  // //////////////////////////////////////
  // Setup the calculation merit function

  // Here f_kp1, and c_kp1 are updated at the same time the
  // line search is being performed.
  nlp.set_f( &f_kp1 );
  nlp.set_c( &c_kp1 );
  MeritFuncCalcNLP
    phi_calc( &merit_func(), &nlp );

  // ////////////////////////////////
  // Use Watchdog near the solution

  if( watch_k_ == NORMAL_LINE_SEARCH ) {
    const value_type
      opt_kkt_err_k	= s.opt_kkt_err().get_k(0),
      feas_kkt_err_k	= s.feas_kkt_err().get_k(0);
    if( opt_kkt_err_k <= opt_kkt_err_threshold() && feas_kkt_err_k <= feas_kkt_err_threshold() ) {
      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        out	<< "\nopt_kkt_err_k = " << opt_kkt_err_k << " <= opt_kkt_err_threshold = "
            << opt_kkt_err_threshold() << std::endl
          << "\nfeas_kkt_err_k = " << feas_kkt_err_k << " <= feas_kkt_err_threshold = "
            << feas_kkt_err_threshold() << std::endl
          << "\nSwitching to watchdog linesearch ...\n";
      }
      watch_k_ = 0;
    }
  }

  if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
    out	<< "\nTrial point:\n"
      << "phi_k   = " << phi_k << std::endl
      << "Dphi_k  = " << Dphi_k << std::endl
      << "phi_kp1 = " << phi_kp1 << std::endl;
  }

  bool	ls_success = true,
      step_return = true;

  switch( watch_k_ ) {
    case 0:
    {
      // Take  a full step
      const value_type phi_cord = phi_k + eta() * Dphi_k;
      const bool accept_step = phi_kp1 <= phi_cord;

      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        out	<< "\n*** Zeroth watchdog iteration:\n"
          << "\nphi_kp1 = " << phi_kp1 << ( accept_step ? " <= " : " > " )
            << "phi_k + eta * Dphi_k = " << phi_cord << std::endl;
      }

      if( phi_kp1 > phi_cord ) {
        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out	<< "\nAccept this increase for now but watch out next iteration!\n";
        }
        // Save this initial point
        xo_		= x_k;
        fo_		= f_k;
        nrm_co_	= norm_inf( c_k );
        do_		= d_k;
        phio_	= phi_k;
        Dphio_	= Dphi_k;
        phiop1_	= phi_kp1;
        // Slip the update of the penalty parameter
        const value_type mu_k = s.mu().get_k(0);
        s.mu().set_k(+1) = mu_k;
        // Move on to the next step in the watchdog procedure
        watch_k_ = 1;
      }
      else {
        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out	<< "\nAll is good!\n";
        }
        // watch_k_ stays 0
      }
      step_return = true;
      break;
    }
    case 1:
    {
      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        out	<< "\n*** First watchdog iteration:\n"
          << "\nDo a line search to determine x_kp1 = x_k + alpha_k * d_k ...\n";
      }
      // Now do a line search but and we require some type of reduction
      const DVectorSlice xd[2] = { x_k(), d_k() };
      MeritFuncCalc1DQuadratic phi_calc_1d( phi_calc, 1, xd, &x_kp1() );
      ls_success = direct_line_search().do_line_search( phi_calc_1d, phi_k
        , &alpha_k, &phi_kp1
        , (int)olevel >= (int)PRINT_ALGORITHM_STEPS ?
          &out : static_cast<std::ostream*>(0)	);

      // If the linesearch failed then the rest of the tests will catch this.

      value_type phi_cord = 0;
      bool test1, test2;

      if(		( test1 = ( phi_k <= phio_ ) )
        || ( test2 = phi_kp1 <= ( phi_cord = phio_ + eta() * Dphio_ ) )		)
      {
        // We will accept this step and and move on.
        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out
            << "\nphi_k = " << phi_k << ( test1 ? " <= " : " > " )
              << "phi_km1 = " << phio_ << std::endl
            << "phi_kp1 = " << phi_kp1 << ( test2 ? " <= " : " > " )
              << "phi_km1 + eta * Dphi_km1 = " << phi_cord << std::endl
            << "This is a sufficent reduction so reset watchdog.\n";
        }
        watch_k_ = 0;
        step_return = true;
      }
      else if ( ! ( test1 = ( phi_kp1 <= phio_ ) ) ) {
        // Even this reduction is no good!
        // Go back to original point and do a linesearch from there.
        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out
            << "\nphi_kp1 = " << phi_kp1 << " > phi_km1 = " << phio_ << std::endl
            << "This is not good reduction in phi so do linesearch from x_km1\n"
            << "\n* Go back to x_km1: x_kp1 = x_k - alpha_k * d_k ...\n";
        }

        // Go back from x_k to x_km1 for iteration k:
        //
        // x_kp1 = x_km1
        // x_kp1 = x_k - alpha_km1 * d_km1
        //
        // A negative sign for alpha is an indication that we are backtracking.
        //
        s.alpha().set_k(0)			= -1.0;
        s.d().set_k(0).v()			= do_;
        s.f().set_k(+1)				= fo_;

        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out << "Output iteration k ...\n"
            << "k = k+1\n";
        }

        // Output these iteration quantities
        algo.track().output_iteration( algo );	// k
        // Transition to iteration k+1
        s.next_iteration();

        // Take the step from x_k = x_km2 to x_kp1 for iteration k (k+1):
        //
        // x_kp1 = x_km2 + alpha_n * d_km2
        // x_kp1 = x_k   + alpha_n * d_km1
        // x_kp1 = x_n
        //				
        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out << "\n* Take the step from x_k = x_km2 to x_kp1 for iteration k (k+1)\n"
            << "Find: x_kp1 = x_k + alpha_k * d_k = x_km2 + alpha_k * d_km2\n ...\n";
        }

        // alpha_k = 1.0
        value_type &alpha_k = s.alpha().set_k(0) = 1.0;
        
        // /////////////////////////////////////
        // Compute Dphi_k and phi_k

        // x_k
        const DVector &x_k								= xo_;

        // d_k
        const DVector &d_k = s.d().set_k(0).v()			= do_;

        // Dphi_k
        const value_type &Dphi_k						= Dphio_;

        // phi_k
        const value_type &phi_k = s.phi().set_k(0)		= phio_;

        // Here f_kp1, and c_kp1 are updated at the same time the
        // line search is being performed.
        algo.nlp().set_f( &s.f().set_k(+1) );
        algo.nlp().set_c( &s.c().set_k(+1).v() );
        phi_calc.set_nlp( algo.get_nlp() );

        // ////////////////////////////////////////
        // Compute x_xp1 and ph_kp1 for full step

        // x_kp1 = x_k + alpha_k * d_k
        DVector &x_kp1 = s.x().set_k(+1).v();
        V_VpV( &x_kp1, x_k, d_k );

        // phi_kp1
        value_type &phi_kp1 = s.phi().set_k(+1)			= phiop1_;

        const DVectorSlice xd[2] = { x_k(), d_k() };
        MeritFuncCalc1DQuadratic phi_calc_1d( phi_calc, 1, xd, &x_kp1() );
        ls_success = direct_line_search().do_line_search(
            phi_calc_1d, phi_k
          , &alpha_k, &phi_kp1
          , (int)olevel >= (int)PRINT_ALGORITHM_STEPS ?
            &out : static_cast<std::ostream*>(0)	);

        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out << "\nOutput iteration k (k+1) ...\n"
            << "k = k+1 (k+2)\n"
            << "Reinitialize watchdog algorithm\n";
        }

        // Output these iteration quantities
        algo.track().output_iteration( algo );	// (k+1)
        // Transition to iteration k+1 (k+2)
        s.next_iteration();

        watch_k_ = 0; // Reinitialize the watchdog

        // Any update for k (k+2) should use the last updated value
        // which was for k-2 (k) since there is not much info for k-1 (k+1).
        // Be careful here and make sure this is square with other steps.

        algo.do_step_next( EvalNewPoint_name );
        step_return = false;	// Redirect control
      }
      else {
        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out
            << "phi_kp1 = " << phi_kp1 << " <= phi_km1 = " << phio_ << std::endl
            << "\nAccept this step but do a linesearch next iteration!\n";
        }
        // Slip the update of the penalty parameter
        const value_type mu_k = s.mu().get_k(0);
        s.mu().set_k(+1) = mu_k;
        // Do the last stage of the watchdog procedure next iteration.
        watch_k_ = 2;
        step_return = true;
      }
      break;
    }
    case NORMAL_LINE_SEARCH:
    case 2:
    {
      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        if( watch_k_ == 2 ) {
          out	<< "\n*** Second watchdog iteration:\n"
            << "Do a line search to determine x_kp1 = x_k + alpha_k * d_k ...\n";
        }
        else {
          out	<< "\n*** Normal linesearch:\n"
            << "Do a line search to determine x_kp1 = x_k + alpha_k * d_k ...\n";
        }
      }

      const DVectorSlice xd[2] = { x_k(), d_k() };
      MeritFuncCalc1DQuadratic phi_calc_1d( phi_calc, 1, xd, &x_kp1() );
      ls_success = direct_line_search().do_line_search( phi_calc_1d, phi_k
        , &alpha_k, &phi_kp1
        , (int)olevel >= (int)PRINT_ALGORITHM_STEPS ?
          &out : static_cast<std::ostream*>(0)	);

      if( watch_k_ == 2 )
        watch_k_ = 0;

      step_return = true;
      break;
    }
    default:
      TEST_FOR_EXCEPT(true);	// Only local programming error
  }

  if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
    out	<< "\nalpha    = " << s.alpha().get_k(0) << "\n";
    out << "\nphi_kp1 = " << s.phi().get_k(+1) << "\n";
  }

  if( static_cast<int>(olevel) >= static_cast<int>(PRINT_VECTORS) ) {
    out << "\nd_k = \n" << s.d().get_k(0)();
    out << "\nx_kp1 = \n" << s.x().get_k(+1)();
  }

  if( !ls_success )
    throw LineSearchFailure("LineSearchWatchDog_Step::do_step(): Line search failure");

  return step_return;

}
QPSolverStats::ESolutionType
QPSolverRelaxedQPKWIK::imp_solve_qp(
  std::ostream* out, EOutputLevel olevel, ERunTests test_what
  ,const Vector& g, const MatrixSymOp& G
  ,value_type etaL
  ,const Vector* dL, const Vector* dU
  ,const MatrixOp* E, BLAS_Cpp::Transp trans_E, const Vector* b
  ,const Vector* eL, const Vector* eU
  ,const MatrixOp* F, BLAS_Cpp::Transp trans_F, const Vector* f
  ,value_type* obj_d
  ,value_type* eta, VectorMutable* d
  ,VectorMutable* nu
  ,VectorMutable* mu, VectorMutable* Ed
  ,VectorMutable* lambda, VectorMutable* Fd
  )
{
  using Teuchos::dyn_cast;
  using DenseLinAlgPack::nonconst_tri_ele;
  using LinAlgOpPack::dot;
  using LinAlgOpPack::V_StV;
  using LinAlgOpPack::assign;
  using LinAlgOpPack::V_StV;
  using LinAlgOpPack::V_MtV;
  using AbstractLinAlgPack::EtaVector;
  using AbstractLinAlgPack::transVtMtV;
  using AbstractLinAlgPack::num_bounded;
  using ConstrainedOptPack::MatrixExtractInvCholFactor;

  // /////////////////////////
  // Map to QPKWIK input

  // Validate that rHL is of the proper type.
  const MatrixExtractInvCholFactor &cG
    = dyn_cast<const MatrixExtractInvCholFactor>(G);

  // Determine the number of sparse bounds on variables and inequalities.
  // By default set for the dense case
  const value_type inf = this->infinite_bound();
  const size_type
    nd              = d->dim(),
    m_in            = E  ? b->dim()                  : 0,
    m_eq            = F  ? f->dim()                  : 0,
    nvarbounds      = dL ? num_bounded(*dL,*dU,inf)  : 0,
    ninequbounds    = E  ? num_bounded(*eL,*eU,inf)  : 0,
    nequalities     = F  ? f->dim()                  : 0;

  // Determine if this is a QP with a structure different from the
  // one just solved.
  
  const bool same_qp_struct = (  N_ == nd && M1_ == nvarbounds && M2_ == ninequbounds && M3_ == nequalities );

  /////////////////////////////////////////////////////////////////
  // Set the input parameters to be sent to QPKWIKNEW

  // N
  N_ = nd;

  // M1
  M1_ = nvarbounds;

  // M2
  M2_ = ninequbounds;

  // M3
  M3_ = nequalities;

  // GRAD
  GRAD_ = VectorDenseEncap(g)();

  // UINV_AUG
  //
  // UINV_AUG = [ sqrt(bigM)  0  ]
  //            [ 0           L' ]
  //
  UINV_AUG_.resize(N_+1,N_+1);
  cG.extract_inv_chol( &nonconst_tri_ele( UINV_AUG_(2,N_+1,2,N_+1), BLAS_Cpp::upper ) );
  UINV_AUG_(1,1) = 1.0 / ::sqrt( NUMPARAM_[2] );
  UINV_AUG_.col(1)(2,N_+1) = 0.0;
  UINV_AUG_.row(1)(2,N_+1) = 0.0;

  // LDUINV_AUG
  LDUINV_AUG_ = UINV_AUG_.rows();

  // IBND, BL , BU, A, LDA, YPY

  IBND_INV_.resize( nd + m_in);
  std::fill( IBND_INV_.begin(), IBND_INV_.end(), 0 ); // Initialize the zero
  IBND_.resize( my_max( 1, M1_ + M2_ ) );
  BL_.resize( my_max( 1, M1_ + M2_ ) );
  BU_.resize( my_max( 1, M1_ + M2_ + M3_ ) );
  LDA_ = my_max( 1, M2_ + M3_ );
  A_.resize( LDA_, (  M2_ + M3_ > 0 ? N_ : 1 ) );
  YPY_.resize( my_max( 1, M1_ + M2_ ) );
  if(M1_)
    YPY_(1,M1_) = 0.0; // Must be for this QP interface

  // Initialize variable bound constraints
  if( dL ) {
    VectorDenseEncap dL_de(*dL);
    VectorDenseEncap dU_de(*dU);
    // read iterators
    AbstractLinAlgPack::sparse_bounds_itr
      dLU_itr( dL_de().begin(), dL_de().end()
          ,dU_de().begin(), dU_de().end()
          ,inf );
    // written iterators
    IBND_t::iterator
      IBND_itr = IBND_.begin(),
      IBND_end = IBND_.begin() + M1_;
    DVector::iterator
      BL_itr = BL_.begin(),
      BU_itr = BU_.begin(),
      YPY_itr = YPY_.begin();
    // Loop
    for( size_type ibnd_i = 1; IBND_itr != IBND_end; ++ibnd_i, ++dLU_itr ) {
      IBND_INV_[dLU_itr.index()-1] = ibnd_i;
      *IBND_itr++ = dLU_itr.index();
      *BL_itr++	= dLU_itr.lbound();
      *BU_itr++	= dLU_itr.ubound();
      *YPY_itr++	= 0.0; // Must be zero with this QP interface
    }
  }

  // Initialize inequality constraints
  
  if(M2_) {
    VectorDenseEncap eL_de(*eL);
    VectorDenseEncap eU_de(*eU);
    VectorDenseEncap b_de(*b);
    AbstractLinAlgPack::sparse_bounds_itr
      eLU_itr( eL_de().begin(), eL_de().end()
          ,eU_de().begin(), eU_de().end()
          ,inf );
    if( M2_ < m_in ) {
      // Initialize BL, BU, YPY and A for sparse bounds on general inequalities
      // written iterators
      DVector::iterator
        BL_itr		= BL_.begin() + M1_,
        BU_itr		= BU_.begin() + M1_,
        YPY_itr		= YPY_.begin() + M1_;
      IBND_t::iterator
        ibnds_itr	= IBND_.begin() + M1_;
      // loop
      for(size_type i = 1; i <= M2_; ++i, ++eLU_itr, ++ibnds_itr ) {
        TEST_FOR_EXCEPT( !( !eLU_itr.at_end() ) );
        const size_type k      = eLU_itr.index();
        *BL_itr++              = eLU_itr.lbound();
        *BU_itr++              = eLU_itr.ubound();
        *YPY_itr++             = b_de()(k);
        *ibnds_itr             = k;  // Only for my record, not used by QPKWIK
        IBND_INV_[nd+k-1]      = M1_ + i;
        // Add the corresponding row of op(E) to A
        // y == A.row(i)'
        // y' = e_k' * op(E) => y = op(E')*e_k
        DVectorSlice y = A_.row(i);
        EtaVector e_k(k,eL_de().dim());
        V_MtV( &y( 1, N_ ), *E, BLAS_Cpp::trans_not(trans_E), e_k() ); // op(E')*e_k
      }
    }
    else {
      //
      // Initialize BL, BU, YPY and A for dense bounds on general inequalities
      //
      // Initialize BL(M1+1:M1+M2), BU(M1+1:M1+M2)
      // and IBND(M1+1:M1+M2) = identity (only for my record, not used by QPKWIK)
      DVector::iterator
        BL_itr		= BL_.begin() + M1_,
        BU_itr		= BU_.begin() + M1_;
      IBND_t::iterator
        ibnds_itr	= IBND_.begin() + M1_;
      for(size_type i = 1; i <= m_in; ++i ) {
        if( !eLU_itr.at_end() && eLU_itr.index() == i ) {
          *BL_itr++ = eLU_itr.lbound();
          *BU_itr++ = eLU_itr.ubound();
          ++eLU_itr;
        }
        else {
          *BL_itr++ = -inf;
          *BU_itr++ = +inf;
        }
        *ibnds_itr++     = i;
        IBND_INV_[nd+i-1]= M1_ + i;
      }
      // A(1:M2,1:N) = op(E)
      assign( &A_(1,M2_,1,N_), *E, trans_E );
      // YPY
      YPY_(M1_+1,M1_+M2_) = b_de();
    }
  }

  // Initialize equalities

  if(M3_) {
    V_StV( &BU_( M1_ + M2_ + 1, M1_ + M2_ + M3_ ), -1.0, VectorDenseEncap(*f)() );
    assign( &A_( M2_ + 1, M2_ + M3_, 1, N_ ), *F, trans_F );
  }

  // IYPY
  IYPY_ = 1; // ???

  // WARM
  WARM_ = 0; // Cold start by default

  // MAX_ITER
  MAX_ITER_ = static_cast<f_int>(max_qp_iter_frac() * N_);

  // INF
  INF_ = ( same_qp_struct ? 1 : 0 );
  
  // Initilize output, internal state and workspace quantities.
  if(!same_qp_struct) {
    X_.resize(N_);
    NACTSTORE_ = 0;
    IACTSTORE_.resize(N_+1);
    IACT_.resize(N_+1);
    UR_.resize(N_+1);
    ISTATE_.resize( QPKWIKNEW_CppDecl::qpkwiknew_listate(N_,M1_,M2_,M3_) );
    LRW_ = QPKWIKNEW_CppDecl::qpkwiknew_lrw(N_,M1_,M2_,M3_);
    RW_.resize(LRW_);
  }

  // /////////////////////////////////////////////
  // Setup a warm start form the input arguments
  //
  // Interestingly enough, QPKWIK sorts all of the
  // constraints according to scaled multiplier values
  // and mixes equality with inequality constriants.
  // It seems to me that you should start with equality
  // constraints first.

  WARM_      = 0;
  NACTSTORE_ = 0;

  if( m_eq ) {
    // Add equality constraints first since we know these will
    // be part of the active set.
    for( size_type j = 1; j <= m_eq; ++j ) {
      IACTSTORE_[NACTSTORE_] = 2*M1_ + 2*M2_ + j;
      ++NACTSTORE_;
    }
  }
  if( ( nu && nu->nz() ) || ( mu && mu->nz() ) ) {
    // Add inequality constraints
    const size_type
      nu_nz = nu ? nu->nz() : 0,
      mu_nz = mu ? mu->nz() : 0;
    // Combine all the multipliers for the bound and general inequality
    // constraints and sort them from the largest to the smallest.  Hopefully
    // the constraints with the larger multiplier values will not be dropped
    // from the active set.
    SpVector gamma( nd + 1 + m_in , nu_nz + mu_nz );
    typedef SpVector::element_type ele_t;
    if(nu && nu_nz) {
      VectorDenseEncap nu_de(*nu);
      DVectorSlice::const_iterator
        nu_itr = nu_de().begin(),
        nu_end = nu_de().end();
      index_type i = 1;
      while( nu_itr != nu_end ) {
        for( ; *nu_itr == 0.0; ++nu_itr, ++i );
        gamma.add_element(ele_t(i,*nu_itr));
        ++nu_itr; ++i;
      }
    }
    if(mu && mu_nz) {
      VectorDenseEncap mu_de(*mu);
      DVectorSlice::const_iterator
        mu_itr = mu_de().begin(),
        mu_end = mu_de().end();
      index_type i = 1;
      while( mu_itr != mu_end ) {
        for( ; *mu_itr == 0.0; ++mu_itr, ++i );
        gamma.add_element(ele_t(i+nd,*mu_itr));
        ++mu_itr; ++i;
      }
    }
    std::sort( gamma.begin(), gamma.end()
      , AbstractLinAlgPack::SortByDescendingAbsValue() );
    // Now add the inequality constraints in decreasing order
    const SpVector::difference_type o = gamma.offset();
    for( SpVector::const_iterator itr = gamma.begin(); itr != gamma.end(); ++itr ) {
      const size_type  j   = itr->index() + o;
      const value_type val = itr->value();
      if( j <= nd ) { // Variable bound
        const size_type ibnd_i = IBND_INV_[j-1];
        TEST_FOR_EXCEPT( !( ibnd_i ) );
        IACTSTORE_[NACTSTORE_]
          = (val < 0.0
             ? ibnd_i               // lower bound (see IACT(*))
             : M1_ + M2_ + ibnd_i   // upper bound (see IACT(*))
            );
        ++NACTSTORE_;
      }
      else if( j <= nd + m_in ) { // General inequality constraint
        const size_type ibnd_i = IBND_INV_[j-1]; // offset into M1_ + ibnd_j
        TEST_FOR_EXCEPT( !( ibnd_i ) );
        IACTSTORE_[NACTSTORE_]
          = (val < 0.0
             ? ibnd_i               // lower bound (see IACT(*))
             : M1_ + M2_ + ibnd_i   // upper bound (see IACT(*))
            );
        ++NACTSTORE_;
      }
    }
  }
  if( NACTSTORE_ > 0 )
    WARM_ = 1;

  // /////////////////////////
  // Call QPKWIK

  if( out && olevel > PRINT_NONE ) {
    *out
      << "\nCalling QPKWIK to solve QP problem ...\n";
  }

  QPKWIKNEW_CppDecl::qpkwiknew(
    N_, M1_, M2_, M3_, &GRAD_(1), &UINV_AUG_(1,1), LDUINV_AUG_, &IBND_[0]
    ,&BL_(1), &BU_(1), &A_(1,1), LDA_, &YPY_(1), IYPY_, WARM_, NUMPARAM_, MAX_ITER_, &X_(1)
    ,&NACTSTORE_, &IACTSTORE_[0], &INF_, &NACT_, &IACT_[0], &UR_[0], &EXTRA_
    ,&ITER_, &NUM_ADDS_, &NUM_DROPS_, &ISTATE_[0], LRW_, &RW_[0]
    );

  // ////////////////////////
  // Map from QPKWIK output

  // eta
  *eta = EXTRA_;
  // d
  (VectorDenseMutableEncap(*d))() = X_();
  // nu (simple variable bounds) and mu (general inequalities)
  if(nu) *nu = 0.0;
  if(mu) *mu = 0.0;
  // ToDo: Create VectorDenseMutableEncap views for faster access!
  {for(size_type i = 1; i <= NACT_; ++i) {
    size_type j = IACT_[i-1];
    EConstraintType type = constraint_type(M1_,M2_,M3_,j);
    FortranTypes::f_int idc = constraint_index(M1_,M2_,M3_,&IBND_[0],type,j);
    switch(type) {
      case NU_L:
        nu->set_ele( idc , -UR_(i) );
        break;
      case GAMA_L:
        mu->set_ele( IBND_[ M1_ + idc - 1 ], -UR_(i) );
        break;
      case NU_U:
        nu->set_ele( idc, UR_(i)) ;
        break;
      case GAMA_U:
        mu->set_ele( IBND_[ M1_ + idc - 1 ], UR_(i) );
        break;
      case LAMBDA:
        lambda->set_ele( idc, UR_(i) );
        break;
    }
  }}
  // obj_d (This could be updated within QPKWIK in the future)
  if(obj_d) {
    // obj_d = g'*d + 1/2 * d' * G * g
    *obj_d = dot(g,*d) + 0.5 * transVtMtV(*d,G,BLAS_Cpp::no_trans,*d);
  }
  // Ed (This could be updated within QPKWIK in the future)
  if(Ed) {
    V_MtV( Ed, *E, trans_E, *d );
  }
  // Fd (This could be updated within QPKWIK in the future)
  if(Fd) {
    V_MtV( Fd, *F, trans_F, *d );
  }
  // Set the QP statistics
  QPSolverStats::ESolutionType solution_type;
  if( INF_ >= 0 ) {
    solution_type = QPSolverStats::OPTIMAL_SOLUTION;
  }
  else if( INF_ == -1 ) { // Infeasible constraints
    TEST_FOR_EXCEPTION(
      true, QPSolverRelaxed::Infeasible
      ,"QPSolverRelaxedQPKWIK::solve_qp(...) : Error, QP is infeasible" );
  }
  else if( INF_ == -2 ) { // LRW too small
    TEST_FOR_EXCEPT( !( INF_ != -2 ) );  // Local programming error?
  }
  else if( INF_ == -3 ) { // Max iterations exceeded
    solution_type = QPSolverStats::DUAL_FEASIBLE_POINT;
  }
  else {
    TEST_FOR_EXCEPT(true); // Unknown return value!
  }
  qp_stats_.set_stats(
    solution_type, QPSolverStats::CONVEX
    ,ITER_, NUM_ADDS_, NUM_DROPS_
    ,WARM_==1, *eta > 0.0 );

  return qp_stats_.solution_type();
}
bool MeritFunc_PenaltyParamsUpdateWithMult_AddedStep::do_step(Algorithm& _algo
  , poss_type step_poss, IterationPack::EDoStepType type
  , poss_type assoc_step_poss)
{
  using DenseLinAlgPack::norm_inf;

  NLPAlgo	&algo	= rsqp_algo(_algo);
  NLPAlgoState	&s		= algo.rsqp_state();
  
  EJournalOutputLevel olevel = algo.algo_cntr().journal_output_level();
  std::ostream& out = algo.track().journal_out();

  // print step header.
  if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
    using IterationPack::print_algorithm_step;
    print_algorithm_step( algo, step_poss, type, assoc_step_poss, out );
  }

  MeritFuncPenaltyParams
    *params = dynamic_cast<MeritFuncPenaltyParams*>(&merit_func());
  if( !params ) {
    std::ostringstream omsg;
    omsg
      << "MeritFunc_PenaltyParamsUpdateWithMult_AddedStep::do_step(...), Error "
      << "The class " << typeName(&merit_func()) << " does not support the "
      << "MeritFuncPenaltyParams iterface\n";
    out << omsg.str();
    throw std::logic_error( omsg.str() );
  }

  MeritFuncNLPDirecDeriv
    *direc_deriv = dynamic_cast<MeritFuncNLPDirecDeriv*>(&merit_func());
  if( !direc_deriv ) {
    std::ostringstream omsg;
    omsg
      << "MeritFunc_PenaltyParamsUpdateWithMult_AddedStep::do_step(...), Error "
      << "The class " << typeName(&merit_func()) << " does not support the "
      << "MeritFuncNLPDirecDeriv iterface\n";
    out << omsg.str();
    throw std::logic_error( omsg.str() );
  }

  bool perform_update = true;

  if( s.mu().updated_k(0) ) {
    if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
      out << "\nmu_k is already updated by someone else?\n";
    }
    const value_type mu_k = s.mu().get_k(0);
    if( mu_k == norm_inf_mu_last_ ) {
      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        out << "\nmu_k " << mu_k << " == norm_inf_mu_last = " << norm_inf_mu_last_
          << "\nso we will take this as a signal to skip the update.\n";
      }
      perform_update = false;
    }
    else {
      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        out << "\nmu_k " << mu_k << " != norm_inf_mu_last = " << norm_inf_mu_last_
          << "\nso we will ignore this and perform the update anyway.\n";
      }
    }		
  }
  if(perform_update) {

    if ( s.lambda().updated_k(0) ) {

      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        out << "\nUpdate the penalty parameter...\n";
      }

      const DVector
        &lambda_k = s.lambda().get_k(0).cv();

      if( params->mu().size() != lambda_k.size() )
        params->resize( lambda_k.size() );
      DVectorSlice
        mu = params->mu();

      const value_type
        max_lambda	= norm_inf( lambda_k() ),
        mult_fact	= (1.0 + mult_factor_);

      if(near_solution_) {
        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out << "\nNear solution, forcing mu(j) >= mu_old(j)...\n";
        }
        DVector::const_iterator	lb_itr = lambda_k.begin();
        DVectorSlice::iterator	mu_itr = mu.begin();
        for( ; lb_itr != lambda_k.end(); ++mu_itr, ++ lb_itr )
          *mu_itr = max( max( *mu_itr, mult_fact * ::fabs(*lb_itr) ), small_mu_ );
      }
      else {
        if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
          out << "\nNot near solution, allowing reduction in mu(j) ...\n";
        }
        DVector::const_iterator	lb_itr = lambda_k.begin();
        DVectorSlice::iterator	mu_itr = mu.begin();
        for( ; lb_itr != lambda_k.end(); ++mu_itr, ++ lb_itr ) {
          const value_type lb_j = ::fabs(*lb_itr);
          *mu_itr = max(
                  (3.0 * (*mu_itr) + lb_j) / 4.0	
                , max( mult_fact * lb_j, small_mu_ )
                );
        }
        value_type kkt_error = s.opt_kkt_err().get_k(0) + s.feas_kkt_err().get_k(0);
        if(kkt_error <= kkt_near_sol_) {
          if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
            out << "\nkkt_error = " << kkt_error << " <= kkt_near_sol = "
                << kkt_near_sol_ << std::endl
              << "Switching to forcing mu_k >= mu_km1 in the future\n";
          }
          near_solution_ = true;
        }
      }

      // Force the ratio
      const value_type
          max_mu	= norm_inf( mu() ),
          min_mu	= min_mu_ratio_ * max_mu;
      for(DVectorSlice::iterator mu_itr = mu.begin(); mu_itr != mu.end(); ++mu_itr)
        *mu_itr = max( (*mu_itr), min_mu );	

      s.mu().set_k(0) = norm_inf_mu_last_ = max_mu;

      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        out << "\nmax(|mu(j)|) = " << (*std::max_element( mu.begin(), mu.end() ))
          << "\nmin(|mu(j)|) = " << (*std::min_element( mu.begin(), mu.end() ))
            << std::endl;
      }

      if( (int)olevel >= (int)PRINT_VECTORS ) {
        out << "\nmu = \n" << mu;
      }
    }
    else {
      if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
        out << "\nDon't have the info to update penalty parameter so just use the last updated...\n";
      }
    }
  }

  // In addition also compute the directional derivative
  direc_deriv->calc_deriv( s.Gf().get_k(0)(), s.c().get_k(0)(), s.d().get_k(0)() );

  if( (int)olevel >= (int)PRINT_ALGORITHM_STEPS ) {
    out << "\nmu_k = " << s.mu().get_k(0) << "\n";
  }

  return true;
}
void dense_Vp_StPtMtV(
  DVectorSlice                 *y
  ,value_type                 a
  ,const GenPermMatrixSlice   &P
  ,BLAS_Cpp::Transp           P_trans
  ,const M_t                  &M
  ,BLAS_Cpp::Transp           M_trans
  ,const V_t                  &x
  ,value_type                 b
  )
{
  using BLAS_Cpp::no_trans;
  using BLAS_Cpp::trans;
  using BLAS_Cpp::trans_not;
  using BLAS_Cpp::rows;
  using BLAS_Cpp::cols;
  using DenseLinAlgPack::dot;
  using DenseLinAlgPack::DVector;
  using DenseLinAlgPack::Vt_S;
  using AbstractLinAlgPack::dot;
  using AbstractLinAlgPack::Vp_StMtV;
  using AbstractLinAlgPack::GenPermMatrixSlice;
  typedef AbstractLinAlgPack::EtaVector eta;
  using Teuchos::Workspace;
  Teuchos::WorkspaceStore* wss = Teuchos::get_default_workspace_store().get();
  
  // Validate the sizes
  // 
  // y += op(P)*op(M)*x
  // 
  const DenseLinAlgPack::size_type
    ny = y->size(),
    nx = x.size(),
    opM_rows = rows( M.rows(), M.cols(), M_trans ),
    opM_cols = cols( M.rows(), M.cols(), M_trans );
  if(    ny != rows( P.rows(), P.cols(), P_trans )
       || nx != opM_cols
       || cols( P.rows(), P.cols(), P_trans ) != opM_rows )
    throw std::length_error( "MatrixOp::Vp_StPtMtV(...) : Error, "
      "sizes of arguments does not match up" );
  //
  // Compute y = b*y + a*op(P)*op(M)*x in a resonably efficient manner.  This
  // implementation will assume that M is stored as a dense matrix.  Either
  // t = op(M)*x is computed first (O(opM_rows*nx) flops) then y = b*y + a*op(P)*t
  // (O(ny) + O(P_nz) flops) or each row of t' = e(j)' * op(M) (O(nx) flops)
  // is computed one at a time and then y(i) = b*y(i) + a*t'*x (O(nx)  flops)
  // where op(P)(i,j) = 1.0.  In the latter case, there are P_nz rows
  // of op(M) that have to be generated so the total cost is O(P_nz*nx).
  // Therefore, we will do the former if opM_rows < P_nz and the latter otherwise.
  //
  if( !P.nz() ) {
    // y = b*y
    if(b==0.0)       *y = 0.0;
    else if(b!=1.0)  Vt_S(y,b);
  }
  else if( opM_rows > P.nz() || P.is_identity() ) {
    // t = op(M)*x
    Workspace<value_type> t_ws(wss,opM_rows);
    DVectorSlice t(&t_ws[0],t_ws.size());
    LinAlgOpPack::V_MtV( &t, M, M_trans, x );
    // y = b*y + a*op(P)*t
    Vp_StMtV( y, a, P, P_trans, t(), b );
  }
  else {
    // y = b*y
    if(b==0.0)       *y = 0.0;
    else if(b!=1.0)  Vt_S(y,b);
    // Compute t' = e(j)' * op(M) then y(i) += a*t'*x where op(P)(i,j) = 1.0
    Workspace<value_type> t_ws(wss,opM_cols);
    DVectorSlice t(&t_ws[0],t_ws.size());
    if( P.is_identity() ) {
      for( size_type k = 1; k <= P.nz(); ++k ) {
        const size_type
          i = k,
          j = k;
        // t = op(M') * e(j)			
        LinAlgOpPack::V_MtV( &t, M, trans_not(M_trans), eta(j,opM_rows)() );
        // y(i) += a*t'*x
        (*y)(i) += a * dot( t(), x );
      }
    }
    else {
      for( GenPermMatrixSlice::const_iterator itr = P.begin(); itr != P.end(); ++itr ) {
        const DenseLinAlgPack::size_type
          i = P_trans == no_trans ? itr->row_i() : itr->col_j(),
          j = P_trans == no_trans ? itr->col_j() : itr->row_i();
        // t = op(M') * e(j)			
        LinAlgOpPack::V_MtV( &t, M, trans_not(M_trans), eta(j,opM_rows)() );
        // y(i) += a*t'*x
        (*y)(i) += a * dot( t(), x );
      }
    }
  }
}
bool ReducedHessianSecantUpdateLPBFGS_Strategy::perform_update(
  DVectorSlice* s_bfgs, DVectorSlice* y_bfgs, bool first_update
  ,std::ostream& out, EJournalOutputLevel olevel, NLPAlgo *algo, NLPAlgoState *s
  ,MatrixOp *rHL_k
  )
{
  using std::setw;
  using std::endl;
  using std::right;
  using Teuchos::dyn_cast;
  namespace rcp = MemMngPack;
  using Teuchos::RCP;
  using LinAlgOpPack::V_MtV;
  using DenseLinAlgPack::dot;
  using AbstractLinAlgPack::norm_inf;
  using AbstractLinAlgPack::transVtMtV;
  typedef ConstrainedOptPack::MatrixHessianSuperBasic MHSB_t;
  using Teuchos::Workspace;
  Teuchos::WorkspaceStore* wss = Teuchos::get_default_workspace_store().get();

  if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
    out << "\n*** (LPBFGS) Full limited memory BFGS to projected BFGS ...\n";
  }

#ifdef _WINDOWS
  MHSB_t &rHL_super = dynamic_cast<MHSB_t&>(*rHL_k);
#else
  MHSB_t &rHL_super = dyn_cast<MHSB_t>(*rHL_k);
#endif

  const size_type
    n    = algo->nlp().n(),
    r    = algo->nlp().r(),
    n_pz = n-r;

  bool do_projected_rHL_RR = false;

  // See if we still have a limited memory BFGS update matrix
  RCP<MatrixSymPosDefLBFGS> // We don't want this to be deleted until we are done with it
    lbfgs_rHL_RR = Teuchos::rcp_const_cast<MatrixSymPosDefLBFGS>(
      Teuchos::rcp_dynamic_cast<const MatrixSymPosDefLBFGS>(rHL_super.B_RR_ptr()) );

  if( lbfgs_rHL_RR.get() && rHL_super.Q_R().is_identity()  ) {
    //
    // We have a limited memory BFGS matrix and have not started projected BFGS updating
    // yet so lets determine if it is time to consider switching
    //
    // Check that the current update is sufficiently p.d. before we do anything
    const value_type
      sTy = dot(*s_bfgs,*y_bfgs),
      yTy = dot(*y_bfgs,*y_bfgs);
    if( !ConstrainedOptPack::BFGS_sTy_suff_p_d(
      *s_bfgs,*y_bfgs,&sTy
      ,  int(olevel) >= int(PRINT_ALGORITHM_STEPS) ? &out : NULL )
      && !proj_bfgs_updater().bfgs_update().use_dampening()
      )
    {
      if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
        out	<< "\nWarning!  use_damening == false so there is no way we can perform any"
            " kind of BFGS update (projected or not) so we will skip it!\n";
      }
      quasi_newton_stats_(*s).set_k(0).set_updated_stats(
        QuasiNewtonStats::INDEF_SKIPED );
      return true;
    }
    // Consider if we can even look at the active set yet.
    const bool consider_switch  = lbfgs_rHL_RR->num_secant_updates() >= min_num_updates_proj_start();
    if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
      out << "\nnum_previous_updates = " << lbfgs_rHL_RR->num_secant_updates()
        << ( consider_switch ? " >= " : " < " )
        << "min_num_updates_proj_start = " << min_num_updates_proj_start()
        << ( consider_switch
           ? "\nConsidering if we should switch to projected BFGS updating of superbasics ...\n"
           : "\nNot time to consider switching to projected BFGS updating of superbasics yet!" );
    }
    if( consider_switch ) {
      // 
      // Our job here is to determine if it is time to switch to projected projected
      // BFGS updating.
      //
      if( act_set_stats_(*s).updated_k(-1) ) {
        if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
          out	<< "\nDetermining if projected BFGS updating of superbasics should begin ...\n";
        }
        // Determine if the active set has calmed down enough
        const SpVector
          &nu_km1 = s->nu().get_k(-1);
        const SpVectorSlice
          nu_indep = nu_km1(s->var_indep());
        const bool 
          act_set_calmed_down
          = PBFGSPack::act_set_calmed_down(
            act_set_stats_(*s).get_k(-1)
            ,proj_bfgs_updater().act_set_frac_proj_start()
            ,olevel,out
            ),
          max_num_updates_exceeded
          = lbfgs_rHL_RR->m_bar() >= max_num_updates_proj_start();
        do_projected_rHL_RR = act_set_calmed_down || max_num_updates_exceeded;
        if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
          if( act_set_calmed_down ) {
            out << "\nThe active set has calmed down enough so lets further consider switching to\n"
              << "projected BFGS updating of superbasic variables ...\n";
          }
          else if( max_num_updates_exceeded ) {
            out << "\nThe active set has not calmed down enough but num_previous_updates = "
              << lbfgs_rHL_RR->m_bar() << " >= max_num_updates_proj_start = "	<< max_num_updates_proj_start()
              << "\nso we will further consider switching to projected BFGS updating of superbasic variables ...\n";
          }
          else {
            out << "\nIt is not time to switch to projected BFGS so just keep performing full limited memory BFGS for now ...\n";
          }
        }
        if( do_projected_rHL_RR ) {
          //
          // Determine the set of initially fixed and free independent variables.
          //
          typedef Workspace<size_type>                              i_x_t;
          typedef Workspace<ConstrainedOptPack::EBounds>   bnd_t;
          i_x_t   i_x_free(wss,n_pz);
          i_x_t   i_x_fixed(wss,n_pz);
          bnd_t   bnd_fixed(wss,n_pz);
          i_x_t   l_x_fixed_sorted(wss,n_pz);
          size_type n_pz_X = 0, n_pz_R = 0;
          // rHL = rHL_scale * I
          value_type
            rHL_scale = sTy > 0.0 ? yTy/sTy : 1.0;
          if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
            out	<< "\nScaling for diagonal intitial rHL = rHL_scale*I, rHL_scale = " << rHL_scale << std::endl;
          }
          value_type sRTBRRsR = 0.0, sRTyR = 0.0, sXTBXXsX = 0.0, sXTyX = 0.0;
          // Get the elements in i_x_free[] for variables that are definitely free
          // and initialize s_R'*B_RR*s_R and s_R'*y_R
          PBFGSPack::init_i_x_free_sRTsR_sRTyR(
            nu_indep, *s_bfgs, *y_bfgs
            , &n_pz_R, &i_x_free[0], &sRTBRRsR, &sRTyR );
          sRTBRRsR *= rHL_scale;
          Workspace<value_type> rHL_XX_diag_ws(wss,nu_indep.nz());
          DVectorSlice rHL_XX_diag(&rHL_XX_diag_ws[0],rHL_XX_diag_ws.size());
          rHL_XX_diag = rHL_scale;
          // Sort fixed variables according to |s_X(i)^2*B_XX(i,i)|/|sRTBRRsR| + |s_X(i)*y_X(i)|/|sRTyR|
          // and initialize s_X'*B_XX*s_X and s_X*y_X
          PBFGSPack::sort_fixed_max_cond_viol(
            nu_indep,*s_bfgs,*y_bfgs,rHL_XX_diag,sRTBRRsR,sRTyR
            ,&sXTBXXsX,&sXTyX,&l_x_fixed_sorted[0]);
          // Pick initial set of i_x_free[] and i_x_fixed[] (sorted!)
          PBFGSPack::choose_fixed_free(
            proj_bfgs_updater().project_error_tol()
            ,proj_bfgs_updater().super_basic_mult_drop_tol(),nu_indep
            ,*s_bfgs,*y_bfgs,rHL_XX_diag,&l_x_fixed_sorted[0]
            ,olevel,out,&sRTBRRsR,&sRTyR,&sXTBXXsX,&sXTyX
            ,&n_pz_X,&n_pz_R,&i_x_free[0],&i_x_fixed[0],&bnd_fixed[0] );
          if( n_pz_R < n_pz ) {
            //
            // We are ready to transition to projected BFGS updating!
            //
            // Determine if we are be using dense or limited memory BFGS?
            const bool
              low_num_super_basics = n_pz_R <= num_superbasics_switch_dense();
            if( static_cast<int>(olevel) >= static_cast<int>(PRINT_BASIC_ALGORITHM_INFO) ) {
              out	<< "\nSwitching to projected BFGS updating ..."
                << "\nn_pz_R = " << n_pz_R << ( low_num_super_basics ? " <= " : " > " )
                << " num_superbasics_switch_dense = " << num_superbasics_switch_dense()
                << ( low_num_super_basics
                   ? "\nThere are not too many superbasic variables so use dense projected BFGS ...\n"
                   : "\nThere are too many superbasic variables so use limited memory projected BFGS ...\n"
                  );
            }
            // Create new matrix to use for rHL_RR initialized to rHL_RR = rHL_scale*I
            RCP<MatrixSymSecant>
              rHL_RR = NULL;
            if( low_num_super_basics ) {
              rHL_RR = new MatrixSymPosDefCholFactor(
                NULL    // Let it allocate its own memory
                ,NULL   // ...
                ,n_pz   // maximum size
                ,lbfgs_rHL_RR->maintain_original()
                ,lbfgs_rHL_RR->maintain_inverse()
                );
            }
            else {
              rHL_RR = new MatrixSymPosDefLBFGS(
                n_pz, lbfgs_rHL_RR->m()
                ,lbfgs_rHL_RR->maintain_original()
                ,lbfgs_rHL_RR->maintain_inverse()
                ,lbfgs_rHL_RR->auto_rescaling()
                );
            }
            rHL_RR->init_identity( n_pz_R, rHL_scale );
            if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ITERATION_QUANTITIES) ) {
              out << "\nrHL_RR after intialized to rHL_RR = rHL_scale*I but before performing current BFGS update:\nrHL_RR =\n"
                << dynamic_cast<MatrixOp&>(*rHL_RR); // I know this is okay!
            }
            // Initialize rHL_XX = rHL_scale*I
#ifdef _WINDOWS
            MatrixSymInitDiag
              &rHL_XX = dynamic_cast<MatrixSymInitDiag&>(
                const_cast<MatrixSymOp&>(*rHL_super.B_XX_ptr()));
#else
            MatrixSymInitDiag
              &rHL_XX = dyn_cast<MatrixSymInitDiag>(
                const_cast<MatrixSymOp&>(*rHL_super.B_XX_ptr()));
#endif
            rHL_XX.init_identity( n_pz_X, rHL_scale );
            // Reinitialize rHL
            rHL_super.initialize(
              n_pz, n_pz_R, &i_x_free[0], &i_x_fixed[0], &bnd_fixed[0]
              ,Teuchos::rcp_const_cast<const MatrixSymWithOpFactorized>(
                Teuchos::rcp_dynamic_cast<MatrixSymWithOpFactorized>(rHL_RR))
              ,NULL,BLAS_Cpp::no_trans,rHL_super.B_XX_ptr()
              );
            //
            // Perform the current BFGS update first
            //
            MatrixSymOp
              &rHL_RR_op = dynamic_cast<MatrixSymOp&>(*rHL_RR);
            const GenPermMatrixSlice
              &Q_R = rHL_super.Q_R(),
              &Q_X = rHL_super.Q_X();
            // Get projected BFGS update vectors y_bfgs_R, s_bfgs_R
            Workspace<value_type>
              y_bfgs_R_ws(wss,Q_R.cols()),
              s_bfgs_R_ws(wss,Q_R.cols()),
              y_bfgs_X_ws(wss,Q_X.cols()),
              s_bfgs_X_ws(wss,Q_X.cols());
            DVectorSlice y_bfgs_R(&y_bfgs_R_ws[0],y_bfgs_R_ws.size());
            DVectorSlice s_bfgs_R(&s_bfgs_R_ws[0],s_bfgs_R_ws.size());
            DVectorSlice y_bfgs_X(&y_bfgs_X_ws[0],y_bfgs_X_ws.size());
            DVectorSlice s_bfgs_X(&s_bfgs_X_ws[0],s_bfgs_X_ws.size());
            V_MtV( &y_bfgs_R, Q_R, BLAS_Cpp::trans, *y_bfgs );  // y_bfgs_R = Q_R'*y_bfgs
            V_MtV( &s_bfgs_R, Q_R, BLAS_Cpp::trans, *s_bfgs );  // s_bfgs_R = Q_R'*s_bfgs
            V_MtV( &y_bfgs_X, Q_X, BLAS_Cpp::trans, *y_bfgs );  // y_bfgs_X = Q_X'*y_bfgs
            V_MtV( &s_bfgs_X, Q_X, BLAS_Cpp::trans, *s_bfgs );  // s_bfgs_X = Q_X'*s_bfgs
            // Update rHL_RR
            if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
              out << "\nPerform current BFGS update on " << n_pz_R << " x " << n_pz_R << " projected "
                << "reduced Hessian for the superbasic variables where B = rHL_RR...\n";
            }
            QuasiNewtonStats quasi_newton_stats;
            proj_bfgs_updater().bfgs_update().perform_update(
              &s_bfgs_R(),&y_bfgs_R(),false,out,olevel,algo->algo_cntr().check_results()
              ,&rHL_RR_op, &quasi_newton_stats );
            // Perform previous updates if possible
            if( lbfgs_rHL_RR->m_bar() ) {
              const size_type num_add_updates = std::_MIN(num_add_recent_updates(),lbfgs_rHL_RR->m_bar());
              if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
                out	<< "\nAdd the min(num_previous_updates,num_add_recent_updates) = min(" << lbfgs_rHL_RR->m_bar()
                  << "," << num_add_recent_updates() << ") = " << num_add_updates << " most recent BFGS updates if possible ...\n";
              }
              // Now add previous update vectors
              const value_type
                project_error_tol = proj_bfgs_updater().project_error_tol();
              const DMatrixSlice
                S = lbfgs_rHL_RR->S(),
                Y = lbfgs_rHL_RR->Y();
              size_type k = lbfgs_rHL_RR->k_bar();  // Location in S and Y of most recent update vectors
              for( size_type l = 1; l <= num_add_updates; ++l, --k ) {
                if(k == 0) k = lbfgs_rHL_RR->m_bar();  // see MatrixSymPosDefLBFGS
                // Check to see if this update satisfies the required conditions.
                // Start with the condition on s'*y since this are cheap to check.
                V_MtV( &s_bfgs_X(), Q_X, BLAS_Cpp::trans, S.col(k) ); // s_bfgs_X = Q_X'*s_bfgs
                V_MtV( &y_bfgs_X(), Q_X, BLAS_Cpp::trans, Y.col(k) ); // y_bfgs_X = Q_X'*y_bfgs
                sRTyR    = dot( s_bfgs_R, y_bfgs_R );
                sXTyX    = dot( s_bfgs_X, y_bfgs_X );
                bool
                  sXTyX_cond    = ::fabs(sXTyX/sRTyR) <= project_error_tol,
                  do_update     = sXTyX_cond,
                  sXTBXXsX_cond = false;
                if( sXTyX_cond ) {
                  // Check the second more expensive condition
                  V_MtV( &s_bfgs_R(), Q_R, BLAS_Cpp::trans, S.col(k) ); // s_bfgs_R = Q_R'*s_bfgs
                  V_MtV( &y_bfgs_R(), Q_R, BLAS_Cpp::trans, Y.col(k) ); // y_bfgs_R = Q_R'*y_bfgs
                  sRTBRRsR = transVtMtV( s_bfgs_R, rHL_RR_op, BLAS_Cpp::no_trans, s_bfgs_R );
                  sXTBXXsX = rHL_scale * dot( s_bfgs_X, s_bfgs_X );
                  sXTBXXsX_cond = sXTBXXsX/sRTBRRsR <= project_error_tol;
                  do_update     = sXTBXXsX_cond && sXTyX_cond;
                }
                if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
                  out << "\n---------------------------------------------------------------------"
                    << "\nprevious update " << l
                    << "\n\nChecking projection error:\n"
                    << "\n|s_X'*y_X|/|s_R'*y_R| = |" << sXTyX << "|/|" << sRTyR
                    << "| = " << ::fabs(sXTyX/sRTyR)
                    << ( sXTyX_cond ? " <= " : " > " ) << " project_error_tol = "
                    << project_error_tol;
                  if( sXTyX_cond ) {
                    out	<< "\n(s_X'*rHL_XX*s_X/s_R'*rHL_RR*s_R) = (" << sXTBXXsX
                        << ") = " << (sXTBXXsX/sRTBRRsR)
                        << ( sXTBXXsX_cond ? " <= " : " > " ) << " project_error_tol = "
                        << project_error_tol;
                  }
                  out << ( do_update
                    ? "\n\nAttemping to add this previous update where B = rHL_RR ...\n"
                    : "\n\nCan not add this previous update ...\n" );
                }
                if( do_update ) {
                  // ( rHL_RR, s_bfgs_R, y_bfgs_R ) -> rHL_RR (this should not throw an exception!)
                  try {
                    proj_bfgs_updater().bfgs_update().perform_update(
                      &s_bfgs_R(),&y_bfgs_R(),false,out,olevel,algo->algo_cntr().check_results()
                      ,&rHL_RR_op, &quasi_newton_stats );
                  }
                  catch( const MatrixSymSecant::UpdateSkippedException& excpt ) {
                    if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
                      out	<< "\nOops!  The " << l << "th most recent BFGS update was rejected?:\n"
                        << excpt.what() << std::endl;
                    }
                  }
                }
              }
              if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
                out << "\n---------------------------------------------------------------------\n";
                }
              if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ITERATION_QUANTITIES) ) {
                out << "\nrHL_RR after adding previous BFGS updates:\nrHL_BRR =\n"
                  << dynamic_cast<MatrixOp&>(*rHL_RR);
              }
            }
            else {
              if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
                out	<< "\nThere were no previous update vectors to add!\n";
              }
            }
            if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ITERATION_QUANTITIES) ) {
              out << "\nFull rHL after complete reinitialization:\nrHL =\n" << *rHL_k;
            }
            quasi_newton_stats_(*s).set_k(0).set_updated_stats(
              QuasiNewtonStats::REINITIALIZED );
            return true;
          }
          else {
            if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
              out << "\nn_pz_R == n_pz = " << n_pz_R << ", No variables would be removed from "
                << "the superbasis so just keep on performing limited memory BFGS for now ...\n";
            }
            do_projected_rHL_RR = false;
          }
        }
      }
    }
    // If we have not switched to PBFGS then just update the full limited memory BFGS matrix
    if(!do_projected_rHL_RR) {
      if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
        out << "\nPerform current BFGS update on " << n_pz << " x " << n_pz << " full "
          << "limited memory reduced Hessian B = rHL ...\n";
      }
      proj_bfgs_updater().bfgs_update().perform_update(
        s_bfgs,y_bfgs,first_update,out,olevel,algo->algo_cntr().check_results()
        ,lbfgs_rHL_RR.get()
        ,&quasi_newton_stats_(*s).set_k(0)
        );
      return true;
    }
  }
  else {
    if( static_cast<int>(olevel) >= static_cast<int>(PRINT_ALGORITHM_STEPS) ) {
      out	<< "\nWe have already switched to projected BFGS updating ...\n";
    }
  }
  //
  // If we get here then we must have switched to
  // projected updating so lets just pass it on!
  //
  return proj_bfgs_updater().perform_update(
    s_bfgs,y_bfgs,first_update,out,olevel,algo,s,rHL_k);
}
void MatrixSymPosDefLBFGS::update_Q() const
{
  using DenseLinAlgPack::tri;
  using DenseLinAlgPack::tri_ele;
  using DenseLinAlgPack::Mp_StM;

  //
  // We need update the factorizations to solve for:
  //
  // x = inv(Q) * y
  //
  //	[ y1 ]	=	[ (1/gk)*S'S	 L	] * [ x1 ]
  //	[ y2 ]		[      L'		-D	]   [ x2 ]
  //
  // We will solve the above system using the schur complement:
  //
  // C = (1/gk)*S'S + L*inv(D)*L'
  //
  // According to the referenced paper, C is p.d. so:
  //
  // C = J*J'
  //
  // We then compute the solution as:
  //
  // x1 = inv(C) * ( y1 + L*inv(D)*y2 )
  // x2 = - inv(D) * ( y2 - L'*x1 )
  //
  // Therefore we will just update the factorization C = J*J'
  //

  // Form the upper triangular part of C which will become J
  // which we are using storage of QJ

  if( QJ_.rows() < m_ )
    QJ_.resize( m_, m_ );

  const size_type
    mb = m_bar_;

  DMatrixSlice
    C = QJ_(1,mb,1,mb);

  // C = L * inv(D) * L'
  //
  // Here L is a strictly lower triangular (zero diagonal) matrix where:
  //
  // L = [ 0  0 ]
  //     [ Lb 0 ]
  //
  // Lb is lower triangular (nonzero diagonal)
  //
  // Therefore we compute L*inv(D)*L' as:
  //
  // C = [ 0	0 ] * [ Db  0  ] * [ 0  Lb' ]
  //	   [ Lb 0 ]   [ 0   db ]   [ 0   0  ]
  //
  //   = [ 0  0  ] = [ 0      0     ]
  //     [ 0  Cb ]   [ 0  Lb*Db*Lb' ]
  //
  // We need to compute the upper triangular part of Cb.

  C.row(1) = 0.0;
  if( mb > 1 )
    comp_Cb( STY_(2,mb,1,mb-1), STY_.diag(0)(1,mb-1), &C(2,mb,2,mb) );

  // C += (1/gk)*S'S

  const DMatrixSliceSym &STS = this->STS();
  Mp_StM( &C, (1/gamma_k_), tri( STS.gms(), STS.uplo(), BLAS_Cpp::nonunit )
    , BLAS_Cpp::trans );

  // Now perform a cholesky factorization of C
  // After this factorization the upper triangular part of QJ
  // (through C) will contain the cholesky factor.

  DMatrixSliceTriEle C_upper = tri_ele( C, BLAS_Cpp::upper );
  try {
    DenseLinAlgLAPack::potrf( &C_upper );
  }
  catch( const DenseLinAlgLAPack::FactorizationException &fe ) {
    TEUCHOS_TEST_FOR_EXCEPTION(
      true, UpdateFailedException
      ,"Error, the factorization of Q which should be s.p.d. failed with"
      " the error message: {" << fe.what() << "}";
      );
  }
void MatrixSymPosDefLBFGS::V_InvMtV(
  VectorMutable* y, BLAS_Cpp::Transp trans_rhs1
  , const Vector& x
  ) const
{
  using AbstractLinAlgPack::Vp_StMtV;
  using DenseLinAlgPack::V_InvMtV;
  using LinAlgOpPack::V_mV;
  using LinAlgOpPack::V_StV;
  using LinAlgOpPack::Vp_V;
  using LinAlgOpPack::V_MtV;
  using LinAlgOpPack::V_StMtV;
  using LinAlgOpPack::Vp_MtV;
  using DenseLinAlgPack::Vp_StMtV;
  typedef VectorDenseEncap         vde;
  typedef VectorDenseMutableEncap  vdme;
  using Teuchos::Workspace;
  Teuchos::WorkspaceStore* wss = Teuchos::get_default_workspace_store().get();

  assert_initialized();

  TEUCHOS_TEST_FOR_EXCEPT( !(  inverse_is_updated_  ) ); // For now just always update

  // y = inv(Bk) * x = Hk * x
  //
  // = gk*x + [S gk*Y] * [ inv(R')*(D+gk*Y'Y)*inv(R)     -inv(R') ] * [   S'  ] * x
  //                     [            -inv(R)                0    ]   [ gk*Y' ]
  //
  // Perform in the following (in order):
  //
  // y = gk*x
  //
  // t1 = [   S'*x  ]					<: R^(2*m)
  //      [ gk*Y'*x ]
  //
  // t2 = inv(R) * t1(1:m)			<: R^(m)
  //
  // t3 = - inv(R') * t1(m+1,2*m)		<: R^(m)
  //
  // t4 = gk * Y'Y * t2				<: R^(m)
  //
  // t4 += D*t2
  //
  // t5 = inv(R') * t4				<: R^(m)
  //
  // t5 += t3
  //
  // y += S*t5
  //
  // y += -gk*Y*t2

  // y = gk*x
  V_StV( y, gamma_k_, x );

  const size_type
    mb = m_bar_;	
  
  if( !mb )
    return;	// No updates have been performed.

  const multi_vec_ptr_t
    S = this->S(),
    Y = this->Y();

  // Get workspace

  Workspace<value_type>    t1_ws(wss,2*mb);
  DVectorSlice                   t1(&t1_ws[0],t1_ws.size());
  Workspace<value_type>    t2_ws(wss,mb);
  DVectorSlice                   t2(&t2_ws[0],t2_ws.size());
  Workspace<value_type>    t3_ws(wss,mb);
  DVectorSlice                   t3(&t3_ws[0],t3_ws.size());
  Workspace<value_type>    t4_ws(wss,mb);
  DVectorSlice                   t4(&t4_ws[0],t4_ws.size());
  Workspace<value_type>    t5_ws(wss,mb);
  DVectorSlice                   t5(&t5_ws[0],t5_ws.size());

  VectorSpace::vec_mut_ptr_t
    t = S->space_rows().create_member();

  const DMatrixSliceTri
    &R = this->R();

  const DMatrixSliceSym
    &YTY = this->YTY();

  // t1 = [   S'*x  ]
  //      [ gk*Y'*x ]
  V_MtV( t.get(), *S, BLAS_Cpp::trans, x );
  t1(1,mb) = vde(*t)();
  V_StMtV( t.get(), gamma_k_, *Y, BLAS_Cpp::trans, x );
  t1(mb+1,2*mb) = vde(*t)();

  // t2 = inv(R) * t1(1:m)
  V_InvMtV( &t2, R, BLAS_Cpp::no_trans, t1(1,mb) );

  // t3 = - inv(R') * t1(m+1,2*m)
  V_mV( &t3, t1(mb+1,2*mb) );
  V_InvMtV( &t3, R, BLAS_Cpp::trans, t3 );

  // t4 = gk * Y'Y * t2
  V_StMtV( &t4, gamma_k_, YTY, BLAS_Cpp::no_trans, t2 );

  // t4 += D*t2
  Vp_DtV( &t4, t2 );

  // t5 = inv(R') * t4
  V_InvMtV( &t5, R, BLAS_Cpp::trans, t4 );

  // t5 += t3
  Vp_V( &t5, t3 );

  // y += S*t5
  (vdme(*t)() = t5);
  Vp_MtV( y, *S, BLAS_Cpp::no_trans, *t );

  // y += -gk*Y*t2
  (vdme(*t)() = t2);
  Vp_StMtV( y, -gamma_k_, *Y, BLAS_Cpp::no_trans, *t );

}