/** evaluate everything before optimization */
  void 
  QMCCostFunctionSingle::checkConfigurations() {

    dG.resize(W.getTotalNum());
    dL.resize(W.getTotalNum());
    int numLocWalkers=W.getActiveWalkers();
    Records.resize(numLocWalkers,6);

    typedef MCWalkerConfiguration::Walker_t Walker_t;
    MCWalkerConfiguration::iterator it(W.begin()); 
    MCWalkerConfiguration::iterator it_end(W.end()); 
    int nat = W.getTotalNum();
    int iw=0;
    int totalElements=W.getTotalNum()*OHMMS_DIM;
    Etarget=0.0;
    while(it != it_end) {

      Walker_t& thisWalker(**it);

      //clean-up DataSet to save re-used values
      thisWalker.DataSet.clear();
      //rewind the counter
      thisWalker.DataSet.rewind();
      //MCWalkerConfiguraton::registerData add distance-table data
      W.registerData(thisWalker,thisWalker.DataSet);

      Return_t*  saved=Records[iw];
#if defined(QMC_COMPLEX)
      app_error() << " Optimization is not working with complex wavefunctions yet" << endl;
      app_error() << "  Needs to fix TrialWaveFunction::evaluateDeltaLog " << endl;
      Psi.evaluateDeltaLog(W, saved[LOGPSI_FIXED], saved[LOGPSI_FREE], dG, dL);
      thisWalker.DataSet.add(&(dG[0][0]),&(dG[0][0])+totalElements);
#else
      Psi.evaluateDeltaLog(W, saved[LOGPSI_FIXED], saved[LOGPSI_FREE], thisWalker.Drift, dL);
#endif
      thisWalker.DataSet.add(dL.first_address(),dL.last_address());
      Etarget += saved[ENERGY_TOT] = H.evaluate(W);
      saved[ENERGY_FIXED] = H.getInvariantEnergy();

      ++it;
      ++iw;
    }

    //Need to sum over the processors
    vector<Return_t> etemp(2);
    etemp[0]=Etarget;
    etemp[1]=static_cast<Return_t>(numLocWalkers);
    myComm->allreduce(etemp);
    Etarget = static_cast<Return_t>(etemp[0]/etemp[1]);
    NumSamples = static_cast<int>(etemp[1]);

    setTargetEnergy(Etarget);

    ReportCounter=0;
  }
Esempio n. 2
0
/** evaluate everything before optimization */
void QMCCostFunctionOMP::checkConfigurations()
{
  /* mmorales:
     Since there are cases when memory is an issue (too many dets), the use of a buffer
     is decoupled from the use of includeNonlocalH in the cost function. Without a buffer,
     everything is recalculated.
  Options:
  - "yes" or "all"  : store everything
  - "minimum"       : store orbitals and inverses only, recalculate dets
  FIX FIX FIX: right now, there is no way to include the nonlocalH in the cost function
  without using a buffer because evaluateLog needs to be called to get the inverse of psiM
  This implies that isOptimizable must be set to true, which is risky. Fix this somehow
  */
  StoreDerivInfo=false;
  DerivStorageLevel=-1;
  if(usebuffer == "yes" || usebuffer == "all")
  {
    StoreDerivInfo=true;
    if(includeNonlocalH!="no")
      DerivStorageLevel=0;
    else
      DerivStorageLevel=1;
    app_log() <<"Using buffers for temporary storage in QMCCostFunction.\n" <<endl;
  }
  else if (usebuffer == "minimum")
  {
    StoreDerivInfo=true;
    // in this case the use of nonlocalH is irrelevant, since the same inf is enough for both cases
    DerivStorageLevel=2;
    app_log() <<"Using minimum storage for determinant evaluation. \n";
  }
  else
  {
    if(includeNonlocalH!="no")
    {
      APP_ABORT("Need to enable the use of includeNonlocalH=='name' without a buffer.");
    }
  }
  int numW = 0;
  for(int i=0; i<wClones.size(); i++)
    numW += wClones[i]->getActiveWalkers();
  app_log() <<"Memory usage: " <<endl;
  app_log() <<"Linear method (approx matrix usage: 4*N^2): " <<NumParams()*NumParams()*sizeof(QMCTraits::RealType)*4.0/1.0e6  <<" MB" <<endl; // assuming 4 matrices
  app_log() <<"Deriv,HDerivRecord:      " <<numW*NumOptimizables*sizeof(QMCTraits::RealType)*3.0/1.0e6 <<" MB" <<endl;
  if(StoreDerivInfo)
  {
    MCWalkerConfiguration& dummy(*wClones[0]);
    long memorb=0,meminv=0,memdets=0,memorbs_only=0;
    Psi.memoryUsage_DataForDerivatives(dummy,memorbs_only,memorb,meminv,memdets);
    memorbs_only*=sizeof(QMCTraits::RealType);
    memorb*=sizeof(QMCTraits::RealType);
    meminv*=sizeof(QMCTraits::RealType);
    memdets*=sizeof(QMCTraits::RealType);
    app_log() <<"Buffer memory cost:     MB/walker       MB/total " <<endl;
    app_log() <<"Orbitals only:           " <<memorbs_only/1.0e6 <<"      " <<memorbs_only*numW/1.0e6 <<endl;
    app_log() <<"Orbitals + dervs:        " <<memorb/1.0e6 <<"      " <<memorb*numW/1.0e6 <<endl;
    app_log() <<"Inverse:                 " <<meminv/1.0e6 <<"      " <<meminv*numW/1.0e6 <<endl;
    app_log() <<"Determinants:            " <<memdets/1.0e6 <<"      " <<memdets*numW/1.0e6 <<endl;
  }

  app_log().flush();

  RealType et_tot=0.0;
  RealType e2_tot=0.0;
#pragma omp parallel reduction(+:et_tot,e2_tot)
  {
    int ip = omp_get_thread_num();
    MCWalkerConfiguration& wRef(*wClones[ip]);
    if (RecordsOnNode[ip] ==0)
    {
      RecordsOnNode[ip]=new Matrix<Return_t>;
      RecordsOnNode[ip]->resize(wRef.getActiveWalkers(),SUM_INDEX_SIZE);
      if (needGrads)
      {
        DerivRecords[ip]=new Matrix<Return_t>;
        DerivRecords[ip]->resize(wRef.getActiveWalkers(),NumOptimizables);
        HDerivRecords[ip]=new Matrix<Return_t>;
        HDerivRecords[ip]->resize(wRef.getActiveWalkers(),NumOptimizables);
      }
    }
    else if (RecordsOnNode[ip]->size1()!=wRef.getActiveWalkers())
    {
      RecordsOnNode[ip]->resize(wRef.getActiveWalkers(),SUM_INDEX_SIZE);
      if (needGrads)
      {
        DerivRecords[ip]->resize(wRef.getActiveWalkers(),NumOptimizables);
        HDerivRecords[ip]->resize(wRef.getActiveWalkers(),NumOptimizables);
      }
    }
    QMCHamiltonianBase* nlpp = (includeNonlocalH =="no")?  0: hClones[ip]->getHamiltonian(includeNonlocalH.c_str());
    //set the optimization mode for the trial wavefunction
    psiClones[ip]->startOptimization();
    //    synchronize the random number generator with the node
    (*MoverRng[ip]) = (*RngSaved[ip]);
    hClones[ip]->setRandomGenerator(MoverRng[ip]);
    //int nat = wRef.getTotalNum();
    //int totalElements=W.getTotalNum()*OHMMS_DIM;
    typedef MCWalkerConfiguration::Walker_t Walker_t;
    Return_t e0=0.0;
    //       Return_t ef=0.0;
    Return_t e2=0.0;
    for (int iw=0, iwg=wPerNode[ip]; iw<wRef.getActiveWalkers(); ++iw,++iwg)
    {
      ParticleSet::Walker_t& thisWalker(*wRef[iw]);
      wRef.R=thisWalker.R;
      wRef.update();
      Return_t* restrict saved=(*RecordsOnNode[ip])[iw];
      //          Return_t logpsi(0);
      //          psiClones[ip]->evaluateDeltaLog(wRef, saved[LOGPSI_FIXED], saved[LOGPSI_FREE], *dLogPsi[iwg],*d2LogPsi[iwg]);
      // buffer for MultiSlaterDet data
      //          if((usebuffer=="yes")||(includeNonlocalH=="yes"))
      if(StoreDerivInfo)
      {
        psiClones[ip]->registerDataForDerivatives(wRef, thisWalker.DataSetForDerivatives,DerivStorageLevel);
        psiClones[ip]->evaluateDeltaLog(wRef, saved[LOGPSI_FIXED], saved[LOGPSI_FREE], *dLogPsi[iwg], *d2LogPsi[iwg], thisWalker.DataSetForDerivatives);
        //            logpsi = saved[LOGPSI_FIXED] + saved[LOGPSI_FREE];
      }
      else
      {
        psiClones[ip]->evaluateDeltaLog(wRef, saved[LOGPSI_FIXED], saved[LOGPSI_FREE], *dLogPsi[iwg], *d2LogPsi[iwg]);
        //            logpsi = psiClones[ip]->evaluateLog(wRef);
      }
      //          if(includeNonlocalH!="no") logpsi = saved[LOGPSI_FIXED] + saved[LOGPSI_FREE];
      Return_t x= hClones[ip]->evaluate(wRef);
      e0 += saved[ENERGY_TOT] = x;
      e2 += x*x;
      saved[ENERGY_FIXED] = hClones[ip]->getLocalPotential();
      if(nlpp)
        saved[ENERGY_FIXED] -= nlpp->Value;
      //if (includeNonlocalH!="no")
      //  saved[ENERGY_FIXED] = hClones[ip]->getLocalPotential() - (*(hClones[ip]->getHamiltonian(includeNonlocalH.c_str()))).Value;
      //else
      //  saved[ENERGY_FIXED] = hClones[ip]->getLocalPotential();
      //           ef += saved[ENERGY_FIXED];
      saved[REWEIGHT]=thisWalker.Weight=1.0;
      //          thisWalker.resetProperty(logpsi,psiClones[ip]->getPhase(),x);
      if (needGrads)
      {
        //allocate vector
        vector<Return_t> Dsaved(NumOptimizables,0.0);
        vector<Return_t> HDsaved(NumOptimizables,0.0);
        psiClones[ip]->evaluateDerivatives(wRef, OptVariablesForPsi, Dsaved, HDsaved);
        std::copy(Dsaved.begin(),Dsaved.end(),(*DerivRecords[ip])[iw]);
        std::copy(HDsaved.begin(),HDsaved.end(),(*HDerivRecords[ip])[iw]);
      }
    }
    //add them all using reduction
    et_tot+=e0;
    e2_tot+=e2;
    // #pragma omp atomic
    //       eft_tot+=ef;
  }
  OptVariablesForPsi.setComputed();
  //     app_log() << "  VMC Efavg = " << eft_tot/static_cast<Return_t>(wPerNode[NumThreads]) << endl;
  //Need to sum over the processors
  vector<Return_t> etemp(3);
  etemp[0]=et_tot;
  etemp[1]=static_cast<Return_t>(wPerNode[NumThreads]);
  etemp[2]=e2_tot;
  myComm->allreduce(etemp);
  Etarget = static_cast<Return_t>(etemp[0]/etemp[1]);
  NumSamples = static_cast<int>(etemp[1]);
  app_log() << "  VMC Eavg = " << Etarget << endl;
  app_log() << "  VMC Evar = " << etemp[2]/etemp[1]-Etarget*Etarget << endl;
  app_log() << "  Total weights = " << etemp[1] << endl;

  app_log().flush();

  setTargetEnergy(Etarget);
  ReportCounter=0;
}
Esempio n. 3
0
File: bdsvd.cpp Progetto: 0004c/VTK
/*************************************************************************
Internal working subroutine for bidiagonal decomposition
*************************************************************************/
bool bidiagonalsvddecompositioninternal(ap::real_1d_array& d,
     ap::real_1d_array e,
     int n,
     bool isupper,
     bool isfractionalaccuracyrequired,
     ap::real_2d_array& u,
     int ustart,
     int nru,
     ap::real_2d_array& c,
     int cstart,
     int ncc,
     ap::real_2d_array& vt,
     int vstart,
     int ncvt)
{
    bool result;
    int i;
    int idir;
    int isub;
    int iter;
    int j;
    int ll = 0; // Eliminate compiler warning.
    int lll;
    int m;
    int maxit;
    int oldll;
    int oldm;
    double abse;
    double abss;
    double cosl;
    double cosr;
    double cs;
    double eps;
    double f;
    double g;
    double h;
    double mu;
    double oldcs;
    double oldsn = 0.; // Eliminate compiler warning.
    double r;
    double shift;
    double sigmn;
    double sigmx;
    double sinl;
    double sinr;
    double sll;
    double smax;
    double smin;
    double sminl;
    double sminoa;
    double sn;
    double thresh;
    double tol;
    double tolmul;
    double unfl;
    ap::real_1d_array work0;
    ap::real_1d_array work1;
    ap::real_1d_array work2;
    ap::real_1d_array work3;
    int maxitr;
    bool matrixsplitflag;
    bool iterflag;
    ap::real_1d_array utemp;
    ap::real_1d_array vttemp;
    ap::real_1d_array ctemp;
    ap::real_1d_array etemp;
    bool fwddir;
    double tmp;
    int mm1;
    int mm0;
    bool bchangedir;
    int uend;
    int cend;
    int vend;

    result = true;
    if( n==0 )
    {
        return result;
    }
    if( n==1 )
    {
        if( d(1)<0 )
        {
            d(1) = -d(1);
            if( ncvt>0 )
            {
                ap::vmul(&vt(vstart, vstart), ap::vlen(vstart,vstart+ncvt-1), -1);
            }
        }
        return result;
    }
    
    //
    // init
    //
    work0.setbounds(1, n-1);
    work1.setbounds(1, n-1);
    work2.setbounds(1, n-1);
    work3.setbounds(1, n-1);
    uend = ustart+ap::maxint(nru-1, 0);
    vend = vstart+ap::maxint(ncvt-1, 0);
    cend = cstart+ap::maxint(ncc-1, 0);
    utemp.setbounds(ustart, uend);
    vttemp.setbounds(vstart, vend);
    ctemp.setbounds(cstart, cend);
    maxitr = 12;
    fwddir = true;
    
    //
    // resize E from N-1 to N
    //
    etemp.setbounds(1, n);
    for(i = 1; i <= n-1; i++)
    {
        etemp(i) = e(i);
    }
    e.setbounds(1, n);
    for(i = 1; i <= n-1; i++)
    {
        e(i) = etemp(i);
    }
    e(n) = 0;
    idir = 0;
    
    //
    // Get machine constants
    //
    eps = ap::machineepsilon;
    unfl = ap::minrealnumber;
    
    //
    // If matrix lower bidiagonal, rotate to be upper bidiagonal
    // by applying Givens rotations on the left
    //
    if( !isupper )
    {
        for(i = 1; i <= n-1; i++)
        {
            generaterotation(d(i), e(i), cs, sn, r);
            d(i) = r;
            e(i) = sn*d(i+1);
            d(i+1) = cs*d(i+1);
            work0(i) = cs;
            work1(i) = sn;
        }
        
        //
        // Update singular vectors if desired
        //
        if( nru>0 )
        {
            applyrotationsfromtheright(fwddir, ustart, uend, 1+ustart-1, n+ustart-1, work0, work1, u, utemp);
        }
        if( ncc>0 )
        {
            applyrotationsfromtheleft(fwddir, 1+cstart-1, n+cstart-1, cstart, cend, work0, work1, c, ctemp);
        }
    }
    
    //
    // Compute singular values to relative accuracy TOL
    // (By setting TOL to be negative, algorithm will compute
    // singular values to absolute accuracy ABS(TOL)*norm(input matrix))
    //
    tolmul = ap::maxreal(double(10), ap::minreal(double(100), pow(eps, -0.125)));
    tol = tolmul*eps;
    if( !isfractionalaccuracyrequired )
    {
        tol = -tol;
    }
    
    //
    // Compute approximate maximum, minimum singular values
    //
    smax = 0;
    for(i = 1; i <= n; i++)
    {
        smax = ap::maxreal(smax, fabs(d(i)));
    }
    for(i = 1; i <= n-1; i++)
    {
        smax = ap::maxreal(smax, fabs(e(i)));
    }
    sminl = 0;
    if( tol>=0 )
    {
        
        //
        // Relative accuracy desired
        //
        sminoa = fabs(d(1));
        if( sminoa!=0 )
        {
            mu = sminoa;
            for(i = 2; i <= n; i++)
            {
                mu = fabs(d(i))*(mu/(mu+fabs(e(i-1))));
                sminoa = ap::minreal(sminoa, mu);
                if( sminoa==0 )
                {
                    break;
                }
            }
        }
        sminoa = sminoa/sqrt(double(n));
        thresh = ap::maxreal(tol*sminoa, maxitr*n*n*unfl);
    }
    else
    {
        
        //
        // Absolute accuracy desired
        //
        thresh = ap::maxreal(fabs(tol)*smax, maxitr*n*n*unfl);
    }
    
    //
    // Prepare for main iteration loop for the singular values
    // (MAXIT is the maximum number of passes through the inner
    // loop permitted before nonconvergence signalled.)
    //
    maxit = maxitr*n*n;
    iter = 0;
    oldll = -1;
    oldm = -1;
    
    //
    // M points to last element of unconverged part of matrix
    //
    m = n;
    
    //
    // Begin main iteration loop
    //
    while(true)
    {
        
        //
        // Check for convergence or exceeding iteration count
        //
        if( m<=1 )
        {
            break;
        }
        if( iter>maxit )
        {
            result = false;
            return result;
        }
        
        //
        // Find diagonal block of matrix to work on
        //
        if( tol<0&&fabs(d(m))<=thresh )
        {
            d(m) = 0;
        }
        smax = fabs(d(m));
        smin = smax;
        matrixsplitflag = false;
        for(lll = 1; lll <= m-1; lll++)
        {
            ll = m-lll;
            abss = fabs(d(ll));
            abse = fabs(e(ll));
            if( tol<0&&abss<=thresh )
            {
                d(ll) = 0;
            }
            if( abse<=thresh )
            {
                matrixsplitflag = true;
                break;
            }
            smin = ap::minreal(smin, abss);
            smax = ap::maxreal(smax, ap::maxreal(abss, abse));
        }
        if( !matrixsplitflag )
        {
            ll = 0;
        }
        else
        {
            
            //
            // Matrix splits since E(LL) = 0
            //
            e(ll) = 0;
            if( ll==m-1 )
            {
                
                //
                // Convergence of bottom singular value, return to top of loop
                //
                m = m-1;
                continue;
            }
        }
        ll = ll+1;
        
        //
        // E(LL) through E(M-1) are nonzero, E(LL-1) is zero
        //
        if( ll==m-1 )
        {
            
            //
            // 2 by 2 block, handle separately
            //
            svdv2x2(d(m-1), e(m-1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl);
            d(m-1) = sigmx;
            e(m-1) = 0;
            d(m) = sigmn;
            
            //
            // Compute singular vectors, if desired
            //
            if( ncvt>0 )
            {
                mm0 = m+(vstart-1);
                mm1 = m-1+(vstart-1);
                ap::vmove(&vttemp(vstart), &vt(mm1, vstart), ap::vlen(vstart,vend), cosr);
                ap::vadd(&vttemp(vstart), &vt(mm0, vstart), ap::vlen(vstart,vend), sinr);
                ap::vmul(&vt(mm0, vstart), ap::vlen(vstart,vend), cosr);
                ap::vsub(&vt(mm0, vstart), &vt(mm1, vstart), ap::vlen(vstart,vend), sinr);
                ap::vmove(&vt(mm1, vstart), &vttemp(vstart), ap::vlen(vstart,vend));
            }
            if( nru>0 )
            {
                mm0 = m+ustart-1;
                mm1 = m-1+ustart-1;
                ap::vmove(utemp.getvector(ustart, uend), u.getcolumn(mm1, ustart, uend), cosl);
                ap::vadd(utemp.getvector(ustart, uend), u.getcolumn(mm0, ustart, uend), sinl);
                ap::vmul(u.getcolumn(mm0, ustart, uend), cosl);
                ap::vsub(u.getcolumn(mm0, ustart, uend), u.getcolumn(mm1, ustart, uend), sinl);
                ap::vmove(u.getcolumn(mm1, ustart, uend), utemp.getvector(ustart, uend));
            }
            if( ncc>0 )
            {
                mm0 = m+cstart-1;
                mm1 = m-1+cstart-1;
                ap::vmove(&ctemp(cstart), &c(mm1, cstart), ap::vlen(cstart,cend), cosl);
                ap::vadd(&ctemp(cstart), &c(mm0, cstart), ap::vlen(cstart,cend), sinl);
                ap::vmul(&c(mm0, cstart), ap::vlen(cstart,cend), cosl);
                ap::vsub(&c(mm0, cstart), &c(mm1, cstart), ap::vlen(cstart,cend), sinl);
                ap::vmove(&c(mm1, cstart), &ctemp(cstart), ap::vlen(cstart,cend));
            }
            m = m-2;
            continue;
        }
        
        //
        // If working on new submatrix, choose shift direction
        // (from larger end diagonal element towards smaller)
        //
        // Previously was
        //     "if (LL>OLDM) or (M<OLDLL) then"
        // fixed thanks to Michael Rolle < *****@*****.** >
        // Very strange that LAPACK still contains it.
        //
        bchangedir = false;
        if( idir==1&&fabs(d(ll))<1.0E-3*fabs(d(m)) )
        {
            bchangedir = true;
        }
        if( idir==2&&fabs(d(m))<1.0E-3*fabs(d(ll)) )
        {
            bchangedir = true;
        }
        if( ll!=oldll||m!=oldm||bchangedir )
        {
            if( fabs(d(ll))>=fabs(d(m)) )
            {
                
                //
                // Chase bulge from top (big end) to bottom (small end)
                //
                idir = 1;
            }
            else
            {
                
                //
                // Chase bulge from bottom (big end) to top (small end)
                //
                idir = 2;
            }
        }
        
        //
        // Apply convergence tests
        //
        if( idir==1 )
        {
            
            //
            // Run convergence test in forward direction
            // First apply standard test to bottom of matrix
            //
            if( (fabs(e(m-1))<=fabs(tol)*fabs(d(m)))||(tol<0&&fabs(e(m-1))<=thresh) )
            {
                e(m-1) = 0;
                continue;
            }
            if( tol>=0 )
            {
                
                //
                // If relative accuracy desired,
                // apply convergence criterion forward
                //
                mu = fabs(d(ll));
                sminl = mu;
                iterflag = false;
                for(lll = ll; lll <= m-1; lll++)
                {
                    if( fabs(e(lll))<=tol*mu )
                    {
                        e(lll) = 0;
                        iterflag = true;
                        break;
                    }
                    mu = fabs(d(lll+1))*(mu/(mu+fabs(e(lll))));
                    sminl = ap::minreal(sminl, mu);
                }
                if( iterflag )
                {
                    continue;
                }
            }
        }
        else
        {
            
            //
            // Run convergence test in backward direction
            // First apply standard test to top of matrix
            //
            if( (fabs(e(ll))<=fabs(tol)*fabs(d(ll)))||(tol<0&&fabs(e(ll))<=thresh) )
            {
                e(ll) = 0;
                continue;
            }
            if( tol>=0 )
            {
                
                //
                // If relative accuracy desired,
                // apply convergence criterion backward
                //
                mu = fabs(d(m));
                sminl = mu;
                iterflag = false;
                for(lll = m-1; lll >= ll; lll--)
                {
                    if( fabs(e(lll))<=tol*mu )
                    {
                        e(lll) = 0;
                        iterflag = true;
                        break;
                    }
                    mu = fabs(d(lll))*(mu/(mu+fabs(e(lll))));
                    sminl = ap::minreal(sminl, mu);
                }
                if( iterflag )
                {
                    continue;
                }
            }
        }
        oldll = ll;
        oldm = m;
        
        //
        // Compute shift.  First, test if shifting would ruin relative
        // accuracy, and if so set the shift to zero.
        //
        if( tol>=0&&n*tol*(sminl/smax)<=ap::maxreal(eps, 0.01*tol) )
        {
            
            //
            // Use a zero shift to avoid loss of relative accuracy
            //
            shift = 0;
        }
        else
        {
            
            //
            // Compute the shift from 2-by-2 block at end of matrix
            //
            if( idir==1 )
            {
                sll = fabs(d(ll));
                svd2x2(d(m-1), e(m-1), d(m), shift, r);
            }
            else
            {
                sll = fabs(d(m));
                svd2x2(d(ll), e(ll), d(ll+1), shift, r);
            }
            
            //
            // Test if shift negligible, and if so set to zero
            //
            if( sll>0 )
            {
                if( ap::sqr(shift/sll)<eps )
                {
                    shift = 0;
                }
            }
        }
        
        //
        // Increment iteration count
        //
        iter = iter+m-ll;
        
        //
        // If SHIFT = 0, do simplified QR iteration
        //
        if( shift==0 )
        {
            if( idir==1 )
            {
                
                //
                // Chase bulge from top to bottom
                // Save cosines and sines for later singular vector updates
                //
                cs = 1;
                oldcs = 1;
                for(i = ll; i <= m-1; i++)
                {
                    generaterotation(d(i)*cs, e(i), cs, sn, r);
                    if( i>ll )
                    {
                        e(i-1) = oldsn*r;
                    }
                    generaterotation(oldcs*r, d(i+1)*sn, oldcs, oldsn, tmp);
                    d(i) = tmp;
                    work0(i-ll+1) = cs;
                    work1(i-ll+1) = sn;
                    work2(i-ll+1) = oldcs;
                    work3(i-ll+1) = oldsn;
                }
                h = d(m)*cs;
                d(m) = h*oldcs;
                e(m-1) = h*oldsn;
                
                //
                // Update singular vectors
                //
                if( ncvt>0 )
                {
                    applyrotationsfromtheleft(fwddir, ll+vstart-1, m+vstart-1, vstart, vend, work0, work1, vt, vttemp);
                }
                if( nru>0 )
                {
                    applyrotationsfromtheright(fwddir, ustart, uend, ll+ustart-1, m+ustart-1, work2, work3, u, utemp);
                }
                if( ncc>0 )
                {
                    applyrotationsfromtheleft(fwddir, ll+cstart-1, m+cstart-1, cstart, cend, work2, work3, c, ctemp);
                }
                
                //
                // Test convergence
                //
                if( fabs(e(m-1))<=thresh )
                {
                    e(m-1) = 0;
                }
            }
            else
            {
                
                //
                // Chase bulge from bottom to top
                // Save cosines and sines for later singular vector updates
                //
                cs = 1;
                oldcs = 1;
                for(i = m; i >= ll+1; i--)
                {
                    generaterotation(d(i)*cs, e(i-1), cs, sn, r);
                    if( i<m )
                    {
                        e(i) = oldsn*r;
                    }
                    generaterotation(oldcs*r, d(i-1)*sn, oldcs, oldsn, tmp);
                    d(i) = tmp;
                    work0(i-ll) = cs;
                    work1(i-ll) = -sn;
                    work2(i-ll) = oldcs;
                    work3(i-ll) = -oldsn;
                }
                h = d(ll)*cs;
                d(ll) = h*oldcs;
                e(ll) = h*oldsn;
                
                //
                // Update singular vectors
                //
                if( ncvt>0 )
                {
                    applyrotationsfromtheleft(!fwddir, ll+vstart-1, m+vstart-1, vstart, vend, work2, work3, vt, vttemp);
                }
                if( nru>0 )
                {
                    applyrotationsfromtheright(!fwddir, ustart, uend, ll+ustart-1, m+ustart-1, work0, work1, u, utemp);
                }
                if( ncc>0 )
                {
                    applyrotationsfromtheleft(!fwddir, ll+cstart-1, m+cstart-1, cstart, cend, work0, work1, c, ctemp);
                }
                
                //
                // Test convergence
                //
                if( fabs(e(ll))<=thresh )
                {
                    e(ll) = 0;
                }
            }
        }
        else
        {
            
            //
            // Use nonzero shift
            //
            if( idir==1 )
            {
                
                //
                // Chase bulge from top to bottom
                // Save cosines and sines for later singular vector updates
                //
                f = (fabs(d(ll))-shift)*(extsignbdsqr(double(1), d(ll))+shift/d(ll));
                g = e(ll);
                for(i = ll; i <= m-1; i++)
                {
                    generaterotation(f, g, cosr, sinr, r);
                    if( i>ll )
                    {
                        e(i-1) = r;
                    }
                    f = cosr*d(i)+sinr*e(i);
                    e(i) = cosr*e(i)-sinr*d(i);
                    g = sinr*d(i+1);
                    d(i+1) = cosr*d(i+1);
                    generaterotation(f, g, cosl, sinl, r);
                    d(i) = r;
                    f = cosl*e(i)+sinl*d(i+1);
                    d(i+1) = cosl*d(i+1)-sinl*e(i);
                    if( i<m-1 )
                    {
                        g = sinl*e(i+1);
                        e(i+1) = cosl*e(i+1);
                    }
                    work0(i-ll+1) = cosr;
                    work1(i-ll+1) = sinr;
                    work2(i-ll+1) = cosl;
                    work3(i-ll+1) = sinl;
                }
                e(m-1) = f;
                
                //
                // Update singular vectors
                //
                if( ncvt>0 )
                {
                    applyrotationsfromtheleft(fwddir, ll+vstart-1, m+vstart-1, vstart, vend, work0, work1, vt, vttemp);
                }
                if( nru>0 )
                {
                    applyrotationsfromtheright(fwddir, ustart, uend, ll+ustart-1, m+ustart-1, work2, work3, u, utemp);
                }
                if( ncc>0 )
                {
                    applyrotationsfromtheleft(fwddir, ll+cstart-1, m+cstart-1, cstart, cend, work2, work3, c, ctemp);
                }
                
                //
                // Test convergence
                //
                if( fabs(e(m-1))<=thresh )
                {
                    e(m-1) = 0;
                }
            }
            else
            {
                
                //
                // Chase bulge from bottom to top
                // Save cosines and sines for later singular vector updates
                //
                f = (fabs(d(m))-shift)*(extsignbdsqr(double(1), d(m))+shift/d(m));
                g = e(m-1);
                for(i = m; i >= ll+1; i--)
                {
                    generaterotation(f, g, cosr, sinr, r);
                    if( i<m )
                    {
                        e(i) = r;
                    }
                    f = cosr*d(i)+sinr*e(i-1);
                    e(i-1) = cosr*e(i-1)-sinr*d(i);
                    g = sinr*d(i-1);
                    d(i-1) = cosr*d(i-1);
                    generaterotation(f, g, cosl, sinl, r);
                    d(i) = r;
                    f = cosl*e(i-1)+sinl*d(i-1);
                    d(i-1) = cosl*d(i-1)-sinl*e(i-1);
                    if( i>ll+1 )
                    {
                        g = sinl*e(i-2);
                        e(i-2) = cosl*e(i-2);
                    }
                    work0(i-ll) = cosr;
                    work1(i-ll) = -sinr;
                    work2(i-ll) = cosl;
                    work3(i-ll) = -sinl;
                }
                e(ll) = f;
                
                //
                // Test convergence
                //
                if( fabs(e(ll))<=thresh )
                {
                    e(ll) = 0;
                }
                
                //
                // Update singular vectors if desired
                //
                if( ncvt>0 )
                {
                    applyrotationsfromtheleft(!fwddir, ll+vstart-1, m+vstart-1, vstart, vend, work2, work3, vt, vttemp);
                }
                if( nru>0 )
                {
                    applyrotationsfromtheright(!fwddir, ustart, uend, ll+ustart-1, m+ustart-1, work0, work1, u, utemp);
                }
                if( ncc>0 )
                {
                    applyrotationsfromtheleft(!fwddir, ll+cstart-1, m+cstart-1, cstart, cend, work0, work1, c, ctemp);
                }
            }
        }
        
        //
        // QR iteration finished, go back and check convergence
        //
        continue;
    }
    
    //
    // All singular values converged, so make them positive
    //
    for(i = 1; i <= n; i++)
    {
        if( d(i)<0 )
        {
            d(i) = -d(i);
            
            //
            // Change sign of singular vectors, if desired
            //
            if( ncvt>0 )
            {
                ap::vmul(&vt(i+vstart-1, vstart), ap::vlen(vstart,vend), -1);
            }
        }
    }
    
    //
    // Sort the singular values into decreasing order (insertion sort on
    // singular values, but only one transposition per singular vector)
    //
    for(i = 1; i <= n-1; i++)
    {
        
        //
        // Scan for smallest D(I)
        //
        isub = 1;
        smin = d(1);
        for(j = 2; j <= n+1-i; j++)
        {
            if( d(j)<=smin )
            {
                isub = j;
                smin = d(j);
            }
        }
        if( isub!=n+1-i )
        {
            
            //
            // Swap singular values and vectors
            //
            d(isub) = d(n+1-i);
            d(n+1-i) = smin;
            if( ncvt>0 )
            {
                j = n+1-i;
                ap::vmove(&vttemp(vstart), &vt(isub+vstart-1, vstart), ap::vlen(vstart,vend));
                ap::vmove(&vt(isub+vstart-1, vstart), &vt(j+vstart-1, vstart), ap::vlen(vstart,vend));
                ap::vmove(&vt(j+vstart-1, vstart), &vttemp(vstart), ap::vlen(vstart,vend));
            }
            if( nru>0 )
            {
                j = n+1-i;
                ap::vmove(utemp.getvector(ustart, uend), u.getcolumn(isub+ustart-1, ustart, uend));
                ap::vmove(u.getcolumn(isub+ustart-1, ustart, uend), u.getcolumn(j+ustart-1, ustart, uend));
                ap::vmove(u.getcolumn(j+ustart-1, ustart, uend), utemp.getvector(ustart, uend));
            }
            if( ncc>0 )
            {
                j = n+1-i;
                ap::vmove(&ctemp(cstart), &c(isub+cstart-1, cstart), ap::vlen(cstart,cend));
                ap::vmove(&c(isub+cstart-1, cstart), &c(j+cstart-1, cstart), ap::vlen(cstart,cend));
                ap::vmove(&c(j+cstart-1, cstart), &ctemp(cstart), ap::vlen(cstart,cend));
            }
        }
    }
    return result;
}