예제 #1
0
파일: gslMatrixUtils.C 프로젝트: mmm/byrd
const Matrix<complex<double> > 
gslMatrixSqrt( const Matrix<complex<double> >& mat ) {

    if( mat.num_rows() != mat.num_cols() ) throw("Matrices not square");



    

    Vector<double> eigVals( mat.num_rows() );
    Matrix<complex<double> > eigVects = nan_to_zero(mat);
//    eigVals = Hermitian_eigenvalue_solve( eigVects );

    Matrix<complex<double> > tmpMat2 = 
        dagger(eigVects)*mat*eigVects;
    const complex<double> zero(0.0,0.0);
    Matrix<complex<double> > out(mat.num_rows(),
                                         mat.num_rows(),
                                         zero);
    for (int i=1; i<=mat.num_rows(); i++ ) {
        out(i,i) = sqrt( tmpMat2(i,i) );
    }

    return nan_to_zero(eigVects*out*dagger(eigVects));

}
void SpeciesInfo::augmentDensityGridGrad(const ScalarFieldArray& E_n, std::vector<vector3<> >* forces)
{	static StopWatch watch("augmentDensityGridGrad"); watch.start();
	augmentDensityGrid_COMMON_INIT
	if(!nAug) augmentDensityInit();
	const GridInfo &gInfo = e->gInfo;
	double dGinv = 1./gInfo.dGradial;
	matrix E_nAugRadial = zeroes(nCoeffHlf, e->eInfo.nDensities * atpos.size() * Nlm);
	double* E_nAugRadialData = (double*)E_nAugRadial.dataPref();
	matrix nAugRadial; const double* nAugRadialData=0;
	if(forces)
	{	matrix nAugTot = nAug; nAugTot.allReduce(MPIUtil::ReduceSum);
		nAugRadial = QradialMat * nAugTot;
		nAugRadialData = (const double*)nAugRadial.dataPref();
	}
	VectorFieldTilde E_atpos; if(forces) nullToZero(E_atpos, gInfo);
	for(unsigned s=0; s<E_n.size(); s++)
	{	ScalarFieldTilde ccE_n = Idag(E_n[s]);
		for(unsigned atom=0; atom<atpos.size(); atom++)
		{	int atomOffs = nCoeff * Nlm * (atom + atpos.size()*s);
			if(forces) initZero(E_atpos);
			callPref(nAugmentGrad)(Nlm, gInfo.S, gInfo.G, nCoeff, dGinv, forces? (nAugRadialData+atomOffs) :0, atpos[atom],
				ccE_n->dataPref(), E_nAugRadialData+atomOffs, forces ? E_atpos.dataPref() : vector3<complex*>(), nagIndex, nagIndexPtr);
			if(forces) for(int k=0; k<3; k++) (*forces)[atom][k] -= sum(E_atpos[k]);
		}
	}
	E_nAug = dagger(QradialMat) * E_nAugRadial;  //propagate from spline coeffs to radial functions
	E_nAug.allReduce(MPIUtil::ReduceSum);
	watch.stop();
}
void ColumnBundleTransform::gatherAxpy(complex alpha, const ColumnBundle& C_D, int bD, ColumnBundle& C_C, int bC) const
{	//Check inputs:
	myassert(C_C.colLength() == nSpinor*basisC.nbasis); myassert(bC >= 0 && bC < C_C.nCols());
	myassert(C_D.colLength() == nSpinor*basisD.nbasis); myassert(bD >= 0 && bD < C_D.nCols());
	//Gather:
	matrix spinorRotInv = (invert<0) ? transpose(spinorRot) : dagger(spinorRot);
	for(int sD=0; sD<nSpinor; sD++)
		for(int sC=0; sC<nSpinor; sC++)
			callPref(eblas_gather_zaxpy)(index.size(), alpha*spinorRotInv(sC,sD), indexPref,
				C_D.dataPref() + C_D.index(bD, sD*C_D.basis->nbasis),
				C_C.dataPref() + C_C.index(bC, sC*C_C.basis->nbasis), invert<0 );
}
예제 #4
0
matrix dagger_symmetrize(const scaled<matrix> &A)
{	return 0.5*(dagger(A) + A);
}
예제 #5
0
void Phonon::setup(bool printDefaults)
{
	//Parse input to initialize unit cell:
	parse(input, e, printDefaults);
	logSuspend();
	parse(input, eSupTemplate); //silently create a copy by re-parsing input (Everything is not trivially copyable)
	logResume();
	
	//Ensure phonon command specified:
	if(!sup.length())
		die("phonon supercell must be specified using the phonon command.\n");
	if(!e.gInfo.S.length_squared())
		die("Manual fftbox setting required for phonon. If supercell grid\n"
			"initialization fails, specify slightly larger manual fftbox.\n");
	//Check kpoint and supercell compatibility:
	if(e.eInfo.qnums.size()>1 || e.eInfo.qnums[0].k.length_squared())
		die("phonon requires a Gamma-centered uniform kpoint mesh.\n");
	for(int j=0; j<3; j++)
	{	if(!sup[j] || e.eInfo.kfold[j] % sup[j])
		{	die("kpoint folding %d is not a multiple of supercell count %d for lattice direction %d.\n",
				e.eInfo.kfold[j], sup[j], j);
		}
		eSupTemplate.eInfo.kfold[j] = e.eInfo.kfold[j] / sup[j];
	}
	
	logPrintf("########### Unit cell calculation #############\n");
	SpeciesInfo::Constraint constraintFull;
	constraintFull.moveScale = 0;
	constraintFull.type = SpeciesInfo::Constraint::None;
	for(size_t sp=0; sp<e.iInfo.species.size(); sp++)
		e.iInfo.species[sp]->constraints.assign(e.iInfo.species[sp]->atpos.size(), constraintFull);
	e.setup();
	if(!e.coulombParams.supercell) e.updateSupercell(true); //force supercell generation

	nSpins = e.eInfo.spinType==SpinZ ? 2 : 1;
	nSpinor = e.eInfo.spinorLength();

	//Initialize state of unit cell:
	if(e.cntrl.dumpOnly)
	{	//Single energy calculation so that all dependent quantities have been initialized:
		logPrintf("\n----------- Energy evaluation at fixed state -------------\n"); logFlush();
		e.eVars.elecEnergyAndGrad(e.ener, 0, 0, true);
	}
	else elecFluidMinimize(e);
	logPrintf("# Energy components:\n"); e.ener.print(); logPrintf("\n");

	//Determine optimum number of bands for supercell calculation:
	nBandsOpt = 0;
	for(int q=e.eInfo.qStart; q<e.eInfo.qStop; q++)
	{	int nBands_q = std::upper_bound(e.eVars.F[q].begin(), e.eVars.F[q].end(), Fcut, std::greater<double>()) - e.eVars.F[q].begin();
		nBandsOpt = std::max(nBandsOpt, nBands_q);
	}
	mpiUtil->allReduce(nBandsOpt, MPIUtil::ReduceMax);
	logPrintf("Fcut=%lg reduced nBands from %d to %d per unit cell.\n", Fcut, e.eInfo.nBands, nBandsOpt);

	//Make unit cell state available on all processes 
	//(since MPI division of qSup and q are different and independent of the map)
	for(int q=0; q<e.eInfo.nStates; q++)
	{	//Allocate:
		if(!e.eInfo.isMine(q))
		{	e.eVars.C[q].init(e.eInfo.nBands, e.basis[q].nbasis * e.eInfo.spinorLength(), &e.basis[q], &e.eInfo.qnums[q]);
			e.eVars.F[q].resize(e.eInfo.nBands);
			e.eVars.Hsub_eigs[q].resize(e.eInfo.nBands);
			if(e.eInfo.fillingsUpdate==ElecInfo::FermiFillingsAux)
				e.eVars.B[q].init(e.eInfo.nBands, e.eInfo.nBands);
		}
		//Broadcast from owner:
		int qSrc = e.eInfo.whose(q);
		e.eVars.C[q].bcast(qSrc);
		e.eVars.F[q].bcast(qSrc);
		e.eVars.Hsub_eigs[q].bcast(qSrc);
		if(e.eInfo.fillingsUpdate==ElecInfo::FermiFillingsAux)
			e.eVars.B[q].bcast(qSrc);
	}

	logPrintf("\n------- Configuring supercell and perturbation modes -------\n");
	
	//Grid:
	eSupTemplate.gInfo.S = Diag(sup) * e.gInfo.S; //ensure exact supercell
	eSupTemplate.gInfo.R = e.gInfo.R * Diag(sup);
	prodSup = sup[0] * sup[1] * sup[2];
	
	//Replicate atoms (and related properties):
	for(size_t sp=0; sp<e.iInfo.species.size(); sp++)
	{	const SpeciesInfo& spIn = *(e.iInfo.species[sp]);
		SpeciesInfo& spOut = *(eSupTemplate.iInfo.species[sp]);
		spOut.atpos.clear();
		spOut.initialMagneticMoments.clear();
		matrix3<> invSup = inv(Diag(vector3<>(sup)));
		vector3<int> iR;
		for(iR[0]=0; iR[0]<sup[0]; iR[0]++)
		for(iR[1]=0; iR[1]<sup[1]; iR[1]++)
		for(iR[2]=0; iR[2]<sup[2]; iR[2]++)
		{	for(vector3<> pos: spIn.atpos)
				spOut.atpos.push_back(invSup * (pos + iR));
			for(vector3<> M: spIn.initialMagneticMoments)
				spOut.initialMagneticMoments.push_back(M); //needed only to determine supercell symmetries
		}
		spOut.constraints.assign(spOut.atpos.size(), constraintFull);
	}
	
	//Supercell symmetries:
	eSupTemplate.symm.setup(eSupTemplate);
	const std::vector< matrix3<int> >& symSup = eSupTemplate.symm.getMatrices();
	symSupCart.clear();
	eSupTemplate.gInfo.invR = inv(eSupTemplate.gInfo.R);
	for(const matrix3<int>& m: symSup)
		symSupCart.push_back(eSupTemplate.gInfo.R * m * eSupTemplate.gInfo.invR);
	
	//Pick maximally symmetric orthogonal basis:
	logPrintf("\nFinding maximally-symmetric orthogonal basis for displacements:\n");
	std::vector< vector3<> > dirBasis;
	{	std::multimap<int, vector3<> > dirList; //directions indexed by their stabilizer group cardinality
		vector3<int> iR;
		for(iR[0]=0; iR[0]<=+1; iR[0]++)
		for(iR[1]=-1; iR[1]<=+1; iR[1]++)
		for(iR[2]=-1; iR[2]<=+1; iR[2]++)
			if(iR.length_squared())
			{	//Try low-order lattice vector linear combination:
				vector3<> n = eSupTemplate.gInfo.R * iR; n *= (1./n.length());
				dirList.insert(std::make_pair(nStabilizer(n, symSupCart), n));
				//Try low-order reciprocal lattice vector linear combination:
				n = iR * eSupTemplate.gInfo.invR; n *= (1./n.length());
				dirList.insert(std::make_pair(nStabilizer(n, symSupCart), n));
			}
		dirBasis.push_back(dirList.rbegin()->second);
		//Pick second driection orthogonal to first:
		std::multimap<int, vector3<> > dirList2;
		for(auto entry: dirList)
		{	vector3<> n = entry.second;
			n -= dot(n, dirBasis[0]) * dirBasis[0];
			if(n.length_squared() < symmThresholdSq) continue;
			n *= (1./n.length());
			dirList2.insert(std::make_pair(nStabilizer(n, symSupCart), n));
		}
		dirBasis.push_back(dirList2.rbegin()->second);
		dirBasis.push_back(cross(dirBasis[0], dirBasis[1])); //third direction constrained by orthogonality
	}
	for(const vector3<>& n: dirBasis)
		logPrintf(" [ %+lf %+lf %+lf ] |Stabilizer|: %d\n", n[0], n[1], n[2], nStabilizer(n,symSupCart));
	
	//List all modes:
	modes.clear();
	for(size_t sp=0; sp<e.iInfo.species.size(); sp++)
		for(size_t at=0; at<e.iInfo.species[sp]->atpos.size(); at++) //only need to move atoms in first unit cell
			for(int iDir=0; iDir<3; iDir++)
			{	Mode mode;
				mode.sp = sp;
				mode.at = at;
				mode.dir[iDir] = 1.;
				modes.push_back(mode);
			}

	//Find irreducible modes:
	perturbations.clear();
	for(unsigned sp=0; sp<e.iInfo.species.size(); sp++)
	{	int nAtoms = e.iInfo.species[sp]->atpos.size();
		int nPert = nAtoms * dirBasis.size();
		//generate all perturbations first:
		std::vector<Perturbation> pertSp(nPert); //perturbations of this species
		std::vector<matrix> proj(nPert); //projection operator into subspace spanned by star of current perturbation
		matrix projTot;
		const auto& atomMap = eSupTemplate.symm.getAtomMap()[sp];
		for(int iPert=0; iPert<nPert; iPert++)
		{	pertSp[iPert].sp = sp;
			pertSp[iPert].at = iPert / dirBasis.size();
			pertSp[iPert].dir = dirBasis[iPert % dirBasis.size()];
			pertSp[iPert].weight = 1./symSupCart.size();
			for(unsigned iSym=0; iSym<symSupCart.size(); iSym++)
			{	int at = atomMap[pertSp[iPert].at][iSym] % nAtoms; //map back to first cell
				vector3<> dir = symSupCart[iSym] * pertSp[iPert].dir;
				matrix nHat = zeroes(nPert,1);
				for(int iDir=0; iDir<3; iDir++)
					nHat.set(at*3+iDir,0, dir[iDir]);
				proj[iPert] += pertSp[iPert].weight * nHat * dagger(nHat);
			}
			projTot += proj[iPert];
		}
		myassert(nrm2(projTot - eye(nPert)) < symmThreshold);
		//only select perturbations with distinct subspace projections:
		std::vector<bool> irred(nPert, true); //whether each perturbation is in irreducible set
		for(int iPert=0; iPert<nPert; iPert++)
		{	for(int jPert=0; jPert<iPert; jPert++)
				if(irred[jPert] && nrm2(proj[iPert]-proj[jPert])<symmThreshold)
				{	pertSp[jPert].weight += pertSp[iPert].weight; //send weight of current mode to its image in irreducible set
					irred[iPert] = false; //this mode will be accounted for upon symmetrization
					break;
				}
		}
		for(int iPert=0; iPert<nPert; iPert++)
			if(irred[iPert])
				perturbations.push_back(pertSp[iPert]);
	}
	logPrintf("\n%d perturbations of the unit cell reduced to %d under symmetries:\n", int(modes.size()), int(perturbations.size()));
	for(const Perturbation& pert: perturbations)
		logPrintf("%s %d  [ %+lf %+lf %+lf ] %lf\n", e.iInfo.species[pert.sp]->name.c_str(),
			pert.at, pert.dir[0], pert.dir[1], pert.dir[2], pert.weight*symSupCart.size());
	
	//Determine wavefunction unitary rotations:
	logPrintf("\nCalculating unitary rotations of unit cell states under symmetries:\n");
	stateRot.resize(nSpins);
	double unitarityErr = 0.;
	for(int iSpin=0; iSpin<nSpins; iSpin++)
	{	//Find states involved in the supercell Gamma-point:
		struct Kpoint : public Supercell::KmeshTransform
		{	vector3<> k; //also store k-point for convenience (KmeshTransform doesn't have it)
		};
		std::vector<Kpoint> kpoints; kpoints.reserve(prodSup);
		const Supercell& supercell = *(e.coulombParams.supercell);
		for(unsigned ik=0; ik<supercell.kmesh.size(); ik++)
		{	double kSupErr; round(matrix3<>(Diag(sup)) * supercell.kmesh[ik], &kSupErr);
			if(kSupErr < symmThreshold) //maps to Gamma point
			{	Kpoint kpoint;
				(Supercell::KmeshTransform&)kpoint = supercell.kmeshTransform[ik]; //copy base class
				kpoint.k = supercell.kmesh[ik];
				kpoint.iReduced += iSpin*(e.eInfo.nStates/nSpins); //point to source k-point with appropriate spin
				kpoints.push_back(kpoint);
			}
		}
		myassert(int(kpoints.size()) == prodSup);
		//Initialize basis and qnum for these states:
		std::vector<QuantumNumber> qnums(prodSup);
		std::vector<Basis> basis(prodSup);
		logSuspend();
		for(int ik=0; ik<prodSup; ik++)
		{	qnums[ik].k = kpoints[ik].k;
			qnums[ik].spin = (nSpins==1 ? 0 : (iSpin ? +1 : -1));
			qnums[ik].weight = 1./prodSup;
			basis[ik].setup(e.gInfo, e.iInfo, e.cntrl.Ecut, kpoints[ik].k);
		}
		logResume();
		//Get wavefunctions for all these k-points:
		#define whose_ik(ik) (((ik) * mpiUtil->nProcesses())/prodSup) //local MPI division
		std::vector<ColumnBundle> C(prodSup);
		std::vector<std::shared_ptr<ColumnBundleTransform::BasisWrapper> > basisWrapper(prodSup);
		auto sym = e.symm.getMatrices(); //unit cell symmetries
		for(int ik=0; ik<prodSup; ik++)
		{	C[ik].init(e.eInfo.nBands, basis[ik].nbasis*nSpinor, &basis[ik], &qnums[ik], isGpuEnabled());
			if(whose_ik(ik) == mpiUtil->iProcess())
			{	int q = kpoints[ik].iReduced;
				C[ik].zero();
				basisWrapper[ik] = std::make_shared<ColumnBundleTransform::BasisWrapper>(basis[ik]);
				ColumnBundleTransform(e.eInfo.qnums[q].k, e.basis[q], qnums[ik].k, *(basisWrapper[ik]),
					nSpinor, sym[kpoints[ik].iSym], kpoints[ik].invert).scatterAxpy(1., e.eVars.C[q], C[ik],0,1);
			}
		}
		for(int ik=0; ik<prodSup; ik++) C[ik].bcast(whose_ik(ik)); //make available on all processes
		//Determine max eigenvalue:
		int nBands = e.eInfo.nBands;
		double Emax = -INFINITY;
		for(int q=e.eInfo.qStart; q<e.eInfo.qStop; q++)
			Emax = std::max(Emax, e.eVars.Hsub_eigs[q].back());
		mpiUtil->allReduce(Emax, MPIUtil::MPIUtil::ReduceMax);
		double EmaxValid = +INFINITY;
		//Loop over supercell symmetry operations:
		PeriodicLookup<QuantumNumber> plook(qnums, e.gInfo.GGT);
		stateRot[iSpin].resize(symSupCart.size());
		for(size_t iSym=0; iSym<symSupCart.size(); iSym++)
		{	matrix3<> symUnitTmp = e.gInfo.invR * symSupCart[iSym] * e.gInfo.R; //in unit cell lattice coordinates
			#define SymmErrMsg \
				"Supercell symmetries do not map unit cell k-point mesh onto itself.\n" \
				"This implies that the supercell is more symmetric than the unit cell!\n" \
				"Please check to make sure that you have used the minimal unit cell.\n\n"
			matrix3<int> symUnit;
			for(int j1=0; j1<3; j1++)
				for(int j2=0; j2<3; j2++)
				{	symUnit(j1,j2) = round(symUnitTmp(j1,j2));
					if(fabs(symUnit(j1,j2) - symUnitTmp(j1,j2)) > symmThreshold)
						die(SymmErrMsg)
				}
			//Find image kpoints under rotation: (do this for all k-points so that all processes exit together if necessary)
			std::vector<int> ikRot(prodSup);
			for(int ik=0; ik<prodSup; ik++)
			{	size_t ikRotCur = plook.find(qnums[ik].k * symUnit);
				if(ikRotCur==string::npos) die(SymmErrMsg)
				ikRot[ik] = ikRotCur;
			}
			#undef SymmErrMsg
			//Calculate unitary transformation matrix:
			stateRot[iSpin][iSym].init(prodSup, nBands);
			for(int ik=0; ik<prodSup; ik++)
				if(whose_ik(ikRot[ik]) == mpiUtil->iProcess()) //MPI division by target k-point
				{	ColumnBundle Crot = C[ikRot[ik]].similar();
					Crot.zero();
					ColumnBundleTransform(qnums[ik].k, basis[ik], qnums[ikRot[ik]].k, *(basisWrapper[ikRot[ik]]),
						nSpinor, symUnit, +1).scatterAxpy(1., C[ik], Crot,0,1);
					matrix Urot = Crot ^ O(C[ikRot[ik]]); //will be unitary if Crot is a strict unitary rotation of C[ikRot[ik]]
					//Check maximal subspace that is unitary: (remiander must be incomplete degenerate subspace)
					int nBandsValid = nBands;
					while(nBandsValid && !isUnitary(Urot(0,nBandsValid, 0,nBandsValid)))
						nBandsValid--;
					if(nBandsValid<nBands)
					{	//Update energy range of validity:
						EmaxValid = std::min(EmaxValid, e.eVars.Hsub_eigs[kpoints[ik].iReduced][nBandsValid]);
						//Make valid subspace exactly unitary:
						matrix UrotSub = Urot(0,nBandsValid, 0,nBandsValid);
						matrix UrotOverlap = dagger(UrotSub) * UrotSub;
						UrotSub = UrotSub * invsqrt(UrotOverlap); //make exactly unitary
						unitarityErr += std::pow(nrm2(UrotOverlap - eye(nBandsValid)), 2);
						//Zero out invalid subspace:
						Urot.zero();
						Urot.set(0,nBandsValid, 0,nBandsValid, UrotSub);
					}
					stateRot[iSpin][iSym].set(ik, ikRot[ik], Urot);
				}
			stateRot[iSpin][iSym].allReduce();
		}
		#undef whose_ik
		mpiUtil->allReduce(EmaxValid, MPIUtil::ReduceMin);
		if(nSpins>1) logPrintf("\tSpin %+d: ", iSpin ? +1 : -1);  else logPrintf("\t");
		logPrintf("Matrix elements valid for ");
		if(std::isfinite(EmaxValid)) logPrintf("E < %+.6lf (Emax = %+.6lf) due to incomplete degenerate subspaces.\n", EmaxValid, Emax);
		else logPrintf("all available states (all degenerate subspaces are complete).\n");
	}
	mpiUtil->allReduce(unitarityErr, MPIUtil::ReduceSum);
	unitarityErr = sqrt(unitarityErr / (nSpins * prodSup * symSupCart.size()));
	logPrintf("\tRMS unitarity error in valid subspaces: %le\n", unitarityErr);
}
예제 #6
0
inline bool isUnitary(const matrix& U) { return nrm2(U*dagger(U) - eye(U.nCols())) < symmThreshold; }
void SpeciesInfo::augmentDensitySpherical(const QuantumNumber& qnum, const diagMatrix& Fq, const matrix& VdagCq)
{	static StopWatch watch("augmentDensitySpherical"); watch.start(); 
	augmentDensity_COMMON_INIT
	int nProj = MnlAll.nRows();
	const GridInfo &gInfo = e->gInfo;
	complex* nAugData = nAug.data();
	
	//Loop over atoms:
	for(unsigned atom=0; atom<atpos.size(); atom++)
	{	//Get projections and calculate density matrix at this atom:
		matrix atomVdagC = VdagCq(atom*nProj,(atom+1)*nProj, 0,VdagCq.nCols());
		matrix RhoAll = atomVdagC * Fq * dagger(atomVdagC); //density matrix in projector basis on this atom
		if(isRelativistic()) RhoAll = fljAll * RhoAll * fljAll; //transformation for relativistic pseudopotential
		std::vector<matrix> Rho(e->eInfo.nDensities); //RhoAll split by spin(-density-matrix) components
		if(e->eInfo.isNoncollinear())
		{	matrix RhoUp = RhoAll(0,2,nProj, 0,2,nProj);
			matrix RhoDn = RhoAll(1,2,nProj, 1,2,nProj);
			if(Rho.size()==1)
				Rho[0] = RhoUp + RhoDn; //unpolarized noncollinear mode
			else
			{	matrix RhoUpDn = RhoAll(0,2,nProj, 1,2,nProj);
				matrix RhoDnUp = RhoAll(1,2,nProj, 0,2,nProj);
				Rho[0] = RhoUp;
				Rho[1] = RhoDn;
				Rho[2] = (RhoUpDn + RhoDnUp) * 0.5; //'real part' of UpDn
				Rho[3] = (RhoUpDn - RhoDnUp) * complex(0,-0.5); //'imaginary part' of UpDn
			}
		}
		else std::swap(Rho[qnum.index()], RhoAll); //in this case each qnum contributes to a specific spin component
		
		//Calculate spherical function contributions from density matrix:
		for(size_t s=0; s<Rho.size(); s++) if(Rho[s])
		{	int atomOffs = Nlm*(atom + s*atpos.size());
			//Triple loop over first projector:
			int i1 = 0;
			for(int l1=0; l1<int(VnlRadial.size()); l1++)
			for(int p1=0; p1<int(VnlRadial[l1].size()); p1++)
			for(int m1=-l1; m1<=l1; m1++)
			{	//Triple loop over second projector:
				int i2 = 0;
				for(int l2=0; l2<int(VnlRadial.size()); l2++)
				for(int p2=0; p2<int(VnlRadial[l2].size()); p2++)
				for(int m2=-l2; m2<=l2; m2++)
				{	if(i2<=i1) //rest handled by i1<->i2 symmetry
					{	std::vector<YlmProdTerm> terms = expandYlmProd(l1,m1, l2,m2);
						double prefac = qnum.weight * ((i1==i2 ? 1 : 2)/gInfo.detR)
									* (Rho[s].data()[Rho[s].index(i2,i1)] * cis(0.5*M_PI*(l2-l1))).real();
						for(const YlmProdTerm& term: terms)
						{	QijIndex qIndex = { l1, p1, l2, p2, term.l };
							auto Qijl = Qradial.find(qIndex);
							if(Qijl==Qradial.end()) continue; //no entry at this l
							nAugData[nAug.index(Qijl->first.index, atomOffs + term.l*(term.l+1) + term.m)] += term.coeff * prefac;
						}
					}
					i2++;
				}
				i1++;
			}
		}
	}
	watch.stop();
}
void ElectronScattering::dump(const Everything& everything)
{	Everything& e = (Everything&)everything; //may modify everything to save memory / optimize
	this->e = &everything;
	nBands = e.eInfo.nBands;
	nSpinor = e.eInfo.spinorLength();
	
	logPrintf("\n----- Electron-electron scattering Im(Sigma) -----\n"); logFlush();

	//Update default parameters:
	if(!eta)
	{	eta = e.eInfo.kT;
		if(!eta) die("eta must be specified explicitly since electronic temperature is zero.\n");
	}
	if(!Ecut) Ecut = e.cntrl.Ecut;
	double oMin = DBL_MAX, oMax = -DBL_MAX; //occupied energy range
	double uMin = DBL_MAX, uMax = -DBL_MAX; //unoccupied energy range
	for(int q=e.eInfo.qStart; q<e.eInfo.qStop; q++)
		for(int b=0; b<nBands; b++)
		{	double E = e.eVars.Hsub_eigs[q][b];
			double f = e.eVars.F[q][b];
			if(f > fCut) //sufficiently occupied
			{	oMin = std::min(oMin, E);
				oMax = std::max(oMax, E);
			}
			if(f < 1.-fCut) //sufficiently unoccupied
			{	uMin = std::min(uMin, E);
				uMax = std::max(uMax, E);
			}
		}
	mpiUtil->allReduce(oMin, MPIUtil::ReduceMin);
	mpiUtil->allReduce(oMax, MPIUtil::ReduceMax);
	mpiUtil->allReduce(uMin, MPIUtil::ReduceMin);
	mpiUtil->allReduce(uMax, MPIUtil::ReduceMax);
	if(!omegaMax) omegaMax = std::max(uMax-uMin, oMax-oMin);
	Emin = uMin - omegaMax;
	Emax = oMax + omegaMax;
	//--- print selected values after fixing defaults:
	logPrintf("Frequency resolution:    %lg\n", eta);
	logPrintf("Dielectric matrix Ecut:  %lg\n", Ecut);
	logPrintf("Maximum energy transfer: %lg\n", omegaMax);
	
	//Initialize frequency grid:
	diagMatrix omegaGrid, wOmega;
	omegaGrid.push_back(0.);
	wOmega.push_back(0.5*eta); //integration weight (halved at endpoint)
	while(omegaGrid.back()<omegaMax + 10*eta) //add margin for covering enough of the Lorentzians
	{	omegaGrid.push_back(omegaGrid.back() + eta);
		wOmega.push_back(eta);
	}
	int iOmegaStart, iOmegaStop; //split dielectric computation over frequency grid
	TaskDivision omegaDiv(omegaGrid.size(), mpiUtil);
	omegaDiv.myRange(iOmegaStart, iOmegaStop);
	logPrintf("Initialized frequency grid with resolution %lg and %d points.\n", eta, omegaGrid.nRows());

	//Make necessary quantities available on all processes:
	C.resize(e.eInfo.nStates);
	E.resize(e.eInfo.nStates);
	F.resize(e.eInfo.nStates);
	for(int q=0; q<e.eInfo.nStates; q++)
	{	int procSrc = e.eInfo.whose(q);
		if(procSrc == mpiUtil->iProcess())
		{	std::swap(C[q], e.eVars.C[q]);
			std::swap(E[q], e.eVars.Hsub_eigs[q]);
			std::swap(F[q], e.eVars.F[q]);
		}
		else
		{	C[q].init(nBands, e.basis[q].nbasis * nSpinor, &e.basis[q], &e.eInfo.qnums[q]);
			E[q].resize(nBands);
			F[q].resize(nBands);
		}
		C[q].bcast(procSrc);
		E[q].bcast(procSrc);
		F[q].bcast(procSrc);
	}
	
	//Randomize supercell to improve load balancing on k-mesh:
	{	std::vector< vector3<> >& kmesh = e.coulombParams.supercell->kmesh;
		std::vector<Supercell::KmeshTransform>& kmeshTransform = e.coulombParams.supercell->kmeshTransform;
		for(size_t ik=0; ik<kmesh.size()-1; ik++)
		{	size_t jk = ik + floor(Random::uniform(kmesh.size()-ik));
			mpiUtil->bcast(jk);
			if(jk !=ik && jk < kmesh.size())
			{	std::swap(kmesh[ik], kmesh[jk]);
				std::swap(kmeshTransform[ik], kmeshTransform[jk]);
			}
		}
	}
	
	//Report maximum nearest-neighbour eigenvalue change (to guide choice of eta)
	supercell = e.coulombParams.supercell;
	matrix3<> kBasisT = inv(supercell->Rsuper) * e.gInfo.R;
	vector3<> kBasis[3]; for(int j=0; j<3; j++) kBasis[j] = kBasisT.row(j);
	plook = std::make_shared< PeriodicLookup< vector3<> > >(supercell->kmesh, e.gInfo.GGT);
	size_t ikStart, ikStop;
	TaskDivision(supercell->kmesh.size(), mpiUtil).myRange(ikStart, ikStop);
	double dEmax = 0.;
	for(size_t ik=ikStart; ik<ikStop; ik++)
	{	const diagMatrix& Ei = E[supercell->kmeshTransform[ik].iReduced];
		for(int j=0; j<3; j++)
		{	size_t jk = plook->find(supercell->kmesh[ik] + kBasis[j]);
			myassert(jk != string::npos);
			const diagMatrix& Ej = E[supercell->kmeshTransform[jk].iReduced];
			for(int b=0; b<nBands; b++)
				if(Emin <= Ei[b] && Ei[b] <= Emax)
					dEmax = std::max(dEmax, fabs(Ej[b]-Ei[b]));
		}
	}
	mpiUtil->allReduce(dEmax, MPIUtil::ReduceMax);
	logPrintf("Maximum k-neighbour dE: %lg (guide for selecting eta)\n", dEmax);
	
	//Initialize reduced q-Mesh:
	//--- q-mesh is a k-point dfference mesh, which could differ from k-mesh for off-Gamma meshes
	qmesh.resize(supercell->kmesh.size());
	for(size_t iq=0; iq<qmesh.size(); iq++)
	{	qmesh[iq].k = supercell->kmesh[iq] - supercell->kmesh[0]; //k-difference
		qmesh[iq].weight = 1./qmesh.size(); //uniform mesh
		qmesh[iq].spin = 0;
	}
	logPrintf("Symmetries reduced momentum transfers (q-mesh) from %d to ", int(qmesh.size()));
	qmesh = e.symm.reduceKmesh(qmesh);
	logPrintf("%d entries\n", int(qmesh.size())); logFlush();
	
	//Initialize polarizability/dielectric bases corresponding to qmesh:
	logPrintf("Setting up reduced polarizability bases at Ecut = %lg: ", Ecut); logFlush();
	basisChi.resize(qmesh.size());
	double avg_nbasis = 0.;
	const GridInfo& gInfoBasis = e.gInfoWfns ? *e.gInfoWfns : e.gInfo;
	logSuspend();
	for(size_t iq=0; iq<qmesh.size(); iq++)
	{	basisChi[iq].setup(gInfoBasis, e.iInfo, Ecut, qmesh[iq].k);
		avg_nbasis += qmesh[iq].weight * basisChi[iq].nbasis;
	}
	logResume();
	logPrintf("nbasis = %.2lf average, %.2lf ideal\n", avg_nbasis, pow(sqrt(2*Ecut),3)*(e.gInfo.detR/(6*M_PI*M_PI)));
	logFlush();


	//Initialize common wavefunction basis and ColumnBundle transforms for full k-mesh:
	logPrintf("Setting up k-mesh wavefunction transforms ... "); logFlush();
	double kMaxSq = 0;
	for(const vector3<>& k: supercell->kmesh)
	{	kMaxSq = std::max(kMaxSq, e.gInfo.GGT.metric_length_squared(k));
		for(const QuantumNumber& qnum: qmesh)
			kMaxSq = std::max(kMaxSq, e.gInfo.GGT.metric_length_squared(k + qnum.k));
	}
	double kWeight = double(e.eInfo.spinWeight) / supercell->kmesh.size();
	double GmaxEff = sqrt(2.*e.cntrl.Ecut) + sqrt(kMaxSq);
	double EcutEff = 0.5*GmaxEff*GmaxEff * (1.+symmThreshold); //add some margin for round-off error safety
	logSuspend();
	basis.setup(e.gInfo, e.iInfo, EcutEff, vector3<>());
	logResume();
	ColumnBundleTransform::BasisWrapper basisWrapper(basis);
	std::vector<matrix3<int>> sym = e.symm.getMatrices();
	for(size_t ik=ikStart; ik<ikStop; ik++)
	{	const vector3<>& k = supercell->kmesh[ik];
		for(const QuantumNumber& qnum: qmesh)
		{	vector3<> k2 = k + qnum.k; double roundErr;
			vector3<int> k2sup = round((k2 - supercell->kmesh[0]) * supercell->super, &roundErr);
			myassert(roundErr < symmThreshold);
			auto iter = transform.find(k2sup);
			if(iter == transform.end())
			{	size_t ik2 = plook->find(k2); myassert(ik2 != string::npos);
				const Supercell::KmeshTransform& kTransform = supercell->kmeshTransform[ik2];
				const Basis& basisC = e.basis[kTransform.iReduced];
				const vector3<>& kC = e.eInfo.qnums[kTransform.iReduced].k;
				transform[k2sup] = std::make_shared<ColumnBundleTransform>(kC, basisC, k2, basisWrapper,
					nSpinor, sym[kTransform.iSym], kTransform.invert);
				//Initialize corresponding quantum number:
				QuantumNumber qnum;
				qnum.k = k2;
				qnum.spin = 0;
				qnum.weight = kWeight;
				qnumMesh[k2sup] = qnum;
			}
		}
	}
	logPrintf("done.\n"); logFlush();

	//Main loop over momentum transfers:
	diagMatrix ImKscrHead(omegaGrid.size(), 0.);
	std::vector<diagMatrix> ImSigma(e.eInfo.nStates, diagMatrix(nBands,0.));
	diagMatrix cedaNum(nBands, 0.), cedaDen(nBands, 0.);
	for(size_t iq=0; iq<qmesh.size(); iq++)
	{	logPrintf("\nMomentum transfer %d of %d: q = ", int(iq+1), int(qmesh.size()));
		qmesh[iq].k.print(globalLog, " %+.5lf ");
		int nbasis = basisChi[iq].nbasis;
		
		//Construct Coulomb operator (regularizes G=0 using the tricks developed for EXX):
		matrix invKq = inv(coulombMatrix(iq));
		
		//Calculate chi_KS:
		std::vector<matrix> chiKS(omegaGrid.nRows()); CEDA ceda(nBands, nbasis);
		logPrintf("\tComputing chi_KS ...  "); logFlush(); 
		size_t nkMine = ikStop-ikStart;
		int ikInterval = std::max(1, int(round(nkMine/20.))); //interval for reporting progress
		for(size_t ik=ikStart; ik<ikStop; ik++)
		{	//Report progress:
			size_t ikDone = ik-ikStart+1;
			if(ikDone % ikInterval == 0)
			{	logPrintf("%d%% ", int(round(ikDone*100./nkMine)));
				logFlush();
			}
			//Get events:
			size_t jk; matrix nij;
			std::vector<Event> events = getEvents(true, ik, iq, jk, nij, &ceda);
			if(!events.size()) continue;
			//Collect contributions for each frequency:
			for(int iOmega=0; iOmega<omegaGrid.nRows(); iOmega++)
			{	double omega = omegaGrid[iOmega];
				complex omegaTilde(omega, 2*eta);
				complex one(1,0);
				std::vector<complex> Xks; Xks.reserve(events.size());
				for(const Event& event: events)
					Xks.push_back(-e.gInfo.detR * kWeight * event.fWeight
						* (one/(event.Eji - omegaTilde) + one/(event.Eji + omegaTilde)) );
				chiKS[iOmega] += (nij * Xks) * dagger(nij);
			}
		}
		for(int iOmega=0; iOmega<omegaGrid.nRows(); iOmega++)
			chiKS[iOmega].allReduce(MPIUtil::ReduceSum);
		logPrintf("done.\n"); logFlush();
		diagMatrix chiKS0diag = diag(chiKS[0]); //static neglecting local-fields (for CEDA)
		
		//Figure out head entry index:
		int iHead = -1;
		for(int n=0; n<nbasis; n++)
			if(!basisChi[iq].iGarr[n].length_squared())
			{	iHead = n;
				break;
			}
		myassert(iHead >= 0);
		
		//Calculate Im(screened Coulomb operator):
		logPrintf("\tComputing Im(Kscreened) ... "); logFlush();
		std::vector<matrix> ImKscr(omegaGrid.nRows(), zeroes(nbasis, nbasis));
		for(int iOmega=iOmegaStart; iOmega<iOmegaStop; iOmega++)
		{	ImKscr[iOmega] = imag(inv(invKq - chiKS[iOmega]));
			chiKS[iOmega] = 0; //free to save memory
			ImKscrHead[iOmega] += qmesh[iq].weight * ImKscr[iOmega](iHead,iHead).real(); //accumulate head of ImKscr
		}
		for(int iOmega=0; iOmega<omegaGrid.nRows(); iOmega++)
			ImKscr[iOmega].bcast(omegaDiv.whose(iOmega));
		chiKS.clear();
		logPrintf("done.\n"); logFlush();
		
		//Collect CEDA contributions:
		ceda.collect(*this, iq, chiKS0diag, cedaNum, cedaDen);
		
		//Calculate ImSigma contributions:
		logPrintf("\tComputing ImSigma ... "); logFlush(); 
		for(size_t ik=ikStart; ik<ikStop; ik++)
		{	//Report progress:
			size_t ikDone = ik-ikStart+1;
			if(ikDone % ikInterval == 0)
			{	logPrintf("%d%% ", int(round(ikDone*100./nkMine)));
				logFlush();
			}
			//Get events:
			size_t jk; matrix nij;
			std::vector<Event> events = getEvents(false, ik, iq, jk, nij);
			if(!events.size()) continue;
			//Integrate over frequency for event contributions to linewidth:
			diagMatrix eventContrib(events.size(), 0);
			for(int iOmega=0; iOmega<omegaGrid.nRows(); iOmega++)
			{	//Construct energy conserving delta-function:
				double omega = omegaGrid[iOmega];
				complex omegaTilde(omega, 2*eta);
				diagMatrix delta; delta.reserve(events.size());
				for(const Event& event: events)
					delta.push_back(e.gInfo.detR * event.fWeight //overlap and sign for electron / hole
						* (2*eta/M_PI) * ( 1./(event.Eji - omegaTilde).norm() - 1./(event.Eji + omegaTilde).norm()) ); //Normalized Lorentzians
				eventContrib += wOmega[iOmega] * delta * diag(dagger(nij) * ImKscr[iOmega] * nij);
			}
			//Accumulate contributions to linewidth:
			int iReduced = supercell->kmeshTransform[ik].iReduced; //directly collect to reduced k-point
			double symFactor = e.eInfo.spinWeight / (supercell->kmesh.size() * e.eInfo.qnums[iReduced].weight); //symmetrization factor = 1 / |orbit of iReduced|
			double qWeight = qmesh[iq].weight;
			for(size_t iEvent=0; iEvent<events.size(); iEvent++)
			{	const Event& event = events[iEvent];
				ImSigma[iReduced][event.i] += symFactor * qWeight * eventContrib[iEvent];
			}
		}
		logPrintf("done.\n"); logFlush();
	}
	logPrintf("\n");
	
	ImKscrHead.allReduce(MPIUtil::ReduceSum);
	for(diagMatrix& IS: ImSigma)
		IS.allReduce(MPIUtil::ReduceSum);
	for(int q=0; q<e.eInfo.nStates; q++)
		for(int b=0; b<nBands; b++)
		{	double Eqb = E[q][b];
			if(Eqb<Emin || Eqb>Emax)
				ImSigma[q][b] = NAN; //clearly mark as invalid
		}
	
	string fname = e.dump.getFilename("ImSigma_ee");
	logPrintf("Dumping %s ... ", fname.c_str()); logFlush();
	e.eInfo.write(ImSigma, fname.c_str());
	logPrintf("done.\n");

	fname = e.dump.getFilename("ImKscrHead");
	logPrintf("Dumping %s ... ", fname.c_str()); logFlush();
	if(mpiUtil->isHead())
	{	FILE* fp = fopen(fname.c_str(), "w");
		for(int iOmega=0; iOmega<omegaGrid.nRows(); iOmega++)
			fprintf(fp, "%lf %le\n", omegaGrid[iOmega], ImKscrHead[iOmega]);
		fclose(fp);
	}
	logPrintf("done.\n");

	fname = e.dump.getFilename("CEDA");
	logPrintf("Dumping %s ... ", fname.c_str()); logFlush();
	if(mpiUtil->isHead())
	{	FILE* fp = fopen(fname.c_str(), "w");
		if(!fp) die("Could not open '%s' for writing.\n", fname.c_str());
		(cedaNum * inv(cedaDen)).print(fp, "%19.12le\n");
		fclose(fp);
	}
	logPrintf("done.\n");

	logPrintf("\n"); logFlush();
}
std::vector<ElectronScattering::Event> ElectronScattering::getEvents(bool chiMode, size_t ik, size_t iq, size_t& jk, matrix& nij, ElectronScattering::CEDA* ceda) const
{	static StopWatch watchI("ElectronScattering::getEventsI"), watchJ("ElectronScattering::getEventsJ"), watchCEDA("ElectronScattering::CEDA");
	//Find target k-point:
	const vector3<>& ki = supercell->kmesh[ik];
	const vector3<> kj = ki + qmesh[iq].k;
	jk = plook->find(kj);
	myassert(jk != string::npos);
	
	//Compile list of events:
	int iReduced = supercell->kmeshTransform[ik].iReduced;
	int jReduced = supercell->kmeshTransform[jk].iReduced;
	const diagMatrix &Ei = E[iReduced], &Fi = F[iReduced];
	const diagMatrix &Ej = E[jReduced], &Fj = F[jReduced];
	std::vector<Event> events, eventsCEDA; events.reserve((nBands*nBands)/2);
	std::vector<bool> iUsed(nBands,false), jUsed(nBands,false); //sets of i and j actually referenced
	Event event;
	for(event.i=0; event.i<nBands; event.i++)
	for(event.j=0; event.j<nBands; event.j++)
	{	event.fWeight = chiMode ? 0.5*(Fi[event.i] - Fj[event.j]) : (1. - Fi[event.i] - Fj[event.j]);
		double Eii = Ei[event.i];
		double Ejj = Ej[event.j];
		event.Eji = Ejj - Eii;
		if(!chiMode)
		{	if(Eii<Emin || Eii>Emax) event.fWeight = 0.; //state out of relevant range
			if(event.fWeight * (Eii-Ejj) <= 0) event.fWeight = 0; //wrong sign for energy transfer
		}
		bool needEvent = (fabs(event.fWeight) > fCut);
		bool needCEDA = ceda && ((Fi[event.i]>fCut) || (Fj[event.j]>fCut)); //additionally need occupied-occupied combinations for CEDA
		if(needEvent || needCEDA)
		{	(needEvent ? events : eventsCEDA).push_back(event);
			iUsed[event.i] = true;
			jUsed[event.j] = true;
		}
	}
	if(!events.size()) return events;
	std::vector<Event> eventsAll = events;
	eventsAll.insert(eventsAll.end(), eventsCEDA.begin(), eventsCEDA.end());
	
	//Get wavefunctions in real space:
	ColumnBundle Ci = getWfns(ik, ki), Cj = getWfns(jk, kj);
	std::vector< std::vector<complexScalarField> > conjICi(nBands), ICj(nBands);
	watchI.start();
	for(int i=0; i<nBands; i++) if(iUsed[i])
	{	conjICi[i].resize(nSpinor);
		for(int s=0; s<nSpinor; s++)
			conjICi[i][s] = conj(I(Ci.getColumn(i,s))); 
	}
	for(int j=0; j<nBands; j++) if(jUsed[j])
	{	ICj[j].resize(nSpinor);
		for(int s=0; s<nSpinor; s++)
			ICj[j][s] = I(Cj.getColumn(j,s));
	}
	watchI.stop();
	
	//Initialize pair densities:
	watchJ.start();
	const Basis& basis_q = basisChi[iq];
	int nbasis = basis_q.nbasis;
	nij = zeroes(nbasis, eventsAll.size());
	complex* nijData = nij.dataPref();
	for(const Event& event: eventsAll)
	{	complexScalarField Inij;
		for(int s=0; s<nSpinor; s++)
			Inij += conjICi[event.i][s] * ICj[event.j][s];
		callPref(eblas_gather_zdaxpy)(nbasis, 1., basis_q.indexPref, J(Inij)->dataPref(), nijData);
		nijData += nbasis;
	}
	watchJ.stop();
	
	//CEDA plasma-frequency sum rule contributions:
	if(ceda)
	{	myassert(chiMode);
		watchCEDA.start();
		//Single loop quantities:
		for(int i=0; i<nBands; i++)
		{	ceda->Fsum[i] += Fi[i];
			ceda->FEsum[i] += Fi[i] * Ei[i];
		}
		//Double loop quantities:
		const complex* nijData = nij.data();
		for(const Event& event: eventsAll)
		{	//Compute elementwise nij^2:
			diagMatrix nijSq(nbasis, 0.);
			eblas_accumNorm(nbasis, 1., nijData, nijSq.data());
			nijData += nbasis;
			//Accumulate to appropriate entries of oNum and oDen:
			double numWeight = 0.5*(Fi[event.i]*Ej[event.j] + Fj[event.j]*Ei[event.i]);
			double denWeight = 0.5*(Fi[event.i] + Fj[event.j]);
			int ijMax = std::max(event.i,event.j);
			ceda->oNum[ijMax] += numWeight * nijSq;
			ceda->oDen[ijMax] += denWeight * nijSq;
		}
		//Nonlocal corrections:
		//--- get DFT+U matrices:
		std::vector<matrix> Urho;
		std::vector<ColumnBundle> psi;
		if(e->eInfo.hasU)
			e->iInfo.rhoAtom_getV(Cj, e->eVars.U_rhoAtom, psi, Urho); //get atomic orbitals at kj
		for(size_t sp=0; sp<e->iInfo.species.size(); sp++)
		{	//get nonlocal psp matrices and projectors:
			matrix Mnl;
			std::shared_ptr<ColumnBundle> V = e->iInfo.species[sp]->getV(Cj, &Mnl); //get projectors at kj
			bool hasNL = Mnl.nRows();
			bool hasU = e->eInfo.hasU && Urho[sp].nRows();
			if(!(hasNL || hasU)) continue;
			//Put projectors and orbitals in real space:
			std::vector<complexScalarField> IV;
			std::vector< std::vector<complexScalarField> > Ipsi;
			diagMatrix diagNL, diagU; //(q+G)-diagonal contributions
			if(hasNL)
			{	myassert(Mnl.nRows() == V->nCols()*nSpinor);
				IV.resize(V->nCols());
				for(int v=0; v<V->nCols(); v++)
					IV[v] = I(V->getColumn(v,0)); //NL projectors are always non-spinorial
				matrix CjDagV = Cj ^ (*V);
				diagNL = diag(CjDagV * Mnl * dagger(CjDagV));
			}
			if(hasU)
			{	myassert(Urho[sp].nRows() == psi[sp].nCols());
				Ipsi.resize(psi[sp].nCols());
				for(int n=0; n<psi[sp].nCols(); n++)
				{	Ipsi[n].resize(nSpinor);
					for(int s=0; s<nSpinor; s++)
						Ipsi[n][s] = I(psi[sp].getColumn(n,s)); //atomic orbitals will be spinorial in noncollinear modes
				}
				matrix CjDagPsi = Cj ^ psi[sp];
				diagU = diag(CjDagPsi * Urho[sp] * dagger(CjDagPsi));
			}
			//Diagonal terms (computed at j, since those projectors have been retrieved):
			for(int j=0; j<nBands; j++) if(Fj[j] > fCut)
			{	double diag_j = (hasNL ? diagNL[j] : 0.) + (hasU ? diagU[j] : 0.);
				ceda->FNLsum[j] -= (Fj[j] * diag_j) * eye(nbasis);
			}
			//Off-diagonal terms (coupling i and j):
			for(int i=0; i<nBands; i++) if(Fi[i] > fCut)
			{	myassert(iUsed[i]);
				if(hasNL)
				{	//Put pair densities with projectors in reciprocal space:
					matrix niV = zeroes(nbasis, Mnl.nRows()); //anologous to nij above, but with V instead
					complex* niVdata = niV.dataPref();
					for(int v=0; v<V->nCols(); v++)
						for(int s=0; s<nSpinor; s++)
						{	callPref(eblas_gather_zdaxpy)(nbasis, 1., basis_q.indexPref, J(conjICi[i][s] * IV[v])->dataPref(), niVdata);
							niVdata += nbasis;
						}
					//Accumulate correction:
					ceda->FNLsum[i] += Fi[i] * diagouter(niV * Mnl, niV);
				}
				if(hasU)
				{	//Put pair densities with orbitals in reciprocal space:
					matrix niPsi = zeroes(nbasis, Urho[sp].nRows()); //anologous to nij above, but with psi instead
					complex* niPsiData = niPsi.dataPref();
					for(int n=0; n<psi[sp].nCols(); n++)
					{	complexScalarField IniPsi;
						for(int s=0; s<nSpinor; s++)
							IniPsi += conjICi[i][s] * Ipsi[n][s];
						callPref(eblas_gather_zdaxpy)(nbasis, 1., basis_q.indexPref, J(IniPsi)->dataPref(), niPsiData);
						niPsiData += nbasis;
					}
					//Accumulate correction:
					ceda->FNLsum[i] += Fi[i] * diagouter(niPsi * Urho[sp], niPsi);
				}
			}
		}
		watchCEDA.stop();
	}
	
	//Trim extra columns in matrix (which were needed only for CEDA):
	if(eventsCEDA.size())
		nij = nij(0,nij.nRows(), 0,events.size());
	
	return events;
}