NDArray<T> Householder<T>::evalHHmatrix(const NDArray<T>& x) { // input validation if(!x.isVector() && !x.isScalar()) throw "ops::helpers::Householder::evalHHmatrix method: input array must be vector or scalar!"; NDArray<T> w((int)x.lengthOf(), 1, x.ordering(), x.getWorkspace()); // column-vector NDArray<T> wT(1, (int)x.lengthOf(), x.ordering(), x.getWorkspace()); // row-vector (transposed w) T coeff; T normX = x.template reduceNumber<simdOps::Norm2<T>>(); const T min = DataTypeUtils::min<T>(); if(normX*normX - x(0)*x(0) <= min) { normX = x(0); coeff = (T)0.; w = (T)0.; } else { if(x(0) >= (T)0.) normX = -normX; // choose opposite sign to lessen roundoff error T u0 = x(0) - normX; coeff = -u0 / normX; w.assign(x / u0); } w(0) = (T)1.; wT.assign(&w); NDArray<T> identity((int)x.lengthOf(), (int)x.lengthOf(), x.ordering(), x.getWorkspace()); identity.setIdentity(); // identity matrix return identity - mmul(w, wT) * coeff; }
Foam::SolverPerformance<Type> Foam::PBiCICG<Type, DType, LUType>::solve(Field<Type>& psi) const { word preconditionerName(this->controlDict_.lookup("preconditioner")); // --- Setup class containing solver performance data SolverPerformance<Type> solverPerf ( preconditionerName + typeName, this->fieldName_ ); register label nCells = psi.size(); Type* __restrict__ psiPtr = psi.begin(); Field<Type> pA(nCells); Type* __restrict__ pAPtr = pA.begin(); Field<Type> pT(nCells, pTraits<Type>::zero); Type* __restrict__ pTPtr = pT.begin(); Field<Type> wA(nCells); Type* __restrict__ wAPtr = wA.begin(); Field<Type> wT(nCells); Type* __restrict__ wTPtr = wT.begin(); Type wArT = solverPerf.great_*pTraits<Type>::one; Type wArTold = wArT; // --- Calculate A.psi and T.psi this->matrix_.Amul(wA, psi); this->matrix_.Tmul(wT, psi); // --- Calculate initial residual and transpose residual fields Field<Type> rA(this->matrix_.source() - wA); Field<Type> rT(this->matrix_.source() - wT); Type* __restrict__ rAPtr = rA.begin(); Type* __restrict__ rTPtr = rT.begin(); // --- Calculate normalisation factor Type normFactor = this->normFactor(psi, wA, pA); if (LduMatrix<Type, DType, LUType>::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // --- Calculate normalised residual norm solverPerf.initialResidual() = cmptDivide(gSumCmptMag(rA), normFactor); solverPerf.finalResidual() = solverPerf.initialResidual(); // --- Check convergence, solve if not converged if (!solverPerf.checkConvergence(this->tolerance_, this->relTol_)) { // --- Select and construct the preconditioner autoPtr<typename LduMatrix<Type, DType, LUType>::preconditioner> preconPtr = LduMatrix<Type, DType, LUType>::preconditioner::New ( *this, this->controlDict_ ); // --- Solver iteration do { // --- Store previous wArT wArTold = wArT; // --- Precondition residuals preconPtr->precondition(wA, rA); preconPtr->preconditionT(wT, rT); // --- Update search directions: wArT = gSumCmptProd(wA, rT); if (solverPerf.nIterations() == 0) { for (register label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell]; pTPtr[cell] = wTPtr[cell]; } } else { Type beta = cmptDivide ( wArT, stabilise(wArTold, solverPerf.vsmall_) ); for (register label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell] + cmptMultiply(beta, pAPtr[cell]); pTPtr[cell] = wTPtr[cell] + cmptMultiply(beta, pTPtr[cell]); } } // --- Update preconditioned residuals this->matrix_.Amul(wA, pA); this->matrix_.Tmul(wT, pT); Type wApT = gSumCmptProd(wA, pT); // --- Test for singularity if ( solverPerf.checkSingularity ( cmptDivide(cmptMag(wApT), normFactor) ) ) { break; } // --- Update solution and residual: Type alpha = cmptDivide ( wArT, stabilise(wApT, solverPerf.vsmall_) ); for (register label cell=0; cell<nCells; cell++) { psiPtr[cell] += cmptMultiply(alpha, pAPtr[cell]); rAPtr[cell] -= cmptMultiply(alpha, wAPtr[cell]); rTPtr[cell] -= cmptMultiply(alpha, wTPtr[cell]); } solverPerf.finalResidual() = cmptDivide(gSumCmptMag(rA), normFactor); } while ( solverPerf.nIterations()++ < this->maxIter_ && !(solverPerf.checkConvergence(this->tolerance_, this->relTol_)) ); } return solverPerf; }
Foam::lduSolverPerformance Foam::PBiCG::solve ( scalarField& x, const scalarField& b, const direction cmpt ) const { // --- Setup class containing solver performance data lduSolverPerformance solverPerf ( lduMatrix::preconditioner::getName(dict()) + typeName, fieldName() ); register label nCells = x.size(); scalar* __restrict__ xPtr = x.begin(); scalarField pA(nCells); scalar* __restrict__ pAPtr = pA.begin(); scalarField pT(nCells, 0.0); scalar* __restrict__ pTPtr = pT.begin(); scalarField wA(nCells); scalar* __restrict__ wAPtr = wA.begin(); scalarField wT(nCells); scalar* __restrict__ wTPtr = wT.begin(); scalar wArT = matrix_.great_; scalar wArTold = wArT; // Calculate A.x and T.x matrix_.Amul(wA, x, coupleBouCoeffs_, interfaces_, cmpt); matrix_.Tmul(wT, x, coupleIntCoeffs_, interfaces_, cmpt); // Calculate initial residual and transpose residual fields scalarField rA(b - wA); scalarField rT(b - wT); scalar* __restrict__ rAPtr = rA.begin(); scalar* __restrict__ rTPtr = rT.begin(); // Calculate normalisation factor scalar normFactor = this->normFactor(x, b, wA, pA, cmpt); if (lduMatrix::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // Calculate normalised residual norm solverPerf.initialResidual() = gSumMag(rA)/normFactor; solverPerf.finalResidual() = solverPerf.initialResidual(); // Check convergence, solve if not converged if (!stop(solverPerf)) { // Select and construct the preconditioner autoPtr<lduPreconditioner> preconPtr; preconPtr = lduPreconditioner::New ( matrix_, coupleBouCoeffs_, coupleIntCoeffs_, interfaces_, dict() ); // Solver iteration do { // Store previous wArT wArTold = wArT; // Precondition residuals preconPtr->precondition(wA, rA, cmpt); preconPtr->preconditionT(wT, rT, cmpt); // Update search directions: wArT = gSumProd(wA, rT); if (solverPerf.nIterations() == 0) { for (register label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell]; pTPtr[cell] = wTPtr[cell]; } } else { scalar beta = wArT/wArTold; for (register label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell] + beta*pAPtr[cell]; pTPtr[cell] = wTPtr[cell] + beta*pTPtr[cell]; } } // Update preconditioned residuals matrix_.Amul(wA, pA, coupleBouCoeffs_, interfaces_, cmpt); matrix_.Tmul(wT, pT, coupleIntCoeffs_, interfaces_, cmpt); scalar wApT = gSumProd(wA, pT); // Test for singularity if (solverPerf.checkSingularity(mag(wApT)/normFactor)) break; // Update solution and residual: scalar alpha = wArT/wApT; for (register label cell=0; cell<nCells; cell++) { xPtr[cell] += alpha*pAPtr[cell]; rAPtr[cell] -= alpha*wAPtr[cell]; rTPtr[cell] -= alpha*wTPtr[cell]; } solverPerf.finalResidual() = gSumMag(rA)/normFactor; solverPerf.nIterations()++; } while (!stop(solverPerf)); } return solverPerf; }
bool BernoulliRBM::train_(MatrixFloat &data){ const UINT numTrainingSamples = data.getNumRows(); numInputDimensions = data.getNumCols(); numOutputDimensions = numHiddenUnits; numVisibleUnits = numInputDimensions; trainingLog << "NumInputDimensions: " << numInputDimensions << std::endl; trainingLog << "NumOutputDimensions: " << numOutputDimensions << std::endl; if( randomizeWeightsForTraining ){ //Init the weights matrix weightsMatrix.resize(numHiddenUnits, numVisibleUnits); Float a = 1.0 / numVisibleUnits; for(UINT i=0; i<numHiddenUnits; i++) { for(UINT j=0; j<numVisibleUnits; j++) { weightsMatrix[i][j] = rand.getRandomNumberUniform(-a, a); } } //Init the bias units visibleLayerBias.resize( numVisibleUnits ); hiddenLayerBias.resize( numHiddenUnits ); std::fill(visibleLayerBias.begin(),visibleLayerBias.end(),0); std::fill(hiddenLayerBias.begin(),hiddenLayerBias.end(),0); }else{ if( weightsMatrix.getNumRows() != numHiddenUnits ){ errorLog << "train_(MatrixFloat &data) - Weights matrix row size does not match the number of hidden units!" << std::endl; return false; } if( weightsMatrix.getNumCols() != numVisibleUnits ){ errorLog << "train_(MatrixFloat &data) - Weights matrix row size does not match the number of visible units!" << std::endl; return false; } if( visibleLayerBias.size() != numVisibleUnits ){ errorLog << "train_(MatrixFloat &data) - Visible layer bias size does not match the number of visible units!" << std::endl; return false; } if( hiddenLayerBias.size() != numHiddenUnits ){ errorLog << "train_(MatrixFloat &data) - Hidden layer bias size does not match the number of hidden units!" << std::endl; return false; } } //Flag the model has been trained encase the user wants to save the model during a training iteration using an observer trained = true; //Make sure the data is scaled between [0 1] ranges = data.getRanges(); if( useScaling ){ for(UINT i=0; i<numTrainingSamples; i++){ for(UINT j=0; j<numInputDimensions; j++){ data[i][j] = grt_scale(data[i][j], ranges[j].minValue, ranges[j].maxValue, 0.0, 1.0); } } } const UINT numBatches = static_cast<UINT>( ceil( Float(numTrainingSamples)/batchSize ) ); //Setup the batch indexs Vector< BatchIndexs > batchIndexs( numBatches ); UINT startIndex = 0; for(UINT i=0; i<numBatches; i++){ batchIndexs[i].startIndex = startIndex; batchIndexs[i].endIndex = startIndex + batchSize; //Make sure the last batch end index is not larger than the number of training examples if( batchIndexs[i].endIndex >= numTrainingSamples ){ batchIndexs[i].endIndex = numTrainingSamples; } //Get the batch size batchIndexs[i].batchSize = batchIndexs[i].endIndex - batchIndexs[i].startIndex; //Set the start index for the next batch startIndex = batchIndexs[i].endIndex; } Timer timer; UINT i,j,n,epoch,noChangeCounter = 0; Float startTime = 0; Float alpha = learningRate; Float error = 0; Float err = 0; Float delta = 0; Float lastError = 0; Vector< UINT > indexList(numTrainingSamples); TrainingResult trainingResult; MatrixFloat wT( numVisibleUnits, numHiddenUnits ); //Stores a transposed copy of the weights vector MatrixFloat vW( numHiddenUnits, numVisibleUnits ); //Stores the weight velocity updates MatrixFloat tmpW( numHiddenUnits, numVisibleUnits ); //Stores the weight values that will be used to update the main weights matrix at each batch update MatrixFloat v1( batchSize, numVisibleUnits ); //Stores the real batch data during a batch update MatrixFloat v2( batchSize, numVisibleUnits ); //Stores the sampled batch data during a batch update MatrixFloat h1( batchSize, numHiddenUnits ); //Stores the hidden states given v1 and the current weightsMatrix MatrixFloat h2( batchSize, numHiddenUnits ); //Stores the sampled hidden states given v2 and the current weightsMatrix MatrixFloat c1( numHiddenUnits, numVisibleUnits ); //Stores h1' * v1 MatrixFloat c2( numHiddenUnits, numVisibleUnits ); //Stores h2' * v2 MatrixFloat vDiff( batchSize, numVisibleUnits ); //Stores the difference between v1-v2 MatrixFloat hDiff( batchSize, numVisibleUnits ); //Stores the difference between h1-h2 MatrixFloat cDiff( numHiddenUnits, numVisibleUnits ); //Stores the difference between c1-c2 VectorFloat vDiffSum( numVisibleUnits ); //Stores the column sum of vDiff VectorFloat hDiffSum( numHiddenUnits ); //Stores the column sum of hDiff VectorFloat visibleLayerBiasVelocity( numVisibleUnits ); //Stores the velocity update of the visibleLayerBias VectorFloat hiddenLayerBiasVelocity( numHiddenUnits ); //Stores the velocity update of the hiddenLayerBias //Set all the velocity weights to zero vW.setAllValues( 0 ); std::fill(visibleLayerBiasVelocity.begin(),visibleLayerBiasVelocity.end(),0); std::fill(hiddenLayerBiasVelocity.begin(),hiddenLayerBiasVelocity.end(),0); //Randomize the order that the training samples will be used in for(UINT i=0; i<numTrainingSamples; i++) indexList[i] = i; if( randomiseTrainingOrder ){ std::random_shuffle(indexList.begin(), indexList.end()); } //Start the main training loop timer.start(); for(epoch=0; epoch<maxNumEpochs; epoch++) { startTime = timer.getMilliSeconds(); error = 0; //Randomize the batch order std::random_shuffle(batchIndexs.begin(),batchIndexs.end()); //Run each of the batch updates for(UINT k=0; k<numBatches; k+=batchStepSize){ //Resize the data matrices, the matrices will only be resized if the rows cols are different v1.resize( batchIndexs[k].batchSize, numVisibleUnits ); h1.resize( batchIndexs[k].batchSize, numHiddenUnits ); v2.resize( batchIndexs[k].batchSize, numVisibleUnits ); h2.resize( batchIndexs[k].batchSize, numHiddenUnits ); //Setup the data pointers, using data pointers saves a few ms on large matrix updates Float **w_p = weightsMatrix.getDataPointer(); Float **wT_p = wT.getDataPointer(); Float **vW_p = vW.getDataPointer(); Float **data_p = data.getDataPointer(); Float **v1_p = v1.getDataPointer(); Float **v2_p = v2.getDataPointer(); Float **h1_p = h1.getDataPointer(); Float **h2_p = h2.getDataPointer(); Float *vlb_p = &visibleLayerBias[0]; Float *hlb_p = &hiddenLayerBias[0]; //Get the batch data UINT index = 0; for(i=batchIndexs[k].startIndex; i<batchIndexs[k].endIndex; i++){ for(j=0; j<numVisibleUnits; j++){ v1_p[index][j] = data_p[ indexList[i] ][j]; } index++; } //Copy a transposed version of the weights matrix, this is used to compute h1 and h2 for(i=0; i<numHiddenUnits; i++) for(j=0; j<numVisibleUnits; j++) wT_p[j][i] = w_p[i][j]; //Compute h1 h1.multiple(v1, wT); for(n=0; n<batchIndexs[k].batchSize; n++){ for(i=0; i<numHiddenUnits; i++){ h1_p[n][i] = sigmoidRandom( h1_p[n][i] + hlb_p[i] ); } } //Compute v2 v2.multiple(h1, weightsMatrix); for(n=0; n<batchIndexs[k].batchSize; n++){ for(i=0; i<numVisibleUnits; i++){ v2_p[n][i] = sigmoidRandom( v2_p[n][i] + vlb_p[i] ); } } //Compute h2 h2.multiple(v2,wT); for(n=0; n<batchIndexs[k].batchSize; n++){ for(i=0; i<numHiddenUnits; i++){ h2_p[n][i] = grt_sigmoid( h2_p[n][i] + hlb_p[i] ); } } //Compute c1, c2 and the difference between v1-v2 c1.multiple(h1,v1,true); c2.multiple(h2,v2,true); vDiff.subtract(v1, v2); //Compute the sum of vdiff for(j=0; j<numVisibleUnits; j++){ vDiffSum[j] = 0; for(i=0; i<batchIndexs[k].batchSize; i++){ vDiffSum[j] += vDiff[i][j]; } } //Compute the difference between h1 and h2 hDiff.subtract(h1, h2); for(j=0; j<numHiddenUnits; j++){ hDiffSum[j] = 0; for(i=0; i<batchIndexs[k].batchSize; i++){ hDiffSum[j] += hDiff[i][j]; } } //Compute the difference between c1 and c2 cDiff.subtract(c1,c2); //Update the weight velocities for(i=0; i<numHiddenUnits; i++){ for(j=0; j<numVisibleUnits; j++){ vW_p[i][j] = ((momentum * vW_p[i][j]) + (alpha * cDiff[i][j])) / batchIndexs[k].batchSize; } } for(i=0; i<numVisibleUnits; i++){ visibleLayerBiasVelocity[i] = ((momentum * visibleLayerBiasVelocity[i]) + (alpha * vDiffSum[i])) / batchIndexs[k].batchSize; } for(i=0; i<numHiddenUnits; i++){ hiddenLayerBiasVelocity[i] = ((momentum * hiddenLayerBiasVelocity[i]) + (alpha * hDiffSum[i])) / batchIndexs[k].batchSize; } //Update the weights weightsMatrix.add( vW ); //Update the bias for the visible layer for(i=0; i<numVisibleUnits; i++){ visibleLayerBias[i] += visibleLayerBiasVelocity[i]; } //Update the bias for the visible layer for(i=0; i<numHiddenUnits; i++){ hiddenLayerBias[i] += hiddenLayerBiasVelocity[i]; } //Compute the reconstruction error err = 0; for(i=0; i<batchIndexs[k].batchSize; i++){ for(j=0; j<numVisibleUnits; j++){ err += SQR( v1[i][j] - v2[i][j] ); } } error += err / batchIndexs[k].batchSize; } error /= numBatches; delta = lastError - error; lastError = error; trainingLog << "Epoch: " << epoch+1 << "/" << maxNumEpochs; trainingLog << " Epoch time: " << (timer.getMilliSeconds()-startTime)/1000.0 << " seconds"; trainingLog << " Learning rate: " << alpha; trainingLog << " Momentum: " << momentum; trainingLog << " Average reconstruction error: " << error; trainingLog << " Delta: " << delta << std::endl; //Update the learning rate alpha *= learningRateUpdate; trainingResult.setClassificationResult(epoch, error, this); trainingResults.push_back(trainingResult); trainingResultsObserverManager.notifyObservers( trainingResult ); //Check for convergance if( fabs(delta) < minChange ){ if( ++noChangeCounter >= minNumEpochs ){ trainingLog << "Stopping training. MinChange limit reached!" << std::endl; break; } }else noChangeCounter = 0; } trainingLog << "Training complete after " << epoch << " epochs. Total training time: " << timer.getMilliSeconds()/1000.0 << " seconds" << std::endl; trained = true; return true; }
Foam::solverPerformance Foam::PBiCG::solve ( scalarField& psi, const scalarField& source, const direction cmpt ) const { // --- Setup class containing solver performance data solverPerformance solverPerf ( lduMatrix::preconditioner::getName(controlDict_) + typeName, fieldName_ ); label nCells = psi.size(); scalar* __restrict__ psiPtr = psi.begin(); scalarField pA(nCells); scalar* __restrict__ pAPtr = pA.begin(); scalarField pT(nCells, 0.0); scalar* __restrict__ pTPtr = pT.begin(); scalarField wA(nCells); scalar* __restrict__ wAPtr = wA.begin(); scalarField wT(nCells); scalar* __restrict__ wTPtr = wT.begin(); scalar wArT = solverPerf.great_; scalar wArTold = wArT; // --- Calculate A.psi and T.psi matrix_.Amul(wA, psi, interfaceBouCoeffs_, interfaces_, cmpt); matrix_.Tmul(wT, psi, interfaceIntCoeffs_, interfaces_, cmpt); // --- Calculate initial residual and transpose residual fields scalarField rA(source - wA); scalarField rT(source - wT); scalar* __restrict__ rAPtr = rA.begin(); scalar* __restrict__ rTPtr = rT.begin(); // --- Calculate normalisation factor scalar normFactor = this->normFactor(psi, source, wA, pA); if (lduMatrix::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // --- Calculate normalised residual norm solverPerf.initialResidual() = gSumMag(rA, matrix().mesh().comm()) /normFactor; solverPerf.finalResidual() = solverPerf.initialResidual(); // --- Check convergence, solve if not converged if ( minIter_ > 0 || !solverPerf.checkConvergence(tolerance_, relTol_) ) { // --- Select and construct the preconditioner autoPtr<lduMatrix::preconditioner> preconPtr = lduMatrix::preconditioner::New ( *this, controlDict_ ); // --- Solver iteration do { // --- Store previous wArT wArTold = wArT; // --- Precondition residuals preconPtr->precondition(wA, rA, cmpt); preconPtr->preconditionT(wT, rT, cmpt); // --- Update search directions: wArT = gSumProd(wA, rT, matrix().mesh().comm()); if (solverPerf.nIterations() == 0) { for (label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell]; pTPtr[cell] = wTPtr[cell]; } } else { scalar beta = wArT/wArTold; for (label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell] + beta*pAPtr[cell]; pTPtr[cell] = wTPtr[cell] + beta*pTPtr[cell]; } } // --- Update preconditioned residuals matrix_.Amul(wA, pA, interfaceBouCoeffs_, interfaces_, cmpt); matrix_.Tmul(wT, pT, interfaceIntCoeffs_, interfaces_, cmpt); scalar wApT = gSumProd(wA, pT, matrix().mesh().comm()); // --- Test for singularity if (solverPerf.checkSingularity(mag(wApT)/normFactor)) { break; } // --- Update solution and residual: scalar alpha = wArT/wApT; for (label cell=0; cell<nCells; cell++) { psiPtr[cell] += alpha*pAPtr[cell]; rAPtr[cell] -= alpha*wAPtr[cell]; rTPtr[cell] -= alpha*wTPtr[cell]; } solverPerf.finalResidual() = gSumMag(rA, matrix().mesh().comm()) /normFactor; } while ( ( solverPerf.nIterations()++ < maxIter_ && !solverPerf.checkConvergence(tolerance_, relTol_) ) || solverPerf.nIterations() < minIter_ ); } return solverPerf; }
Foam::SolverPerformance<Type> Foam::PBiCCCG<Type, DType, LUType>::solve ( gpuField<Type>& psi ) const { word preconditionerName(this->controlDict_.lookup("preconditioner")); // --- Setup class containing solver performance data SolverPerformance<Type> solverPerf ( preconditionerName + typeName, this->fieldName_ ); register label nCells = psi.size(); gpuField<Type> pA(nCells); gpuField<Type> pT(nCells, pTraits<Type>::zero); gpuField<Type> wA(nCells); gpuField<Type> wT(nCells); scalar wArT = 1e15; //this->matrix_.great_; scalar wArTold = wArT; // --- Calculate A.psi and T.psi this->matrix_.Amul(wA, psi); this->matrix_.Tmul(wT, psi); // --- Calculate initial residual and transpose residual fields gpuField<Type> rA(this->matrix_.source() - wA); gpuField<Type> rT(this->matrix_.source() - wT); // --- Calculate normalisation factor Type normFactor = this->normFactor(psi, wA, pA); if (LduMatrix<Type, DType, LUType>::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // --- Calculate normalised residual norm solverPerf.initialResidual() = cmptDivide(gSumCmptMag(rA), normFactor); solverPerf.finalResidual() = solverPerf.initialResidual(); // --- Check convergence, solve if not converged if ( this->minIter_ > 0 || !solverPerf.checkConvergence(this->tolerance_, this->relTol_) ) { // --- Select and construct the preconditioner autoPtr<typename LduMatrix<Type, DType, LUType>::preconditioner> preconPtr = LduMatrix<Type, DType, LUType>::preconditioner::New ( *this, this->controlDict_ ); // --- Solver iteration do { // --- Store previous wArT wArTold = wArT; // --- Precondition residuals preconPtr->precondition(wA, rA); preconPtr->preconditionT(wT, rT); // --- Update search directions: wArT = gSumProd(wA, rT); if (solverPerf.nIterations() == 0) { thrust::copy(wA.begin(),wA.end(),pA.begin()); thrust::copy(wT.begin(),wT.end(),pT.begin()); } else { scalar beta = wArT/wArTold; thrust::transform ( wA.begin(), wA.end(), thrust::make_transform_iterator ( pA.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(beta) ), pA.begin(), addOperatorFunctor<Type,Type,Type>() ); thrust::transform ( wT.begin(), wT.end(), thrust::make_transform_iterator ( pT.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(beta) ), pT.begin(), addOperatorFunctor<Type,Type,Type>() ); } // --- Update preconditioned residuals this->matrix_.Amul(wA, pA); this->matrix_.Tmul(wT, pT); scalar wApT = gSumProd(wA, pT); // --- Test for singularity if ( solverPerf.checkSingularity ( cmptDivide(pTraits<Type>::one*mag(wApT), normFactor) ) ) { break; } // --- Update solution and residual: scalar alpha = wArT/wApT; thrust::transform ( psi.begin(), psi.end(), thrust::make_transform_iterator ( pA.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(alpha) ), psi.begin(), addOperatorFunctor<Type,Type,Type>() ); thrust::transform ( rA.begin(), rA.end(), thrust::make_transform_iterator ( wA.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(alpha) ), rA.begin(), subtractOperatorFunctor<Type,Type,Type>() ); thrust::transform ( rT.begin(), rT.end(), thrust::make_transform_iterator ( wT.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(alpha) ), rT.begin(), subtractOperatorFunctor<Type,Type,Type>() ); solverPerf.finalResidual() = cmptDivide(gSumCmptMag(rA), normFactor); } while ( ( solverPerf.nIterations()++ < this->maxIter_ && !solverPerf.checkConvergence(this->tolerance_, this->relTol_) ) || solverPerf.nIterations() < this->minIter_ ); } return solverPerf; }
inline void ReformHermitianMatrix ( UpperOrLower uplo, DistMatrix<R,MC,MR>& A, const DistMatrix<R,VR,STAR>& w, const DistMatrix<R,MC,MR>& Z, const RealFunctor& f ) { #ifndef RELEASE PushCallStack("hermitian_function::ReformHermitianMatrix"); #endif const Grid& g = A.Grid(); DistMatrix<R,MC,MR> ZL(g), ZR(g), Z0(g), Z1(g), Z2(g); DistMatrix<R,VR,STAR> wT(g), w0(g), wB(g), w1(g), w2(g); DistMatrix<R,MC, STAR> Z1_MC_STAR(g); DistMatrix<R,VR, STAR> Z1_VR_STAR(g); DistMatrix<R,STAR,MR > Z1Trans_STAR_MR(g); DistMatrix<R,STAR,STAR> w1_STAR_STAR(g); if( uplo == LOWER ) MakeTrapezoidal( LEFT, UPPER, 1, A ); else MakeTrapezoidal( LEFT, LOWER, -1, A ); LockedPartitionRight( Z, ZL, ZR, 0 ); LockedPartitionDown ( w, wT, wB, 0 ); while( ZL.Width() < Z.Width() ) { LockedRepartitionRight ( ZL, /**/ ZR, Z0, /**/ Z1, Z2 ); LockedRepartitionDown ( wT, w0, /**/ /**/ w1, wB, w2 ); Z1_MC_STAR.AlignWith( A ); Z1_VR_STAR.AlignWith( A ); Z1Trans_STAR_MR.AlignWith( A ); //--------------------------------------------------------------------// Z1_MC_STAR = Z1; Z1_VR_STAR = Z1_MC_STAR; w1_STAR_STAR = w1; // Scale Z1[VR,* ] with the modified eigenvalues const int width = Z1_VR_STAR.Width(); const int localHeight = Z1_VR_STAR.LocalHeight(); for( int j=0; j<width; ++j ) { const R omega = f(w1_STAR_STAR.GetLocalEntry(j,0)); R* buffer = Z1_VR_STAR.LocalBuffer(0,j); for( int iLocal=0; iLocal<localHeight; ++iLocal ) buffer[iLocal] *= omega; } Z1Trans_STAR_MR.TransposeFrom( Z1_VR_STAR ); internal::LocalTrrk( uplo, (R)1, Z1_MC_STAR, Z1Trans_STAR_MR, (R)1, A ); //--------------------------------------------------------------------// Z1Trans_STAR_MR.FreeAlignments(); Z1_VR_STAR.FreeAlignments(); Z1_MC_STAR.FreeAlignments(); SlideLockedPartitionDown ( wT, w0, w1, /**/ /**/ wB, w2 ); SlideLockedPartitionRight ( ZL, /**/ ZR, Z0, Z1, /**/ Z2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void ReformNormalMatrix ( DistMatrix<Complex<R>,MC,MR >& A, const DistMatrix<R, VR,STAR>& w, const DistMatrix<Complex<R>,MC,MR >& Z, const ComplexFunctor& f ) { #ifndef RELEASE PushCallStack("hermitian_function::ReformNormalMatrix"); #endif const Grid& g = A.Grid(); typedef Complex<R> C; DistMatrix<C,MC,MR> ZL(g), ZR(g), Z0(g), Z1(g), Z2(g); DistMatrix<R,VR,STAR> wT(g), w0(g), wB(g), w1(g), w2(g); DistMatrix<C,MC, STAR> Z1_MC_STAR(g); DistMatrix<C,VR, STAR> Z1_VR_STAR(g); DistMatrix<C,STAR,MR > Z1Adj_STAR_MR(g); DistMatrix<R,STAR,STAR> w1_STAR_STAR(g); Zero( A ); LockedPartitionRight( Z, ZL, ZR, 0 ); LockedPartitionDown ( w, wT, wB, 0 ); while( ZL.Width() < Z.Width() ) { LockedRepartitionRight ( ZL, /**/ ZR, Z0, /**/ Z1, Z2 ); LockedRepartitionDown ( wT, w0, /**/ /**/ w1, wB, w2 ); Z1_MC_STAR.AlignWith( A ); Z1_VR_STAR.AlignWith( A ); Z1Adj_STAR_MR.AlignWith( A ); //--------------------------------------------------------------------// Z1_MC_STAR = Z1; Z1_VR_STAR = Z1_MC_STAR; w1_STAR_STAR = w1; // Scale Z1[VR,* ] with the modified eigenvalues const int width = Z1_VR_STAR.Width(); const int localHeight = Z1_VR_STAR.LocalHeight(); for( int j=0; j<width; ++j ) { const C conjOmega = Conj(f(w1_STAR_STAR.GetLocalEntry(j,0))); C* buffer = Z1_VR_STAR.LocalBuffer(0,j); for( int iLocal=0; iLocal<localHeight; ++iLocal ) buffer[iLocal] *= conjOmega; } Z1Adj_STAR_MR.AdjointFrom( Z1_VR_STAR ); internal::LocalGemm ( NORMAL, NORMAL, (C)1, Z1_MC_STAR, Z1Adj_STAR_MR, (C)1, A ); //--------------------------------------------------------------------// Z1Adj_STAR_MR.FreeAlignments(); Z1_VR_STAR.FreeAlignments(); Z1_MC_STAR.FreeAlignments(); SlideLockedPartitionDown ( wT, w0, w1, /**/ /**/ wB, w2 ); SlideLockedPartitionRight ( ZL, /**/ ZR, Z0, Z1, /**/ Z2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void HermitianFromEVD ( UpperOrLower uplo, DistMatrix<F>& A, const DistMatrix<BASE(F),VR,STAR>& w, const DistMatrix<F>& Z ) { #ifndef RELEASE CallStackEntry entry("HermitianFromEVD"); #endif const Grid& g = A.Grid(); typedef BASE(F) R; DistMatrix<F> ZL(g), ZR(g), Z0(g), Z1(g), Z2(g); DistMatrix<R,VR,STAR> wT(g), w0(g), wB(g), w1(g), w2(g); DistMatrix<F,MC, STAR> Z1_MC_STAR(g); DistMatrix<F,VR, STAR> Z1_VR_STAR(g); DistMatrix<F,STAR,MR > Z1Adj_STAR_MR(g); DistMatrix<R,STAR,STAR> w1_STAR_STAR(g); A.ResizeTo( Z.Height(), Z.Height() ); if( uplo == LOWER ) MakeTrapezoidal( UPPER, A, 1 ); else MakeTrapezoidal( LOWER, A, -1 ); LockedPartitionRight( Z, ZL, ZR, 0 ); LockedPartitionDown ( w, wT, wB, 0 ); while( ZL.Width() < Z.Width() ) { LockedRepartitionRight ( ZL, /**/ ZR, Z0, /**/ Z1, Z2 ); LockedRepartitionDown ( wT, w0, /**/ /**/ w1, wB, w2 ); Z1_MC_STAR.AlignWith( A ); Z1_VR_STAR.AlignWith( A ); Z1Adj_STAR_MR.AlignWith( A ); //--------------------------------------------------------------------// Z1_MC_STAR = Z1; Z1_VR_STAR = Z1_MC_STAR; w1_STAR_STAR = w1; DiagonalScale( RIGHT, NORMAL, w1_STAR_STAR, Z1_VR_STAR ); Z1Adj_STAR_MR.AdjointFrom( Z1_VR_STAR ); LocalTrrk( uplo, F(1), Z1_MC_STAR, Z1Adj_STAR_MR, F(1), A ); //--------------------------------------------------------------------// SlideLockedPartitionDown ( wT, w0, w1, /**/ /**/ wB, w2 ); SlideLockedPartitionRight ( ZL, /**/ ZR, Z0, Z1, /**/ Z2 ); } }