// Find a 1:1 mapping from rows to columns of the specified square matrix such that the total cost is minimized. Returns a // vector V such that V[i] = j maps rows i to columns j. static std::vector<long> findMinimumAssignment(const DistanceMatrix &matrix) { #ifdef ROSE_HAVE_DLIB ASSERT_forbid(matrix.size() == 0); ASSERT_require(matrix.nr() == matrix.nc()); // We can avoid the O(n^3) Kuhn-Munkres algorithm if all values of the matrix are the same. double minValue, maxValue; dlib::find_min_and_max(matrix, minValue /*out*/, maxValue /*out*/); if (minValue == maxValue) { std::vector<long> ident; ident.reserve(matrix.nr()); for (long i=0; i<matrix.nr(); ++i) ident.push_back(i); return ident; } // Dlib's Kuhn-Munkres finds the *maximum* mapping over *integers*, so we negate everything to find the minumum, and we map // the doubles onto a reasonably large interval of integers. The interval should be large enough to have some precision, // but not so large that things might overflow. const int iGreatest = 1000000; // arbitrary upper bound for integer interval dlib::matrix<long> intMatrix(matrix.nr(), matrix.nc()); for (long i=0; i<matrix.nr(); ++i) { for (long j=0; j<matrix.nc(); ++j) intMatrix(i, j) = round(-iGreatest * (matrix(i, j) - minValue) / (maxValue - minValue)); } return dlib::max_cost_assignment(intMatrix); #else throw FunctionSimilarity::Exception("dlib support is necessary for FunctionSimilarity analysis" "; see ROSE installation instructions"); #endif }
void testUpdateDistanceMatrix() { PeakListCollection PLC = SamplePeakListCollection(); DistanceMatrix DM = PLC.buildDistanceMatrix_(200., 0.5, 0, 0.); DistanceMatrix Ref = PLC.buildDistanceMatrix_(200., 0.5, 0, 0.); DM.addElement(99.); //Add a dummy element, we will delete it again unsigned int merged_lower = 4; //last two elements unsigned int merged_upper = 5; PLC.plContent_.clear(); PLC.plContent_.resize(6,1); PLC.updateDistanceMatrix_(DM, merged_lower, merged_upper, 200., 0.5, 0, 0.); //Size should still be 5 shouldEqual((int)DM.size(),5); //Matrix should be the same as the original matrix Ref for(unsigned int i = 0; i < 5; i++){ for(unsigned int j = 0; j < i; j++){ shouldEqualTolerance(DM(i,j),Ref(i,j),TOL); } } return; }
DBL_MATRIX PeakListCollection::mergeAll(double drt, double dmz, double dz, double dint) { //preprocessing: delete empty PeakLists from c_ for(unsigned int i = 0; i < c_.size(); i++){ if(c_[i].size() == 0){ c_.erase(c_.begin() + i); } } unsigned int oldSize = c_.size(); // initialization //fill correspondenceMap with rt-values correspondenceMap_.clear(); correspondenceMap_.resize(oldSize); for(unsigned int i = 0; i < oldSize; i++){ correspondenceMap_[i].resize(c_[i].size()); for(unsigned int j = 0; j < c_[i].size(); j++){ //create map_item to write to correspondenceMap map_item tempItem; //origin information contains the PeakList index and rt value originInformation o; o.rt = c_[i][j].getRt(); o.mz = c_[i][j].getMz(); o.intensity = c_[i][j].getAbundance(); o.originPeakList = i; o.originPeak = j; tempItem.push_back( o ); correspondenceMap_[i][j] = tempItem; } } //PeakLists are not merged in the beginning --> fill with 1 plContent_.clear(); plContent_.resize(oldSize,1); //build distance matrix DistanceMatrix D = buildDistanceMatrix_(drt,dmz,dz,dint); //merge, until there is only one PeakList left while(D.size() > 1){ //find cheapest assignment unsigned int merge_lower = D.getMin_LowerIndex(); unsigned int merge_upper = D.getMin_HigherIndex(); //merge peaklists into a new one mergePeakLists_(merge_lower, merge_upper, c_, drt, dmz, dz, dint); //update distance matrix updateDistanceMatrix_(D, merge_lower, merge_upper, drt, dmz, dz, dint); } return calculateRtCorrespondencesFromCorrespondenceMap_(oldSize); }
void testBuildDistanceMatrix() { PeakListCollection PLC = SamplePeakListCollection(); DistanceMatrix DM = PLC.buildDistanceMatrix_(200., 0.5, 0, 0.); //PeakList 4 should have no Peaks in common with the other lists --> Cost == DBL_MAX for(unsigned int i = 0; i < DM.size()-1; i++){ shouldEqual(DM(i,4),DBL_MAX); } //distance PL0 - PL2 should be sqrt(0.1^2/0.5^2 + 30^2/200^2) = 0.25 shouldEqualTolerance(DM(0,2),0.25,TOL); shouldEqualTolerance(DM(2,0),0.25,TOL); return; }
//Update the Distance Matrix void PeakListCollection::updateDistanceMatrix_(DistanceMatrix& D, unsigned int merged_lower, unsigned int merged_upper, double drt, double dmz, double dz, double dint = 0.) { //remove distances of merged PeakLists from distance matrix D.deleteElement(merged_upper); D.deleteElement(merged_lower); //add a new column for the merged PeakList D.addElement(); //fill distances unsigned int sz = D.size(); for(unsigned int i = 0; i<sz-1; i++){ StableMarriage sm(c_[i],c_[sz-1],drt,dmz,dz,dint); sm.setLimit(1.); D(i,sz-1) = sm.getCost(); } return; }
DBL_MATRIX PeakListCollection::getAlignment(double drt, double dmz, double dz, double dint, AccelerationFlag flag = NORMAL, int param = 0) { //Create a copy of c_ to work on std::vector<PeakList> pls = c_; unsigned int oldSize = pls.size(); // initialization //fill correspondenceMap with rt-values correspondenceMap_.clear(); correspondenceMap_.resize(oldSize); for(unsigned int i = 0; i < oldSize; i++){ correspondenceMap_[i].resize(pls[i].size()); for(unsigned int j = 0; j < pls[i].size(); j++){ //create map_item to write to correspondenceMap map_item tempItem; //origin information contains the PeakList index and rt value originInformation o; o.rt = c_[i][j].getRt(); o.mz = c_[i][j].getMz(); o.intensity = c_[i][j].getAbundance(); o.originPeakList = i; o.originPeak = j; tempItem.push_back( o ); correspondenceMap_[i][j] = tempItem; } } //PeakLists are not merged in the beginning --> fill with 1 plContent_.clear(); plContent_.resize(oldSize,1); DistanceMatrix D; if(flag == NORMAL){ //build distance matrix D = buildDistanceMatrix_(drt,dmz,dz,dint); } if(flag == REFERENCE){ //first, merge PeakList n with one other --> last PeakList contains PeakList n if(param==0){ //select first two peaklists unsigned int merge_lower = param; unsigned int merge_upper = 1; //merge peaklists into a new one mergePeakLists_(merge_lower, merge_upper, pls, drt, dmz, dz, dint); }else{ //select first and n'th PeakList unsigned int merge_lower = 0; unsigned int merge_upper = param; //merge peaklists into a new one mergePeakLists_(merge_lower, merge_upper, pls, drt, dmz, dz, dint); } } //merge, until there is only one PeakList left bool exit = false; while(!exit){ //find cheapest assignment unsigned int merge_lower = 0; unsigned int merge_upper = 1; switch(flag){ case NORMAL: merge_lower = D.getMin_LowerIndex(); merge_upper = D.getMin_HigherIndex(); break; case FAST: //select last two PeakList merge_lower = pls.size()-2; merge_upper = pls.size()-1; break; case REFERENCE: //select last two PeakList merge_lower = pls.size()-2; merge_upper = pls.size()-1; break; } //merge peaklists into a new one mergePeakLists_(merge_lower, merge_upper, pls, drt, dmz, dz, dint); switch(flag){ case NORMAL: //update distance matrix updateDistanceMatrix_(D, merge_lower, merge_upper, drt, dmz, dz, dint); if(D.size()==1){ exit = true; } break; case FAST: if(pls.size()==1){ exit = true; } break; case REFERENCE: if(pls.size()==1){ exit = true; } break; } } DBL_MATRIX rtc = calculateRtCorrespondencesFromCorrespondenceMap_(oldSize); return rtc; }
TreeTemplate<Node>* OptimizationTools::buildDistanceTree( DistanceEstimation& estimationMethod, AgglomerativeDistanceMethod& reconstructionMethod, const ParameterList& parametersToIgnore, bool optimizeBrLen, const std::string& param, double tolerance, unsigned int tlEvalMax, OutputStream* profiler, OutputStream* messenger, unsigned int verbose) throw (Exception) { estimationMethod.resetAdditionalParameters(); estimationMethod.setVerbose(verbose); if (param == DISTANCEMETHOD_PAIRWISE) { ParameterList tmp = estimationMethod.getSubstitutionModel().getIndependentParameters(); tmp.addParameters(estimationMethod.getRateDistribution().getIndependentParameters()); tmp.deleteParameters(parametersToIgnore.getParameterNames()); estimationMethod.setAdditionalParameters(tmp); } TreeTemplate<Node>* tree = NULL; TreeTemplate<Node>* previousTree = NULL; bool test = true; while (test) { // Compute matrice: if (verbose > 0) ApplicationTools::displayTask("Estimating distance matrix", true); estimationMethod.computeMatrix(); DistanceMatrix* matrix = estimationMethod.getMatrix(); if (verbose > 0) ApplicationTools::displayTaskDone(); // Compute tree: if (matrix->size() == 2) { //Special case, there is only one possible tree: Node* n1 = new Node(0); Node* n2 = new Node(1, matrix->getName(0)); n2->setDistanceToFather((*matrix)(0,0) / 2.); Node* n3 = new Node(2, matrix->getName(1)); n3->setDistanceToFather((*matrix)(0,0) / 2.); n1->addSon(n2); n1->addSon(n3); tree = new TreeTemplate<Node>(n1); break; } if (verbose > 0) ApplicationTools::displayTask("Building tree"); reconstructionMethod.setDistanceMatrix(*matrix); reconstructionMethod.computeTree(); previousTree = tree; delete matrix; tree = dynamic_cast<TreeTemplate<Node>*>(reconstructionMethod.getTree()); if (verbose > 0) ApplicationTools::displayTaskDone(); if (previousTree && verbose > 0) { int rf = TreeTools::robinsonFouldsDistance(*previousTree, *tree, false); ApplicationTools::displayResult("Topo. distance with previous iteration", TextTools::toString(rf)); test = (rf == 0); delete previousTree; } if (param != DISTANCEMETHOD_ITERATIONS) break; // Ends here. // Now, re-estimate parameters: auto_ptr<SubstitutionModel> model(estimationMethod.getSubstitutionModel().clone()); auto_ptr<DiscreteDistribution> rdist(estimationMethod.getRateDistribution().clone()); DRHomogeneousTreeLikelihood tl(*tree, *estimationMethod.getData(), model.get(), rdist.get(), true, verbose > 1); tl.initialize(); ParameterList parameters = tl.getParameters(); if (!optimizeBrLen) { vector<string> vs = tl.getBranchLengthsParameters().getParameterNames(); parameters.deleteParameters(vs); } parameters.deleteParameters(parametersToIgnore.getParameterNames()); optimizeNumericalParameters(&tl, parameters, NULL, 0, tolerance, tlEvalMax, messenger, profiler, verbose > 0 ? verbose - 1 : 0); if (verbose > 0) { ParameterList tmp = tl.getSubstitutionModelParameters(); for (unsigned int i = 0; i < tmp.size(); i++) { ApplicationTools::displayResult(tmp[i].getName(), TextTools::toString(tmp[i].getValue())); } tmp = tl.getRateDistributionParameters(); for (unsigned int i = 0; i < tmp.size(); i++) { ApplicationTools::displayResult(tmp[i].getName(), TextTools::toString(tmp[i].getValue())); } } } return tree; }
string Alignment::_computeTree(DistanceMatrix dists, DistanceMatrix vars) throw (Exception) { // Initialization: std::map<size_t, Node*> currentNodes_; std::vector<double> sumDist_(dists.size()); double lambda_; for (size_t i = 0; i < dists.size(); i++) { currentNodes_[i] = new Node(static_cast<int>(i), dists.getName(i)); } int idNextNode = dists.size(); vector<double> newDist(dists.size()); vector<double> newVar(dists.size()); // Build tree: while (currentNodes_.size() > 3) { // get best pair for (std::map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++) { size_t id = i->first; sumDist_[id] = 0; for (map<size_t, Node*>::iterator j = currentNodes_.begin(); j != currentNodes_.end(); j++) { size_t jd = j->first; sumDist_[id] += dists(id, jd); } } vector<size_t> bestPair(2); double critMax = std::log(0.); for (map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++) { size_t id = i->first; map<size_t, Node*>::iterator j = i; j++; for ( ; j != currentNodes_.end(); j++) { size_t jd = j->first; double crit = sumDist_[id] + sumDist_[jd] - static_cast<double>(currentNodes_.size() - 2) * dists(id, jd); // cout << "\t" << id << "\t" << jd << "\t" << crit << endl; if (crit > critMax) { critMax = crit; bestPair[0] = id; bestPair[1] = jd; } } } if (critMax == std::log(0.)) throw Exception("Unexpected error: no maximum criterium found."); // get branch lengths for pair double ratio = (sumDist_[bestPair[0]] - sumDist_[bestPair[1]]) / static_cast<double>(currentNodes_.size() - 2); vector<double> d(2); d[0] = std::max(.5 * (dists(bestPair[0], bestPair[1]) + ratio), MIN_BRANCH_LENGTH); d[1] = std::max(.5 * (dists(bestPair[0], bestPair[1]) - ratio), MIN_BRANCH_LENGTH); Node* best1 = currentNodes_[bestPair[0]]; Node* best2 = currentNodes_[bestPair[1]]; // Distances may be used by getParentNodes (PGMA for instance). best1->setDistanceToFather(d[0]); best2->setDistanceToFather(d[1]); Node* parent = new Node(idNextNode++); parent->addSon(best1); parent->addSon(best2); // compute lambda lambda_ = 0; if (vars(bestPair[0], bestPair[1]) == 0) lambda_ = .5; else { for (map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++) { size_t id = i->first; if (id != bestPair[0] && id != bestPair[1]) lambda_ += (vars(bestPair[1], id) - vars(bestPair[0], id)); } double div = 2 * static_cast<double>(currentNodes_.size() - 2) * vars(bestPair[0], bestPair[1]); lambda_ /= div; lambda_ += .5; } if (lambda_ < 0.) lambda_ = 0.; if (lambda_ > 1.) lambda_ = 1.; for (map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++) { size_t id = i->first; if (id != bestPair[0] && id != bestPair[1]) { newDist[id] = std::max(lambda_ * (dists(bestPair[0], id) - d[0]) + (1 - lambda_) * (dists(bestPair[1], id) - d[1]), 0.); newVar[id] = lambda_ * vars(bestPair[0], id) + (1 - lambda_) * vars(bestPair[1], id) - lambda_ * (1 - lambda_) * vars(bestPair[0], bestPair[1]); } else newDist[id] = 0; } // Actualize currentNodes_: currentNodes_[bestPair[0]] = parent; currentNodes_.erase(bestPair[1]); for (map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++) { size_t id = i->first; dists(bestPair[0], id) = dists(id, bestPair[0]) = newDist[id]; vars(bestPair[0], id) = vars(id, bestPair[0]) = newVar[id]; } } // final step Node* root = new Node(idNextNode); map<size_t, Node* >::iterator it = currentNodes_.begin(); size_t i1 = it->first; Node* n1 = it->second; it++; size_t i2 = it->first; Node* n2 = it->second; if (currentNodes_.size() == 2) { // Rooted double d = dists(i1, i2) / 2; root->addSon(n1); root->addSon(n2); n1->setDistanceToFather(d); n2->setDistanceToFather(d); } else { // Unrooted it++; size_t i3 = it->first; Node* n3 = it->second; double d1 = std::max(dists(i1, i2) + dists(i1, i3) - dists(i2, i3), MIN_BRANCH_LENGTH); double d2 = std::max(dists(i2, i1) + dists(i2, i3) - dists(i1, i3), MIN_BRANCH_LENGTH); double d3 = std::max(dists(i3, i1) + dists(i3, i2) - dists(i1, i2), MIN_BRANCH_LENGTH); root->addSon(n1); root->addSon(n2); root->addSon(n3); n1->setDistanceToFather(d1 / 2.); n2->setDistanceToFather(d2 / 2.); n3->setDistanceToFather(d3 / 2.); } Tree *tree_ = new TreeTemplate<Node>(root); stringstream ss; Newick treeWriter; if (!tree_) throw Exception("The tree is empty"); treeWriter.write(*tree_, ss); delete tree_; string s{ss.str()}; s.erase(s.find_last_not_of(" \n\r\t")+1); return s; }