void Mase::writeHeader_(std::ostream& output, const MaseHeader& header) const { //Write trees: vector<string> treeNames = header.getTreeNames(); for (size_t i = 0; i < treeNames.size(); ++i) { output << ";;$ " + treeNames[i] << endl; output << ";;" + header.getTree(treeNames[i]); output << endl; } //Write site selections: vector<string> siteSelectionNames = header.getSiteSelectionNames(); for (size_t i = 0; i < siteSelectionNames.size(); ++i) { MultiRange<size_t> ranges = header.getSiteSelection(siteSelectionNames[i]); output << ";;Site selection " << siteSelectionNames[i] << " (" << ranges.totalLength() << " sites)" << endl; output << ";;# of segments=" << ranges.size() << " " << siteSelectionNames[i] << endl; output << ";;"; for (size_t j = 0; j < ranges.size(); ++j) { output << " " << (ranges.getRange(j).begin() + 1) << "," << ranges.getRange(j).end(); if ((j + 1) % 10 == 0) output << endl << ";;"; } output << endl; } //Write sequence selections: vector<string> sequenceSelectionNames = header.getSequenceSelectionNames(); for (size_t i = 0; i < sequenceSelectionNames.size(); ++i) { vector<size_t> set = header.getSequenceSelection(sequenceSelectionNames[i]); output << ";;@ of species=" << set.size() << " " << sequenceSelectionNames[i] << endl; output << ";;"; for (unsigned int j = 0; j < set.size(); ++j) { output << " " << set[j]; if ((j + 1) % 10 == 0) output << endl << ";;"; } output << endl; } }
void Mase::readHeader_(std::istream& input, MaseHeader& header) const throw (Exception) { do { //Check if the line is a header line: if (input.peek() == ';') { char c; input.get(c); if (input.peek() == ';') { input.get(c); string line = FileTools::getNextLine(input); //Check the type of line... //Site selection: string::size_type index = line.find("# of"); if (index < line.npos) { StringTokenizer st(string(line.begin() + static_cast<ptrdiff_t>(index + 4), line.end()), " \t=;"); st.nextToken(); //skip next word: may be 'regions' or 'segments' or else ;-) unsigned int numberOfSegments = TextTools::to<unsigned int>(st.nextToken()); string name = st.unparseRemainingTokens(); //Then look for the set definition: MultiRange<size_t> siteSelection; while (siteSelection.size() < numberOfSegments) { line = FileTools::getNextLine(input); if (line[0] != ';' || line[1] != ';') throw Exception("Mase::readHeader_(): corrupted file, site selection " + name + " is incomplete. Aborting."); line = line.substr(2); StringTokenizer st2(line); while (st2.hasMoreToken()) { StringTokenizer st3(st2.nextToken(), ","); unsigned int begin = TextTools::to<unsigned int>(st3.nextToken()); unsigned int end = TextTools::to<unsigned int>(st3.nextToken()); //WARNING!!! In the mase+ format, sites numerotation is 1-based, including, while ranges are 0-based, [a, b[: siteSelection.addRange(Range<size_t>(begin - 1, end)); } if (siteSelection.size() > numberOfSegments) throw Exception("Mase::readHeader_(): incorrected file, found " + TextTools::toString(siteSelection.size()) + "segments while expected " + TextTools::toString(numberOfSegments)); } header.setSiteSelection(name, siteSelection); } else { //Sequence selection: index = line.find("@ of"); if (index < line.npos) { StringTokenizer st(line.substr(index + 4), " \t=;"); st.nextToken(); //skip next word: may be 'sequences' or else ;-) unsigned int numberOfSequences = TextTools::to<unsigned int>(st.nextToken()); string name = st.unparseRemainingTokens(); //The look for the set definition: vector<size_t> sequenceSelection; while (sequenceSelection.size() < numberOfSequences) { line = FileTools::getNextLine(input); if (line[0] != ';' || line[1] != ';') throw Exception("Mase::readHeader_(): corrupted file, sequence selection " + name + " is incomplete. Aborting."); line = line.substr(2); StringTokenizer st2(line, ", "); while (st2.hasMoreToken()) { unsigned int pos = TextTools::to<unsigned int>(st2.nextToken()); //WARNING!!! In the mase+ format, sequence numerotation is 1-based sequenceSelection.push_back(pos); } if (sequenceSelection.size() > numberOfSequences) throw Exception("Mase::readHeader_(): incorrected file, found " + TextTools::toString(sequenceSelection.size()) + "sequences while expected " + TextTools::toString(numberOfSequences)); } header.setSequenceSelection(name, sequenceSelection); } else { //Tree: index = line.find("$"); if (index < line.npos) { string name = TextTools::removeSurroundingWhiteSpaces(line.substr(index + 1)); //Here we stop if the line ends with a ";" string tree = ""; do { line = FileTools::getNextLine(input); if (line[0] != ';' || line[1] != ';') throw Exception("Mase::readHeader_(): corrupted file, tree " + name + " is incomplete. Aborting."); line = TextTools::removeSurroundingWhiteSpaces(line.substr(2)); tree += line; } while (! TextTools::endsWith(line, ";")); header.setTree(name, tree); } } } } else { input.putback(c); break; } } } while (true); }
int main(int args, char** argv) { cout << "******************************************************************" << endl; cout << "* Bio++ Alignment Score, version 2.3.0 *" << endl; cout << "* Author: J. Dutheil Last Modif. 25/11/14 *" << endl; cout << "******************************************************************" << endl; cout << endl; if (args == 1) { help(); return 0; } try { BppApplication bppalnscore(args, argv, "BppAlnScore"); bppalnscore.startTimer(); // Get alphabet Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppalnscore.getParams(), "", false, true, true); // Get the test alignment: auto_ptr<SiteContainer> sitesTest(SequenceApplicationTools::getSiteContainer(alphabet, bppalnscore.getParams(), ".test", false, true)); // Get the reference alignment: auto_ptr<SiteContainer> sitesRef(SequenceApplicationTools::getSiteContainer(alphabet, bppalnscore.getParams(), ".ref", false, true)); // We check if the two alignments are compatible: vector<string> namesTest = sitesTest->getSequencesNames(); vector<string> namesRef = sitesRef->getSequencesNames(); if (namesTest != namesRef) { ApplicationTools::displayTask("Reorder sequences in ref. alignment", true); auto_ptr<AlignedSequenceContainer> tmp(new AlignedSequenceContainer(sitesRef->getAlphabet())); for (size_t i = 0; i < namesTest.size(); ++i) { ApplicationTools::displayGauge(i, namesTest.size() - 1); try { tmp->addSequence(sitesRef->getSequence(namesTest[i])); } catch (SequenceNotFoundException& ex) { throw Exception("ERROR!!! Reference alignment should contain the same sequences as the test alignment!"); } } ApplicationTools::displayTaskDone(); sitesRef = tmp; } // Build alignment indexes: RowMatrix<size_t> indexTest, indexRef; SiteContainerTools::getSequencePositions(*sitesTest, indexTest); SiteContainerTools::getSequencePositions(*sitesRef, indexRef); // Now build scores: int na = ApplicationTools::getIntParameter("score.na", bppalnscore.getParams(), 0); ApplicationTools::displayResult("NA value to used", na); vector<int> cs = SiteContainerTools::getColumnScores(indexTest, indexRef, na); vector<double> sps = SiteContainerTools::getSumOfPairsScores(indexTest, indexRef, static_cast<double>(na)); // Should scores be averaged for words? size_t wsize = ApplicationTools::getParameter<size_t>("score.word_size", bppalnscore.getParams(), 1); size_t phase = 0; if (wsize > 1) { ApplicationTools::displayResult("Scores uniformized for words of size", wsize); string phaseOpt = ApplicationTools::getStringParameter("score.phase", bppalnscore.getParams(), "1"); if (TextTools::isDecimalInteger(phaseOpt)) { phase = TextTools::to<size_t>(phaseOpt); if (phase == 0) throw Exception("ERROR: positions are 1-based."); phase--; } else { // We look for the first occurrence of the given motif: try { BasicSequence motif("motif", phaseOpt, sitesTest->getAlphabet()); ApplicationTools::displayResult("Phase based on 1st occurence of", motif.toString()); size_t pos = sitesTest->getNumberOfSites(); for (size_t i = 0; i < sitesTest->getNumberOfSequences(); ++i) { size_t p = SequenceTools::findFirstOf(sitesTest->getSequence(i), motif); if (p < pos) pos = p; } phase = pos; } catch (Exception& ex) { throw Exception("Error, unvalid motif specified for phase option."); } } ApplicationTools::displayResult("First word starts at", phase + 1); // Now perform the smoothing: size_t i; for (i = 0; i < phase; ++i) { cs[i] = 0; sps[i] = 0; } for ( ; i + wsize <= cs.size(); i += wsize) { // First compute minimum criterion: int csmin = 1; double spsmin = 1; for (size_t j = i; j < i + wsize; ++j) { if (cs[j] < csmin) csmin = cs[j]; if (sps[j] < spsmin) spsmin = sps[j]; } // Assign min to all positions in word: for (size_t j = i; j < i + wsize; ++j) { cs[j] = csmin; sps[j] = spsmin; } } for ( ; i < cs.size(); ++i) { cs[i] = 0; sps[i] = 0; } } // Output scores to file: string outputScores = ApplicationTools::getAFilePath("output.scores", bppalnscore.getParams(), false, false); if (outputScores != "none") { ApplicationTools::displayResult("Output scores to", outputScores); ofstream output(outputScores.c_str(), ios::out); output << "Site\tColumnScore\tSumOfPairsScore" << endl; for (size_t i = 0; i < cs.size(); ++i) { output << sitesTest->getSite(i).getPosition() << "\t" << cs[i] << "\t" << sps[i] << endl; } output.close(); } // Create a sequence filter: string outputFilter = ApplicationTools::getAFilePath("output.mase", bppalnscore.getParams(), false, false); if (outputFilter != "none") { ApplicationTools::displayResult("Output mase with site filter to", outputFilter); double spsThreshold = ApplicationTools::getDoubleParameter("output.sps_thresholds", bppalnscore.getParams(), 0.8); ApplicationTools::displayResult("Threshold for SPS", spsThreshold); MultiRange<size_t> csRanges; MultiRange<size_t> spsRanges; size_t csBeg = 0, spsBeg = 0, csEnd = 0, spsEnd = 0; size_t s = alphabet->getStateCodingSize(); for (size_t i = 0; i < cs.size(); ++i) { if (cs[i] == 1 && i > 0 && cs[i-1] != 1) csBeg = i; if (cs[i] != 1 && i > 0 && cs[i-1] == 1) { csEnd = i; csRanges.addRange(Range<size_t>(csBeg * s, csEnd * s)); } if (sps[i] >= spsThreshold && i > 0 && sps[i-1] < spsThreshold) spsBeg = i; if (sps[i] < spsThreshold && i > 0 && sps[i-1] >= spsThreshold) { spsEnd = i; spsRanges.addRange(Range<size_t>(spsBeg * s, spsEnd * s)); } } //Add the last range if any: if (cs.back() == 1) csRanges.addRange(Range<size_t>(csBeg * s, cs.size() * s)); if (sps.back() >= spsThreshold) spsRanges.addRange(Range<size_t>(spsBeg * s, sps.size() * s)); MaseHeader header; header.setSiteSelection("CS", csRanges); header.setSiteSelection("SPS", spsRanges); Mase writer; writer.writeMeta(outputFilter, *sitesTest, header); } // We're done! bppalnscore.done(); } catch (exception& e) { cout << e.what() << endl; return 1; } return 0; }