Beispiel #1
0
void Mase::writeHeader_(std::ostream& output, const MaseHeader& header) const
{
  //Write trees:
  vector<string> treeNames = header.getTreeNames();
  for (size_t i = 0; i < treeNames.size(); ++i) {
    output << ";;$ " + treeNames[i] << endl;
    output << ";;" + header.getTree(treeNames[i]);
    output << endl;
  }

  //Write site selections:
  vector<string> siteSelectionNames = header.getSiteSelectionNames();
  for (size_t i = 0; i < siteSelectionNames.size(); ++i) {
    MultiRange<size_t> ranges = header.getSiteSelection(siteSelectionNames[i]);
    output << ";;Site selection " << siteSelectionNames[i] << " (" << ranges.totalLength() << " sites)" << endl;
    output << ";;# of segments=" << ranges.size() << " " << siteSelectionNames[i] << endl;
    output << ";;";
    for (size_t j = 0; j < ranges.size(); ++j) {
      output << " " << (ranges.getRange(j).begin() + 1) << "," << ranges.getRange(j).end();
      if ((j + 1) % 10 == 0)
        output << endl << ";;";
    }
    output << endl;
  }

  //Write sequence selections:
  vector<string> sequenceSelectionNames = header.getSequenceSelectionNames();
  for (size_t i = 0; i < sequenceSelectionNames.size(); ++i) {
    vector<size_t> set = header.getSequenceSelection(sequenceSelectionNames[i]);
    output << ";;@ of species=" << set.size() << " " << sequenceSelectionNames[i] << endl;
    output << ";;";
    for (unsigned int j = 0; j < set.size(); ++j) {
      output << " " << set[j];
      if ((j + 1) % 10 == 0)
        output << endl << ";;";
    }
    output << endl;
  }
}
Beispiel #2
0
void Mase::readHeader_(std::istream& input, MaseHeader& header) const throw (Exception)
{
  do {
    //Check if the line is a header line:
    if (input.peek() == ';') {
      char c;
      input.get(c);
      if (input.peek() == ';') {
        input.get(c);
        string line = FileTools::getNextLine(input);
        
        //Check the type of line...

        //Site selection:
        string::size_type index = line.find("# of");
        if (index < line.npos) {
          StringTokenizer st(string(line.begin() + static_cast<ptrdiff_t>(index + 4), line.end()), " \t=;");
          st.nextToken(); //skip next word: may be 'regions' or 'segments' or else ;-)
          unsigned int numberOfSegments = TextTools::to<unsigned int>(st.nextToken());
          string name = st.unparseRemainingTokens();
          //Then look for the set definition:
          MultiRange<size_t> siteSelection;
          while (siteSelection.size() < numberOfSegments) {
            line = FileTools::getNextLine(input);
            if (line[0] != ';' || line[1] != ';')
              throw Exception("Mase::readHeader_(): corrupted file, site selection " + name + " is incomplete. Aborting.");
            line = line.substr(2);
            StringTokenizer st2(line);
            while (st2.hasMoreToken()) {
              StringTokenizer st3(st2.nextToken(), ",");
              unsigned int begin = TextTools::to<unsigned int>(st3.nextToken());
              unsigned int end   = TextTools::to<unsigned int>(st3.nextToken());
              //WARNING!!! In the mase+ format, sites numerotation is 1-based, including, while ranges are 0-based, [a, b[:
              siteSelection.addRange(Range<size_t>(begin - 1, end));
            }
            if (siteSelection.size() > numberOfSegments)
              throw Exception("Mase::readHeader_(): incorrected file, found " + TextTools::toString(siteSelection.size()) + "segments while expected " + TextTools::toString(numberOfSegments));
          }
          header.setSiteSelection(name, siteSelection);
        } else {
          //Sequence selection:
          index = line.find("@ of");
          if (index < line.npos) {
            StringTokenizer st(line.substr(index + 4), " \t=;");
            st.nextToken(); //skip next word: may be 'sequences' or else ;-)
            unsigned int numberOfSequences = TextTools::to<unsigned int>(st.nextToken());
            string name = st.unparseRemainingTokens();
            //The look for the set definition:
            vector<size_t> sequenceSelection;
            while (sequenceSelection.size() < numberOfSequences) {
              line = FileTools::getNextLine(input);
              if (line[0] != ';' || line[1] != ';')
                throw Exception("Mase::readHeader_(): corrupted file, sequence selection " + name + " is incomplete. Aborting.");
              line = line.substr(2);
              StringTokenizer st2(line, ", ");
              while (st2.hasMoreToken()) {
                unsigned int pos = TextTools::to<unsigned int>(st2.nextToken());
                //WARNING!!! In the mase+ format, sequence numerotation is 1-based 
                sequenceSelection.push_back(pos);
            }
            if (sequenceSelection.size() > numberOfSequences)
              throw Exception("Mase::readHeader_(): incorrected file, found " + TextTools::toString(sequenceSelection.size()) + "sequences while expected " + TextTools::toString(numberOfSequences));
            }
            header.setSequenceSelection(name, sequenceSelection);
          } else {
            //Tree:
            index = line.find("$");
            if (index < line.npos) {
              string name = TextTools::removeSurroundingWhiteSpaces(line.substr(index + 1));
              //Here we stop if the line ends with a ";"
              string tree = "";
              do {
                line = FileTools::getNextLine(input);
                if (line[0] != ';' || line[1] != ';')
                  throw Exception("Mase::readHeader_(): corrupted file, tree " + name + " is incomplete. Aborting.");
                line = TextTools::removeSurroundingWhiteSpaces(line.substr(2));
                tree += line;
              } while (! TextTools::endsWith(line, ";"));
              header.setTree(name, tree);
            }
          }
        }
      } else {
        input.putback(c);
        break;
      }
    }
  } while (true);
}
Beispiel #3
0
int main(int args, char** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*              Bio++ Alignment Score, version 2.3.0              *" << endl;
  cout << "* Author: J. Dutheil                        Last Modif. 25/11/14 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;

  if (args == 1)
  {
    help();
    return 0;
  }

  try
  {
    BppApplication bppalnscore(args, argv, "BppAlnScore");
    bppalnscore.startTimer();

    // Get alphabet
    Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppalnscore.getParams(), "", false, true, true);

    // Get the test alignment:
    auto_ptr<SiteContainer> sitesTest(SequenceApplicationTools::getSiteContainer(alphabet, bppalnscore.getParams(), ".test", false, true));

    // Get the reference alignment:
    auto_ptr<SiteContainer> sitesRef(SequenceApplicationTools::getSiteContainer(alphabet, bppalnscore.getParams(), ".ref", false, true));

    // We check if the two alignments are compatible:
    vector<string> namesTest = sitesTest->getSequencesNames();
    vector<string> namesRef  = sitesRef->getSequencesNames();
    if (namesTest != namesRef)
    {
      ApplicationTools::displayTask("Reorder sequences in ref. alignment", true);
      auto_ptr<AlignedSequenceContainer> tmp(new AlignedSequenceContainer(sitesRef->getAlphabet()));
      for (size_t i = 0; i < namesTest.size(); ++i)
      {
        ApplicationTools::displayGauge(i, namesTest.size() - 1);
        try
        {
          tmp->addSequence(sitesRef->getSequence(namesTest[i]));
        }
        catch (SequenceNotFoundException& ex)
        {
          throw Exception("ERROR!!! Reference alignment should contain the same sequences as the test alignment!");
        }
      }
      ApplicationTools::displayTaskDone();
      sitesRef = tmp;
    }

    // Build alignment indexes:
    RowMatrix<size_t> indexTest, indexRef;
    SiteContainerTools::getSequencePositions(*sitesTest, indexTest);
    SiteContainerTools::getSequencePositions(*sitesRef,  indexRef);

    // Now build scores:
    int na = ApplicationTools::getIntParameter("score.na", bppalnscore.getParams(), 0);
    ApplicationTools::displayResult("NA value to used", na);
    vector<int> cs = SiteContainerTools::getColumnScores(indexTest, indexRef, na);
    vector<double> sps = SiteContainerTools::getSumOfPairsScores(indexTest, indexRef, static_cast<double>(na));

    // Should scores be averaged for words?
    size_t wsize = ApplicationTools::getParameter<size_t>("score.word_size", bppalnscore.getParams(), 1);
    size_t phase = 0;
    if (wsize > 1)
    {
      ApplicationTools::displayResult("Scores uniformized for words of size", wsize);
      string phaseOpt = ApplicationTools::getStringParameter("score.phase", bppalnscore.getParams(), "1");
      if (TextTools::isDecimalInteger(phaseOpt))
      {
        phase = TextTools::to<size_t>(phaseOpt);
        if (phase == 0)
          throw Exception("ERROR: positions are 1-based.");
        phase--;
      }
      else
      {
        // We look for the first occurrence of the given motif:
        try
        {
          BasicSequence motif("motif", phaseOpt, sitesTest->getAlphabet());
          ApplicationTools::displayResult("Phase based on 1st occurence of", motif.toString());
          size_t pos = sitesTest->getNumberOfSites();
          for (size_t i = 0; i < sitesTest->getNumberOfSequences(); ++i)
          {
            size_t p = SequenceTools::findFirstOf(sitesTest->getSequence(i), motif);
            if (p < pos)
              pos = p;
          }
          phase = pos;
        }
        catch (Exception& ex)
        {
          throw Exception("Error, unvalid motif specified for phase option.");
        }
      }
      ApplicationTools::displayResult("First word starts at", phase + 1);

      // Now perform the smoothing:
      size_t i;
      for (i = 0; i < phase; ++i)
      {
        cs[i] = 0;
        sps[i] = 0;
      }
      for ( ; i + wsize <= cs.size(); i += wsize)
      {
        // First compute minimum criterion:
        int csmin = 1;
        double spsmin = 1;
        for (size_t j = i; j < i + wsize; ++j)
        {
          if (cs[j] < csmin)
            csmin = cs[j];
          if (sps[j] < spsmin)
            spsmin = sps[j];
        }
        // Assign min to all positions in word:
        for (size_t j = i; j < i + wsize; ++j)
        {
          cs[j] = csmin;
          sps[j] = spsmin;
        }
      }
      for ( ; i < cs.size(); ++i)
      {
        cs[i] = 0;
        sps[i] = 0;
      }
    }

    // Output scores to file:
    string outputScores = ApplicationTools::getAFilePath("output.scores", bppalnscore.getParams(), false, false);
    if (outputScores != "none")
    {
      ApplicationTools::displayResult("Output scores to", outputScores);
      ofstream output(outputScores.c_str(), ios::out);
      output << "Site\tColumnScore\tSumOfPairsScore" << endl;
      for (size_t i = 0; i < cs.size(); ++i)
      {
        output << sitesTest->getSite(i).getPosition() << "\t" << cs[i] << "\t" << sps[i] << endl;
      }
      output.close();
    }

    // Create a sequence filter:
    string outputFilter = ApplicationTools::getAFilePath("output.mase", bppalnscore.getParams(), false, false);
    if (outputFilter != "none")
    {
      ApplicationTools::displayResult("Output mase with site filter to", outputFilter);
      double spsThreshold = ApplicationTools::getDoubleParameter("output.sps_thresholds", bppalnscore.getParams(), 0.8);
      ApplicationTools::displayResult("Threshold for SPS", spsThreshold);

      MultiRange<size_t> csRanges;
      MultiRange<size_t> spsRanges;
      size_t csBeg = 0, spsBeg = 0, csEnd = 0, spsEnd = 0;
      size_t s = alphabet->getStateCodingSize();
      for (size_t i = 0; i < cs.size(); ++i)
      {
        if (cs[i] == 1 && i > 0 && cs[i-1] != 1)
          csBeg = i;
        if (cs[i] != 1 && i > 0 && cs[i-1] == 1) {
          csEnd = i;
          csRanges.addRange(Range<size_t>(csBeg * s, csEnd * s));
        }

        if (sps[i] >= spsThreshold && i > 0 && sps[i-1] < spsThreshold)
          spsBeg = i;
        if (sps[i] < spsThreshold && i > 0 && sps[i-1] >= spsThreshold) {
          spsEnd = i;
          spsRanges.addRange(Range<size_t>(spsBeg * s, spsEnd * s));
        }
      }
      //Add the last range if any:
      if (cs.back() == 1)
        csRanges.addRange(Range<size_t>(csBeg * s, cs.size() * s));
      if (sps.back() >= spsThreshold)
        spsRanges.addRange(Range<size_t>(spsBeg * s, sps.size() * s));

      MaseHeader header;
      header.setSiteSelection("CS", csRanges);
      header.setSiteSelection("SPS", spsRanges);
      Mase writer;
      writer.writeMeta(outputFilter, *sitesTest, header);
    }

    // We're done!
    bppalnscore.done();
  }
  catch (exception& e)
  {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}