Example #1
0
shared_ptr<VectorSiteContainer> SiteContainerBuilder::construct_sorted_alignment(VectorSiteContainer *sites,
        bool ascending) {
    VectorSequenceContainer *tmp = new VectorSequenceContainer(sites->getAlphabet());
    vector<string> names = sites->getSequencesNames();
    if (ascending) sort(names.begin(), names.end());
    else sort(names.begin(), names.end(), greater<string>());
    for (string &name : names) {
        tmp->addSequence(sites->getSequence(name));
    }
    auto ret = make_shared<VectorSiteContainer>(*tmp);
    delete tmp;
    return ret;
}
Example #2
0
void GenBank::appendFromStream(istream & input, VectorSequenceContainer & vsc) const throw (Exception)
{
    if (!input) {
        throw IOException ("GenBank::read: fail to open file");
    }

    string temp, name, sequence = "";  // Initialization

    // Main loop : for all file lines
    while (!input.eof())
    {
        getline(input, temp, '\n');  // Copy current line in temporary string

        if(temp.size() >= 9 && temp.substr(0,9) == "ACCESSION")
        {
            name = TextTools::removeSurroundingWhiteSpaces(temp.substr(10));
            StringTokenizer st(name, " ");
            name = st.nextToken();
            //cout << name << endl;
        }
        if(temp.size() >=6 && temp.substr(0,6) == "ORIGIN")
        {
            sequence = "";
            getline(input, temp, '\n');  // Copy current line in temporary string
            while(!input.eof() && temp.size() > 2 && temp.substr(0,2) != "//")
            {
                sequence += TextTools::removeWhiteSpaces(temp.substr(10));
                getline(input, temp, '\n');  // Copy current line in temporary string
            }
            if(name == "") throw Exception("GenBank::read(). Sequence with no ACCESSION number!");
            Sequence * seq = new Sequence(name, sequence, vsc.getAlphabet());
            vsc.addSequence(* seq);
            name = "";
        }
    }
}
Example #3
0
int main(int args, char** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*           Bio++ Sequence Manipulator, version 2.3.0.           *" << endl;
  cout << "* Author: J. Dutheil                        Last Modif. 25/11/14 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;
  
  if (args == 1)
  {
    help();
    return 0;
  }
  
  try {

  BppApplication bppseqman(args, argv, "BppSeqMan");
  bppseqman.startTimer();
  
  // Get alphabet
  Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppseqman.getParams(), "", false, true, true);
  unique_ptr<GeneticCode> gCode;
  CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet);

  // Get sequences:
  bool aligned = ApplicationTools::getBooleanParameter("input.alignment", bppseqman.getParams(), false, "", true, 1);
  OrderedSequenceContainer* sequences = 0;

  if (aligned) {
    VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppseqman.getParams());
    sequences = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppseqman.getParams(), "", true, false);
    delete allSites;
  } else {
    SequenceContainer* tmp = SequenceApplicationTools::getSequenceContainer(alphabet, bppseqman.getParams(), "", true, true);
    sequences = new VectorSequenceContainer(*tmp);
    delete tmp;
  }

  ApplicationTools::displayResult("Number of sequences", sequences->getNumberOfSequences());
  
  // Perform manipulations
  
  vector<string> actions = ApplicationTools::getVectorParameter<string>("sequence.manip", bppseqman.getParams(), ',', "", "", false, 1);
  

  for (size_t a = 0; a < actions.size(); a++)
  {
    string cmdName;
    map<string, string> cmdArgs;
    KeyvalTools::parseProcedure(actions[a], cmdName, cmdArgs);
    ApplicationTools::displayResult("Performing action", cmdName);

    // +-----------------+
    // | Complementation |
    // +-----------------+
    if (cmdName == "Complement")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = SequenceTools::getComplement(sequences->getSequence(i));
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +------------------------+
    // | (Reverse)Transcription |
    // +------------------------+
    else if (cmdName == "Transcript")
    {
      if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType())
      {
        OrderedSequenceContainer* sc = 0;
        if (aligned) sc = new VectorSiteContainer(&AlphabetTools::RNA_ALPHABET);
        else         sc = new VectorSequenceContainer(&AlphabetTools::RNA_ALPHABET);
        for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++)
        {
          Sequence* seq = SequenceTools::transcript(sequences->getSequence(i));
          sc->addSequence(*seq, false);
          delete seq;
        }
        delete sequences;
        sequences = sc;
      }
      else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType())
      {
        OrderedSequenceContainer* sc = 0;
        if (aligned) sc = new VectorSiteContainer(&AlphabetTools::DNA_ALPHABET);
        else         sc = new VectorSequenceContainer(&AlphabetTools::DNA_ALPHABET);
        for (unsigned int i = 0; i < sequences->getNumberOfSequences(); i++)
        {
          Sequence* seq = SequenceTools::reverseTranscript(sequences->getSequence(i));
          sc->addSequence(*seq, false);
          delete seq;
        }
        delete sequences;
        sequences = sc;
      }
      else throw Exception("Transcription error: input alphabet must be of type 'nucleic'.");
    }
    // +-------------------------------+
    // | Switching nucleotide alphabet |
    // +-------------------------------+
    else if (cmdName == "Switch")
    {
      const Alphabet* alpha = 0;
      if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::DNA_ALPHABET.getAlphabetType())
      {
        alpha = &AlphabetTools::RNA_ALPHABET;
      }
      else if (sequences->getAlphabet()->getAlphabetType() == AlphabetTools::RNA_ALPHABET.getAlphabetType())
      {
        alpha = &AlphabetTools::DNA_ALPHABET;
      }
      else throw Exception("Cannot switch alphabet type, alphabet is not of type 'nucleic'.");
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(alpha);
      else         sc = new VectorSequenceContainer(alpha);
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        const Sequence* old = &sequences->getSequence(i);
        vector<int> content(old->size());
        for (size_t j = 0; j < old->size(); ++j)
          content[j] = (*old)[j];
        Sequence* seq = new BasicSequence(old->getName(), content, old->getComments(), alpha);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +-------------+
    // | Translation |
    // +-------------+
    else if (cmdName == "Translate")
    {
      if (!AlphabetTools::isCodonAlphabet(sequences->getAlphabet()))
        throw Exception("Error in translation: alphabet is not of type 'codon'.");
      if (cmdArgs["code"] != "")
        throw Exception("ERROR: 'code' argument is deprecated. The genetic code to use for translation is now set by the top-level argument 'genetic_code'.");
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }

      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(&AlphabetTools::PROTEIN_ALPHABET);
      else         sc = new VectorSequenceContainer(&AlphabetTools::PROTEIN_ALPHABET);
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        Sequence* seq = gCode->translate(sequences->getSequence(i));
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;      
    }
    // +-------------+
    // | Remove gaps |
    // +-------------+
    else if (cmdName == "RemoveGaps")
    {
      VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
        SequenceTools::removeGaps(*seq);
        sc->addSequence(*seq);
      }
      delete sequences;
      sequences = sc;
      aligned = false;
    }
    // +---------------------------+
    // | Change gaps to unresolved |
    // +---------------------------+
    else if (cmdName == "GapToUnknown")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = new BasicSequence(sequences->getSequence(i));
        SymbolListTools::changeGapsToUnknownCharacters(*seq);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +---------------------------+
    // | Change unresolved to gaps |
    // +---------------------------+
    else if (cmdName == "UnknownToGap")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        Sequence* seq = new BasicSequence(sequences->getSequence(i));
        SymbolListTools::changeUnresolvedCharactersToGaps(*seq);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    
    // +--------------+
    // | Remove stops |
    // +--------------+
    else if (cmdName == "RemoveStops")
    {
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if (!sites)
      {
        VectorSequenceContainer* sc = new VectorSequenceContainer(sequences->getAlphabet());
        for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
        {
          unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
          SequenceTools::removeStops(*seq, *gCode);
          sc->addSequence(*seq);
        }
        delete sequences;
        sequences = sc;
      } else {
        VectorSiteContainer* sc = new VectorSiteContainer(sequences->getAlphabet());
        for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
        {
          unique_ptr<Sequence> seq(sequences->getSequence(i).clone());
          SequenceTools::replaceStopsWithGaps(*seq, *gCode);
          sc->addSequence(*seq);
        }
        delete sequences;
        sequences = sc;
      }
    }

    // +--------------+
    // | Remove stops |
    // +--------------+
    else if (cmdName == "RemoveColumnsWithStops")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if (!sites)
      {
        throw Exception("'RemoveColumnsWithStops' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }

      for (size_t i = sites->getNumberOfSites(); i > 0; i--)
      {
        if (CodonSiteTools::hasStop(sites->getSite(i-1), *gCode))
          sites->deleteSite(i - 1);
      }
    }

    // +---------+
    // | Get CDS |
    // +---------+
    else if (cmdName == "GetCDS")
    {
      if (!gCode.get()) {
        string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppseqman.getParams(), "Standard", "", true, 1);
        ApplicationTools::displayResult("Genetic Code", codeDesc);
        gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
      }
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        BasicSequence seq = sequences->getSequence(i);
        size_t len = seq.size();
        SequenceTools::getCDS(seq, *gCode, false, true, true, false);
        if (aligned) {
          for (size_t c = seq.size(); c < len; ++c)
            seq.addElement(seq.getAlphabet()->getGapCharacterCode());
        }
        sc->addSequence(seq, false);
      }
      delete sequences;
      sequences = sc;
    }

    // +--------------------------+
    // | Resolve dotted alignment |
    // +--------------------------+
    else if (actions[a] == "CoerceToAlignment")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer*>(sequences);
      if(! sites)
      {
        sites = new VectorSiteContainer(*sequences);
        delete sequences;
        sequences = sites;
      }
      aligned = true;
    }
    else if (actions[a] == "ResolvedDotted")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences);
      if (!sites)
      {
        throw Exception("'ResolvedDotted' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }

      const Alphabet* alpha = 0;
      string alphastr = ApplicationTools::getStringParameter("alphabet", cmdArgs, "DNA", "", false, 1);
      if (alphastr == "DNA") alpha = &AlphabetTools::DNA_ALPHABET;
      else if (alphastr == "RNA") alpha = &AlphabetTools::RNA_ALPHABET;
      else if (alphastr == "Protein") alpha = &AlphabetTools::PROTEIN_ALPHABET;
      else throw Exception("Resolved alphabet must be one of [DNA|RNA|Protein] for solving dotted alignment.");
      OrderedSequenceContainer* resolvedCont = SiteContainerTools::resolveDottedAlignment(*sites, alpha);
      delete sequences;
      sequences = resolvedCont;
    }
    // +---------------------+
    // | Keep complete sites |
    // +---------------------+
    else if (cmdName == "KeepComplete")
    {
      SiteContainer* sites = dynamic_cast<SiteContainer *>(sequences);
      if (!sites)
      {
        throw Exception("'KeepComplete' can only be used on alignment. You may consider using the 'CoerceToAlignment' command.");
      }

      string maxGapOption = ApplicationTools::getStringParameter("maxGapAllowed", cmdArgs, "100%", "", false, 1);
      if (maxGapOption[maxGapOption.size()-1] == '%')
      {
        double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size()-1)) / 100.;
        for (size_t i = sites->getNumberOfSites(); i > 0; i--)
        {
          map<int, double> freqs;
          SiteTools::getFrequencies(sites->getSite(i - 1), freqs);
          if (freqs[-1] > gapFreq) sites->deleteSite(i - 1);
        }
      }
      else
      {
        size_t gapNum = TextTools::to<size_t>(maxGapOption);
        for (size_t i = sites->getNumberOfSites(); i > 0; i--)
        {
          map<int, size_t> counts;
          SiteTools::getCounts(sites->getSite(i - 1), counts);
          counts[-1]; //Needed in case this entry does not exist in the map. This will set it to 0.
          if (counts[-1] > gapNum) sites->deleteSite(i-1);
        }
      }
    }
    // +-----------------+
    // | Invert sequence |
    // +-----------------+
    else if (cmdName == "Invert")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); i++)
      {
        const Sequence* old = &sequences->getSequence(i);
        Sequence* seq = SequenceTools::getInvert(*old);
        sc->addSequence(*seq, false);
        delete seq;
      }
      delete sequences;
      sequences = sc;
    }
    // +------------------+
    // | GetCodonPosition |
    // +------------------+
    else if (cmdName == "GetCodonPosition")
    {
      unsigned int pos = ApplicationTools::getParameter<unsigned int>("position", cmdArgs, 3, "", false, 1);
      OrderedSequenceContainer* sc = dynamic_cast<OrderedSequenceContainer*>(SequenceContainerTools::getCodonPosition(*sequences, pos - 1));
      delete sequences;
      if (aligned) {
        sequences = new VectorSiteContainer(*sc);
        delete sc;
      } else {
        sequences = sc;
      }
    }
    // +-----------------+
    // | FilterFromTree |
    // +-----------------+
    else if (cmdName == "FilterFromTree")
    {
      unique_ptr<Tree> tree(PhylogeneticsApplicationTools::getTree(cmdArgs, ""));
      vector<string> names = tree->getLeavesNames();
      OrderedSequenceContainer* reorderedSequences = 0;
      if (aligned) {
        reorderedSequences = new VectorSiteContainer(sequences->getAlphabet());
      } else {
        reorderedSequences = new VectorSequenceContainer(sequences->getAlphabet());
      }
      for (size_t i = 0; i < names.size(); ++i) {
        reorderedSequences->addSequence(sequences->getSequence(names[i]), false);
      }
      delete sequences;
      sequences = reorderedSequences;
    }
    // +----------------------+
    // | RemoveEmptySequences |
    // +----------------------+
    else if (cmdName == "RemoveEmptySequences")
    {
      OrderedSequenceContainer* sc = 0;
      if (aligned) sc = new VectorSiteContainer(sequences->getAlphabet());
      else         sc = new VectorSequenceContainer(sequences->getAlphabet());
      for (size_t i = 0; i < sequences->getNumberOfSequences(); ++i)
      {
        if (SequenceTools::getNumberOfSites(sequences->getSequence(i))!=0)
          sc->addSequence(sequences->getSequence(i), false);
      }
      delete sequences;
      sequences = sc;
    }

    else throw Exception("Unknown action: " + cmdName);
  }
  
  // Write sequences
  ApplicationTools::displayBooleanResult("Final sequences are aligned", aligned);
  if (aligned)
  {
    SequenceApplicationTools::writeAlignmentFile(*dynamic_cast<SiteContainer*>(sequences), bppseqman.getParams(), "", true, 1);
  }
  else
  {
    SequenceApplicationTools::writeSequenceFile(*sequences, bppseqman.getParams(), "", true, 1);
  }

  delete alphabet;
  delete sequences;

  bppseqman.done();

  } catch(exception & e) {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}
Example #4
0
void Mase::appendFromStream(istream & input, VectorSequenceContainer & vsc) const throw (Exception)
{
	if (!input) { throw IOException ("Mase::read : fail to open file"); }
	
	// Initialization
	Comments seqComments, fileComments;
	string temp, name, sequence = "";
	bool comments = false;

	// Get current general comments is VectorSequenceContainer
	fileComments = vsc.getGeneralComments();

	// Main loop : for all file lines
	while(!input.eof())
  {
		getline(input, temp, '\n');  // Copy current line in temporary string
		
		// If first character is ;
		if(temp[0] == ';')
    {
			// If second character is also ;
			if(temp[1] == ';')
      {
				// File comments isolation
				temp.erase(0,2);  // Characters ;; deletion
				if(temp != "") fileComments.push_back(temp);
			}
      else
      {
				// If a name and a sequence were founded
				if((name != "") && (sequence != ""))
        {
					// New sequence creation, and addition in existing VectorSequenceContainer
					vsc.addSequence(Sequence(name, sequence, seqComments, vsc.getAlphabet()), _checkNames);
					name = "";
					sequence = "";
					seqComments.clear();
				}
				
				// Sequence commentaries isolation
				temp.erase(temp.begin());  // Character ; deletion
				if(temp != "") seqComments.push_back(temp);
				comments = true;
			}
		}
    else
    {
			// If sequence commentaries were just isolated
			if(comments)
      {
				// Sequence name isolation
				name = temp;
				comments = false;
			}
      else sequence += temp;  // Sequence isolation
		}
	}
	
	// Addition of the last sequence in file
	if((name != "") && (sequence != ""))
  {
		vsc.addSequence(Sequence(name, sequence, seqComments, vsc.getAlphabet()), _checkNames);
	}

	// Set new general comments in VectorSequenceContainer (old + new comments)
	vsc.setGeneralComments(fileComments);
}