bool QualityScores::stripQualThreshold(Sequence& sequence, double qThreshold){ try { string rawSequence = sequence.getUnaligned(); int seqLength = sequence.getNumBases(); if(seqName != sequence.getName()){ m->mothurOut("sequence name mismatch btwn fasta: " + sequence.getName() + " and qual file: " + seqName); m->mothurOutEndLine(); } int end; for(int i=0;i<seqLength;i++){ end = i; if(qScores[i] < qThreshold){ break; } } //every score passed if (end == (seqLength-1)) { end = seqLength; } sequence.setUnaligned(rawSequence.substr(0,end)); trimQScores(-1, end); return 1; } catch(exception& e) { m->errorOut(e, "QualityScores", "flipQScores"); exit(1); } }
FastqRead::FastqRead(Sequence s, QualityScores q, string f) { try { m = MothurOut::getInstance(); format = f; //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference. for (int i = -64; i < 65; i++) { char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499)); convertTable.push_back(temp); convertBackTable.push_back(((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499))); } if (s.getName() != q.getName()) { m->mothurOut("[ERROR]: sequence name does not match quality score name. Cannot construct fastq object.\n"); m->control_pressed = true; } else { name = s.getName(); comment = s.getComment(); sequence = s.getUnaligned(); scores = q.getScores(); scoreString = convertQual(scores); } } catch(exception& e) { m->errorOut(e, "FastqRead", "FastqRead"); exit(1); } }
int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long long>& MPIPos){ try { MPI_Status status; int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are for(int i=0;i<num;i++){ if (m->control_pressed) { return 1; } //read next sequence int length = MPIPos[start+i+1] - MPIPos[start+i]; char* buf4 = new char[length]; MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); string tempBuf = buf4; if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } istringstream iss (tempBuf,istringstream::in); delete buf4; Sequence* candidateSeq = new Sequence(iss); m->gobble(iss); if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine(); }else{ //find chimeras chimera->getChimeras(candidateSeq); if (m->control_pressed) { delete candidateSeq; return 1; } //print results chimera->print(outMPI, outAccMPI); } } delete candidateSeq; //report progress if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; } } //report progress if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; } return 0; } catch(exception& e) { m->errorOut(e, "ChimeraPintailCommand", "driverMPI"); exit(1); } }
bool QualityScores::stripQualWindowAverage(Sequence& sequence, int stepSize, int windowSize, double qThreshold){ try { string rawSequence = sequence.getUnaligned(); int seqLength = sequence.getNumBases(); if(seqName != sequence.getName()){ m->mothurOut("sequence name mismatch between fasta: " + sequence.getName() + " and qual file: " + seqName); m->mothurOutEndLine(); } int end = windowSize; int start = 0; if(seqLength < windowSize) { return 0; } while((start+windowSize) < seqLength){ double windowSum = 0.0000; for(int i=start;i<end;i++){ windowSum += qScores[i]; } double windowAverage = windowSum / (double)(end-start); if(windowAverage < qThreshold){ end = end - stepSize; break; } start += stepSize; end = start + windowSize; if(end >= seqLength){ end = seqLength; } } if(end == -1){ end = seqLength; } //failed first window if (end < windowSize) { return 0; } sequence.setUnaligned(rawSequence.substr(0,end)); trimQScores(-1, end); return 1; } catch(exception& e) { m->errorOut(e, "QualityScores", "stripQualWindowAverage"); exit(1); } }
void SequenceTools::getPutativeHaplotypes(const Sequence& seq, std::vector<Sequence*>& hap, unsigned int level) { vector< vector< int > > states(seq.size()); list<Sequence*> t_hap; const Alphabet* alpha = seq.getAlphabet(); unsigned int hap_count = 1; // Vector of available states at each position for (size_t i = 0; i < seq.size(); i++) { vector<int> st = alpha->getAlias(seq[i]); if (!st.size()) { st.push_back(alpha->getGapCharacterCode()); } if (st.size() <= level) { states[i] = st; } else { states[i] = vector<int>(1, seq[i]); } } // Combinatorial haplotypes building (the use of tree may be more accurate) t_hap.push_back(new BasicSequence(seq.getName() + "_hap" + TextTools::toString(hap_count++), "", alpha)); for (size_t i = 0; i < states.size(); i++) { for (list<Sequence*>::iterator it = t_hap.begin(); it != t_hap.end(); it++) { for (unsigned int j = 0; j < states[i].size(); j++) { Sequence* tmp_seq = new BasicSequence(seq.getName() + "_hap", (**it).getContent(), alpha); if (j < states[i].size() - 1) { tmp_seq->setName(tmp_seq->getName() + TextTools::toString(hap_count++)); tmp_seq->addElement(states[i][j]); t_hap.insert(it, tmp_seq); } else { (**it).addElement(states[i][j]); } } } } for (list<Sequence*>::reverse_iterator it = t_hap.rbegin(); it != t_hap.rend(); it++) { hap.push_back(*it); } }
unsigned int SequenceFeatureTools::getOrfs(const Sequence& seq, SequenceFeatureSet& featSet, const GeneticCode& gCode) { if (! AlphabetTools::isNucleicAlphabet(seq.getAlphabet())) { throw AlphabetException("SequenceFeatureTools::getOrfs: Sequence alphabet must be nucleic!", seq.getAlphabet()); } unsigned int orfCpt = 0; const CodonAlphabet* codonAlpha = gCode.getSourceAlphabet(); std::vector< std::vector<size_t> > starts(3), stops(3); size_t phase = 0; for (size_t p = 0 ; p < seq.size() - 2 ; p++) { phase = p % 3; if (gCode.isStart(codonAlpha->getCodon(seq.getValue(p), seq.getValue(p + 1), seq.getValue(p + 2)))) { starts[phase].push_back(p); //std::cerr << "Start: " << p << " (" << phase << ")" << std::endl; } else if (gCode.isStop(codonAlpha->getCodon(seq.getValue(p), seq.getValue(p + 1), seq.getValue(p + 2)))) { stops[phase].push_back(p); //std::cerr << "Stop: " << p << " (" << phase << ")" << std::endl; } } for (size_t i = 0 ; i < 3 ; ++i) { std::vector< size_t >::iterator start(starts[i].begin()), stop(stops[i].begin()); while (stop != stops[i].end() && start != starts[i].end()) { if (*stop < *start) { stop++; } else { orfCpt++; //std::cerr << "ORF: " << *start << " - " << *stop + 2 << " (" << i << ")" << std::endl; bpp::BasicSequenceFeature feat("", seq.getName(), "Bio++", "CDS", *start, *stop + 2, '+'); featSet.addFeature(feat); start++; } } } return orfCpt; }
int ChimeraCheckCommand::driver(linePair* filePos, string outputFName, string filename){ try { ofstream out; m->openOutputFile(outputFName, out); ofstream out2; ifstream inFASTA; m->openInputFile(filename, inFASTA); inFASTA.seekg(filePos->start); bool done = false; int count = 0; while (!done) { if (m->control_pressed) { return 1; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file //find chimeras chimera->getChimeras(candidateSeq); if (m->control_pressed) { delete candidateSeq; return 1; } //print results chimera->print(out, out2); count++; } delete candidateSeq; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } #endif //report progress if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count) + "\n"); } } //report progress if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count) + "\n"); } out.close(); inFASTA.close(); return count; } catch(exception& e) { m->errorOut(e, "ChimeraCheckCommand", "driver"); exit(1); } }
void VectorSiteContainer::setSequence(size_t pos, const Sequence& sequence, bool checkNames) throw (Exception) { if (pos >= getNumberOfSequences()) throw IndexOutOfBoundsException("VectorSiteContainer::setSequence", pos, 0, getNumberOfSequences() - 1); // New sequence's alphabet and site container's alphabet matching verification if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("VectorSiteContainer::addSite", getAlphabet(), sequence.getAlphabet()); // If the container has only one sequence, we set the size to the size of this sequence: if (getNumberOfSequences() == 1) realloc(sequence.size()); if (sequence.size() != sites_.size()) throw SequenceException("VectorSiteContainer::setSequence. Sequence has not the appropriate length.", &sequence); if (checkNames) { for (size_t i = 0; i < names_.size(); i++) { if (i != pos && sequence.getName() == names_[i]) throw SequenceException("VectorSiteContainer::settSequence. Name already exists in container.", &sequence); } } // Update name: names_[pos] = sequence.getName(); // Update elements at each site: for (size_t i = 0; i < sites_.size(); i++) { sites_[i]->setElement(pos, sequence.getValue(i)); } // Update comments: if (comments_[pos]) delete comments_[pos]; comments_[pos] = new Comments(sequence.getComments()); // Update sequences: if (sequences_[pos]) delete sequences_[pos]; sequences_[pos] = 0; }
bool QualityScores::cullQualAverage(Sequence& sequence, double qAverage){ try { string rawSequence = sequence.getUnaligned(); bool success = 0; //guilty until proven innocent if(seqName != sequence.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + sequence.getName() + " and qual file: " + seqName); m->mothurOutEndLine(); } double aveQScore = calculateAverage(); if(aveQScore >= qAverage) { success = 1; } else { success = 0; } return success; } catch(exception& e) { m->errorOut(e, "QualityScores", "cullQualAverage"); exit(1); } }
bool QualityScores::stripQualRollingAverage(Sequence& sequence, double qThreshold){ try { string rawSequence = sequence.getUnaligned(); int seqLength = sequence.getNumBases(); if(seqName != sequence.getName()){ m->mothurOut("sequence name mismatch btwn fasta: " + sequence.getName() + " and qual file: " + seqName); m->mothurOutEndLine(); } int end = -1; double rollingSum = 0.0000; for(int i=0;i<seqLength;i++){ rollingSum += (double)qScores[i]; if(rollingSum / (double)(i+1) < qThreshold){ end = i; break; } } if(end == -1){ end = seqLength; } sequence.setUnaligned(rawSequence.substr(0,end)); trimQScores(-1, end); return 1; } catch(exception& e) { m->errorOut(e, "QualityScores", "flipQScores"); exit(1); } }
void VectorSiteContainer::addSequence(const Sequence& sequence, bool checkNames) throw (Exception) { // If the container has no sequence, we set the size to the size of this sequence: if (getNumberOfSequences() == 0) realloc(sequence.size()); // New sequence's alphabet and site container's alphabet matching verification if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType()) throw AlphabetMismatchException("VectorSiteContainer::addSequence", getAlphabet(), sequence.getAlphabet()); if (sequence.size() != sites_.size()) throw SequenceException("VectorSiteContainer::addSequence. Sequence has not the appropriate length: " + TextTools::toString(sequence.size()) + ", should be " + TextTools::toString(sites_.size()) + ".", &sequence); if (checkNames) { for (size_t i = 0; i < names_.size(); i++) { if (sequence.getName() == names_[i]) throw SequenceException("VectorSiteContainer::addSequence. Name already exists in container.", &sequence); } } // Append name: names_.push_back(sequence.getName()); // Append elements at each site: for (size_t i = 0; i < sites_.size(); i++) { sites_[i]->addElement(sequence.getValue(i)); } // Append comments: comments_.push_back(new Comments(sequence.getComments())); // Sequences pointers: sequences_.push_back(0); }
void VectorSiteContainer::addSequence( const Sequence& sequence, size_t pos, bool checkNames) throw (Exception) { if (pos >= getNumberOfSequences()) throw IndexOutOfBoundsException("VectorSiteContainer::addSequence.", pos, 0, getNumberOfSequences() - 1); if (sequence.size() != sites_.size()) throw SequenceNotAlignedException("VectorSiteContainer::setSequence", &sequence); // New sequence's alphabet and site container's alphabet matching verification if (sequence.getAlphabet()->getAlphabetType() != getAlphabet()->getAlphabetType()) { throw AlphabetMismatchException("VectorSiteContainer::addSite", getAlphabet(), sequence.getAlphabet()); } if (checkNames) { for (size_t i = 0; i < names_.size(); i++) { if (sequence.getName() == names_[i]) throw SequenceException("VectorSiteContainer::addSequence. Name already exists in container.", &sequence); } } for (size_t i = 0; i < sites_.size(); i++) { // For each site: sites_[i]->addElement(pos, sequence.getValue(i)); } // Actualize names and comments: names_.insert(names_.begin() + pos, sequence.getName()); comments_.insert(comments_.begin() + pos, new Comments(sequence.getComments())); sequences_.insert(sequences_.begin() + pos, 0); }
void DistanceDB::addSequence(Sequence seq) { try { //are the template sequences aligned if (!isAligned(seq.getAligned())) { templateAligned = false; m->mothurOut(seq.getName() + " is not aligned. Sequences must be aligned to use the distance method."); m->mothurOutEndLine(); } if (templateSeqsLength == 0) { templateSeqsLength = seq.getAligned().length(); } data.push_back(seq); } catch(exception& e) { m->errorOut(e, "DistanceDB", "addSequence"); exit(1); } }
//********************************************************************************************************************** void driverClassifier(classifyData* params){ try { ifstream inFASTA; params->util.openInputFile(params->filename, inFASTA); string taxonomy; inFASTA.seekg(params->start); bool done = false; string taxBuffer = ""; string taxTBuffer = ""; string accnosBuffer = ""; while (!done) { if (params->m->getControl_pressed()) { break; } Sequence* candidateSeq = new Sequence(inFASTA); params->util.gobble(inFASTA); if (candidateSeq->getName() != "") { string simpleTax = ""; bool flipped = false; taxonomy = params->classify->getTaxonomy(candidateSeq, simpleTax, flipped); if (params->m->getControl_pressed()) { delete candidateSeq; break; } if (taxonomy == "unknown;") { params->m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences.\n"); } //output confidence scores or not if (params->probs) { taxBuffer += candidateSeq->getName() + '\t' + taxonomy + '\n'; } else { taxBuffer += candidateSeq->getName() + '\t' + simpleTax + '\n'; } if (flipped) { accnosBuffer += candidateSeq->getName() + '\n'; } taxTBuffer = candidateSeq->getName() + '\t' + simpleTax + '\n'; params->count++; } delete candidateSeq; //report progress if((params->count) % 100 == 0){ params->m->mothurOutJustToScreen(toString(params->count) +"\n"); params->taxTWriter->write(taxTBuffer); taxTBuffer = ""; params->taxWriter->write(taxBuffer); taxBuffer = ""; if (accnosBuffer != "") { params->accnosWriter->write(accnosBuffer); accnosBuffer = ""; } } #if defined NON_WINDOWS unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= params->end)) { break; } #else if (params->count == params->end) { break; } #endif } //report progress if((params->count) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->count)+"\n"); params->taxTWriter->write(taxTBuffer); taxTBuffer = ""; params->taxWriter->write(taxBuffer); taxBuffer = ""; if (accnosBuffer != "") { params->accnosWriter->write(accnosBuffer); accnosBuffer = ""; } } inFASTA.close(); } catch(exception& e) { params->m->errorOut(e, "ClassifySeqsCommand", "driver"); exit(1); } }
//*************************************************************************************************************** //gets closest matches to each end, since chimeras will most likely have different parents on each end vector<Sequence> DeCalculator::findClosest(Sequence querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted, int minSim) { try { //indexes.clear(); vector<Sequence> seqsMatches; vector<SeqDist> distsLeft; vector<SeqDist> distsRight; Dist* distcalculator = new eachGapDist(); string queryUnAligned = querySeq.getUnaligned(); int numBases = int(queryUnAligned.length() * 0.33); string leftQuery = ""; //first 1/3 of the sequence string rightQuery = ""; //last 1/3 of the sequence string queryAligned = querySeq.getAligned(); //left side bool foundFirstBase = false; int baseCount = 0; int leftSpot = 0; int firstBaseSpot = 0; for (int i = 0; i < queryAligned.length(); i++) { //if you are a base if (isalpha(queryAligned[i])) { baseCount++; if (!foundFirstBase) { foundFirstBase = true; firstBaseSpot = i; } } //eliminate opening .'s if (foundFirstBase) { leftQuery += queryAligned[i]; } //if you have 1/3 if (baseCount >= numBases) { leftSpot = i; break; } //first 1/3 } //right side - count through another 1/3, so you are at last third baseCount = 0; int rightSpot = 0; for (int i = leftSpot; i < queryAligned.length(); i++) { //if you are a base if (isalpha(queryAligned[i])) { baseCount++; } //if you have 1/3 if (baseCount > numBases + 1) { rightSpot = i; break; } //last 1/3 } //trim end //find last position in query that is a non gap character int lastBaseSpot = queryAligned.length()-1; for (int j = queryAligned.length()-1; j >= 0; j--) { if (isalpha(queryAligned[j])) { lastBaseSpot = j; break; } } rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot+1)); //sequence from pos spot to end Sequence queryLeft(querySeq.getName(), leftQuery); Sequence queryRight(querySeq.getName(), rightQuery); //cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl; //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl; for(int j = 0; j < thisFilteredTemplate.size(); j++){ string dbAligned = thisFilteredTemplate[j]->getAligned(); string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot+1)); //last 1/3 of the sequence Sequence dbLeft(thisFilteredTemplate[j]->getName(), leftDB); Sequence dbRight(thisFilteredTemplate[j]->getName(), rightDB); distcalculator->calcDist(queryLeft, dbLeft); float distLeft = distcalculator->getDist(); distcalculator->calcDist(queryRight, dbRight); float distRight = distcalculator->getDist(); SeqDist subjectLeft; subjectLeft.seq = NULL; subjectLeft.dist = distLeft; subjectLeft.index = j; distsLeft.push_back(subjectLeft); SeqDist subjectRight; subjectRight.seq = NULL; subjectRight.dist = distRight; subjectRight.index = j; distsRight.push_back(subjectRight); } delete distcalculator; //sort by smallest distance sort(distsRight.begin(), distsRight.end(), compareSeqDist); sort(distsLeft.begin(), distsLeft.end(), compareSeqDist); //merge results map<string, string> seen; map<string, string>::iterator it; vector<SeqDist> dists; float lastRight = distsRight[0].dist; float lastLeft = distsLeft[0].dist; float maxDist = 1.0 - (minSim / 100.0); for (int i = 0; i < numWanted+1; i++) { if (m->control_pressed) { return seqsMatches; } //add left if you havent already it = seen.find(thisTemplate[distsLeft[i].index]->getName()); if (it == seen.end() && distsLeft[i].dist <= maxDist) { dists.push_back(distsLeft[i]); seen[thisTemplate[distsLeft[i].index]->getName()] = thisTemplate[distsLeft[i].index]->getName(); lastLeft = distsLeft[i].dist; // cout << "loop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl; } //add right if you havent already it = seen.find(thisTemplate[distsRight[i].index]->getName()); if (it == seen.end() && distsRight[i].dist <= maxDist) { dists.push_back(distsRight[i]); seen[thisTemplate[distsRight[i].index]->getName()] = thisTemplate[distsRight[i].index]->getName(); lastRight = distsRight[i].dist; // cout << "loop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl; } if (i == numWanted) { break; } } //are we still above the minimum similarity cutoff if ((lastLeft >= minSim) || (lastRight >= minSim)) { //add in ties from left int i = numWanted; while (i < distsLeft.size()) { if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); } else { break; } i++; } //add in ties from right i = numWanted; while (i < distsRight.size()) { if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); } else { break; } i++; } } //cout << numWanted << endl; for (int i = 0; i < dists.size(); i++) { // cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl; if ((thisTemplate[dists[i].index]->getName() != querySeq.getName()) && (((1.0-dists[i].dist)*100) >= minSim)) { Sequence temp(thisTemplate[dists[i].index]->getName(), thisTemplate[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother. //cout << querySeq->getName() << '\t' << thisTemplate[dists[i].index]->getName() << '\t' << dists[i].dist << endl; seqsMatches.push_back(temp); } } return seqsMatches; } catch(exception& e) { m->errorOut(e, "DeCalculator", "findClosest"); exit(1); } }
int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector<unsigned long long>& MPIPos){ try { string outputString = ""; MPI_Status statusReport; MPI_Status statusAlign; MPI_Status statusAccnos; MPI_Status status; int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are NastReport report; if (pid == 0) { outputString = report.getHeaders(); int length = outputString.length(); char* buf = new char[length]; memcpy(buf, outputString.c_str(), length); MPI_File_write_shared(reportFile, buf, length, MPI_CHAR, &statusReport); delete buf; } Alignment* alignment; int longestBase = templateDB->getLongestBase(); if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } else if(align == "noalign") { alignment = new NoAlign(); } else { m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman."); m->mothurOutEndLine(); alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } for(int i=0;i<num;i++){ if (m->control_pressed) { delete alignment; return 0; } //read next sequence int length = MPIPos[start+i+1] - MPIPos[start+i]; char* buf4 = new char[length]; //memcpy(buf4, outputString.c_str(), length); MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); string tempBuf = buf4; delete buf4; if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } istringstream iss (tempBuf,istringstream::in); Sequence* candidateSeq = new Sequence(iss); report.setCandidate(candidateSeq); int origNumBases = candidateSeq->getNumBases(); string originalUnaligned = candidateSeq->getUnaligned(); int numBasesNeeded = origNumBases * threshold; if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { alignment->resize(candidateSeq->getUnaligned().length()+1); } Sequence temp = templateDB->findClosestSequence(candidateSeq); Sequence* templateSeq = &temp; float searchScore = templateDB->getSearchScore(); Nast* nast = new Nast(alignment, candidateSeq, templateSeq); Sequence* copy; Nast* nast2; bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below //since nast does not make a copy of hte sequence passed, and it is used by the reporter below //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place //so this bool tells you if you need to delete it //if there is a possibility that this sequence should be reversed if (candidateSeq->getNumBases() < numBasesNeeded) { string wasBetter = ""; //if the user wants you to try the reverse if (flip) { //get reverse compliment copy = new Sequence(candidateSeq->getName(), originalUnaligned); copy->reverseComplement(); //rerun alignment Sequence temp2 = templateDB->findClosestSequence(copy); Sequence* templateSeq2 = &temp2; searchScore = templateDB->getSearchScore(); nast2 = new Nast(alignment, copy, templateSeq2); //check if any better if (copy->getNumBases() > candidateSeq->getNumBases()) { candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better templateSeq = templateSeq2; delete nast; nast = nast2; needToDeleteCopy = true; wasBetter = "\treverse complement produced a better alignment, so mothur used the reverse complement."; }else{ wasBetter = "\treverse complement did NOT produce a better alignment, please check sequence."; delete nast2; delete copy; } } //create accnos file with names outputString = candidateSeq->getName() + wasBetter + "\n"; //send results to parent int length = outputString.length(); char* buf = new char[length]; memcpy(buf, outputString.c_str(), length); MPI_File_write_shared(accnosFile, buf, length, MPI_CHAR, &statusAccnos); delete buf; MPIWroteAccnos = true; } report.setTemplate(templateSeq); report.setSearchParameters(search, searchScore); report.setAlignmentParameters(align, alignment); report.setNastParameters(*nast); outputString = ">" + candidateSeq->getName() + "\n" + candidateSeq->getAligned() + "\n"; //send results to parent int length = outputString.length(); char* buf2 = new char[length]; memcpy(buf2, outputString.c_str(), length); MPI_File_write_shared(alignFile, buf2, length, MPI_CHAR, &statusAlign); delete buf2; outputString = report.getReport(); //send results to parent length = outputString.length(); char* buf3 = new char[length]; memcpy(buf3, outputString.c_str(), length); MPI_File_write_shared(reportFile, buf3, length, MPI_CHAR, &statusReport); delete buf3; delete nast; if (needToDeleteCopy) { delete copy; } } delete candidateSeq; //report progress if((i+1) % 100 == 0){ cout << (toString(i+1)) << endl; } } //report progress if((num) % 100 != 0){ cout << (toString(num)) << endl; } return 1; } catch(exception& e) { m->errorOut(e, "AlignCommand", "driverMPI"); exit(1); } }
//*************************************************************************************************************** vector<sim> ChimeraCheckRDP::findIS() { try { vector< map<int, int> > queryKmerInfo; //vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq //example: seqKmerInfo[50] = map containing the kmers found in the first 50 + kmersize characters of ecoli. //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the //kmers 2 seqs had in common. There may be a better way to do this thats why I am leaving so many comments... vector< map<int, int> > subjectKmerInfo; vector<sim> isValues; string queryName = querySeq->getName(); string seq = querySeq->getUnaligned(); queryKmerInfo = kmer->getKmerCounts(seq); subjectKmerInfo = kmer->getKmerCounts(closest.getUnaligned()); //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]); //you don't want the starting point to be virtually at hte end so move it in 10% int start = seq.length() / 10; //for each window for (int f = start; f < (seq.length() - start); f+=increment) { if (m->control_pressed) { return isValues; } if ((f - kmerSize) < 0) { m->mothurOut("Your sequence is too short for your kmerSize."); m->mothurOutEndLine(); exit(1); } sim temp; string fragLeft = seq.substr(0, f); //left side of breakpoint string fragRight = seq.substr(f); //right side of breakpoint //make a sequence of the left side and right side Sequence* left = new Sequence(queryName, fragLeft); Sequence* right = new Sequence(queryName, fragRight); //find seqs closest to each fragment Sequence closestLeft = templateDB->findClosestSequence(left); Sequence closestRight = templateDB->findClosestSequence(right); //get kmerinfo for the closest left vector< map<int, int> > closeLeftKmerInfo = kmer->getKmerCounts(closestLeft.getUnaligned()); //get kmerinfo for the closest right vector< map<int, int> > closeRightKmerInfo = kmer->getKmerCounts(closestRight.getUnaligned()); //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side //iterate through left sides map to subtract the number of times you saw things before you got the the right side map<int, int> rightside = queryKmerInfo[queryKmerInfo.size()-1]; for (map<int, int>::iterator itleft = queryKmerInfo[f-kmerSize].begin(); itleft != queryKmerInfo[f-kmerSize].end(); itleft++) { int howManyTotal = queryKmerInfo[queryKmerInfo.size()-1][itleft->first]; //times that kmer was seen in total //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side int howmanyright = howManyTotal - itleft->second; //if any were seen just on the left erase if (howmanyright == 0) { rightside.erase(itleft->first); } } map<int, int> closerightside = closeRightKmerInfo[closeRightKmerInfo.size()-1]; for (map<int, int>::iterator itright = closeRightKmerInfo[f-kmerSize].begin(); itright != closeRightKmerInfo[f-kmerSize].end(); itright++) { int howManyTotal = closeRightKmerInfo[(closeRightKmerInfo.size()-1)][itright->first]; //times that kmer was seen in total //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side int howmanyright = howManyTotal - itright->second; //if any were seen just on the left erase if (howmanyright == 0) { closerightside.erase(itright->first); } } int nLeft = calcKmers(closeLeftKmerInfo[f-kmerSize], queryKmerInfo[f-kmerSize]); int nRight = calcKmers(closerightside, rightside); int is = nLeft + nRight - nTotal; //save IS, leftparent, rightparent, breakpoint temp.leftParent = closestLeft.getName(); temp.rightParent = closestRight.getName(); temp.score = is; temp.midpoint = f; isValues.push_back(temp); delete left; delete right; } return isValues; } catch(exception& e) { m->errorOut(e, "ChimeraCheckRDP", "findIS"); exit(1); } }
//********************************************************************************************************************** void alignDriver(alignStruct* params) { try { NastReport report; ifstream inFASTA; params->util.openInputFile(params->inputFilename, inFASTA); inFASTA.seekg(params->filePos.start); bool done = false; long long count = 0; long long numFlipped_0 = 0; long long numFlipped_1 = 0; //moved this into driver to avoid deep copies in windows paralellized version Alignment* alignment; int longestBase = params->templateDB->getLongestBase(); if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: template longest base = " + toString(longestBase) + " \n"); } if(params->alignMethod == "gotoh") { alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, longestBase); } else if(params->alignMethod == "needleman") { alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, longestBase); } else if(params->alignMethod == "blast") { alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch); } else if(params->alignMethod == "noalign") { alignment = new NoAlign(); } else { params->m->mothurOut(params->alignMethod + " is not a valid alignment option. I will run the command using needleman."); params->m->mothurOutEndLine(); alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, longestBase); } while (!done) { if (params->m->getControl_pressed()) { break; } Sequence* candidateSeq = new Sequence(inFASTA); params->util.gobble(inFASTA); report.setCandidate(candidateSeq); int origNumBases = candidateSeq->getNumBases(); string originalUnaligned = candidateSeq->getUnaligned(); int numBasesNeeded = origNumBases * params->threshold; if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file if (candidateSeq->getUnaligned().length()+1 > alignment->getnRows()) { if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: " + candidateSeq->getName() + " " + toString(candidateSeq->getUnaligned().length()) + " " + toString(alignment->getnRows()) + " \n"); } alignment->resize(candidateSeq->getUnaligned().length()+2); } float searchScore; Sequence temp = params->templateDB->findClosestSequence(candidateSeq, searchScore); Sequence* templateSeq = new Sequence(temp.getName(), temp.getAligned()); Nast* nast = new Nast(alignment, candidateSeq, templateSeq); Sequence* copy; Nast* nast2; bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below //since nast does not make a copy of hte sequence passed, and it is used by the reporter below //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place //so this bool tells you if you need to delete it //if there is a possibility that this sequence should be reversed if (candidateSeq->getNumBases() < numBasesNeeded) { numFlipped_1++; string wasBetter = ""; //if the user wants you to try the reverse if (params->flip) { //get reverse compliment copy = new Sequence(candidateSeq->getName(), originalUnaligned); copy->reverseComplement(); if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: flipping " + candidateSeq->getName() + " \n"); } //rerun alignment Sequence temp2 = params->templateDB->findClosestSequence(copy, searchScore); Sequence* templateSeq2 = new Sequence(temp2.getName(), temp2.getAligned()); if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: closest template " + temp2.getName() + " \n"); } nast2 = new Nast(alignment, copy, templateSeq2); if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: completed Nast2 " + candidateSeq->getName() + " flipped numBases = " + toString(copy->getNumBases()) + " old numbases = " + toString(candidateSeq->getNumBases()) +" \n"); } //check if any better if (copy->getNumBases() > candidateSeq->getNumBases()) { candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better delete templateSeq; templateSeq = templateSeq2; delete nast; nast = nast2; needToDeleteCopy = true; wasBetter = "\treverse complement produced a better alignment, so mothur used the reverse complement."; numFlipped_0++; }else{ wasBetter = "\treverse complement did NOT produce a better alignment so it was not used, please check sequence."; delete nast2; delete templateSeq2; delete copy; } if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: done.\n"); } } //create accnos file with names params->accnosWriter->write(candidateSeq->getName() + wasBetter + "\n"); } report.setTemplate(templateSeq); report.setSearchParameters(params->search, searchScore); report.setAlignmentParameters(params->alignMethod, alignment); report.setNastParameters(*nast); params->alignWriter->write('>' + candidateSeq->getName() + '\n' + candidateSeq->getAligned() + '\n'); params->reportWriter->write(report.getReport()); delete nast; delete templateSeq; if (needToDeleteCopy) { delete copy; } count++; } delete candidateSeq; #if defined NON_WINDOWS unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= params->filePos.end)) { break; } #else if (count == params->filePos.end) { break; } #endif //report progress if((count) % 1000 == 0){ params->m->mothurOutJustToScreen(toString(count) + "\n"); } } //report progress if((count) % 1000 != 0){ params->m->mothurOutJustToScreen(toString(count) + "\n"); } params->numSeqs += count; params->flippedResults[0] += numFlipped_0; params->flippedResults[1] += numFlipped_1; delete alignment; inFASTA.close(); } catch(exception& e) { params->m->errorOut(e, "AlignCommand", "driver"); exit(1); } }
//********************************************************************************************************************** int AlignCommand::driver(linePair* filePos, string alignFName, string reportFName, string accnosFName, string filename){ try { ofstream alignmentFile; m->openOutputFile(alignFName, alignmentFile); ofstream accnosFile; m->openOutputFile(accnosFName, accnosFile); NastReport report(reportFName); ifstream inFASTA; m->openInputFile(filename, inFASTA); inFASTA.seekg(filePos->start); bool done = false; int count = 0; //moved this into driver to avoid deep copies in windows paralellized version Alignment* alignment; int longestBase = templateDB->getLongestBase(); if (m->debug) { m->mothurOut("[DEBUG]: template longest base = " + toString(templateDB->getLongestBase()) + " \n"); } if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } else if(align == "noalign") { alignment = new NoAlign(); } else { m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman."); m->mothurOutEndLine(); alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } while (!done) { if (m->control_pressed) { break; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); report.setCandidate(candidateSeq); int origNumBases = candidateSeq->getNumBases(); string originalUnaligned = candidateSeq->getUnaligned(); int numBasesNeeded = origNumBases * threshold; if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file if (candidateSeq->getUnaligned().length()+1 > alignment->getnRows()) { if (m->debug) { m->mothurOut("[DEBUG]: " + candidateSeq->getName() + " " + toString(candidateSeq->getUnaligned().length()) + " " + toString(alignment->getnRows()) + " \n"); } alignment->resize(candidateSeq->getUnaligned().length()+2); } Sequence temp = templateDB->findClosestSequence(candidateSeq); Sequence* templateSeq = new Sequence(temp.getName(), temp.getAligned()); float searchScore = templateDB->getSearchScore(); Nast* nast = new Nast(alignment, candidateSeq, templateSeq); Sequence* copy; Nast* nast2; bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below //since nast does not make a copy of hte sequence passed, and it is used by the reporter below //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place //so this bool tells you if you need to delete it //if there is a possibility that this sequence should be reversed if (candidateSeq->getNumBases() < numBasesNeeded) { string wasBetter = ""; //if the user wants you to try the reverse if (flip) { //get reverse compliment copy = new Sequence(candidateSeq->getName(), originalUnaligned); copy->reverseComplement(); if (m->debug) { m->mothurOut("[DEBUG]: flipping " + candidateSeq->getName() + " \n"); } //rerun alignment Sequence temp2 = templateDB->findClosestSequence(copy); Sequence* templateSeq2 = new Sequence(temp2.getName(), temp2.getAligned()); if (m->debug) { m->mothurOut("[DEBUG]: closest template " + temp2.getName() + " \n"); } searchScore = templateDB->getSearchScore(); nast2 = new Nast(alignment, copy, templateSeq2); if (m->debug) { m->mothurOut("[DEBUG]: completed Nast2 " + candidateSeq->getName() + " flipped numBases = " + toString(copy->getNumBases()) + " old numbases = " + toString(candidateSeq->getNumBases()) +" \n"); } //check if any better if (copy->getNumBases() > candidateSeq->getNumBases()) { candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better delete templateSeq; templateSeq = templateSeq2; delete nast; nast = nast2; needToDeleteCopy = true; wasBetter = "\treverse complement produced a better alignment, so mothur used the reverse complement."; }else{ wasBetter = "\treverse complement did NOT produce a better alignment so it was not used, please check sequence."; delete nast2; delete templateSeq2; delete copy; } if (m->debug) { m->mothurOut("[DEBUG]: done.\n"); } } //create accnos file with names accnosFile << candidateSeq->getName() << wasBetter << endl; } report.setTemplate(templateSeq); report.setSearchParameters(search, searchScore); report.setAlignmentParameters(align, alignment); report.setNastParameters(*nast); alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; report.print(); delete nast; delete templateSeq; if (needToDeleteCopy) { delete copy; } count++; } delete candidateSeq; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } #endif //report progress if((count) % 100 == 0){ m->mothurOutJustToScreen(toString(count) + "\n"); } } //report progress if((count) % 100 != 0){ m->mothurOutJustToScreen(toString(count) + "\n"); } delete alignment; alignmentFile.close(); inFASTA.close(); accnosFile.close(); return count; } catch(exception& e) { m->errorOut(e, "AlignCommand", "driver"); exit(1); } }
BasicSequence::BasicSequence(const Sequence& s) : BasicSymbolList(s), name_(s.getName()), comments_(s.getComments()) {}
//*************************************************************************************************************** vector<Sequence*> MothurChimera::readSeqs(string file) { try { vector<Sequence*> container; int count = 0; length = 0; unaligned = false; ReferenceDB* rdb = ReferenceDB::getInstance(); if (file == "saved") { m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); for (int i = 0; i < rdb->referenceSeqs.size(); i++) { Sequence* temp = new Sequence(rdb->referenceSeqs[i].getName(), rdb->referenceSeqs[i].getAligned()); if (count == 0) { length = temp->getAligned().length(); count++; } //gets first seqs length else if (length != temp->getAligned().length()) { unaligned = true; } if (temp->getName() != "") { container.push_back(temp); } } templateFileName = rdb->getSavedReference(); }else { m->mothurOut("Reading sequences from " + file + "..."); cout.flush(); ifstream in; m->openInputFile(file, in); //read in seqs and store in vector while(!in.eof()){ if (m->control_pressed) { return container; } Sequence* current = new Sequence(in); m->gobble(in); if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length else if (length != current->getAligned().length()) { unaligned = true; } if (current->getName() != "") { container.push_back(current); if (rdb->save) { rdb->referenceSeqs.push_back(*current); } } } in.close(); m->mothurOut("Done."); m->mothurOutEndLine(); filterString = (string(container[0]->getAligned().length(), '1')); } return container; } catch(exception& e) { m->errorOut(e, "MothurChimera", "readSeqs"); exit(1); } }
//********************************************************************************************************************** string ChopSeqsCommand::getChopped(Sequence seq, string& qualValues) { try { string temp = seq.getAligned(); string tempUnaligned = seq.getUnaligned(); if (countGaps) { //if needed trim sequence if (keep == "front") {//you want to keep the beginning int tempLength = temp.length(); if (tempLength > numbases) { //you have enough bases to remove some int stopSpot = 0; int numBasesCounted = 0; for (int i = 0; i < temp.length(); i++) { //eliminate N's if (!keepN) { if (toupper(temp[i]) == 'N') { temp[i] = '.'; } } numBasesCounted++; if (numBasesCounted >= numbases) { stopSpot = i; break; } } if (stopSpot == 0) { temp = ""; } else { temp = temp.substr(0, stopSpot+1); } }else { if (!Short) { temp = ""; } //sequence too short } }else { //you are keeping the back int tempLength = temp.length(); if (tempLength > numbases) { //you have enough bases to remove some int stopSpot = 0; int numBasesCounted = 0; for (int i = (temp.length()-1); i >= 0; i--) { //eliminate N's if (!keepN) { if (toupper(temp[i]) == 'N') { temp[i] = '.'; } } numBasesCounted++; if (numBasesCounted >= numbases) { stopSpot = i; break; } } if (stopSpot == 0) { temp = ""; } else { temp = temp.substr(stopSpot+1); } }else { if (!Short) { temp = ""; } //sequence too short } } }else{ //if needed trim sequence if (keep == "front") {//you want to keep the beginning int tempLength = tempUnaligned.length(); if (tempLength > numbases) { //you have enough bases to remove some int stopSpot = 0; int numBasesCounted = 0; for (int i = 0; i < temp.length(); i++) { //eliminate N's if (!keepN) { if (toupper(temp[i]) == 'N') { temp[i] = '.'; tempLength--; if (tempLength < numbases) { stopSpot = 0; break; } } } if(isalpha(temp[i])) { numBasesCounted++; } if (numBasesCounted >= numbases) { stopSpot = i; break; } } if (stopSpot == 0) { temp = ""; } else { temp = temp.substr(0, stopSpot+1); } qualValues = seq.getName() +'\t' + toString(0) + '\t' + toString(stopSpot+1) + '\n'; }else { if (!Short) { temp = ""; qualValues = seq.getName() +'\t' + toString(0) + '\t' + toString(0) + '\n'; } //sequence too short else { qualValues = seq.getName() +'\t' + toString(0) + '\t' + toString(tempLength) + '\n'; } } }else { //you are keeping the back int tempLength = tempUnaligned.length(); if (tempLength > numbases) { //you have enough bases to remove some int stopSpot = 0; int numBasesCounted = 0; for (int i = (temp.length()-1); i >= 0; i--) { if (!keepN) { //eliminate N's if (toupper(temp[i]) == 'N') { temp[i] = '.'; tempLength--; if (tempLength < numbases) { stopSpot = 0; break; } } } if(isalpha(temp[i])) { numBasesCounted++; } if (numBasesCounted >= numbases) { stopSpot = i; break; } } if (stopSpot == 0) { temp = ""; } else { temp = temp.substr(stopSpot); } qualValues = seq.getName() +'\t' + toString(stopSpot) + '\t' + toString(temp.length()-1) + '\n'; }else { if (!Short) { temp = ""; qualValues = seq.getName() +'\t' + toString(0) + '\t' + toString(0) + '\n'; } //sequence too short else { qualValues = seq.getName() +'\t' + toString(0) + '\t' + toString(tempLength) + '\n'; } } } } return temp; } catch(exception& e) { m->errorOut(e, "ChopSeqsCommand", "getChopped"); exit(1); } }