bool Seq::HasGap() const { for (CharVect::const_iterator p = begin(); p != end(); ++p) { char c = *p; if (IsGapChar(c)) return true; } return false; }
// Perecent identity of a pair of sequences. // Positions with one or both gapped are ignored. double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const { const unsigned uColCount = GetColCount(); unsigned uPosCount = 0; unsigned uSameCount = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c1 = GetChar(uSeqIndex1, uColIndex); const char c2 = GetChar(uSeqIndex2, uColIndex); if (IsGapChar(c1) || IsGapChar(c2)) continue; if (c1 == c2) ++uSameCount; ++uPosCount; } if (0 == uPosCount) return 0; return (double) uSameCount / (double) uPosCount; }
unsigned Seq::GetUngappedLength() const { unsigned uUngappedLength = 0; for (CharVect::const_iterator p = begin(); p != end(); ++p) { char c = *p; if (!IsGapChar(c)) ++uUngappedLength; } return uUngappedLength; }
void Seq::StripGapsAndWhitespace() { for (CharVect::iterator p = begin(); p != end(); ) { char c = *p; if (isspace(c) || IsGapChar(c)) erase(p); else ++p; } }
void Seq::StripGaps() { for (CharVect::iterator p = begin(); p != end(); ) { char c = *p; if (IsGapChar(c)) erase(p); else ++p; } }
bool Seq::EqIgnoreCase(const Seq &s) const { const unsigned n = Length(); if (n != s.Length()) return false; for (unsigned i = 0; i < n; ++i) { const char c1 = at(i); const char c2 = s.at(i); if (IsGapChar(c1)) { if (!IsGapChar(c2)) return false; } else { if (toupper(c1) != toupper(c2)) return false; } } return true; }
static void SeqFromMSACols(const MSA &msa, unsigned uSeqIndex, unsigned uColFrom, unsigned uColTo, Seq &s) { s.Clear(); s.SetName(msa.GetSeqName(uSeqIndex)); s.SetId(msa.GetSeqId(uSeqIndex)); for (unsigned uColIndex = uColFrom; uColIndex <= uColTo; ++uColIndex) { char c = msa.GetChar(uSeqIndex, uColIndex); if (!IsGapChar(c)) s.AppendChar(c); } }
void Seq::ExtractUngapped(MSA &msa) const { msa.Clear(); unsigned uColCount = Length(); msa.SetSize(1, 1); unsigned uUngappedPos = 0; for (unsigned n = 0; n < uColCount; ++n) { char c = at(n); if (!IsGapChar(c)) msa.SetChar(0, uUngappedPos++, c); } msa.SetSeqName(0, m_ptrName); }
ALPHA SeqVect::GuessAlpha() const { // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap // letters belong to the nucleotide alphabet, guess nucleo. // Otherwise amino. const unsigned CHAR_COUNT = 100; const unsigned MIN_NUCLEO_PCT = 95; const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return ALPHA_Amino; unsigned uSeqIndex = 0; unsigned uPos = 0; unsigned uSeqLength = GetSeqLength(0); unsigned uDNACount = 0; unsigned uRNACount = 0; unsigned uTotal = 0; const Seq *ptrSeq = &GetSeq(0); for (;;) { while (uPos >= uSeqLength) { ++uSeqIndex; if (uSeqIndex >= uSeqCount) break; ptrSeq = &GetSeq(uSeqIndex); uSeqLength = ptrSeq->Length(); uPos = 0; } if (uSeqIndex >= uSeqCount) break; char c = ptrSeq->at(uPos++); if (IsGapChar(c)) continue; if (IsDNA(c)) ++uDNACount; if (IsRNA(c)) ++uRNACount; ++uTotal; if (uTotal >= CHAR_COUNT) break; } if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_DNA; if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_RNA; return ALPHA_Amino; }
// Return true on end-of-file bool Seq::FromFASTAFile(TextFile &File) { Clear(); char szLine[MAX_FASTA_LINE]; bool bEof = File.GetLine(szLine, sizeof(szLine)); if (bEof) return true; if ('>' != szLine[0]) Quit("Expecting '>' in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); size_t n = strlen(szLine); if (1 == n) Quit("Missing annotation following '>' in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); m_ptrName = new char[n]; strcpy(m_ptrName, szLine + 1); TEXTFILEPOS Pos = File.GetPos(); for (;;) { bEof = File.GetLine(szLine, sizeof(szLine)); if (bEof) { if (0 == size()) { Quit("Empty sequence in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); return true; } return false; } if ('>' == szLine[0]) { if (0 == size()) Quit("Empty sequence in FASTA file %s line %u", File.GetFileName(), File.GetLineNr()); // Rewind to beginning of this line, it's the start of the // next sequence. File.SetPos(Pos); return false; } const char *ptrChar = szLine; while (char c = *ptrChar++) { if (isspace(c)) continue; if (IsGapChar(c)) continue; if (!IsResidueChar(c)) { if (isprint(c)) { char w = GetWildcardChar(); Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'", c, File.GetFileName(), File.GetLineNr(), w); c = w; } else Quit("Invalid byte hex %02x in FASTA file %s line %d", (unsigned char) c, File.GetFileName(), File.GetLineNr()); } c = toupper(c); push_back(c); } Pos = File.GetPos(); } }
bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const { const unsigned uThisLength = Length(); const unsigned uOtherLength = s.Length(); unsigned uThisPos = 0; unsigned uOtherPos = 0; int cThis; int cOther; for (;;) { if (uThisPos == uThisLength && uOtherPos == uOtherLength) break; // Set cThis to next non-gap character in this string // or -1 if end-of-string. for (;;) { if (uThisPos == uThisLength) { cThis = -1; break; } else { cThis = at(uThisPos); ++uThisPos; if (!IsGapChar(cThis)) { cThis = toupper(cThis); break; } } } // Set cOther to next non-gap character in s // or -1 if end-of-string. for (;;) { if (uOtherPos == uOtherLength) { cOther = -1; break; } else { cOther = s.at(uOtherPos); ++uOtherPos; if (!IsGapChar(cOther)) { cOther = toupper(cOther); break; } } } // Compare characters are corresponding ungapped position if (cThis != cOther) return false; } return true; }