double MSA::GetCons(unsigned uColIndex) const { unsigned Counts[MAX_ALPHA]; for (unsigned uLetter = 0; uLetter < g_AlphaSize.get(); ++uLetter) Counts[uLetter] = 0; unsigned uMaxCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex)) continue; char c = GetChar(uSeqIndex, uColIndex); c = toupper(c); if ('X' == c || 'B' == c || 'Z' == c) continue; unsigned uLetter = GetLetter(uSeqIndex, uColIndex); unsigned uCount = Counts[uLetter] + 1; if (uCount > uMaxCount) uMaxCount = uCount; Counts[uLetter] = uCount; } // Cons is undefined for all-gap column if (0 == uMaxCount) { // assert(false); return 1; } double dCons = (double) uMaxCount / (double) GetSeqCount(); assert(dCons > 0 && dCons <= 1); return dCons; }
void MSA::SetClustalWWeights(const Tree &tree) { const unsigned uSeqCount = GetSeqCount(); const unsigned uLeafCount = tree.GetLeafCount(); WEIGHT *Weights = new WEIGHT[uSeqCount]; CalcClustalWWeights(tree, Weights); for (unsigned n = 0; n < uLeafCount; ++n) { const WEIGHT w = Weights[n]; const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n); const unsigned uId = tree.GetLeafId(uLeafNodeIndex); const unsigned uSeqIndex = GetSeqIndex(uId); #if DEBUG if (GetSeqName(uSeqIndex) != tree.GetLeafName(uLeafNodeIndex)) Quit("MSA::SetClustalWWeights: names don't match"); #endif SetSeqWeight(uSeqIndex, w); } NormalizeWeights((WEIGHT) 1.0); delete[] Weights; }
void MSA::ToFASTAFile(TextFile &File) const { const unsigned uColCount = GetColCount(); assert(uColCount > 0); const unsigned uLinesPerSeq = (GetColCount() - 1)/FASTA_BLOCK + 1; const unsigned uSeqCount = GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { File.PutString(">"); File.PutString(GetSeqName(uSeqIndex)); File.PutString("\n"); unsigned n = 0; for (unsigned uLine = 0; uLine < uLinesPerSeq; ++uLine) { unsigned uLetters = uColCount - uLine*FASTA_BLOCK; if (uLetters > FASTA_BLOCK) uLetters = FASTA_BLOCK; for (unsigned i = 0; i < uLetters; ++i) { char c = GetChar(uSeqIndex, n); File.PutChar(c); ++n; } File.PutChar('\n'); } } }
void MSA::SetHenikoffWeights() const { const unsigned uColCount = GetColCount(); const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return; else if (1 == uSeqCount) { m_Weights[0] = (WEIGHT) 1.0; return; } else if (2 == uSeqCount) { m_Weights[0] = (WEIGHT) 0.5; m_Weights[1] = (WEIGHT) 0.5; return; } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) m_Weights[uSeqIndex] = 0.0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) CalcHenikoffWeightsCol(uColIndex); // Set all-gap seqs weight to 0 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGapSeq(uSeqIndex)) m_Weights[uSeqIndex] = 0.0; Normalize(m_Weights, uSeqCount); }
void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const { const unsigned uSeqCount = GetSeqCount(); // Compute letter counts in this column unsigned uLetterCount[MAX_ALPHA]; memset(uLetterCount, 0, sizeof(uLetterCount)); unsigned uDifferentLetterCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); if (uLetter >= 20) continue; unsigned uNewCount = uLetterCount[uLetter] + 1; uLetterCount[uLetter] = uNewCount; if (1 == uNewCount) ++uDifferentLetterCount; } // Compute weight contributions for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); if (uLetter >= 20) continue; const unsigned uCount = uLetterCount[uLetter]; unsigned uDenom = uCount*uDifferentLetterCount; if (uDenom == 0) continue; m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom); } }
Seq &SeqVect::GetSeqById(unsigned uId) { const unsigned uSeqCount = GetSeqCount(); for (unsigned i = 0; i < uSeqCount; ++i) { if (GetSeqId(i) == uId) return GetSeq(i); } Quit("SeqVect::GetSeqIdByUd(%d): not found", uId); return (Seq &) *((Seq *) 0); }
unsigned SeqVect::GetSeqIdFromName(const char *Name) const { const unsigned uSeqCount = GetSeqCount(); for (unsigned i = 0; i < uSeqCount; ++i) { if (!strcmp(Name, GetSeqName(i))) return GetSeqId(i); } Quit("SeqVect::GetSeqIdFromName(%s): not found", Name); return 0; }
void MSA::ListWeights() const { const unsigned uSeqCount = GetSeqCount(); Log("Weights:\n"); WEIGHT wTotal = 0; for (unsigned n = 0; n < uSeqCount; ++n) { wTotal += GetSeqWeight(n); Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n)); } Log("Total weights = %6.3f, should be 1.0\n", wTotal); }
void MSA::ToPhyInterleavedFile(TextFile &File) const { const unsigned SeqCount = GetSeqCount(); const unsigned ColCount = GetColCount(); File.PutFormat("%d %d\n", SeqCount, ColCount); if (0 == ColCount) return; unsigned Col = 0; for (;;) { const unsigned ColBlockStart = Col; const unsigned MaxCols = (ColBlockStart == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE; for (unsigned Seq = 0; Seq < SeqCount; ++Seq) { if (0 == ColBlockStart) { char Name[11]; const char *ptrName = GetSeqName(Seq); size_t n = strlen(ptrName); if (n > 10) n = 10; memcpy(Name, ptrName, n); Name[n] = 0; FixName(Name); File.PutFormat("%-10.10s", Name); } Col = ColBlockStart; for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock) { if (Col == ColCount) break; if (ColsThisBlock%10 == 0 && (0 == ColBlockStart || ColsThisBlock > 0)) File.PutChar(' '); char c = GetChar(Seq, Col); if (isalpha(c)) c = toupper(c); File.PutChar(c); ++Col; } File.PutChar('\n'); } if (Col == ColCount) break; File.PutChar('\n'); } }
ALPHA SeqVect::GuessAlpha() const { // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap // letters belong to the nucleotide alphabet, guess nucleo. // Otherwise amino. const unsigned CHAR_COUNT = 100; const unsigned MIN_NUCLEO_PCT = 95; const unsigned uSeqCount = GetSeqCount(); if (0 == uSeqCount) return ALPHA_Amino; unsigned uSeqIndex = 0; unsigned uPos = 0; unsigned uSeqLength = GetSeqLength(0); unsigned uDNACount = 0; unsigned uRNACount = 0; unsigned uTotal = 0; const Seq *ptrSeq = &GetSeq(0); for (;;) { while (uPos >= uSeqLength) { ++uSeqIndex; if (uSeqIndex >= uSeqCount) break; ptrSeq = &GetSeq(uSeqIndex); uSeqLength = ptrSeq->Length(); uPos = 0; } if (uSeqIndex >= uSeqCount) break; char c = ptrSeq->at(uPos++); if (IsGapChar(c)) continue; if (IsDNA(c)) ++uDNACount; if (IsRNA(c)) ++uRNACount; ++uTotal; if (uTotal >= CHAR_COUNT) break; } if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_DNA; if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) return ALPHA_RNA; return ALPHA_Amino; }
double MSA::GetAvgCons() const { assert(GetSeqCount() > 0); double dSum = 0; unsigned uNonGapColCount = 0; for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex) { if (!IsGapColumn(uColIndex)) { dSum += GetCons(uColIndex); ++uNonGapColCount; } } assert(uNonGapColCount > 0); double dAvg = dSum / uNonGapColCount; assert(dAvg > 0 && dAvg <= 1); return dAvg; }
// Return value is the group count, i.e. the effective number // of distinctly different sequences. unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const { // Build distance matrix DistFunc DF; unsigned uSeqCount = GetSeqCount(); DF.SetCount(uSeqCount); for (unsigned i = 0; i < uSeqCount; ++i) for (unsigned j = i+1; j < uSeqCount; ++j) { double dDist = GetPctIdentityPair(i, j); assert(dDist >= 0.0 && dDist <= 1.0); DF.SetDist(i, j, (float) (1.0 - dDist)); } // Cluster based on the distance function BlosumCluster.Create(DF); // Return value is HMMer's "effective sequence count". return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST); }
void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc, FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const { const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); const char* seqName; memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT)); WEIGHT wTotal = 0; FCOUNT fGap = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT w = GetSeqWeight(uSeqIndex); if (IsGap(uSeqIndex, uColIndex)) { fGap += w; continue; } else if (IsWildcard(uSeqIndex, uColIndex)) { const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); switch (g_Alpha) { case ALPHA_Amino: switch (uLetter) { case AX_B: // D or N fcCounts[AX_D] += w/2; fcCounts[AX_N] += w/2; break; case AX_Z: // E or Q fcCounts[AX_E] += w/2; fcCounts[AX_Q] += w/2; break; default: // any { const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) fcCounts[uLetter] += f; break; } } break; case ALPHA_DNA: case ALPHA_RNA: switch (uLetter) { case AX_R: // G or A fcCounts[NX_G] += w/2; fcCounts[NX_A] += w/2; break; case AX_Y: // C or T/U fcCounts[NX_C] += w/2; fcCounts[NX_T] += w/2; break; default: // any const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) fcCounts[uLetter] += f; break; } break; default: Quit("Alphabet %d not supported", g_Alpha); } continue; } unsigned uLetter = GetLetter(uSeqIndex, uColIndex); //BEGIN MODIFICATIONS TO MUSCLE int original=0; for(unsigned i=0; i<uColIndex; i++){ if (i >= this->GetColCount()){break;} ++original; char c = GetChar(uSeqIndex, i); if(c== '-'){ original--; } } seqName = this->GetSeqName(uSeqIndex); int compositeVectPosition; compositeVectPosition = atoi(seqName); CompositeVect CV = *CVLocation; Composite* CVL = CV[compositeVectPosition]; Composite C = *CVL; for(int j=0; j<21; j++){ fcCounts[j] = w*C[original][j]; wTotal = w*C[original][j]; } //ORIGINAL MUSLCE LINE WAS: //fcCounts[uLetter] += w; //wTotal += w; //END MODIFICATIONS TO MUSCLE } *ptrfOcc = (float) (1.0 - fGap); if (bNormalize && wTotal > 0) { if (wTotal > 1.001) Quit("wTotal=%g\n", wTotal); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) fcCounts[uLetter] /= wTotal; // AssertNormalized(fcCounts); } FCOUNT fcStartCount = 0; if (uColIndex == 0) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcStartCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1)) fcStartCount += GetSeqWeight(uSeqIndex); } FCOUNT fcEndCount = 0; if (uColCount - 1 == uColIndex) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcEndCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1)) fcEndCount += GetSeqWeight(uSeqIndex); } FCOUNT LL = 0; FCOUNT LG = 0; FCOUNT GL = 0; FCOUNT GG = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { WEIGHT w = GetSeqWeight(uSeqIndex); bool bLetterHere = !IsGap(uSeqIndex, uColIndex); bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1)); if (bLetterHere) { if (bLetterPrev) LL += w; else GL += w; } else { if (bLetterPrev) LG += w; else GG += w; } } FCOUNT fcExtendCount = 0; if (uColIndex > 0 && uColIndex < GetColCount() - 1) for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) && IsGap(uSeqIndex, uColIndex + 1)) fcExtendCount += GetSeqWeight(uSeqIndex); *ptrfcLL = LL; *ptrfcLG = LG; *ptrfcGL = GL; *ptrfcGG = GG; *ptrfcGapStart = fcStartCount; *ptrfcGapEnd = fcEndCount; *ptrfcGapExtend = fcExtendCount; }
void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc, FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const { const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT)); WEIGHT wTotal = 0; FCOUNT fGap = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT w = GetSeqWeight(uSeqIndex); if (IsGap(uSeqIndex, uColIndex)) { fGap += w; continue; } else if (IsWildcard(uSeqIndex, uColIndex)) { const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); switch (g_Alpha) { case ALPHA_Amino: switch (uLetter) { case AX_B: // D or N fcCounts[AX_D] += w/2; fcCounts[AX_N] += w/2; break; case AX_Z: // E or Q fcCounts[AX_E] += w/2; fcCounts[AX_Q] += w/2; break; default: // any { const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) fcCounts[uLetter] += f; break; } } break; case ALPHA_DNA: case ALPHA_RNA: switch (uLetter) { case AX_R: // G or A fcCounts[NX_G] += w/2; fcCounts[NX_A] += w/2; break; case AX_Y: // C or T/U fcCounts[NX_C] += w/2; fcCounts[NX_T] += w/2; break; default: // any const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) fcCounts[uLetter] += f; break; } break; default: Quit("Alphabet %d not supported", g_Alpha); } continue; } unsigned uLetter = GetLetter(uSeqIndex, uColIndex); fcCounts[uLetter] += w; wTotal += w; } *ptrfOcc = (float) (1.0 - fGap); if (bNormalize && wTotal > 0) { if (wTotal > 1.001) Quit("wTotal=%g\n", wTotal); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) fcCounts[uLetter] /= wTotal; // AssertNormalized(fcCounts); } FCOUNT fcStartCount = 0; if (uColIndex == 0) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcStartCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1)) fcStartCount += GetSeqWeight(uSeqIndex); } FCOUNT fcEndCount = 0; if (uColCount - 1 == uColIndex) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcEndCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1)) fcEndCount += GetSeqWeight(uSeqIndex); } FCOUNT LL = 0; FCOUNT LG = 0; FCOUNT GL = 0; FCOUNT GG = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { WEIGHT w = GetSeqWeight(uSeqIndex); bool bLetterHere = !IsGap(uSeqIndex, uColIndex); bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1)); if (bLetterHere) { if (bLetterPrev) LL += w; else GL += w; } else { if (bLetterPrev) LG += w; else GG += w; } } FCOUNT fcExtendCount = 0; if (uColIndex > 0 && uColIndex < GetColCount() - 1) for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) && IsGap(uSeqIndex, uColIndex + 1)) fcExtendCount += GetSeqWeight(uSeqIndex); *ptrfcLL = LL; *ptrfcLG = LG; *ptrfcGL = GL; *ptrfcGG = GG; *ptrfcGapStart = fcStartCount; *ptrfcGapEnd = fcEndCount; *ptrfcGapExtend = fcExtendCount; }
void MSA::ToAlnFile(TextFile &File) const { if (getMuscleContext()->params.g_bClwStrict) File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n"); else { File.PutString("MUSCLE (" MUSCLE_MAJOR_VERSION "." MUSCLE_MINOR_VERSION ")" " multiple sequence alignment\n"); File.PutString("\n"); } int iLongestNameLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *ptrName = GetSeqName(uSeqIndex); const char *ptrBlank = strchr(ptrName, ' '); int iLength; if (0 != ptrBlank) iLength = (int) (ptrBlank - ptrName); else iLength = (int) strlen(ptrName); if (iLength > iLongestNameLength) iLongestNameLength = iLength; } if (iLongestNameLength > MAX_NAME) iLongestNameLength = MAX_NAME; if (iLongestNameLength < MIN_NAME) iLongestNameLength = MIN_NAME; unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1; for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex) { File.PutString("\n"); unsigned uStartColIndex = uLineIndex*uCharsPerLine; unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1; if (uEndColIndex >= GetColCount()) uEndColIndex = GetColCount() - 1; char Name[MAX_NAME+1]; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { const char *ptrName = GetSeqName(uSeqIndex); const char *ptrBlank = strchr(ptrName, ' '); int iLength; if (0 != ptrBlank) iLength = (int) (ptrBlank - ptrName); else iLength = (int) strlen(ptrName); if (iLength > MAX_NAME) iLength = MAX_NAME; memset(Name, ' ', MAX_NAME); memcpy(Name, ptrName, iLength); Name[iLongestNameLength] = 0; File.PutFormat("%s ", Name); for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; ++uColIndex) { const char c = GetChar(uSeqIndex, uColIndex); File.PutFormat("%c", toupper(c)); } File.PutString("\n"); } memset(Name, ' ', MAX_NAME); Name[iLongestNameLength] = 0; File.PutFormat("%s ", Name); for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; ++uColIndex) { const char c = GetAlnConsensusChar(*this, uColIndex); File.PutChar(c); } File.PutString("\n"); } }