Example #1
0
// Perecent group identity of a pair of sequences.
// Positions with one or both gapped are ignored.
double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1,
  unsigned uSeqIndex2) const
	{

	const unsigned uColCount = GetColCount();
	unsigned uPosCount = 0;
	unsigned uSameCount = 0;
	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
		{
		if (IsGap(uSeqIndex1, uColIndex))
			continue;
		if (IsGap(uSeqIndex2, uColIndex))
			continue;
		if (IsWildcard(uSeqIndex1, uColIndex))
			continue;
		if (IsWildcard(uSeqIndex2, uColIndex))
			continue;

		const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex);
		const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex);
		const unsigned uGroup1 = ResidueGroup[uLetter1];
		const unsigned uGroup2 = ResidueGroup[uLetter2];
		if (uGroup1 == uGroup2)
			++uSameCount;
		++uPosCount;
		}
	if (0 == uPosCount)
		return 0;
	return (double) uSameCount / (double) uPosCount;
	}
Example #2
0
double MSA::GetCons(unsigned uColIndex) const
	{
	unsigned Counts[MAX_ALPHA];
	for (unsigned uLetter = 0; uLetter < g_AlphaSize.get(); ++uLetter)
		Counts[uLetter] = 0;

	unsigned uMaxCount = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
		{
		if (IsGap(uSeqIndex, uColIndex))
			continue;
		char c = GetChar(uSeqIndex, uColIndex);
		c = toupper(c);
		if ('X' == c || 'B' == c || 'Z' == c)
			continue;
		unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
		unsigned uCount = Counts[uLetter] + 1;
		if (uCount > uMaxCount)
			uMaxCount = uCount;
		Counts[uLetter] = uCount;
		}

// Cons is undefined for all-gap column
	if (0 == uMaxCount)
		{
//		assert(false);
		return 1;
		}

	double dCons = (double) uMaxCount / (double) GetSeqCount();
	assert(dCons > 0 && dCons <= 1);
	return dCons;
	}
Example #3
0
void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
  FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
  FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,
  FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const
	{
	const unsigned uSeqCount = GetSeqCount();
	const unsigned uColCount = GetColCount();
	const char* seqName;
	memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));
	WEIGHT wTotal = 0;
	FCOUNT fGap = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const WEIGHT w = GetSeqWeight(uSeqIndex);
		if (IsGap(uSeqIndex, uColIndex))
			{
			fGap += w;
			continue;
			}
		else if (IsWildcard(uSeqIndex, uColIndex))
			{
			const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
			switch (g_Alpha)
				{
			case ALPHA_Amino:
				switch (uLetter)
					{
				case AX_B:		// D or N
					fcCounts[AX_D] += w/2;
					fcCounts[AX_N] += w/2;
					break;
				case AX_Z:		// E or Q
					fcCounts[AX_E] += w/2;
					fcCounts[AX_Q] += w/2;
					break;
				default:		// any
					{
					const FCOUNT f = w/20;
					for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
						fcCounts[uLetter] += f;
					break;
					}
					}
				break;

			case ALPHA_DNA:
			case ALPHA_RNA:
				switch (uLetter)
					{
				case AX_R:	// G or A
					fcCounts[NX_G] += w/2;
					fcCounts[NX_A] += w/2;
					break;
				case AX_Y:	// C or T/U
					fcCounts[NX_C] += w/2;
					fcCounts[NX_T] += w/2;
					break;
				default:	// any
					const FCOUNT f = w/20;
					for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
						fcCounts[uLetter] += f;
					break;
					}
				break;

			default:
				Quit("Alphabet %d not supported", g_Alpha);
				}
			continue;
			}
		unsigned uLetter = GetLetter(uSeqIndex, uColIndex);

		//BEGIN MODIFICATIONS TO MUSCLE

		int original=0;
		for(unsigned i=0; i<uColIndex; i++){
			if (i >= this->GetColCount()){break;}
			++original;
			char c = GetChar(uSeqIndex, i);
			if(c== '-'){
				original--;
			}
		}
		seqName = this->GetSeqName(uSeqIndex);
		int compositeVectPosition;
		compositeVectPosition = atoi(seqName);
		CompositeVect CV = *CVLocation;

		Composite* CVL = CV[compositeVectPosition];
		Composite C = *CVL;
		
		for(int j=0; j<21; j++){
			fcCounts[j] = w*C[original][j];
			wTotal = w*C[original][j];
		}

		//ORIGINAL MUSLCE LINE WAS:
		//fcCounts[uLetter] += w;
		//wTotal += w;
		//END MODIFICATIONS TO MUSCLE

		}
	*ptrfOcc = (float) (1.0 - fGap);

	if (bNormalize && wTotal > 0)
		{
		if (wTotal > 1.001)
			Quit("wTotal=%g\n", wTotal);
		for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
			fcCounts[uLetter] /= wTotal;
//		AssertNormalized(fcCounts);
		}

	FCOUNT fcStartCount = 0;
	if (uColIndex == 0)
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex))
				fcStartCount += GetSeqWeight(uSeqIndex);
		}
	else
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))
				fcStartCount += GetSeqWeight(uSeqIndex);
		}

	FCOUNT fcEndCount = 0;
	if (uColCount - 1 == uColIndex)
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex))
				fcEndCount += GetSeqWeight(uSeqIndex);
		}
	else
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))
				fcEndCount += GetSeqWeight(uSeqIndex);
		}

	FCOUNT LL = 0;
	FCOUNT LG = 0;
	FCOUNT GL = 0;
	FCOUNT GG = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		WEIGHT w = GetSeqWeight(uSeqIndex);
		bool bLetterHere = !IsGap(uSeqIndex, uColIndex);
		bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));
		if (bLetterHere)
			{
			if (bLetterPrev)
				LL += w;
			else
				GL += w;
			}
		else
			{
			if (bLetterPrev)
				LG += w;
			else
				GG += w;
			}
		}

	FCOUNT fcExtendCount = 0;
	if (uColIndex > 0 && uColIndex < GetColCount() - 1)
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&
			  IsGap(uSeqIndex, uColIndex + 1))
				fcExtendCount += GetSeqWeight(uSeqIndex);

	*ptrfcLL = LL;
	*ptrfcLG = LG;
	*ptrfcGL = GL;
	*ptrfcGG = GG;
	*ptrfcGapStart = fcStartCount;
	*ptrfcGapEnd = fcEndCount;
	*ptrfcGapExtend = fcExtendCount;
	}
Example #4
0
void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
  FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
  FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,
  FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const
	{
	const unsigned uSeqCount = GetSeqCount();
	const unsigned uColCount = GetColCount();

	memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));
	WEIGHT wTotal = 0;
	FCOUNT fGap = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const WEIGHT w = GetSeqWeight(uSeqIndex);
		if (IsGap(uSeqIndex, uColIndex))
			{
			fGap += w;
			continue;
			}
		else if (IsWildcard(uSeqIndex, uColIndex))
			{
			const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
			switch (g_Alpha)
				{
			case ALPHA_Amino:
				switch (uLetter)
					{
				case AX_B:		// D or N
					fcCounts[AX_D] += w/2;
					fcCounts[AX_N] += w/2;
					break;
				case AX_Z:		// E or Q
					fcCounts[AX_E] += w/2;
					fcCounts[AX_Q] += w/2;
					break;
				default:		// any
					{
					const FCOUNT f = w/20;
					for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
						fcCounts[uLetter] += f;
					break;
					}
					}
				break;

			case ALPHA_DNA:
			case ALPHA_RNA:
				switch (uLetter)
					{
				case AX_R:	// G or A
					fcCounts[NX_G] += w/2;
					fcCounts[NX_A] += w/2;
					break;
				case AX_Y:	// C or T/U
					fcCounts[NX_C] += w/2;
					fcCounts[NX_T] += w/2;
					break;
				default:	// any
					const FCOUNT f = w/20;
					for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
						fcCounts[uLetter] += f;
					break;
					}
				break;

			default:
				Quit("Alphabet %d not supported", g_Alpha);
				}
			continue;
			}
		unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
		fcCounts[uLetter] += w;
		wTotal += w;
		}
	*ptrfOcc = (float) (1.0 - fGap);

	if (bNormalize && wTotal > 0)
		{
		if (wTotal > 1.001)
			Quit("wTotal=%g\n", wTotal);
		for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
			fcCounts[uLetter] /= wTotal;
//		AssertNormalized(fcCounts);
		}

	FCOUNT fcStartCount = 0;
	if (uColIndex == 0)
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex))
				fcStartCount += GetSeqWeight(uSeqIndex);
		}
	else
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))
				fcStartCount += GetSeqWeight(uSeqIndex);
		}

	FCOUNT fcEndCount = 0;
	if (uColCount - 1 == uColIndex)
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex))
				fcEndCount += GetSeqWeight(uSeqIndex);
		}
	else
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))
				fcEndCount += GetSeqWeight(uSeqIndex);
		}

	FCOUNT LL = 0;
	FCOUNT LG = 0;
	FCOUNT GL = 0;
	FCOUNT GG = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		WEIGHT w = GetSeqWeight(uSeqIndex);
		bool bLetterHere = !IsGap(uSeqIndex, uColIndex);
		bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));
		if (bLetterHere)
			{
			if (bLetterPrev)
				LL += w;
			else
				GL += w;
			}
		else
			{
			if (bLetterPrev)
				LG += w;
			else
				GG += w;
			}
		}

	FCOUNT fcExtendCount = 0;
	if (uColIndex > 0 && uColIndex < GetColCount() - 1)
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&
			  IsGap(uSeqIndex, uColIndex + 1))
				fcExtendCount += GetSeqWeight(uSeqIndex);

	*ptrfcLL = LL;
	*ptrfcLG = LG;
	*ptrfcGL = GL;
	*ptrfcGG = GG;
	*ptrfcGapStart = fcStartCount;
	*ptrfcGapEnd = fcEndCount;
	*ptrfcGapExtend = fcExtendCount;
	}