C++ (Cpp) SeqVect::Length примеры использования

Пример #1

0

Показать файл

Файл: seqvect.cpp Проект: cran/muscle

void SeqVect::Copy(const SeqVect &rhs)
	{
	clear();
	unsigned uSeqCount = rhs.Length();
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq *ptrSeq = rhs.at(uSeqIndex);
		Seq *ptrSeqCopy = new Seq;
		ptrSeqCopy->Copy(*ptrSeq);
		push_back(ptrSeqCopy);
		}
	}

Пример #2

0

Показать файл

Файл: fastdist.cpp Проект: Unode/ext_apps

void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF)
	{
	const unsigned uSeqCount = v.Length();

	switch (DistMethod)
		{
	case DISTANCE_Kmer6_6:
		DistKmer6_6(v, DF);
		break;

	case DISTANCE_Kmer20_3:
		DistKmer20_3(v, DF);
		break;

	case DISTANCE_Kmer20_4:
		FastDistKmer(v, DF);
		break;

	case DISTANCE_Kbit20_3:
		DistKbit20_3(v, DF);
		break;

	case DISTANCE_Kmer4_6:
		DistKmer4_6(v, DF);
		break;

	case DISTANCE_PWKimura:
		DistPWKimura(v, DF);
		break;

	case DISTANCE_PWScoreDist:
		DistPWScoreDist(v, DF);
		break;

	default:
		Quit("DistUnaligned, unsupported distance method %d", DistMethod);
		}

//	const char **SeqNames = (const char **) malloc(uSeqCount*sizeof(char *));
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const Seq &s = *(v[uSeqIndex]);

		const char *ptrName = s.GetName();
		unsigned uId = s.GetId();

		DF.SetName(uSeqIndex, ptrName);
		DF.SetId(uSeqIndex, uId);
		}
	}

Пример #3

0

Показать файл

Файл: mhack.cpp Проект: Unode/ext_apps

void MHackStart(SeqVect &v)
	{
	if (ALPHA_Amino != g_Alpha)
		return;

	const unsigned uSeqCount = v.Length();
	M = new bool[uSeqCount];
	memset(M, 0, uSeqCount*sizeof(bool));
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq &s = v.GetSeq(uSeqIndex);
		if (0 == s.Length())
			continue;
		unsigned uId = s.GetId();
		if (s[0] == 'M' || s[0] == 'm')
			{
			M[uId] = true;
			s[0] = 'X';
			}
		}
	}

Пример #4

0

Показать файл

Файл: scoredist.cpp Проект: Unode/ext_apps

void DistPWScoreDist(const SeqVect &v, DistFunc &DF)
	{
	SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();
	SetSeqWeightMethod(SEQWEIGHT_Henikoff);

	const unsigned uSeqCount = v.Length();
	DF.SetCount(uSeqCount);

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	SetProgressDesc("PW ScoreDist");
	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
		{
		const Seq &s1 = v.GetSeq(uSeqIndex1);
		MSA msa1;
		msa1.FromSeq(s1);
		for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
			{
			if (0 == uCount%20)
				Progress(uCount, uPairCount);
			++uCount;
			const Seq &s2 = v.GetSeq(uSeqIndex2);
			MSA msa2;
			msa2.FromSeq(s2);
		
			PWPath Path;
			MSA msaOut;
			AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);

			float d = (float) GetScoreDist(msaOut, 0, 1);
			DF.SetDist(uSeqIndex1, uSeqIndex2, d);
			}
		}
	ProgressStepsDone();

	SetSeqWeightMethod(SeqWeightSave);
	}

Пример #5

0

Показать файл

Файл: domuscle.cpp Проект: Wyss/mauve-py

void DoMuscle()
	{
	SetOutputFileName(g_pstrOutFileName.get());
	SetInputFileName(g_pstrInFileName.get());

	SetMaxIters(g_uMaxIters.get());
	SetSeqWeightMethod(g_SeqWeight1.get());

	TextFile fileIn(g_pstrInFileName.get());
	SeqVect v;
	v.FromFASTAFile(fileIn);
	const unsigned uSeqCount = v.Length();

	if (0 == uSeqCount)
		Quit("No sequences in input file");

	ALPHA Alpha = ALPHA_Undefined;
	switch (g_SeqType.get())
		{
	case SEQTYPE_Auto:
		Alpha = v.GuessAlpha();
		break;

	case SEQTYPE_Protein:
		Alpha = ALPHA_Amino;
		break;

	case SEQTYPE_DNA:
		Alpha = ALPHA_DNA;
		break;

	case SEQTYPE_RNA:
		Alpha = ALPHA_RNA;
		break;

	default:
		Quit("Invalid seq type");
		}
	SetAlpha(Alpha);
	v.FixAlpha();

//
// AED 21/12/06: Moved matrix loading code inside the PP param function so it gets called for all alignment types
//
	SetPPScore();


	unsigned uMaxL = 0;
	unsigned uTotL = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned L = v.GetSeq(uSeqIndex).Length();
		uTotL += L;
		if (L > uMaxL)
			uMaxL = L;
		}

	SetIter(1);
	g_bDiags.get() = g_bDiags1.get();
	SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);

	SetMuscleSeqVect(v);

	MSA::SetIdCount(uSeqCount);

// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		v.SetSeqId(uSeqIndex, uSeqIndex);

	if (0 == uSeqCount)
		Quit("Input file '%s' has no sequences", g_pstrInFileName.get());
	if (1 == uSeqCount)
		{
		TextFile fileOut(g_pstrOutFileName.get(), true);
		v.ToFile(fileOut);
		return;
		}

	if (uSeqCount > 1)
		MHackStart(v);

// First iteration
	Tree GuideTree;
	if (0 != g_pstrUseTreeFileName.get())
		{
	// Discourage users...
		if (!g_bUseTreeNoWarn.get())
			fprintf(stderr, g_strUseTreeWarning);

	// Read tree from file
		TextFile TreeFile(g_pstrUseTreeFileName.get());
		GuideTree.FromFile(TreeFile);

	// Make sure tree is rooted
		if (!GuideTree.IsRooted())
			Quit("User tree must be rooted");

		if (GuideTree.GetLeafCount() != uSeqCount)
			Quit("User tree does not match input sequences");

		const unsigned uNodeCount = GuideTree.GetNodeCount();
		for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
			{
			if (!GuideTree.IsLeaf(uNodeIndex))
				continue;
			const char *LeafName = GuideTree.GetLeafName(uNodeIndex);
			unsigned uSeqIndex;
			bool SeqFound = v.FindName(LeafName, &uSeqIndex);
			if (!SeqFound)
				Quit("Label %s in tree does not match sequences", LeafName);
			unsigned uId = v.GetSeqIdFromName(LeafName);
			GuideTree.SetLeafId(uNodeIndex, uId);
			}
		}
	else
		TreeFromSeqVect(v, GuideTree, g_Cluster1.get(), g_Distance1.get(), g_Root1.get(),
		  g_pstrDistMxFileName1.get());

	const char *Tree1 = ValueOpt("Tree1");
	if (0 != Tree1)
		{
		TextFile f(Tree1, true);
		GuideTree.ToFile(f);
		if (g_bClusterOnly.get())
			return;
		}

	SetMuscleTree(GuideTree);
	ValidateMuscleIds(GuideTree);

	MSA msa;
	ProgNode *ProgNodes = 0;
	if (g_bLow.get())
		ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
	else
		ProgressiveAlign(v, GuideTree, msa);
	SetCurrentAlignment(msa);

	if (0 != g_pstrComputeWeightsFileName.get())
		{
		extern void OutWeights(const char *FileName, const MSA &msa);
		SetMSAWeightsMuscle(msa);
		OutWeights(g_pstrComputeWeightsFileName.get(), msa);
		return;
		}

	ValidateMuscleIds(msa);

	if (1 == g_uMaxIters.get() || 2 == uSeqCount)
		{
		//TextFile fileOut(g_pstrOutFileName.get(), true);
		//MHackEnd(msa);
		//msa.ToFile(fileOut);
		MuscleOutput(msa);
		return;
		}

	if (0 == g_pstrUseTreeFileName.get())
		{
		g_bDiags.get() = g_bDiags2.get();
		SetIter(2);

		if (g_bLow.get())
			{
			if (0 != g_uMaxTreeRefineIters.get())
				RefineTreeE(msa, v, GuideTree, ProgNodes);
			}
		else
			RefineTree(msa, GuideTree);

		const char *Tree2 = ValueOpt("Tree2");
		if (0 != Tree2)
			{
			TextFile f(Tree2, true);
			GuideTree.ToFile(f);
			}
		}

	SetSeqWeightMethod(g_SeqWeight2.get());
	SetMuscleTree(GuideTree);

	if (g_bAnchors.get())
		RefineVert(msa, GuideTree, g_uMaxIters.get() - 2);
	else
		RefineHoriz(msa, GuideTree, g_uMaxIters.get() - 2, false, false);

#if	0
// Refining by subfamilies is disabled as it didn't give better
// results. I tried doing this before and after RefineHoriz.
// Should get back to this as it seems like this should work.
	RefineSubfams(msa, GuideTree, g_uMaxIters.get() - 2);
#endif

	ValidateMuscleIds(msa);
	ValidateMuscleIds(GuideTree);

	//TextFile fileOut(g_pstrOutFileName.get(), true);
	//MHackEnd(msa);
	//msa.ToFile(fileOut);
	MuscleOutput(msa);
	}

Пример #6

0

Показать файл

Файл: domuscle.cpp Проект: bigmuscle/bigmuscle

void DoMuscle(CompositeVect*CVLocation)
	{
	SetOutputFileName(g_pstrOutFileName);
	SetInputFileName(g_pstrInFileName);

	SetMaxIters(g_uMaxIters);
	SetSeqWeightMethod(g_SeqWeight1);

	TextFile fileIn(g_pstrInFileName);
	SeqVect v;
	v.FromFASTAFile(fileIn);
	const unsigned uSeqCount = v.Length();

	if (0 == uSeqCount)
		Quit("No sequences in input file");

	ALPHA Alpha = ALPHA_Undefined;
	switch (g_SeqType)
		{
	case SEQTYPE_Auto:
		Alpha = v.GuessAlpha();
		break;

	case SEQTYPE_Protein:
		Alpha = ALPHA_Amino;
		break;

	case SEQTYPE_DNA:
		Alpha = ALPHA_DNA;
		break;

	case SEQTYPE_RNA:
		Alpha = ALPHA_RNA;
		break;

	default:
		Quit("Invalid seq type");
		}
	SetAlpha(Alpha);
	v.FixAlpha();

	PTR_SCOREMATRIX UserMatrix = 0;
	if (0 != g_pstrMatrixFileName)
		{
		const char *FileName = g_pstrMatrixFileName;
		const char *Path = getenv("MUSCLE_MXPATH");
		if (Path != 0)
			{
			size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
			char *NewFileName = new char[n];
			sprintf(NewFileName, "%s/%s", Path, FileName);
			FileName = NewFileName;
			}
		TextFile File(FileName);
		UserMatrix = ReadMx(File);
		g_Alpha = ALPHA_Amino;
		g_PPScore = PPSCORE_SP;
		}

	SetPPScore();

	if (0 != UserMatrix)
		g_ptrScoreMatrix = UserMatrix;

	unsigned uMaxL = 0;
	unsigned uTotL = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned L = v.GetSeq(uSeqIndex).Length();
		uTotL += L;
		if (L > uMaxL)
			uMaxL = L;
		}

	SetIter(1);
	g_bDiags = g_bDiags1;
	SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);

	SetMuscleSeqVect(v);

	MSA::SetIdCount(uSeqCount);

// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		v.SetSeqId(uSeqIndex, uSeqIndex);

	if (0 == uSeqCount)
		Quit("Input file '%s' has no sequences", g_pstrInFileName);
	if (1 == uSeqCount)
		{
		TextFile fileOut(g_pstrOutFileName, true);
		v.ToFile(fileOut);
		return;
		}

	if (uSeqCount > 1)
		MHackStart(v);

// First iteration
	Tree GuideTree;
	if (0 != g_pstrUseTreeFileName)
	{
	// Discourage users...
		if (!g_bUseTreeNoWarn)
			fprintf(stderr, "%s", g_strUseTreeWarning);

	// Read tree from file
		TextFile TreeFile(g_pstrUseTreeFileName);
		GuideTree.FromFile(TreeFile);

	// Make sure tree is rooted
		if (!GuideTree.IsRooted())
			Quit("User tree must be rooted");

		if (GuideTree.GetLeafCount() != uSeqCount)
			Quit("User tree does not match input sequences");

		const unsigned uNodeCount = GuideTree.GetNodeCount();
		for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
			{
			if (!GuideTree.IsLeaf(uNodeIndex))
				continue;
			const char *LeafName = GuideTree.GetLeafName(uNodeIndex);
			unsigned uSeqIndex;
			bool SeqFound = v.FindName(LeafName, &uSeqIndex);
			if (!SeqFound)
				Quit("Label %s in tree does not match sequences", LeafName);
			unsigned uId = v.GetSeqIdFromName(LeafName);
			GuideTree.SetLeafId(uNodeIndex, uId);
			}
		}
	else
		TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1,
		  g_pstrDistMxFileName1);

	const char *Tree1 = ValueOpt("Tree1");
	if (0 != Tree1)
		{
		TextFile f(Tree1, true);
		GuideTree.ToFile(f);
		if (g_bClusterOnly)
			return;
		}

	SetMuscleTree(GuideTree);
	ValidateMuscleIds(GuideTree);

	MSA msa;
	msa.SetCompositeVector(CVLocation);
	ProgNode *ProgNodes = 0;
	if (g_bLow)
		ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
	else
		ProgressiveAlign(v, GuideTree, msa);
	SetCurrentAlignment(msa);

	if (0 != g_pstrComputeWeightsFileName)
		{
		extern void OutWeights(const char *FileName, const MSA &msa);
		SetMSAWeightsMuscle(msa);
		OutWeights(g_pstrComputeWeightsFileName, msa);
		return;
		}

	ValidateMuscleIds(msa);

	if (1 == g_uMaxIters || 2 == uSeqCount)
		{
		//TextFile fileOut(g_pstrOutFileName, true);
		//MHackEnd(msa);
		//msa.ToFile(fileOut);
		MuscleOutput(msa);
		return;
		}

	if (0 == g_pstrUseTreeFileName)
		{
		g_bDiags = g_bDiags2;
		SetIter(2);

		if (g_bLow)
			{
			if (0 != g_uMaxTreeRefineIters)
				RefineTreeE(msa, v, GuideTree, ProgNodes);
			}
		else
			RefineTree(msa, GuideTree);

		const char *Tree2 = ValueOpt("Tree2");
		if (0 != Tree2)
			{
			TextFile f(Tree2, true);
			GuideTree.ToFile(f);
			}
		}

	SetSeqWeightMethod(g_SeqWeight2);
	SetMuscleTree(GuideTree);

	if (g_bAnchors)
		RefineVert(msa, GuideTree, g_uMaxIters - 2);
	else
		RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false);

#if	0
// Refining by subfamilies is disabled as it didn't give better
// results. I tried doing this before and after RefineHoriz.
// Should get back to this as it seems like this should work.
	RefineSubfams(msa, GuideTree, g_uMaxIters - 2);
#endif

	ValidateMuscleIds(msa);
	ValidateMuscleIds(GuideTree);

	//TextFile fileOut(g_pstrOutFileName, true);
	//MHackEnd(msa);
	//msa.ToFile(fileOut);
	MuscleOutput(msa);
	}

Пример #7

0

Показать файл

Файл: fastdistmafft.cpp Проект: bigmuscle/bigmuscle

void DistKmer6_6(const SeqVect &v, DistFunc &DF)
	{
	const unsigned uSeqCount = v.Length();

	DF.SetCount(uSeqCount);
	if (0 == uSeqCount)
		return;

// Initialize distance matrix to zero
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			DF.SetDist(uSeq1, uSeq2, 0);
		}

// Convert to letters
	unsigned **Letters = new unsigned *[uSeqCount];
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq &s = *(v[uSeqIndex]);
		const unsigned uSeqLength = s.Length();
		unsigned *L = new unsigned[uSeqLength];
		Letters[uSeqIndex] = L;
		for (unsigned n = 0; n < uSeqLength; ++n)
			{
			char c = s[n];
			L[n] = CharToLetterEx(c);
			assert(L[n] < uResidueGroupCount);
			}
		}

	unsigned **uCommonTupleCount = new unsigned *[uSeqCount];
	for (unsigned n = 0; n < uSeqCount; ++n)
		{
		uCommonTupleCount[n] = new unsigned[uSeqCount];
		memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
		}

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		Seq &seq1 = *(v[uSeq1]);
		const unsigned uSeqLength1 = seq1.Length();
		if (uSeqLength1 < 5)
			continue;

		const unsigned uTupleCount = uSeqLength1 - 5;
		const unsigned *L = Letters[uSeq1];
		CountTuples(L, uTupleCount, Count1);
#if	TRACE
		{
		Log("Seq1=%d\n", uSeq1);
		Log("Groups:\n");
		for (unsigned n = 0; n < uSeqLength1; ++n)
			Log("%u", ResidueGroup[L[n]]);
		Log("\n");

		Log("Tuples:\n");
		ListCount(Count1);
		}
#endif

		SetProgressDesc("K-mer dist pass 1");
		for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
			{
			if (0 == uCount%500)
				Progress(uCount, uPairCount);
			++uCount;
			Seq &seq2 = *(v[uSeq2]);
			const unsigned uSeqLength2 = seq2.Length();
			if (uSeqLength2 < 5)
				{
				if (uSeq1 == uSeq2)
					DF.SetDist(uSeq1, uSeq2, 0);
				else
					DF.SetDist(uSeq1, uSeq2, 1);
				continue;
				}

		// First pass through seq 2 to count tuples
			const unsigned uTupleCount = uSeqLength2 - 5;
			const unsigned *L = Letters[uSeq2];
			CountTuples(L, uTupleCount, Count2);
#if	TRACE
			Log("Seq2=%d Counts=\n", uSeq2);
			ListCount(Count2);
#endif

		// Second pass to accumulate sum of shared tuples
		// MAFFT defines this as the sum over unique tuples
		// in seq2 of the minimum of the number of tuples found
		// in the two sequences.
			unsigned uSum = 0;
			for (unsigned n = 0; n < uTupleCount; ++n)
				{
				const unsigned uTuple = GetTuple(L, n);
				uSum += MIN(Count1[uTuple], Count2[uTuple]);

			// This is a hack to make sure each unique tuple counted only once.
				Count2[uTuple] = 0;
				}
#if	TRACE
			{
			Seq &s1 = *(v[uSeq1]);
			Seq &s2 = *(v[uSeq2]);
			const char *pName1 = s1.GetName();
			const char *pName2 = s2.GetName();
			Log("Common count %s(%d) - %s(%d) =%u\n",
			  pName1, uSeq1, pName2, uSeq2, uSum);
			}
#endif
			uCommonTupleCount[uSeq1][uSeq2] = uSum;
			uCommonTupleCount[uSeq2][uSeq1] = uSum;
			}
		}
	ProgressStepsDone();

	uCount = 0;
	SetProgressDesc("K-mer dist pass 2");
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		Seq &s1 = *(v[uSeq1]);
		const char *pName1 = s1.GetName();

		double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
		if (0 == dCommonTupleCount11)
			dCommonTupleCount11 = 1;

		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			{
			if (0 == uCount%500)
				Progress(uCount, uPairCount);
			++uCount;

			double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
			if (0 == dCommonTupleCount22)
				dCommonTupleCount22 = 1;

			const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
			  /dCommonTupleCount11;
			const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
			  /dCommonTupleCount22;

		// dMinDist is the value used for tree-building in MAFFT
			const double dMinDist = MIN(dDist1, dDist2);
			DF.SetDist(uSeq1, uSeq2, (float) dMinDist);

			//const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
			//g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
		// **** TODO **** why does this make score slightly worse??
			//const double dKimuraDist = KimuraDist(dEstimatedPctId);
			//DF.SetDist(uSeq1, uSeq2, dKimuraDist);
			}
		}
	ProgressStepsDone();

	for (unsigned n = 0; n < uSeqCount; ++n)
		delete[] uCommonTupleCount[n];
	delete[] uCommonTupleCount;
	delete[] Letters;
	}

Пример #8

0

Показать файл

Файл: fastdistjones.cpp Проект: bigmuscle/bigmuscle

// WARNING: Sequences MUST be stripped of gaps and upper case!
void DistKmer20_3(const SeqVect &v, DistFunc &DF)
{
    const unsigned uSeqCount = v.Length();

    DF.SetCount(uSeqCount);
    if (0 == uSeqCount)
        return;
    for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
    {
        DF.SetDist(uSeq1, uSeq1, 0);
        for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
            DF.SetDist(uSeq1, uSeq2, 0);
    }

    const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);
    TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);
    if (0 == TripleCounts)
        Quit("Not enough memory (TripleCounts)");
    memset(TripleCounts, 0, uTripleArrayBytes);

    for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
    {
        TripleCount &tc = *(TripleCounts + uWord);
        const unsigned uBytes = uSeqCount*sizeof(short);
        tc.m_Counts = (unsigned short *) malloc(uBytes);
        memset(tc.m_Counts, 0, uBytes);
    }

    for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
    {
        Seq &s = *(v[uSeqIndex]);
        const unsigned uSeqLength = s.Length();
        for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)
        {
            const unsigned uLetter1 = CharToLetterEx(s[uPos]);
            if (uLetter1 >= 20)
                continue;
            const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);
            if (uLetter2 >= 20)
                continue;
            const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);
            if (uLetter3 >= 20)
                continue;

            const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;
            assert(uWord < TRIPLE_COUNT);

            TripleCount &tc = *(TripleCounts + uWord);
            const unsigned uOldCount = tc.m_Counts[uSeqIndex];
            if (0 == uOldCount)
                ++(tc.m_uSeqCount);

            ++(tc.m_Counts[uSeqIndex]);
        }
    }

#if TRACE
    {
        Log("TripleCounts\n");
        unsigned uGrandTotal = 0;
        for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
        {
            const TripleCount &tc = *(TripleCounts + uWord);
            if (0 == tc.m_uSeqCount)
                continue;

            const unsigned uLetter3 = uWord/(20*20);
            const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;
            const unsigned uLetter1 = uWord%20;
            Log("Word %6u %c%c%c   %6u",
                uWord,
                LetterToCharAmino(uLetter1),
                LetterToCharAmino(uLetter2),
                LetterToCharAmino(uLetter3),
                tc.m_uSeqCount);

            unsigned uSeqCountWithThisWord = 0;
            for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
            {
                const unsigned uCount = tc.m_Counts[uSeqIndex];
                if (uCount > 0)
                {
                    ++uSeqCountWithThisWord;
                    Log(" %u=%u", uSeqIndex, uCount);
                    uGrandTotal += uCount;
                }
            }
            if (uSeqCountWithThisWord != tc.m_uSeqCount)
                Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);
            Log("\n");
        }

        unsigned uTotalBySeqLength = 0;
        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
        {
            Seq &s = *(v[uSeqIndex]);
            const unsigned uSeqLength = s.Length();
            uTotalBySeqLength += uSeqLength - 2;
        }
        if (uGrandTotal != uTotalBySeqLength)
            Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);
    }
#endif

    const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);
    unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);

    for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
    {
        const TripleCount &tc = *(TripleCounts + uWord);
        if (0 == tc.m_uSeqCount)
            continue;

        unsigned uSeqCountFound = 0;
        memset(SeqList, 0, uSeqListBytes);

        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
        {
            if (tc.m_Counts[uSeqIndex] > 0)
            {
                SeqList[uSeqCountFound] = uSeqIndex;
                ++uSeqCountFound;
                if (uSeqCountFound == tc.m_uSeqCount)
                    break;
            }
        }
        assert(uSeqCountFound == tc.m_uSeqCount);

        for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)
        {
            const unsigned uSeqIndex1 = SeqList[uSeq1];
            const unsigned uCount1 = tc.m_Counts[uSeqIndex1];
            for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
            {
                const unsigned uSeqIndex2 = SeqList[uSeq2];
                const unsigned uCount2 = tc.m_Counts[uSeqIndex2];
                const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;
                const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);
                DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));
            }
        }
    }
    delete[] SeqList;
    free(TripleCounts);

    unsigned uDone = 0;
    const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
    for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
    {
        DF.SetDist(uSeq1, uSeq1, 0.0);

        const Seq &s1 = *(v[uSeq1]);
        const unsigned uLength1 = s1.Length();

        for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
        {
            const Seq &s2 = *(v[uSeq2]);
            const unsigned uLength2 = s2.Length();
            unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;
            if (uMinLength < 3)
            {
                DF.SetDist(uSeq1, uSeq2, 1.0);
                continue;
            }

            const double dTripleCount = DF.GetDist(uSeq1, uSeq2);
            if (dTripleCount == 0)
            {
                DF.SetDist(uSeq1, uSeq2, 1.0);
                continue;
            }
            double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);
            //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));
            //if (dEstimatedPairwiseIdentity > 1)
            //	dEstimatedPairwiseIdentity = 1;
//			DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));
            DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);

#if	TRACE
            {
                Log("%s - %s  Triplet count = %g  Lengths %u, %u Estimated pwid = %g\n",
                    s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,
                    dEstimatedPairwiseIdentity);
            }
#endif
            if (uDone%1000 == 0)
                Progress(uDone, uTotal);
        }
    }
    ProgressStepsDone();
}

Пример #9

0

Показать файл

Файл: progressivealign.cpp Проект: Wyss/mauve-py

void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a)
	{
	assert(GuideTree.IsRooted());

#if	TRACE
	Log("GuideTree:\n");
	GuideTree.LogMe();
#endif

	const unsigned uSeqCount = v.Length();
	const unsigned uNodeCount = 2*uSeqCount - 1;

	ProgNode *ProgNodes = new ProgNode[uNodeCount];

	unsigned uJoin = 0;
	unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
	SetProgressDesc("Align node");
	do
		{
		if (GuideTree.IsLeaf(uTreeNodeIndex))
			{
			if (uTreeNodeIndex >= uNodeCount)
				Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount);
			ProgNode &Node = ProgNodes[uTreeNodeIndex];
			unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex);
			if (uId >= uSeqCount)
				Quit("Seq index out of range");
			const Seq &s = *(v[uId]);
			Node.m_MSA.FromSeq(s);
			Node.m_MSA.SetSeqId(0, uId);
			Node.m_uLength = Node.m_MSA.GetColCount();
			}
		else
			{
			Progress(uJoin, uSeqCount - 1);
			++uJoin;

			const unsigned uMergeNodeIndex = uTreeNodeIndex;
			ProgNode &Parent = ProgNodes[uMergeNodeIndex];

			const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
			const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);

			ProgNode &Node1 = ProgNodes[uLeft];
			ProgNode &Node2 = ProgNodes[uRight];

			PWPath Path;
			AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
			Parent.m_uLength = Parent.m_MSA.GetColCount();

			Node1.m_MSA.Clear();
			Node2.m_MSA.Clear();
			}
		uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
		}
	while (NULL_NEIGHBOR != uTreeNodeIndex);
	ProgressStepsDone();

	unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
	const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
	a.Copy(RootProgNode.m_MSA);

	delete[] ProgNodes;
	ProgNodes = 0;
	}

Пример #10

0

Показать файл

Файл: subfam.cpp Проект: Wyss/mauve-py

void ProgAlignSubFams()
	{
	MSA msaOut;

	SetOutputFileName(g_pstrOutFileName.get());
	SetInputFileName(g_pstrInFileName.get());

	SetMaxIters(g_uMaxIters.get());
	SetSeqWeightMethod(g_SeqWeight1.get());

	TextFile fileIn(g_pstrInFileName.get());
	SeqVect v;
	v.FromFASTAFile(fileIn);
	const unsigned uSeqCount = v.Length();

	if (0 == uSeqCount)
		Quit("No sequences in input file");

	ALPHA Alpha = ALPHA_Undefined;
	switch (g_SeqType.get())
		{
	case SEQTYPE_Auto:
		Alpha = v.GuessAlpha();
		break;

	case SEQTYPE_Protein:
		Alpha = ALPHA_Amino;
		break;

	case SEQTYPE_DNA:
		Alpha = ALPHA_DNA;
		break;

	case SEQTYPE_RNA:
		Alpha = ALPHA_RNA;
		break;

	default:
		Quit("Invalid seq type");
		}
	SetAlpha(Alpha);
	v.FixAlpha();

	PTR_SCOREMATRIX UserMatrix = 0;
	if (0 != g_pstrMatrixFileName.get())
		{
		const char *FileName = g_pstrMatrixFileName.get();
		const char *Path = getenv("MUSCLE_MXPATH");
		if (Path != 0)
			{
			size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
			char *NewFileName = new char[n];
			sprintf(NewFileName, "%s/%s", Path, FileName);
			FileName = NewFileName;
			}
		TextFile File(FileName);
		UserMatrix = ReadMx(File);
		g_Alpha = ALPHA_Amino;
		g_PPScore = PPSCORE_SP;
		}

	SetPPScore();

	if (0 != UserMatrix)
		g_ptrScoreMatrix = UserMatrix;

	if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
		{
		SetPPScore(PPSCORE_SPN);
		g_Distance1.get() = DISTANCE_Kmer4_6;
		}

	unsigned uMaxL = 0;
	unsigned uTotL = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned L = v.GetSeq(uSeqIndex).Length();
		uTotL += L;
		if (L > uMaxL)
			uMaxL = L;
		}

	SetIter(1);
	g_bDiags.get() = g_bDiags1.get();
	SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);

	SetMuscleSeqVect(v);

	MSA::SetIdCount(uSeqCount);

// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		v.SetSeqId(uSeqIndex, uSeqIndex);

	if (uSeqCount > 1)
		MHackStart(v);

	if (0 == uSeqCount)
		{
		msaOut.Clear();
		return;
		}

	if (1 == uSeqCount && ALPHA_Amino == Alpha)
		{
		const Seq &s = v.GetSeq(0);
		msaOut.FromSeq(s);
		return;
		}

	Tree GuideTree;
	TreeFromSeqVect(v, GuideTree, g_Cluster1.get(), g_Distance1.get(), g_Root1.get());
	SetMuscleTree(GuideTree);

	MSA msa;
	if (g_bLow.get())
		{
		ProgNode *ProgNodes = 0;
		ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
		delete[] ProgNodes;
		}
	else
		ProgressiveAlign(v, GuideTree, msa);
	SetCurrentAlignment(msa);
	TreeFromMSA(msa, GuideTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get());
	SetMuscleTree(GuideTree);

	unsigned *SubFams = new unsigned[uSeqCount];
	unsigned uSubFamCount;
	SubFam(GuideTree, g_uMaxSubFamCount.get(), SubFams, &uSubFamCount);

	SetProgressDesc("Align node");
	const unsigned uNodeCount = 2*uSeqCount - 1;

	ProgNode *ProgNodes = new ProgNode[uNodeCount];
	bool *NodeIsSubFam = new bool[uNodeCount];
	bool *NodeInSubFam = new bool[uNodeCount];

	for (unsigned i = 0; i < uNodeCount; ++i)
		{
		NodeIsSubFam[i] = false;
		NodeInSubFam[i] = false;
		}

	for (unsigned i = 0; i < uSubFamCount; ++i)
		{
		unsigned uNodeIndex = SubFams[i];
		assert(uNodeIndex < uNodeCount);
		NodeIsSubFam[uNodeIndex] = true;
		SetInFam(GuideTree, uNodeIndex, NodeInSubFam);
		}

	unsigned uJoin = 0;
	unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
	do
		{
		if (NodeIsSubFam[uTreeNodeIndex])
			{
#if	TRACE
			Log("Node %d: align subfam\n", uTreeNodeIndex);
#endif
			ProgNode &Node = ProgNodes[uTreeNodeIndex];
			AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);
			Node.m_uLength = Node.m_MSA.GetColCount();
			}
		else if (!NodeInSubFam[uTreeNodeIndex])
			{
#if	TRACE
			Log("Node %d: align two subfams\n", uTreeNodeIndex);
#endif
			Progress(uJoin, uSubFamCount - 1);
			++uJoin;

			const unsigned uMergeNodeIndex = uTreeNodeIndex;
			ProgNode &Parent = ProgNodes[uMergeNodeIndex];

			const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
			const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);

			ProgNode &Node1 = ProgNodes[uLeft];
			ProgNode &Node2 = ProgNodes[uRight];

			PWPath Path;
			AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
			Parent.m_uLength = Parent.m_MSA.GetColCount();

			Node1.m_MSA.Clear();
			Node2.m_MSA.Clear();
			}
		else
			{
#if	TRACE
			Log("Node %d: in subfam\n", uTreeNodeIndex);
#endif
			;
			}
		uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
		}
	while (NULL_NEIGHBOR != uTreeNodeIndex);
	ProgressStepsDone();

	unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
	ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];

	TextFile fOut(g_pstrOutFileName.get(), true);
	MHackEnd(RootProgNode.m_MSA);
	RootProgNode.m_MSA.ToFile(fOut);

	delete[] NodeInSubFam;
	delete[] NodeIsSubFam;
	delete[] ProgNodes;
	delete[] SubFams;

	ProgNodes = 0;
	NodeInSubFam = 0;
	NodeIsSubFam = 0;
	SubFams = 0;
	}