int GetWeightedPageRankMP1(const PNEANet Graph, TIntFltH& PRankH, const TStr& Attr, const double& C, const double& Eps, const int& MaxIter) { if (!Graph->IsFltAttrE(Attr)) return -1; TFltV Weights = Graph->GetFltAttrVecE(Attr); int mxid = Graph->GetMxNId(); TFltV OutWeights(mxid); Graph->GetWeightOutEdgesV(OutWeights, Weights); /*for (TNEANet::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { OutWeights[NI.GetId()] = Graph->GetWeightOutEdges(NI, Attr); }*/ /*TIntFltH Weights; for (TNEANet::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { Weights.AddDat(NI.GetId(), Graph->GetWeightOutEdges(NI, Attr)); }*/ const int NNodes = Graph->GetNodes(); TVec<TNEANet::TNodeI> NV; //const double OneOver = 1.0/double(NNodes); PRankH.Gen(NNodes); for (TNEANet::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { NV.Add(NI); PRankH.AddDat(NI.GetId(), 1.0/NNodes); //IAssert(NI.GetId() == PRankH.GetKey(PRankH.Len()-1)); } TFltV TmpV(NNodes); for (int iter = 0; iter < MaxIter; iter++) { #pragma omp parallel for schedule(dynamic,10000) for (int j = 0; j < NNodes; j++) { TNEANet::TNodeI NI = NV[j]; TmpV[j] = 0; for (int e = 0; e < NI.GetInDeg(); e++) { const int InNId = NI.GetInNId(e); const TFlt OutWeight = OutWeights[InNId]; int EId = Graph->GetEId(InNId, NI.GetId()); const TFlt Weight = Weights[Graph->GetFltKeyIdE(EId)]; if (OutWeight > 0) { TmpV[j] += PRankH.GetDat(InNId) * Weight / OutWeight; } } TmpV[j] = C*TmpV[j]; // Berkhin (the correct way of doing it) //TmpV[j] = C*TmpV[j] + (1.0-C)*OneOver; // iGraph } double diff=0, sum=0, NewVal; #pragma omp parallel for reduction(+:sum) schedule(dynamic,10000) for (int i = 0; i < TmpV.Len(); i++) { sum += TmpV[i]; } const double Leaked = (1.0-sum) / double(NNodes); #pragma omp parallel for reduction(+:diff) schedule(dynamic,10000) for (int i = 0; i < PRankH.Len(); i++) { // re-instert leaked PageRank NewVal = TmpV[i] + Leaked; // Berkhin //NewVal = TmpV[i] / sum; // iGraph diff += fabs(NewVal-PRankH[i]); PRankH[i] = NewVal; } if (diff < Eps) { break; } } return 0; }
void DoMuscle() { SetOutputFileName(g_pstrOutFileName.get()); SetInputFileName(g_pstrInFileName.get()); SetMaxIters(g_uMaxIters.get()); SetSeqWeightMethod(g_SeqWeight1.get()); TextFile fileIn(g_pstrInFileName.get()); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType.get()) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); // // AED 21/12/06: Moved matrix loading code inside the PP param function so it gets called for all alignment types // SetPPScore(); unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags.get() = g_bDiags1.get(); SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName.get()); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName.get(), true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName.get()) { // Discourage users... if (!g_bUseTreeNoWarn.get()) fprintf(stderr, g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName.get()); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1.get(), g_Distance1.get(), g_Root1.get(), g_pstrDistMxFileName1.get()); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly.get()) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; ProgNode *ProgNodes = 0; if (g_bLow.get()) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName.get()) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName.get(), msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters.get() || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName.get(), true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName.get()) { g_bDiags.get() = g_bDiags2.get(); SetIter(2); if (g_bLow.get()) { if (0 != g_uMaxTreeRefineIters.get()) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2.get()); SetMuscleTree(GuideTree); if (g_bAnchors.get()) RefineVert(msa, GuideTree, g_uMaxIters.get() - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters.get() - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters.get() - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName.get(), true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); }
void DoMuscle(CompositeVect*CVLocation) { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName) { const char *FileName = g_pstrMatrixFileName; const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName, true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName) { // Discourage users... if (!g_bUseTreeNoWarn) fprintf(stderr, "%s", g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1, g_pstrDistMxFileName1); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; msa.SetCompositeVector(CVLocation); ProgNode *ProgNodes = 0; if (g_bLow) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName, msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName) { g_bDiags = g_bDiags2; SetIter(2); if (g_bLow) { if (0 != g_uMaxTreeRefineIters) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msa, GuideTree, g_uMaxIters - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); }
int GetWeightedPageRankMP2(const PNEANet Graph, TIntFltH& PRankH, const TStr& Attr, const double& C, const double& Eps, const int& MaxIter) { if (!Graph->IsFltAttrE(Attr)) return -1; const int NNodes = Graph->GetNodes(); TVec<TNEANet::TNodeI> NV; //const double OneOver = 1.0/double(NNodes); PRankH.Gen(NNodes); int MxId; for (TNEANet::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { NV.Add(NI); PRankH.AddDat(NI.GetId(), 1.0/NNodes); int Id = NI.GetId(); if (Id > MxId) { MxId = Id; } } TFltV PRankV(MxId+1); TFltV OutWeights(MxId+1); TFltV Weights = Graph->GetFltAttrVecE(Attr); #pragma omp parallel for schedule(dynamic,10000) for (int j = 0; j < NNodes; j++) { TNEANet::TNodeI NI = NV[j]; int Id = NI.GetId(); OutWeights[Id] = Graph->GetWeightOutEdges(NI, Attr); PRankV[Id] = 1/NNodes; } TFltV TmpV(NNodes); for (int iter = 0; iter < MaxIter; iter++) { #pragma omp parallel for schedule(dynamic,10000) for (int j = 0; j < NNodes; j++) { TNEANet::TNodeI NI = NV[j]; TFlt Tmp = 0; for (int e = 0; e < NI.GetInDeg(); e++) { const int InNId = NI.GetInNId(e); const TFlt OutWeight = OutWeights[InNId]; int EId = Graph->GetEId(InNId, NI.GetId()); const TFlt Weight = Weights[Graph->GetFltKeyIdE(EId)]; if (OutWeight > 0) { Tmp += PRankH.GetDat(InNId) * Weight / OutWeight; } } TmpV[j] = C*Tmp; // Berkhin (the correct way of doing it) //TmpV[j] = C*TmpV[j] + (1.0-C)*OneOver; // iGraph } double sum = 0; #pragma omp parallel for reduction(+:sum) schedule(dynamic,10000) for (int i = 0; i < TmpV.Len(); i++) { sum += TmpV[i]; } const double Leaked = (1.0-sum) / double(NNodes); double diff = 0; #pragma omp parallel for reduction(+:diff) schedule(dynamic,10000) for (int i = 0; i < NNodes; i++) { TNEANet::TNodeI NI = NV[i]; double NewVal = TmpV[i] + Leaked; // Berkhin //NewVal = TmpV[i] / sum; // iGraph int Id = NI.GetId(); diff += fabs(NewVal-PRankV[Id]); PRankV[Id] = NewVal; } if (diff < Eps) { break; } } #pragma omp parallel for schedule(dynamic,10000) for (int i = 0; i < NNodes; i++) { TNEANet::TNodeI NI = NV[i]; PRankH[i] = PRankV[NI.GetId()]; } return 0; }