static void LogGap(unsigned uStart, unsigned uEnd, unsigned uGapLength, bool bNTerm, bool bCTerm) { Log("%16.16s ", ""); for (unsigned i = 0; i < uStart; ++i) Log(" "); unsigned uMyLength = 0; for (unsigned i = uStart; i <= uEnd; ++i) { bool bGap1 = g_ptrMSA1.get()->IsGap(g_uSeqIndex1.get(), i); bool bGap2 = g_ptrMSA2.get()->IsGap(g_uSeqIndex2.get(), i); if (!bGap1 && !bGap2) Quit("Error -- neither gapping"); if (bGap1 && bGap2) Log("."); else { ++uMyLength; Log("-"); } } SCORE s = GapPenalty(uGapLength, bNTerm || bCTerm); Log(" L=%d N%d C%d s=%.3g", uGapLength, bNTerm, bCTerm, s); Log("\n"); if (uMyLength != uGapLength) Quit("Lengths differ"); }
static const char *LocalScoreToStr(SCORE s) { static TLS<char[16]> str; if (s < -100000) return " *"; sprintf(str.get(), "%6.1f", s); return str.get(); }
static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2) { #if DEBUG if (uIndex1 >= g_uLeafCount.get() || uIndex2 >= g_uLeafCount.get()) Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount.get()); #endif unsigned v; if (uIndex1 >= uIndex2) v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; else v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; assert(v < (g_uLeafCount.get()*(g_uLeafCount.get() - 1))/2); return v; }
void SaveCurrentAlignment() { extern TLS<MSA *>ptrBestMSA; static TLS<bool> bCalled(false); if (bCalled.get()) { fprintf(stderr, "\nRecursive call to SaveCurrentAlignment, giving up attempt to save.\n"); exit(EXIT_FatalError); } if (0 == ptrBestMSA.get()) { fprintf(stderr, "\nAlignment not completed, cannot save.\n"); Log("Alignment not completed, cannot save.\n"); exit(EXIT_FatalError); } if (0 == pstrOutputFileName.get()) { fprintf(stderr, "\nOutput file name not specified, cannot save.\n"); exit(EXIT_FatalError); } fprintf(stderr, "\nSaving current alignment ...\n"); TextFile fileOut(pstrOutputFileName.get(), true); ptrBestMSA.get()->ToFASTAFile(fileOut); fprintf(stderr, "Current alignment saved to \"%s\".\n", pstrOutputFileName.get()); Log("Current alignment saved to \"%s\".\n", pstrOutputFileName.get()); }
static bool _indication_proc(Instance* cimple_inst, void* client_data) { TRACE; // This function is called by the CIMPLE Indication_Handler<> in order to // deliver a single indication. Adapter* adapter = (Adapter*)client_data; // If this is the final call, just return. if (cimple_inst == 0) return false; // Convert CIMPLE instance to CMPI instance: CMPIInstance* cmpi_inst = 0; CMPIrc rc = make_cmpi_instance(adapter->broker, cimple_inst, _INDICATIONS_NAMESPACE, 0, cmpi_inst); // Deliver the indication (we cannot do anything about failures). if (rc == CMPI_RC_OK) { // Grab the CMPI context from thread-specific-data. const CMPIContext* context = (const CMPIContext*)_context_tls.get(); // Deliver the indication: CBDeliverIndication( adapter->broker, context, _INDICATIONS_NAMESPACE, cmpi_inst); } // Keep them coming! return true; }
SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, unsigned uLengthB, PWPath &Path) { assert(uLengthB > 0 && uLengthA > 0); const unsigned uPrefixCountA = uLengthA + 1; const unsigned uPrefixCountB = uLengthB + 1; // Allocate DP matrices const size_t LM = uPrefixCountA*uPrefixCountB; SCORE *DPL_ = new SCORE[LM]; SCORE *DPM_ = new SCORE[LM]; SCORE *DPD_ = new SCORE[LM]; SCORE *DPE_ = new SCORE[LM]; SCORE *DPI_ = new SCORE[LM]; SCORE *DPJ_ = new SCORE[LM]; char *TBM_ = new char[LM]; char *TBD_ = new char[LM]; char *TBE_ = new char[LM]; char *TBI_ = new char[LM]; char *TBJ_ = new char[LM]; memset(TBM_, '?', LM); memset(TBD_, '?', LM); memset(TBE_, '?', LM); memset(TBI_, '?', LM); memset(TBJ_, '?', LM); DPM(0, 0) = 0; DPD(0, 0) = MINUS_INFINITY; DPE(0, 0) = MINUS_INFINITY; DPI(0, 0) = MINUS_INFINITY; DPJ(0, 0) = MINUS_INFINITY; DPM(1, 0) = MINUS_INFINITY; DPD(1, 0) = PA[0].m_scoreGapOpen; DPE(1, 0) = PA[0].m_scoreGapOpen2; TBD(1, 0) = 'D'; TBE(1, 0) = 'E'; DPI(1, 0) = MINUS_INFINITY; DPJ(1, 0) = MINUS_INFINITY; DPM(0, 1) = MINUS_INFINITY; DPD(0, 1) = MINUS_INFINITY; DPE(0, 1) = MINUS_INFINITY; DPI(0, 1) = PB[0].m_scoreGapOpen; DPJ(0, 1) = PB[0].m_scoreGapOpen2; TBI(0, 1) = 'I'; TBJ(0, 1) = 'J'; // Empty prefix of B is special case for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { DPM(uPrefixLengthA, 0) = MINUS_INFINITY; DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend.get(); DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2.get(); TBD(uPrefixLengthA, 0) = 'D'; TBE(uPrefixLengthA, 0) = 'E'; DPI(uPrefixLengthA, 0) = MINUS_INFINITY; DPJ(uPrefixLengthA, 0) = MINUS_INFINITY; } // Empty prefix of A is special case for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { DPM(0, uPrefixLengthB) = MINUS_INFINITY; DPD(0, uPrefixLengthB) = MINUS_INFINITY; DPE(0, uPrefixLengthB) = MINUS_INFINITY; DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend.get(); DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2.get(); TBI(0, uPrefixLengthB) = 'I'; TBJ(0, uPrefixLengthB) = 'J'; } // Special case to agree with NWFast, no D-I transitions so... DPD(uLengthA, 0) = MINUS_INFINITY; DPE(uLengthA, 0) = MINUS_INFINITY; // DPI(0, uLengthB) = MINUS_INFINITY; // DPJ(0, uLengthB) = MINUS_INFINITY; // ============ // Main DP loop // ============ SCORE scoreGapCloseB = MINUS_INFINITY; SCORE scoreGapClose2B = MINUS_INFINITY; for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) { const ProfPos &PPB = PB[uPrefixLengthB - 1]; SCORE scoreGapCloseA = MINUS_INFINITY; SCORE scoreGapClose2A = MINUS_INFINITY; for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) { const ProfPos &PPA = PA[uPrefixLengthA - 1]; { // Match M=LetterA+LetterB SCORE scoreLL = ScoreProfPos2(PPA, PPB); DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A; SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B; SCORE scoreBest; if (scoreMM >= scoreDM && scoreMM >= scoreEM && scoreMM >= scoreIM && scoreMM >= scoreJM) { scoreBest = scoreMM; TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; } else if (scoreDM >= scoreMM && scoreDM >= scoreEM && scoreDM >= scoreIM && scoreDM >= scoreJM) { scoreBest = scoreDM; TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; } else if (scoreEM >= scoreMM && scoreEM >= scoreDM && scoreEM >= scoreIM && scoreEM >= scoreJM) { scoreBest = scoreEM; TBM(uPrefixLengthA, uPrefixLengthB) = 'E'; } else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM) { scoreBest = scoreIM; TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; } else { assert(scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM); scoreBest = scoreJM; TBM(uPrefixLengthA, uPrefixLengthB) = 'J'; } DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; } { // Delete D=LetterA+GapB SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen; SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend.get(); SCORE scoreBest; if (scoreMD >= scoreDD) { scoreBest = scoreMD; TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreDD >= scoreMD); scoreBest = scoreDD; TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; } DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; } { // Delete E=LetterA+GapB SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) + PA[uPrefixLengthA-1].m_scoreGapOpen2; SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend2.get(); SCORE scoreBest; if (scoreME >= scoreEE) { scoreBest = scoreME; TBE(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreEE >= scoreME); scoreBest = scoreEE; TBE(uPrefixLengthA, uPrefixLengthB) = 'E'; } DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert I=GapA+LetterB { SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen; SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend.get(); SCORE scoreBest; if (scoreMI >= scoreII) { scoreBest = scoreMI; TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreII > scoreMI); scoreBest = scoreII; TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; } DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; } // Insert J=GapA+LetterB { SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) + PB[uPrefixLengthB - 1].m_scoreGapOpen2; SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend2.get(); SCORE scoreBest; if (scoreMJ >= scoreJJ) { scoreBest = scoreMJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'M'; } else { assert(scoreJJ > scoreMJ); scoreBest = scoreJJ; TBJ(uPrefixLengthA, uPrefixLengthB) = 'J'; } DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest; } scoreGapCloseA = PPA.m_scoreGapClose; scoreGapClose2A = PPA.m_scoreGapClose2; } scoreGapCloseB = PPB.m_scoreGapClose; scoreGapClose2B = PPB.m_scoreGapClose2; } #if TRACE Log("\n"); Log("DA Simple DPL:\n"); ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPM:\n"); ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPD:\n"); ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPE:\n"); ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPI:\n"); ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple DPJ:\n"); ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBM:\n"); ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBD:\n"); ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBE:\n"); ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBI:\n"); ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); Log("\n"); Log("DA Simple TBJ:\n"); ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB); #endif // Trace-back // ========== Path.Clear(); // Find last edge SCORE M = DPM(uLengthA, uLengthB); SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose; SCORE E = DPE(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose2; SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose; SCORE J = DPJ(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose2; char cEdgeType = '?'; SCORE BestScore = M; cEdgeType = 'M'; if (D > BestScore) { cEdgeType = 'D'; BestScore = D; } if (E > BestScore) { cEdgeType = 'E'; BestScore = E; } if (I > BestScore) { cEdgeType = 'I'; BestScore = I; } if (J > BestScore) { cEdgeType = 'J'; BestScore = J; } #if TRACE Log("DA Simple: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n", M, D, E, I, J, cEdgeType); #endif unsigned PLA = uLengthA; unsigned PLB = uLengthB; for (;;) { PWEdge Edge; Edge.cType = XlatEdgeType(cEdgeType); Edge.uPrefixLengthA = PLA; Edge.uPrefixLengthB = PLB; #if TRACE Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB); #endif Path.PrependEdge(Edge); switch (cEdgeType) { case 'M': assert(PLA > 0); assert(PLB > 0); cEdgeType = TBM(PLA, PLB); --PLA; --PLB; break; case 'D': assert(PLA > 0); cEdgeType = TBD(PLA, PLB); --PLA; break; case 'E': assert(PLA > 0); cEdgeType = TBE(PLA, PLB); --PLA; break; case 'I': assert(PLB > 0); cEdgeType = TBI(PLA, PLB); --PLB; break; case 'J': assert(PLB > 0); cEdgeType = TBJ(PLA, PLB); --PLB; break; default: Quit("Invalid edge %c", cEdgeType); } if (0 == PLA && 0 == PLB) break; } Path.Validate(); // SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path); #if TRACE SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); Path.LogMe(); Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath)); #endif if (g_bKeepSimpleDP.get()) { g_DPM.get() = DPM_; g_DPD.get() = DPD_; g_DPE.get() = DPE_; g_DPI.get() = DPI_; g_DPJ.get() = DPJ_; g_TBM.get() = TBM_; g_TBD.get() = TBD_; g_TBE.get() = TBE_; g_TBI.get() = TBI_; g_TBJ.get() = TBJ_; } else { delete[] DPM_; delete[] DPD_; delete[] DPE_; delete[] DPI_; delete[] DPJ_; delete[] TBM_; delete[] TBD_; delete[] TBE_; delete[] TBI_; delete[] TBJ_; } return BestScore; }
static void ListState() { Log("Dist matrix\n"); Log(" "); for (unsigned i = 0; i < g_uLeafCount.get(); ++i) { if (uInsane == g_uNodeIndex.get()[i]) continue; Log(" %5u", g_uNodeIndex.get()[i]); } Log("\n"); for (unsigned i = 0; i < g_uLeafCount.get(); ++i) { if (uInsane == g_uNodeIndex.get()[i]) continue; Log("%5u ", g_uNodeIndex.get()[i]); for (unsigned j = 0; j < g_uLeafCount.get(); ++j) { if (uInsane == g_uNodeIndex.get()[j]) continue; if (i == j) Log(" "); else { unsigned v = TriangleSubscript(i, j); Log("%5.2g ", g_Dist.get()[v]); } } Log("\n"); } Log("\n"); Log(" i Node NrNb Dist\n"); Log("----- ----- ----- --------\n"); for (unsigned i = 0; i < g_uLeafCount.get(); ++i) { if (uInsane == g_uNodeIndex.get()[i]) continue; Log("%5u %5u %5u %8.3f\n", i, g_uNodeIndex.get()[i], g_uNearestNeighbor.get()[i], g_MinDist.get()[i]); } Log("\n"); Log(" Node L R Height LLength RLength\n"); Log("----- ----- ----- ------ ------- -------\n"); for (unsigned i = 0; i <= g_uInternalNodeIndex.get(); ++i) Log("%5u %5u %5u %6.2g %6.2g %6.2g\n", i, g_uLeft.get()[i], g_uRight.get()[i], g_Height.get()[i], g_LeftLength.get()[i], g_RightLength.get()[i]); }
void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage) { g_uLeafCount.get() = DC.GetCount(); g_uTriangleSize.get() = (g_uLeafCount.get()*(g_uLeafCount.get() - 1))/2; g_uInternalNodeCount.get() = g_uLeafCount.get() - 1; g_Dist.get() = new dist_t[g_uTriangleSize.get()]; g_uNodeIndex.get() = new unsigned[g_uLeafCount.get()]; g_uNearestNeighbor.get() = new unsigned[g_uLeafCount.get()]; g_MinDist.get() = new dist_t[g_uLeafCount.get()]; unsigned *Ids = new unsigned [g_uLeafCount.get()]; char **Names = new char *[g_uLeafCount.get()]; g_uLeft.get() = new unsigned[g_uInternalNodeCount.get()]; g_uRight.get() = new unsigned[g_uInternalNodeCount.get()]; g_Height.get() = new dist_t[g_uInternalNodeCount.get()]; g_LeftLength.get() = new dist_t[g_uInternalNodeCount.get()]; g_RightLength.get() = new dist_t[g_uInternalNodeCount.get()]; for (unsigned i = 0; i < g_uLeafCount.get(); ++i) { g_MinDist.get()[i] = BIG_DIST; g_uNodeIndex.get()[i] = i; g_uNearestNeighbor.get()[i] = uInsane; Ids[i] = DC.GetId(i); Names[i] = strsave(DC.GetName(i)); } for (unsigned i = 0; i < g_uInternalNodeCount.get(); ++i) { g_uLeft.get()[i] = uInsane; g_uRight.get()[i] = uInsane; g_LeftLength.get()[i] = BIG_DIST; g_RightLength.get()[i] = BIG_DIST; g_Height.get()[i] = BIG_DIST; } // Compute initial NxN triangular distance matrix. // Store minimum distance for each full (not triangular) row. // Loop from 1, not 0, because "row" is 0, 1 ... i-1, // so nothing to do when i=0. for (unsigned i = 1; i < g_uLeafCount.get(); ++i) { dist_t *Row = g_Dist.get() + TriangleSubscript(i, 0); DC.CalcDistRange(i, Row); for (unsigned j = 0; j < i; ++j) { const dist_t d = Row[j]; if (d < g_MinDist.get()[i]) { g_MinDist.get()[i] = d; g_uNearestNeighbor.get()[i] = j; } if (d < g_MinDist.get()[j]) { g_MinDist.get()[j] = d; g_uNearestNeighbor.get()[j] = i; } } } #if TRACE Log("Initial state:\n"); ListState(); #endif for (g_uInternalNodeIndex.get() = 0; g_uInternalNodeIndex.get() < g_uLeafCount.get() - 1; ++g_uInternalNodeIndex.get()) { #if TRACE Log("\n"); Log("Internal node index %5u\n", g_uInternalNodeIndex.get()); Log("-------------------------\n"); #endif // Find nearest neighbors unsigned Lmin = uInsane; unsigned Rmin = uInsane; dist_t dtMinDist = BIG_DIST; for (unsigned j = 0; j < g_uLeafCount.get(); ++j) { if (uInsane == g_uNodeIndex.get()[j]) continue; dist_t d = g_MinDist.get()[j]; if (d < dtMinDist) { dtMinDist = d; Lmin = j; Rmin = g_uNearestNeighbor.get()[j]; assert(uInsane != Rmin); assert(uInsane != g_uNodeIndex.get()[Rmin]); } } assert(Lmin != uInsane); assert(Rmin != uInsane); assert(dtMinDist != BIG_DIST); #if TRACE Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n", Lmin, g_uNodeIndex.get()[Lmin], Rmin, g_uNodeIndex.get()[Rmin], dtMinDist); #endif // Compute distances to new node // New node overwrites row currently assigned to Lmin dist_t dtNewMinDist = BIG_DIST; unsigned uNewNearestNeighbor = uInsane; for (unsigned j = 0; j < g_uLeafCount.get(); ++j) { if (j == Lmin || j == Rmin) continue; if (uInsane == g_uNodeIndex.get()[j]) continue; const unsigned vL = TriangleSubscript(Lmin, j); const unsigned vR = TriangleSubscript(Rmin, j); const dist_t dL = g_Dist.get()[vL]; const dist_t dR = g_Dist.get()[vR]; dist_t dtNewDist; switch (Linkage) { case LINKAGE_Avg: dtNewDist = AVG(dL, dR); break; case LINKAGE_Min: dtNewDist = MIN(dL, dR); break; case LINKAGE_Max: dtNewDist = MAX(dL, dR); break; case LINKAGE_Biased: dtNewDist = g_dSUEFF.get()*AVG(dL, dR) + (1 - g_dSUEFF.get())*MIN(dL, dR); break; default: Quit("UPGMA2: Invalid LINKAGE_%u", Linkage); } // Nasty special case. // If nearest neighbor of j is Lmin or Rmin, then make the new // node (which overwrites the row currently occupied by Lmin) // the nearest neighbor. This situation can occur when there are // equal distances in the matrix. If we don't make this fix, // the nearest neighbor pointer for j would become invalid. // (We don't need to test for == Lmin, because in that case // the net change needed is zero due to the change in row // numbering). if (g_uNearestNeighbor.get()[j] == Rmin) g_uNearestNeighbor.get()[j] = Lmin; #if TRACE Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n", j, Lmin, dL, Rmin, dR, dtNewDist); #endif g_Dist.get()[vL] = dtNewDist; if (dtNewDist < dtNewMinDist) { dtNewMinDist = dtNewDist; uNewNearestNeighbor = j; } } assert(g_uInternalNodeIndex.get() < g_uLeafCount.get() - 1 || BIG_DIST != dtNewMinDist); assert(g_uInternalNodeIndex.get() < g_uLeafCount.get() - 1 || uInsane != uNewNearestNeighbor); const unsigned v = TriangleSubscript(Lmin, Rmin); const dist_t dLR = g_Dist.get()[v]; const dist_t dHeightNew = dLR/2; const unsigned uLeft = g_uNodeIndex.get()[Lmin]; const unsigned uRight = g_uNodeIndex.get()[Rmin]; const dist_t HeightLeft = uLeft < g_uLeafCount.get() ? 0 : g_Height.get()[uLeft - g_uLeafCount.get()]; const dist_t HeightRight = uRight < g_uLeafCount.get() ? 0 : g_Height.get()[uRight - g_uLeafCount.get()]; g_uLeft.get()[g_uInternalNodeIndex.get()] = uLeft; g_uRight.get()[g_uInternalNodeIndex.get()] = uRight; g_LeftLength.get()[g_uInternalNodeIndex.get()] = dHeightNew - HeightLeft; g_RightLength.get()[g_uInternalNodeIndex.get()] = dHeightNew - HeightRight; g_Height.get()[g_uInternalNodeIndex.get()] = dHeightNew; // Row for left child overwritten by row for new node g_uNodeIndex.get()[Lmin] = g_uLeafCount.get() + g_uInternalNodeIndex.get(); g_uNearestNeighbor.get()[Lmin] = uNewNearestNeighbor; g_MinDist.get()[Lmin] = dtNewMinDist; // Delete row for right child g_uNodeIndex.get()[Rmin] = uInsane; #if TRACE Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n", g_uInternalNodeIndex.get(), Lmin, Rmin); ListState(); #endif } unsigned uRoot = g_uLeafCount.get() - 2; tree.Create(g_uLeafCount.get(), uRoot, g_uLeft.get(), g_uRight.get(), g_LeftLength.get(), g_RightLength.get(), Ids, Names); #if TRACE tree.LogMe(); #endif delete[] g_Dist.get(); delete[] g_uNodeIndex.get(); delete[] g_uNearestNeighbor.get(); delete[] g_MinDist.get(); delete[] g_Height.get(); delete[] g_uLeft.get(); delete[] g_uRight.get(); delete[] g_LeftLength.get(); delete[] g_RightLength.get(); for (unsigned i = 0; i < g_uLeafCount.get(); ++i) free(Names[i]); delete[] Names; delete[] Ids; }
// WARNING: Sequences MUST be stripped of gaps and upper case! void DistKmer20_3(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount); TripleCounts.get() = (TripleCount *) malloc(uTripleArrayBytes); if (0 == TripleCounts.get()) Quit("Not enough memory (TripleCounts)"); memset(TripleCounts.get(), 0, uTripleArrayBytes); for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { TripleCount &tc = *(TripleCounts.get() + uWord); const unsigned uBytes = uSeqCount*sizeof(short); tc.m_Counts = (unsigned short *) malloc(uBytes); memset(tc.m_Counts, 0, uBytes); } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos) { const unsigned uLetter1 = CharToLetterEx(s[uPos]); if (uLetter1 >= 20) continue; const unsigned uLetter2 = CharToLetterEx(s[uPos+1]); if (uLetter2 >= 20) continue; const unsigned uLetter3 = CharToLetterEx(s[uPos+2]); if (uLetter3 >= 20) continue; const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20; assert(uWord < TRIPLE_COUNT); TripleCount &tc = *(TripleCounts.get() + uWord); const unsigned uOldCount = tc.m_Counts[uSeqIndex]; if (0 == uOldCount) ++(tc.m_uSeqCount); ++(tc.m_Counts[uSeqIndex]); } } #if TRACE { Log("TripleCounts\n"); unsigned uGrandTotal = 0; for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { const TripleCount &tc = *(TripleCounts.get() + uWord); if (0 == tc.m_uSeqCount) continue; const unsigned uLetter3 = uWord/(20*20); const unsigned uLetter2 = (uWord - uLetter3*20*20)/20; const unsigned uLetter1 = uWord%20; Log("Word %6u %c%c%c %6u", uWord, LetterToCharAmino(uLetter1), LetterToCharAmino(uLetter2), LetterToCharAmino(uLetter3), tc.m_uSeqCount); unsigned uSeqCountWithThisWord = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uCount = tc.m_Counts[uSeqIndex]; if (uCount > 0) { ++uSeqCountWithThisWord; Log(" %u=%u", uSeqIndex, uCount); uGrandTotal += uCount; } } if (uSeqCountWithThisWord != tc.m_uSeqCount) Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord); Log("\n"); } unsigned uTotalBySeqLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); uTotalBySeqLength += uSeqLength - 2; } if (uGrandTotal != uTotalBySeqLength) Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength); } #endif const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned); unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes); for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { const TripleCount &tc = *(TripleCounts.get() + uWord); if (0 == tc.m_uSeqCount) continue; unsigned uSeqCountFound = 0; memset(SeqList, 0, uSeqListBytes); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (tc.m_Counts[uSeqIndex] > 0) { SeqList[uSeqCountFound] = uSeqIndex; ++uSeqCountFound; if (uSeqCountFound == tc.m_uSeqCount) break; } } assert(uSeqCountFound == tc.m_uSeqCount); for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1) { const unsigned uSeqIndex1 = SeqList[uSeq1]; const unsigned uCount1 = tc.m_Counts[uSeqIndex1]; for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { const unsigned uSeqIndex2 = SeqList[uSeq2]; const unsigned uCount2 = tc.m_Counts[uSeqIndex2]; const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2; const double d = DF.GetDist(uSeqIndex1, uSeqIndex2); DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount)); } } } delete[] SeqList; free(TripleCounts.get()); unsigned uDone = 0; const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0.0); const Seq &s1 = *(v[uSeq1]); const unsigned uLength1 = s1.Length(); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { const Seq &s2 = *(v[uSeq2]); const unsigned uLength2 = s2.Length(); unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2; if (uMinLength < 3) { DF.SetDist(uSeq1, uSeq2, 1.0); continue; } const double dTripleCount = DF.GetDist(uSeq1, uSeq2); if (dTripleCount == 0) { DF.SetDist(uSeq1, uSeq2, 1.0); continue; } double dNormalizedTripletScore = dTripleCount/(uMinLength - 2); //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore)); //if (dEstimatedPairwiseIdentity > 1) // dEstimatedPairwiseIdentity = 1; // DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity)); DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore); #if TRACE { Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n", s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2, dEstimatedPairwiseIdentity); } #endif if (uDone%1000 == 0) Progress(uDone, uTotal); } } ProgressStepsDone(); }
static SCORE ScoreSeqPair(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2, SCORE *ptrLetters, SCORE *ptrGaps) { g_ptrMSA1.get() = &msa1; g_ptrMSA2.get() = &msa2; g_uSeqIndex1.get() = uSeqIndex1; g_uSeqIndex2.get() = uSeqIndex2; const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPair, different lengths"); #if TRACE Log("ScoreSeqPair\n"); Log("%16.16s ", msa1.GetSeqName(uSeqIndex1)); for (unsigned i = 0; i < uColCount; ++i) Log("%c", msa1.GetChar(uSeqIndex1, i)); Log("\n"); Log("%16.16s ", msa2.GetSeqName(uSeqIndex2)); for (unsigned i = 0; i < uColCount; ++i) Log("%c", msa1.GetChar(uSeqIndex2, i)); Log("\n"); #endif SCORE scoreTotal = 0; // Substitution scores unsigned uFirstLetter1 = uInsane; unsigned uFirstLetter2 = uInsane; unsigned uLastLetter1 = uInsane; unsigned uLastLetter2 = uInsane; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); bool bWildcard1 = msa1.IsWildcard(uSeqIndex1, uColIndex); bool bWildcard2 = msa2.IsWildcard(uSeqIndex2, uColIndex); if (!bGap1) { if (uInsane == uFirstLetter1) uFirstLetter1 = uColIndex; uLastLetter1 = uColIndex; } if (!bGap2) { if (uInsane == uFirstLetter2) uFirstLetter2 = uColIndex; uLastLetter2 = uColIndex; } if (bGap1 || bGap2 || bWildcard1 || bWildcard2) continue; unsigned uLetter1 = msa1.GetLetter(uSeqIndex1, uColIndex); unsigned uLetter2 = msa2.GetLetter(uSeqIndex2, uColIndex); SCORE scoreMatch = (*g_ptrScoreMatrix.get())[uLetter1][uLetter2]; scoreTotal += scoreMatch; #if TRACE Log("%c <-> %c = %7.1f %10.1f\n", msa1.GetChar(uSeqIndex1, uColIndex), msa2.GetChar(uSeqIndex2, uColIndex), scoreMatch, scoreTotal); #endif } *ptrLetters = scoreTotal; // Gap penalties unsigned uGapLength = uInsane; unsigned uGapStartCol = uInsane; bool bGapping1 = false; bool bGapping2 = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGapping1) { if (bGap1) ++uGapLength; else { bGapping1 = false; bool bNTerm = (uFirstLetter2 == uGapStartCol); bool bCTerm = (uLastLetter2 + 1 == uColIndex); SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } continue; } else { if (bGap1) { uGapStartCol = uColIndex; bGapping1 = true; uGapLength = 1; continue; } } if (bGapping2) { if (bGap2) ++uGapLength; else { bGapping2 = false; bool bNTerm = (uFirstLetter1 == uGapStartCol); bool bCTerm = (uLastLetter1 + 1 == uColIndex); SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } } else { if (bGap2) { uGapStartCol = uColIndex; bGapping2 = true; uGapLength = 1; } } } if (bGapping1 || bGapping2) { SCORE scoreGap = GapPenalty(uGapLength, true); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColCount - 1, uGapLength, false, true); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } *ptrGaps = scoreTotal - *ptrLetters; return scoreTotal; }
void SetCurrentAlignment(MSA &msa) { extern TLS<MSA *>ptrBestMSA; ptrBestMSA.get() = &msa; }
void SetOutputFileName(const char *out) { pstrOutputFileName.get() = out; }