static void ssFindBest(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq, enum ffStringency stringency, boolean isProt, struct trans3 *t3List, struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers) /* String together blocks in alignment into chains. */ { int count = ffAliCount(ffList); if (count >= 10) { ssFindBestBig(ffList, qSeq, tSeq, stringency, isProt, t3List, retBestAli, retScore, retLeftovers); } else { ssFindBestSmall(ffList, qSeq, tSeq, stringency, isProt, t3List, retBestAli, retScore, retLeftovers); } }
static struct ffAli *trimFlakyEnds(struct dnaSeq *qSeq, struct dnaSeq *tSeq, struct ffAli *ffList) /* Get rid of small initial and terminal exons that seem to just * be chance alignments. Looks for splice sites and non-degenerate * sequence to keep things. */ { int orientation = ffIntronOrientation(ffList); struct ffAli *left, *right; char *iStart, *iEnd; int blockScore, gapPenalty; /* If one or less block then don't bother. */ if (ffAliCount(ffList) < 2) return ffList; /* Trim beginnings. */ left = ffList; right = ffList->right; while (right != NULL) { blockScore = ffScoreMatch(left->nStart, left->hStart, left->nEnd-left->nStart); blockScore -= aPenalty(left->nStart, left->nEnd - left->nStart); iStart = left->hEnd; iEnd = right->hStart; gapPenalty = trimGapPenalty(iEnd-iStart, right->nStart - left->nEnd, iStart, iEnd, orientation); if (gapPenalty >= blockScore) { freeMem(left); ffList = right; right->left = NULL; } else break; left = right; right = right->right; } right = ffRightmost(ffList); if (right == ffList) return ffList; left = right->left; while (left != NULL) { blockScore = ffScoreMatch(right->nStart, right->hStart, right->nEnd-right->nStart); blockScore -= aPenalty(right->nStart, right->nEnd - right->nStart); iStart = left->hEnd; iEnd = right->hStart; gapPenalty = trimGapPenalty(iEnd-iStart, right->nStart - left->nEnd, iStart, iEnd, orientation); if (gapPenalty >= blockScore) { freeMem(right); left->right = NULL; } else break; right = left; left = left->left; } return ffList; }
void oneAli(struct ffAli *left, struct dnaSeq *otherSeq, struct repeatTracker *rt, boolean isRc, enum ffStringency stringency, FILE *out) /* Analyse one alignment and if it looks good enough write it out to file. */ { struct dnaSeq *genoSeq = rt->seq; UBYTE *repBytes = rt->repBytes; struct ffAli *ff, *nextFf; struct ffAli *right = ffRightmost(left); DNA *needle = otherSeq->dna; DNA *hay = genoSeq->dna; int nStart = left->nStart - needle; int nEnd = right->nEnd - needle; int hStart = left->hStart - hay; int hEnd = right->hEnd - hay; int nSize = nEnd - nStart; int hSize = hEnd - hStart; int nInsertBaseCount = 0; int nInsertCount = 0; int hInsertBaseCount = 0; int hInsertCount = 0; int matchCount = 0; int mismatchCount = 0; int repMatch = 0; int countNs = 0; DNA *np, *hp, n, h; int blockSize; int i; int badScore; int milliBad; int passIt; /* Count up matches, mismatches, inserts, etc. */ for (ff = left; ff != NULL; ff = nextFf) { int hStart; nextFf = ff->right; blockSize = ff->nEnd - ff->nStart; np = ff->nStart; hp = ff->hStart; hStart = hp - hay; for (i=0; i<blockSize; ++i) { n = np[i]; h = hp[i]; if (n == 'n' || h == 'n') ++countNs; else { if (n == h) { if (repBytes[i+hStart]) ++repMatch; else ++matchCount; } else ++mismatchCount; } } if (nextFf != NULL) { if (ff->nEnd != nextFf->nStart) { ++nInsertCount; nInsertBaseCount += nextFf->nStart - ff->nEnd; } if (ff->hEnd != nextFf->hStart) { ++hInsertCount; hInsertBaseCount += nextFf->hStart - ff->hEnd; } } } /* See if it looks good enough to output. */ milliBad = calcMilliBad(nEnd - nStart, hEnd - hStart, nInsertCount, hInsertCount, matchCount, repMatch, mismatchCount, stringency == ffCdna); if (veryTight) { passIt = (milliBad < 60 && (matchCount >= 25 || (matchCount >= 15 && matchCount + repMatch >= 50) || (matchCount >= 5 && repMatch >= 100 && milliBad < 50))); } else { passIt = (milliBad < maxBad && (matchCount >= minBases || (matchCount >= minBases/2 && matchCount + repMatch >= 2*minBases) || (repMatch >= 4*minBases && milliBad < (maxBad/2)))); } if (passIt) { if (isRc) { int temp; int oSize = otherSeq->size; temp = nStart; nStart = oSize - nEnd; nEnd = oSize - temp; } fprintf(out, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t" "%c\t" "%s\t%d\t%d\t%d\t" "%s\t%d\t%d\t%d\t%d\t", matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount, (isRc ? '-' : '+'), otherSeq->name, otherSeq->size, nStart, nEnd, genoSeq->name, genoSeq->size, hStart, hEnd, ffAliCount(left)); for (ff = left; ff != NULL; ff = ff->right) fprintf(out, "%d,", ff->nEnd - ff->nStart); fprintf(out, "\t"); for (ff = left; ff != NULL; ff = ff->right) fprintf(out, "%d,", ff->nStart - needle); fprintf(out, "\t"); for (ff = left; ff != NULL; ff = ff->right) fprintf(out, "%d,", ff->hStart - hay); fprintf(out, "\n"); if (ferror(out)) { perror(""); errAbort("Write error to .psl"); } } }
static void savePslx(char *chromName, int chromSize, int chromOffset, struct ffAli *ali, struct dnaSeq *tSeq, struct dnaSeq *qSeq, boolean isRc, enum ffStringency stringency, int minMatch, FILE *f, struct hash *t3Hash, boolean reportTargetStrand, boolean targetIsRc, struct hash *maskHash, int minIdentity, boolean qIsProt, boolean tIsProt, boolean saveSeq) /* Analyse one alignment and if it looks good enough write it out to file in * psl format (or pslX format - if saveSeq is TRUE). */ { /* This function was stolen from psLayout and slightly extensively to cope * with protein as well as DNA aligments. */ struct ffAli *ff, *nextFf; struct ffAli *right = ffRightmost(ali); DNA *needle = qSeq->dna; DNA *hay = tSeq->dna; int nStart = ali->nStart - needle; int nEnd = right->nEnd - needle; int hStart, hEnd; int nInsertBaseCount = 0; int nInsertCount = 0; int hInsertBaseCount = 0; int hInsertCount = 0; int matchCount = 0; int mismatchCount = 0; int repMatch = 0; int countNs = 0; DNA *np, *hp, n, h; int blockSize; int i; struct trans3 *t3List = NULL; Bits *maskBits = NULL; if (maskHash != NULL) maskBits = hashMustFindVal(maskHash, tSeq->name); if (t3Hash != NULL) t3List = hashMustFindVal(t3Hash, tSeq->name); hStart = trans3GenoPos(ali->hStart, tSeq, t3List, FALSE) + chromOffset; hEnd = trans3GenoPos(right->hEnd, tSeq, t3List, TRUE) + chromOffset; /* Count up matches, mismatches, inserts, etc. */ for (ff = ali; ff != NULL; ff = nextFf) { nextFf = ff->right; blockSize = ff->nEnd - ff->nStart; np = ff->nStart; hp = ff->hStart; for (i=0; i<blockSize; ++i) { n = np[i]; h = hp[i]; if (n == 'n' || h == 'n') ++countNs; else { if (n == h) { if (maskBits != NULL) { int seqOff = hp + i - hay; if (bitReadOne(maskBits, seqOff)) ++repMatch; else ++matchCount; } else ++matchCount; } else ++mismatchCount; } } if (nextFf != NULL) { int nhStart = trans3GenoPos(nextFf->hStart, tSeq, t3List, FALSE) + chromOffset; int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset; int hGap = nhStart - ohEnd; int nGap = nextFf->nStart - ff->nEnd; if (nGap != 0) { ++nInsertCount; nInsertBaseCount += nGap; } if (hGap != 0) { ++hInsertCount; hInsertBaseCount += hGap; } } } /* See if it looks good enough to output, and output. */ /* if (score >= minMatch) Moved to higher level */ { int gaps = nInsertCount + (stringency == ffCdna ? 0: hInsertCount); int id = roundingScale(1000, matchCount + repMatch - 2*gaps, matchCount + repMatch + mismatchCount); if (id >= minIdentity) { if (isRc) { int temp; int oSize = qSeq->size; temp = nStart; nStart = oSize - nEnd; nEnd = oSize - temp; } if (targetIsRc) { int temp; temp = hStart; hStart = chromSize - hEnd; hEnd = chromSize - temp; } fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c", matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount, (isRc ? '-' : '+')); if (reportTargetStrand) fprintf(f, "%c", (targetIsRc ? '-' : '+') ); fprintf(f, "\t%s\t%d\t%d\t%d\t" "%s\t%d\t%d\t%d\t%d\t", qSeq->name, qSeq->size, nStart, nEnd, chromName, chromSize, hStart, hEnd, ffAliCount(ali)); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%ld,", (long)(ff->nEnd - ff->nStart)); fprintf(f, "\t"); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%ld,", (long)(ff->nStart - needle)); fprintf(f, "\t"); for (ff = ali; ff != NULL; ff = ff->right) fprintf(f, "%d,", trans3GenoPos(ff->hStart, tSeq, t3List, FALSE) + chromOffset); if (saveSeq) { fputc('\t', f); for (ff = ali; ff != NULL; ff = ff->right) { mustWrite(f, ff->nStart, ff->nEnd - ff->nStart); fputc(',', f); } fputc('\t', f); for (ff = ali; ff != NULL; ff = ff->right) { mustWrite(f, ff->hStart, ff->hEnd - ff->hStart); fputc(',', f); } } fprintf(f, "\n"); if (ferror(f)) { perror(""); errAbort("Write error to .psl"); } } } }
static struct ssGraph *ssGraphMake(struct ffAli *ffList, bioSeq *qSeq, enum ffStringency stringency, boolean isProt, struct trans3 *t3List) /* Make a graph corresponding to ffList */ { int nodeCount = ffAliCount(ffList); int maxEdgeCount = (nodeCount+1)*(nodeCount)/2; int edgeCount = 0; struct ssEdge *edges, *e; struct ssNode *nodes; struct ssGraph *graph; struct ffAli *ff, *mid; int i, midIx; int overlap; boolean canFollow; if (nodeCount == 1) maxEdgeCount = 1; AllocVar(graph); graph->nodeCount = nodeCount; graph->nodes = AllocArray(nodes, nodeCount+1); for (i=1, ff = ffList; i<=nodeCount; ++i, ff = ff->right) { nodes[i].ff = ff; nodes[i].nodeScore = bioScoreMatch(isProt, ff->nStart, ff->hStart, ff->hEnd - ff->hStart); } graph->edges = AllocArray(edges, maxEdgeCount); for (mid = ffList, midIx=1; mid != NULL; mid = mid->right, ++midIx) { int midScore; struct ssNode *midNode = &nodes[midIx]; e = &edges[edgeCount++]; assert(edgeCount <= maxEdgeCount); e->nodeIn = &nodes[0]; e->score = midScore = midNode->nodeScore; midNode->waysIn = e; for (ff = ffList,i=1; ff != mid; ff = ff->right,++i) { int mhStart = 0, mhEnd = 0; if (t3List) { canFollow = tripleCanFollow(ff, mid, qSeq, t3List); trans3Offsets(t3List, mid->hStart, mid->hEnd, &mhStart, &mhEnd); } else { canFollow = (ff->nStart < mid->nStart && ff->nEnd < mid->nEnd && ff->hStart < mid->hStart && ff->hEnd < mid->hEnd); } if (canFollow) { struct ssNode *ffNode = &nodes[i]; int score; int hGap; int nGap; int crossover; nGap = mid->nStart - ff->nEnd; if (t3List) { int fhStart, fhEnd; trans3Offsets(t3List, ff->hStart, ff->hEnd, &fhStart, &fhEnd); hGap = mhStart - fhEnd; } else { hGap = mid->hStart - ff->hEnd; } e = &edges[edgeCount++]; assert(edgeCount <= maxEdgeCount); e->nodeIn = ffNode; e->overlap = overlap = -nGap; if (overlap > 0) { int midSize = mid->hEnd - mid->hStart; int ffSize = ff->hEnd - ff->hStart; int newMidScore, newFfScore; e->crossover = crossover = findCrossover(ff, mid, overlap, isProt); newMidScore = bioScoreMatch(isProt, mid->nStart, mid->hStart, midSize-overlap+crossover); newFfScore = bioScoreMatch(isProt, ff->nStart+crossover, ff->hStart+crossover, ffSize-crossover); score = newMidScore - ffNode->nodeScore + newFfScore; nGap = 0; hGap -= overlap; } else { score = midScore; } score -= ffCalcGapPenalty(hGap, nGap, stringency); e->score = score; slAddHead(&midNode->waysIn, e); } } slReverse(&midNode->waysIn); } return graph; }
static boolean jiggleSmallExons(struct ffAli *ali, struct dnaSeq *nSeq, struct dnaSeq *hSeq) /* See if can jiggle small exons to match splice sites a little * better. */ { struct ffAli *left, *mid, *right; int orient; boolean creeped = FALSE; if (ffAliCount(ali) < 3) return FALSE; orient = ffIntronOrientation(ali); left = ali; mid = left->right; right = mid->right; while (right != NULL) { int midSizeN = mid->nEnd - mid->nStart; if (midSizeN < 10 && mid->hStart - left->hEnd > 1 && right->hStart - mid->hEnd > 1) { DNA *spLeft, *spRight; /* Splice sites on either side of exon. */ DNA exonX[10+2+2]; /* Storage for exon with splice sites. */ DNA *match; static int creeps[4][2] = { {2, 2}, {2, 1}, {1, 2}, {1, 1},}; int creepIx, creepL, creepR; DNA *hs = mid->hStart, *he = mid->hEnd; DNA *hMin = left->hEnd, *hMax = right->hStart; if (orient >= 0) { spLeft = "ag"; spRight = "gt"; } else { spLeft = "ac"; spRight = "ct"; } for (creepIx=0; creepIx<4; ++creepIx) { creepL = creeps[creepIx][0]; creepR = creeps[creepIx][1]; /* Check to see if we already match consensus, and if so just bail. */ if (hs[-1] == spLeft[1] && he[0] == spRight[0]) { if ((creepL == 1 || hs[-2] == spLeft[0]) && (creepR == 1 || he[1] == spRight[1])) { break; } } memcpy(exonX, spLeft + 2 - creepL, creepL); memcpy(exonX + creepL, mid->nStart, midSizeN); memcpy(exonX + creepL + midSizeN, spRight, creepR); match = memMatch(exonX, midSizeN + creepR + creepL, hMin, hMax - hMin); if (match != NULL) { mid->hStart = match + creepL; mid->hEnd = mid->hStart + (he - hs); creeped = TRUE; break; } } } left = mid; mid = right; right = right->right; } if (creeped) ffSlideIntrons(ali); return creeped; }