struct gapInfo *findLargeGaps(struct xaAli *xa, struct gapInfo *oldList) /* Find large gaps in alignment and classify them. */ { struct gdfGene *gdfList; struct gapInfo *gapList = NULL, *gap; int ceIx=0, cbIx=0, symIx=0; int ceStart=0, cbStart=0, symStart=0; int runSize = 0; char sym, lastSym = 0; int symCount = xa->symCount; /* Fetch C. elegans region. */ gdfList = wormGdfGenesInRange(xa->target, xa->tStart, xa->tEnd, &wormSangerGdfCache); /* Run a little state machine that does something at the end of each solid run * of a symbol. */ for (symIx = 0; symIx <= symCount; ++symIx) { sym = xa->hSym[symIx]; if (sym != lastSym) { if (runSize > 32) /* Introns need to be at least this long. */ { /* We're at end of a solid run. */ if (lastSym == 'Q' || lastSym == 'T') { int ceGapStart = xa->tStart + ceStart; int ceGapEnd = xa->tStart + ceIx; struct gdfGene *gdf; char hBefore = xa->hSym[symStart-1]; char hAfter = sym; char strand = '.'; AllocVar(gap); gap->query = cloneString(xa->query); gap->qStart = xa->qStart + cbStart; gap->qEnd = xa->qStart + cbIx; gap->target = cloneString(xa->target); gap->tStart = ceGapStart; gap->tEnd = ceGapEnd; gap->name = cloneString(xa->name); gap->size = runSize; gap->hSym = lastSym; if (uniqueGap(oldList, gap)) { slAddHead(&gapList, gap); classifyGap(gdfList, xa->target, ceGapStart, ceGapEnd, lastSym, &gap->type, &gdf); if (gdf != NULL) strand = gdf->strand; gap->hasIntronEnds = isIntron(xa, symStart, symIx, lastSym, strand, &gap->slideCount, &gap->isRc); if (gap->hasIntronEnds) slideGap(gap, xa, lastSym, symStart, symIx); if (isConserved(hBefore) && isConserved(hAfter)) gap->hasStrongHomology = TRUE; if (gap->hasStrongHomology) { if (lastSym == 'T') writeGap(gap, xa, symStart+gap->slideCount, symIx+gap->slideCount, strand, out); } } } } runSize = 0; ceStart = ceIx; cbStart = cbIx; symStart = symIx; lastSym = sym; } ++runSize; if (xa->qSym[symIx] != '-') ++cbIx; if (xa->tSym[symIx] != '-') ++ceIx; } gdfFreeGeneList(&gdfList); slReverse(&gapList); return gapList; }
void outputCheck(struct psl *psl, struct dnaSeq *qSeq, int qOffset, struct dnaSeq *tSeq, int tOffset, FILE *f) /* Output quality check info to file */ { int sizePolyA = 0; int qSize = psl->qSize; int i; int missSmallStart = 0; int missLargeStart = 0; int missSmallEnd = 0; int missLargeEnd = 0; int missSmallMiddle = 0; int missLargeMiddle = 0; int weirdSplice = 0; int doubleGap = 0; int jumpBack = 0; int diff; int totalProblems = 0; char strand = psl->strand[0]; if (strand == '+') { for (i=1; i<=qSize; ++i) { if (qSeq->dna[qSize - i - qOffset] == 'a') ++sizePolyA; else break; } } else { for (i=0; i<qSize; ++i) { if (qSeq->dna[i - qOffset] == 't') ++sizePolyA; else break; } } if (psl->qStart > tinySize) { if (psl->qStart <= smallSize) { missSmallStart = psl->qStart; ++totalProblems; } else { missLargeStart = psl->qStart; ++totalProblems; } } diff = psl->qSize - psl->qEnd - sizePolyA; if (diff > tinySize) { if (diff <= smallSize) { missSmallEnd = diff; ++totalProblems; } else { missLargeEnd = diff; ++totalProblems; } } for (i=0; i<psl->blockCount-1; ++i) { int nextT = psl->tStarts[i+1]; int nextQ = psl->qStarts[i+1]; int sz = psl->blockSizes[i]; int t = psl->tStarts[i] + sz; int q = psl->qStarts[i] + sz; int dq = nextQ - q; int dt = nextT - t; if (dq < 0 || dt < 0) { ++jumpBack; ++totalProblems; } else { if (dq > 0 && dt > 0) { ++doubleGap; ++totalProblems; } if (dq > tinySize) { if (dq > smallSize) { ++missLargeMiddle; ++totalProblems; } else { ++missSmallMiddle; ++totalProblems; } } if (dq == 0 && dt >=30) { char *dna = tSeq->dna - tOffset; if (!isIntron(strand, dna + t, dna + nextT)) { ++weirdSplice; ++totalProblems; } } } } fprintf(f, "%2d %9s %s ", totalProblems, psl->qName, psl->strand); fprintf(f, "%4dS ", missLargeStart); fprintf(f, "%2ds ", missSmallStart); fprintf(f, "%4dE ", missLargeEnd); fprintf(f, "%2de ", missSmallEnd); fprintf(f, "%2dM ", missLargeMiddle); fprintf(f, "%2dm ", missSmallMiddle); fprintf(f, "%2dW ", weirdSplice); fprintf(f, "%2dG ", doubleGap); fprintf(f, "%2dJ ", jumpBack); fprintf(f, "\n"); total_missSmallStart += boolify(missSmallStart); total_missLargeStart += boolify(missLargeStart); total_missSmallEnd += boolify(missSmallEnd); total_missLargeEnd += boolify(missLargeEnd); total_missSmallMiddle += boolify(missSmallMiddle); total_missLargeMiddle += boolify(missLargeMiddle); total_weirdSplice += boolify(weirdSplice); total_doubleGap += boolify(doubleGap); total_jumpBack += boolify(jumpBack); ++total_rnaCount; if (totalProblems == 0) ++total_rnaPerfect; }