void axtHiQualDiffs(char *axtFile, struct hash *qacHash, FILE *f) /* Write out high quality diffs in axtFile to f. */ { char *qName = cloneString(""); UBYTE *qQuals = NULL; UBYTE *quals = NULL; struct qac *qac = NULL; struct axt *axt = NULL; struct lineFile *lf = lineFileOpen(axtFile, TRUE); int qStart, qDir, qPos, qWinStart, qWinEnd, tPos; int qWinSize = optionInt("winSize", 11); int qQualMin = optionInt("diffQualMin", 30); int qWinQualMin = optionInt("winQualMin", 25); int qWinMaxDiff = optionInt("winMaxDiff", 2); boolean qIndelOk = optionExists("indelOk"); boolean qIgnore98 = optionExists("ignore98"); boolean chimpPos = optionExists("chimpPos"); int qHalfWinSize = qWinSize/2; while ((axt = axtRead(lf)) != NULL) { char *qSym = axt->qSym, *tSym = axt->tSym; int symIx, symCount = axt->symCount; char qc,tc; toUpperN(qSym, symCount); toUpperN(tSym, symCount); if (!sameString(axt->qName, qName)) { freez(&qName); qName = cloneString(axt->qName); qac = hashMustFindVal(qacHash, qName); freez(&qQuals); qQuals = needHugeMem(qac->uncSize); rleUncompress(qac->data, qac->compSize, qQuals, qac->uncSize); } if (axt->qStrand == '+') { qStart = axt->qStart; qDir = 1; } else { qStart = qac->uncSize - axt->qStart - 1; qDir = -1; } qPos = qStart; tPos = axt->tStart; for (symIx = 0; symIx < symCount; ++symIx) { qc = qSym[symIx]; tc = tSym[symIx]; if (qc == '-') tPos += 1; else if (tc == '-') qPos += qDir; else { if (qc != tc) { qWinStart = qPos - qHalfWinSize; qWinEnd = qWinStart + qWinSize; if (qWinStart >= 0 && qWinEnd < qac->uncSize) { if (qQuals[qPos] >= qQualMin) { int i; boolean ok = TRUE; for (i = qWinStart; i<qWinEnd; ++i) if (qQuals[i] < qWinQualMin) { ok = FALSE; break; } if (ok) { int diffCount = 0; int symWinStart = symIx - qHalfWinSize; int symWinEnd = symWinStart + qWinSize; for (i=symWinStart; i < symWinEnd; ++i) { qc = qSym[i]; tc = tSym[i]; if (qc == '-' || tc == '-') { ok = FALSE; break; } if (qc != tc) ++diffCount; } if (ok && diffCount <= qWinMaxDiff && (!qIgnore98 || qQuals[qPos] != 98) ) { if (chimpPos) fprintf(f, "%s\t%d\t%d\t%c\t%c\t%s\t%d\t%d\n", axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx], axt->qName, qPos, qPos+1); else fprintf(f, "%s\t%d\t%d\t%c\t%c\n", axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx]); } } } } } qPos += qDir; tPos += 1; } } axtFree(&axt); } lineFileClose(&lf); }
void ggcChrom(struct chromGenes *chrom, char *axtFile, struct ggcInfo *g, struct hash *restrictHash, FILE *fParts) /* Tabulate matches on chromosome. */ { struct lineFile *lf = lineFileOpen(axtFile, TRUE); bool *hits, *covers; int hitCount = 0, coverCount = 0; struct axt *axt; struct genePred *gp; int closeSize = g->closeSize; int closeHalf = closeSize/2; /* Build up array of booleans - one per base - which are * 1's where mouse/human align and bases match, zero * elsewhere. */ AllocArray(hits, chrom->size); AllocArray(covers, chrom->size); printf("%s (%d bases)\n", chrom->name, chrom->size); while ((axt = axtRead(lf)) != NULL) { int tPos = axt->tStart; int symCount = axt->symCount, i; char t, q, *tSym = axt->tSym, *qSym = axt->qSym; if (axt->tEnd > chrom->size) errAbort("tEnd %d, chrom size %d in %s", axt->tEnd, chrom->size, axtFile); if (axt->tStrand == '-') errAbort("Can't handle minus strand on target in %s", axtFile); for (i=0; i<symCount; ++i) { t = tSym[i]; if (t != '-') { q = qSym[i]; if (toupper(t) == toupper(q)) { hits[tPos] = TRUE; ++hitCount; } if (q == '-') covers[tPos] = 1; else covers[tPos] = 2; ++tPos; } } axtFree(&axt); } for (gp = chrom->geneList; gp != NULL; gp = gp->next) { int exonIx; int utr3Size = 0, utr5Size = 0, cdsAllSize = 0; int utr3Pos = 0, utr5Pos = 0, cdsAllPos = 0; bool *utr3Hits = NULL, *utr3Covers = NULL; bool *utr5Hits = NULL, *utr5Covers = NULL; bool *cdsAllHits = NULL, *cdsAllCovers = NULL; bool isRev = (gp->strand[0] == '-'); /* Filter out genes not in restrict hash if any. */ ++totalGenes; if (restrictHash != NULL) if (!hashLookup(restrictHash, gp->name)) continue; ++reviewedGenes; /* Filter out genes without meaningful UTRs */ if (gp->cdsStart - gp->txStart < g->closeSize/2 || gp->txEnd - gp->cdsEnd < g->closeSize/2) continue; ++genesUsed; /* Total up UTR and CDS sizes. */ for (exonIx=0; exonIx<gp->exonCount; ++exonIx) { int eStart = gp->exonStarts[exonIx]; int eEnd = gp->exonEnds[exonIx]; int eSize = eEnd - eStart; int oneUtr, oneCds; oneCds = rangeIntersection(gp->cdsStart, gp->cdsEnd, eStart, eEnd); if (oneCds > 0) { cdsAllSize += oneCds; } if (eStart < gp->cdsStart) { int utrStart = eStart; int utrEnd = min(gp->cdsStart, eEnd); int utrSize = utrEnd - utrStart; if (isRev) utr3Size += utrSize; else utr5Size += utrSize; } if (eEnd > gp->cdsEnd) { int utrStart = max(gp->cdsEnd, eStart); int utrEnd = eEnd; int utrSize = utrEnd - utrStart; if (isRev) utr5Size += utrSize; else utr3Size += utrSize; } } /* Condense hits from UTRs and CDSs */ if (utr5Size > 0) { AllocArray(utr5Hits, utr5Size); AllocArray(utr5Covers, utr5Size); } if (utr3Size > 0) { AllocArray(utr3Hits, utr3Size); AllocArray(utr3Covers, utr3Size); } if (cdsAllSize > 0) { AllocArray(cdsAllHits, cdsAllSize); AllocArray(cdsAllCovers, cdsAllSize); } for (exonIx=0; exonIx<gp->exonCount; ++exonIx) { int eStart = gp->exonStarts[exonIx]; int eEnd = gp->exonEnds[exonIx]; int eSize = eEnd - eStart; int oneUtr, oneCds; oneCds = rangeIntersection(gp->cdsStart, gp->cdsEnd, eStart, eEnd); if (oneCds > 0) { int cdsStart = eStart; int cdsEnd = gp->cdsEnd; if (cdsStart < gp->cdsStart) cdsStart = gp->cdsStart; memcpy(cdsAllHits + cdsAllPos, hits + cdsStart, oneCds * sizeof(*hits)); memcpy(cdsAllCovers + cdsAllPos, covers + cdsStart, oneCds * sizeof(*covers)); cdsAllPos += oneCds; } if (eStart < gp->cdsStart) { int utrStart = eStart; int utrEnd = min(gp->cdsStart, eEnd); int utrSize = utrEnd - utrStart; if (isRev) { memcpy(utr3Hits + utr3Pos, hits + utrStart, utrSize * sizeof(*hits)); memcpy(utr3Covers + utr3Pos, covers + utrStart, utrSize * sizeof(*covers)); utr3Pos += utrSize; } else { memcpy(utr5Hits + utr5Pos, hits + utrStart, utrSize * sizeof(*hits)); memcpy(utr5Covers + utr5Pos, covers + utrStart, utrSize * sizeof(*covers)); utr5Pos += utrSize; } } if (eEnd > gp->cdsEnd) { int utrStart = max(gp->cdsEnd, eStart); int utrEnd = eEnd; int utrSize = utrEnd - utrStart; if (isRev) { memcpy(utr5Hits + utr5Pos, hits + utrStart, utrSize * sizeof(*hits)); memcpy(utr5Covers + utr5Pos, covers + utrStart, utrSize * sizeof(*covers)); utr5Pos += utrSize; } else { memcpy(utr3Hits + utr3Pos, hits + utrStart, utrSize * sizeof(*hits)); memcpy(utr3Covers + utr3Pos, covers + utrStart, utrSize * sizeof(*covers)); utr3Pos += utrSize; } } } assert(utr3Pos == utr3Size); assert(utr5Pos == utr5Size); assert(cdsAllPos == cdsAllSize); tallyHits(&g->utr5, utr5Hits, utr5Covers, utr5Size, isRev); tallyHits(&g->utr3, utr3Hits, utr3Covers, utr3Size, isRev); tallyHits(&g->cdsAll, cdsAllHits, cdsAllCovers, cdsAllSize, isRev); /* Optionally write out file with gene by gene info. */ if (fParts != NULL) { /* Write header line first time through. */ static boolean firstTime = TRUE; if (firstTime) { firstTime = FALSE; fprintf(fParts, "#accession\tsize_5\tali_5\tmatch_5\tsize_c\tali_c\tmatch_c\tsize_3\tali_3\tmatch_3\n"); } fprintf(fParts, "%s\t", gp->name); fprintf(fParts, "%d\t%d\t%d\t", utr5Size, countBools(utr5Covers, utr5Size), countBools(utr5Hits, utr5Size)); fprintf(fParts, "%d\t%d\t%d\t", cdsAllSize, countBools(cdsAllCovers, cdsAllSize), countBools(cdsAllHits, cdsAllSize)); fprintf(fParts, "%d\t%d\t%d\n", utr3Size, countBools(utr3Covers, utr3Size), countBools(utr3Hits, utr3Size)); } /* Tally upstream/downstream hits. */ { int s1 = gp->txStart - closeHalf; int e1 = s1 + closeSize; int s2 = gp->txEnd - closeHalf; int e2 = s2 + closeSize; if (isRev) { tallyInRange(&g->down, hits, covers, chrom->size, gp->txStart - g->baseDown, gp->txStart, isRev); tallyInRange(&g->up, hits, covers, chrom->size, gp->txEnd, gp->txEnd + g->baseUp, isRev); tallyInRange(&g->txEnd, hits, covers, chrom->size, s1, e1, isRev); tallyInRange(&g->txStart, hits, covers, chrom->size, s2, e2, isRev); } else { tallyInRange(&g->up, hits, covers, chrom->size, gp->txStart - g->baseUp, gp->txStart, isRev); tallyInRange(&g->down, hits, covers, chrom->size, gp->txEnd, gp->txEnd + g->baseDown, isRev); tallyInRange(&g->txStart, hits, covers, chrom->size, s1, e1, isRev); tallyInRange(&g->txEnd, hits, covers, chrom->size, s2, e2, isRev); } } /* Tally hits in coding exons */ for (exonIx=0; exonIx < gp->exonCount; ++exonIx) { int eStart = gp->exonStarts[exonIx]; int eEnd = gp->exonEnds[exonIx]; /* Single coding exon. */ if (eStart <= gp->cdsStart && eEnd >= gp->cdsEnd) { eStart = gp->cdsStart; eEnd = gp->cdsEnd; tallyInRange(&g->cdsSingle, hits, covers, chrom->size, eStart, eEnd, isRev); } /* Initial coding exon */ else if (eStart < gp->cdsStart && eEnd > gp->cdsStart) { int cs = gp->cdsStart - closeHalf; int ce = cs + closeSize; eStart = gp->cdsStart; if (isRev) { tallyInRange(&g->tlEnd, hits, covers, chrom->size, cs, ce, isRev); tallyInRange(&g->cdsLast, hits, covers, chrom->size, eStart, eEnd, isRev); } else { tallyInRange(&g->tlStart, hits, covers, chrom->size, cs, ce, isRev); tallyInRange(&g->cdsFirst, hits, covers, chrom->size, eStart, eEnd, isRev); } } /* Final coding exon */ else if (eStart < gp->cdsEnd && eEnd > gp->cdsEnd) { int cs = gp->cdsEnd - closeHalf; int ce = cs + closeSize; eEnd = gp->cdsEnd; if (isRev) { tallyInRange(&g->tlStart, hits, covers, chrom->size, cs, ce, isRev); tallyInRange(&g->cdsFirst, hits, covers, chrom->size, eStart, eEnd, isRev); } else { tallyInRange(&g->tlEnd, hits, covers, chrom->size, cs, ce, isRev); tallyInRange(&g->cdsLast, hits, covers, chrom->size, eStart, eEnd, isRev); } } /* Middle (but not only) coding exon */ else if (eStart >= gp->cdsStart && eEnd <= gp->cdsEnd) { tallyInRange(&g->cdsMiddle, hits, covers, chrom->size, eStart, eEnd, isRev); } else { } } /* Tally hits in introns and splice sites. */ for (exonIx=1; exonIx<gp->exonCount; ++exonIx) { int iStart = gp->exonEnds[exonIx-1]; int iEnd = gp->exonStarts[exonIx]; int s1 = iStart - closeHalf; int e1 = s1 + closeSize; int s2 = iEnd - closeHalf; int e2 = s2 + closeSize; if (isRev) { tallyInRange(&g->splice3, hits, covers, chrom->size, s1, e1, isRev); tallyInRange(&g->splice5, hits, covers, chrom->size, s2, e2, isRev); } else { tallyInRange(&g->splice5, hits, covers, chrom->size, s1, e1, isRev); tallyInRange(&g->splice3, hits, covers, chrom->size, s2, e2, isRev); } tallyInRange(&g->intron, hits, covers, chrom->size, iStart, iEnd, isRev); } freez(&utr5Hits); freez(&utr3Hits); freez(&cdsAllHits); freez(&utr5Covers); freez(&utr3Covers); freez(&cdsAllCovers); } freez(&hits); freez(&covers); lineFileClose(&lf); }