Exemplo n.º 1
0
void axtHiQualDiffs(char *axtFile, struct hash *qacHash, FILE *f)
/* Write out high quality diffs in axtFile to f. */
{
char *qName = cloneString("");
UBYTE *qQuals = NULL;
UBYTE *quals = NULL;
struct qac *qac = NULL;
struct axt *axt = NULL;
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
int qStart, qDir, qPos, qWinStart, qWinEnd, tPos;
int qWinSize     = optionInt("winSize",     11);
int qQualMin     = optionInt("diffQualMin", 30);
int qWinQualMin  = optionInt("winQualMin",  25);
int qWinMaxDiff  = optionInt("winMaxDiff",  2);
boolean qIndelOk = optionExists("indelOk");
boolean qIgnore98 = optionExists("ignore98");
boolean chimpPos = optionExists("chimpPos");
int qHalfWinSize = qWinSize/2;

while ((axt = axtRead(lf)) != NULL)
    {
    char *qSym = axt->qSym, *tSym = axt->tSym;
    int symIx, symCount = axt->symCount;
    char qc,tc;
    toUpperN(qSym, symCount);
    toUpperN(tSym, symCount);
    if (!sameString(axt->qName, qName))
        {
	freez(&qName);
	qName = cloneString(axt->qName);
	qac = hashMustFindVal(qacHash, qName);
	freez(&qQuals);
	qQuals = needHugeMem(qac->uncSize);
	rleUncompress(qac->data, qac->compSize, qQuals, qac->uncSize);
	}
    if (axt->qStrand == '+')
        {
	qStart = axt->qStart;
	qDir = 1;
	}
    else
        {
	qStart = qac->uncSize - axt->qStart - 1;
	qDir = -1;
	}
    qPos = qStart;
    tPos = axt->tStart;
    for (symIx = 0; symIx < symCount; ++symIx)
        {
	qc = qSym[symIx];
	tc = tSym[symIx];
	if (qc == '-')
	    tPos += 1;
	else if (tc == '-')
	    qPos += qDir;
	else 
	    {
	    if (qc != tc)
		{
		qWinStart = qPos - qHalfWinSize;
		qWinEnd = qWinStart + qWinSize;
		if (qWinStart >= 0 && qWinEnd < qac->uncSize)
		    {
		    if (qQuals[qPos] >= qQualMin)
		        {
			int i;
			boolean ok = TRUE;
			for (i = qWinStart; i<qWinEnd; ++i)
			    if (qQuals[i] < qWinQualMin)
			        {
				ok = FALSE;
				break;
				}
			if (ok)
			    {
			    int diffCount = 0;
			    int symWinStart = symIx - qHalfWinSize;
			    int symWinEnd = symWinStart + qWinSize;
			    for (i=symWinStart; i < symWinEnd; ++i)
			        {
				qc = qSym[i];
				tc = tSym[i];
				if (qc == '-' || tc == '-')
				    {
				    ok = FALSE;
				    break;
				    }
				if (qc != tc)
				    ++diffCount;
				}
			    if (ok && diffCount <= qWinMaxDiff && (!qIgnore98 || qQuals[qPos] != 98) )
				{
				if (chimpPos)
				    fprintf(f, "%s\t%d\t%d\t%c\t%c\t%s\t%d\t%d\n",
					    axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx], axt->qName, qPos, qPos+1);
				else
				    fprintf(f, "%s\t%d\t%d\t%c\t%c\n",
					    axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx]);
				}
			    }
			}
		    }
		}
	    qPos += qDir;
	    tPos += 1;
	    }
	}
    axtFree(&axt);
    }
lineFileClose(&lf);
}
void ggcChrom(struct chromGenes *chrom, char *axtFile, 
	struct ggcInfo *g, struct hash *restrictHash, 
	FILE *fParts)
/* Tabulate matches on chromosome. */
{
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
bool *hits, *covers;
int hitCount = 0, coverCount = 0;
struct axt *axt;
struct genePred *gp;
int closeSize = g->closeSize;
int closeHalf = closeSize/2;

/* Build up array of booleans - one per base - which are
 * 1's where mouse/human align and bases match, zero 
 * elsewhere. */
AllocArray(hits, chrom->size);
AllocArray(covers, chrom->size);
printf("%s (%d bases)\n", chrom->name, chrom->size);
while ((axt = axtRead(lf)) != NULL)
    {
    int tPos = axt->tStart;
    int symCount = axt->symCount, i;
    char t, q, *tSym = axt->tSym, *qSym = axt->qSym;

    if (axt->tEnd > chrom->size)
        errAbort("tEnd %d, chrom size %d in %s", 
		axt->tEnd, chrom->size, axtFile);
    if (axt->tStrand == '-')
        errAbort("Can't handle minus strand on target in %s", axtFile);
    for (i=0; i<symCount; ++i)
        {
	t = tSym[i];
	if (t != '-')
	    {
	    q = qSym[i];
	    if (toupper(t) == toupper(q))
		{
	        hits[tPos] = TRUE;
		++hitCount;
		}
	    if (q == '-')
	       covers[tPos] = 1;
	    else
	       covers[tPos] = 2;
	    ++tPos;
	    }
	}
    axtFree(&axt);
    }

for (gp = chrom->geneList; gp != NULL; gp = gp->next)
    {
    int exonIx;
    int utr3Size = 0, utr5Size = 0, cdsAllSize = 0;
    int utr3Pos = 0, utr5Pos = 0, cdsAllPos = 0;
    bool *utr3Hits = NULL, *utr3Covers = NULL;
    bool *utr5Hits = NULL, *utr5Covers = NULL;
    bool *cdsAllHits = NULL, *cdsAllCovers = NULL;
    bool isRev = (gp->strand[0] == '-');


    /* Filter out genes not in restrict hash if any. */
    ++totalGenes;
    if (restrictHash != NULL)
        if (!hashLookup(restrictHash, gp->name))
	    continue;
    ++reviewedGenes;

    /* Filter out genes without meaningful UTRs */
    if (gp->cdsStart - gp->txStart < g->closeSize/2 || 
    	gp->txEnd - gp->cdsEnd < g->closeSize/2)
        continue;
    ++genesUsed;

    /* Total up UTR and CDS sizes. */
    for (exonIx=0; exonIx<gp->exonCount; ++exonIx)
	 {
	 int eStart = gp->exonStarts[exonIx];
	 int eEnd = gp->exonEnds[exonIx];
	 int eSize = eEnd - eStart;
	 int oneUtr, oneCds;
	 oneCds = rangeIntersection(gp->cdsStart, gp->cdsEnd, eStart, eEnd);
	 if (oneCds > 0)
	     {
	     cdsAllSize += oneCds;
	     }
	 if (eStart < gp->cdsStart)
	     {
	     int utrStart = eStart;
	     int utrEnd = min(gp->cdsStart, eEnd);
	     int utrSize = utrEnd - utrStart;
	     if (isRev)
		 utr3Size += utrSize;
	     else
		 utr5Size += utrSize;
	     }
	 if (eEnd > gp->cdsEnd)
	     {
	     int utrStart = max(gp->cdsEnd, eStart);
	     int utrEnd = eEnd;
	     int utrSize = utrEnd - utrStart;
	     if (isRev)
		 utr5Size += utrSize;
	     else
		 utr3Size += utrSize;
	     }
	 }

    /* Condense hits from UTRs and CDSs */
    if (utr5Size > 0)
	{
	AllocArray(utr5Hits, utr5Size);
	AllocArray(utr5Covers, utr5Size);
	}
    if (utr3Size > 0)
	{
	AllocArray(utr3Hits, utr3Size);
	AllocArray(utr3Covers, utr3Size);
	}
    if (cdsAllSize > 0)
	{
	AllocArray(cdsAllHits, cdsAllSize);
	AllocArray(cdsAllCovers, cdsAllSize);
	}
    for (exonIx=0; exonIx<gp->exonCount; ++exonIx)
	{
	int eStart = gp->exonStarts[exonIx];
	int eEnd = gp->exonEnds[exonIx];
	int eSize = eEnd - eStart;
	int oneUtr, oneCds;
	oneCds = rangeIntersection(gp->cdsStart, gp->cdsEnd, eStart, eEnd);
	if (oneCds > 0)
	    {
	    int cdsStart = eStart;
	    int cdsEnd = gp->cdsEnd;

	    if (cdsStart < gp->cdsStart)
		cdsStart = gp->cdsStart;
	    memcpy(cdsAllHits + cdsAllPos, hits + cdsStart, oneCds * sizeof(*hits));
	    memcpy(cdsAllCovers + cdsAllPos, covers + cdsStart, oneCds * sizeof(*covers));
	    cdsAllPos += oneCds;
	    }
	if (eStart < gp->cdsStart)
	    {
	    int utrStart = eStart;
	    int utrEnd = min(gp->cdsStart, eEnd);
	    int utrSize = utrEnd - utrStart;
	    if (isRev)
		{
		memcpy(utr3Hits + utr3Pos, hits + utrStart, utrSize * sizeof(*hits));
		memcpy(utr3Covers + utr3Pos, covers + utrStart, utrSize * sizeof(*covers));
		utr3Pos += utrSize;
		}
	    else
		{
		memcpy(utr5Hits + utr5Pos, hits + utrStart, utrSize * sizeof(*hits));
		memcpy(utr5Covers + utr5Pos, covers + utrStart, utrSize * sizeof(*covers));
		utr5Pos += utrSize;
		}
	    }
	if (eEnd > gp->cdsEnd)
	    {
	    int utrStart = max(gp->cdsEnd, eStart);
	    int utrEnd = eEnd;
	    int utrSize = utrEnd - utrStart;
	    if (isRev)
		{
		memcpy(utr5Hits + utr5Pos, hits + utrStart, utrSize * sizeof(*hits));
		memcpy(utr5Covers + utr5Pos, covers + utrStart, utrSize * sizeof(*covers));
		utr5Pos += utrSize;
		}
	    else
		{
		memcpy(utr3Hits + utr3Pos, hits + utrStart, utrSize * sizeof(*hits));
		memcpy(utr3Covers + utr3Pos, covers + utrStart, utrSize * sizeof(*covers));
		utr3Pos += utrSize;
		}
	    }
	}
    assert(utr3Pos == utr3Size);
    assert(utr5Pos == utr5Size);
    assert(cdsAllPos == cdsAllSize);

    tallyHits(&g->utr5, utr5Hits, utr5Covers, utr5Size, isRev);
    tallyHits(&g->utr3, utr3Hits, utr3Covers, utr3Size, isRev);
    tallyHits(&g->cdsAll, cdsAllHits, cdsAllCovers, cdsAllSize, isRev);

    /* Optionally write out file with gene by gene info. */
    if (fParts != NULL)
        {
	/* Write header line first time through. */
	static boolean firstTime = TRUE;
	if (firstTime)
	    {
	    firstTime = FALSE;
	    fprintf(fParts, "#accession\tsize_5\tali_5\tmatch_5\tsize_c\tali_c\tmatch_c\tsize_3\tali_3\tmatch_3\n");
	    }
	fprintf(fParts, "%s\t", gp->name);
	fprintf(fParts, "%d\t%d\t%d\t", utr5Size, 
		countBools(utr5Covers, utr5Size),
		countBools(utr5Hits, utr5Size));
	fprintf(fParts, "%d\t%d\t%d\t", cdsAllSize, 
		countBools(cdsAllCovers, cdsAllSize),
		countBools(cdsAllHits, cdsAllSize));
	fprintf(fParts, "%d\t%d\t%d\n", utr3Size, 
		countBools(utr3Covers, utr3Size),
		countBools(utr3Hits, utr3Size));
	}

    /* Tally upstream/downstream hits. */
	{
	int s1 = gp->txStart - closeHalf;
	int e1 = s1 + closeSize;
	int s2 = gp->txEnd - closeHalf;
	int e2 = s2 + closeSize;
	if (isRev)
	    {
	    tallyInRange(&g->down, hits, covers, chrom->size, gp->txStart - g->baseDown,
		gp->txStart, isRev);
	    tallyInRange(&g->up, hits, covers, chrom->size, gp->txEnd, 
		gp->txEnd + g->baseUp, isRev);
	    tallyInRange(&g->txEnd, hits, covers, chrom->size, s1, e1, isRev);
	    tallyInRange(&g->txStart, hits, covers, chrom->size, s2, e2, isRev);
	    }
	else
	    {
	    tallyInRange(&g->up, hits, covers, chrom->size, gp->txStart - g->baseUp,
		gp->txStart, isRev);
	    tallyInRange(&g->down, hits, covers, chrom->size, gp->txEnd, 
		gp->txEnd + g->baseDown, isRev);
	    tallyInRange(&g->txStart, hits, covers, chrom->size, s1, e1, isRev);
	    tallyInRange(&g->txEnd, hits, covers, chrom->size, s2, e2, isRev);
	    }
	}

    /* Tally hits in coding exons */
    for (exonIx=0; exonIx < gp->exonCount; ++exonIx)
        {
	int eStart = gp->exonStarts[exonIx];
	int eEnd = gp->exonEnds[exonIx];
	/* Single coding exon. */
	if (eStart <= gp->cdsStart && eEnd >= gp->cdsEnd)
	   {
	   eStart = gp->cdsStart;
	   eEnd = gp->cdsEnd;
	   tallyInRange(&g->cdsSingle, hits, covers, chrom->size,
	   		eStart, eEnd, isRev);
	   }
	/* Initial coding exon */
	else if (eStart < gp->cdsStart && eEnd > gp->cdsStart)
	    {
	    int cs = gp->cdsStart - closeHalf;
	    int ce = cs + closeSize;
	    eStart = gp->cdsStart;
	    if (isRev)
	        {
		tallyInRange(&g->tlEnd, hits, covers, chrom->size, cs, ce, isRev);
		tallyInRange(&g->cdsLast, hits, covers, chrom->size, 
			eStart, eEnd, isRev);
		}
	    else
	        {
		tallyInRange(&g->tlStart, hits, covers, chrom->size, cs, ce, isRev);
		tallyInRange(&g->cdsFirst, hits, covers, chrom->size, 
			eStart, eEnd, isRev);
		}
	    }
	/* Final coding exon */
	else if (eStart < gp->cdsEnd && eEnd > gp->cdsEnd)
	    {
	    int cs = gp->cdsEnd - closeHalf;
	    int ce = cs + closeSize;
	    eEnd = gp->cdsEnd;
	    if (isRev)
	        {
		tallyInRange(&g->tlStart, hits, covers, chrom->size, cs, ce, isRev);
		tallyInRange(&g->cdsFirst, hits, covers, chrom->size, 
			eStart, eEnd, isRev);
		}
	    else
	        {
		tallyInRange(&g->tlEnd, hits, covers, chrom->size, cs, ce, isRev);
		tallyInRange(&g->cdsLast, hits, covers, chrom->size, 
			eStart, eEnd, isRev);
		}
	    }
	/* Middle (but not only) coding exon */
	else if (eStart >= gp->cdsStart && eEnd <= gp->cdsEnd)
	    {
	    tallyInRange(&g->cdsMiddle, hits, covers, chrom->size, eStart, eEnd, isRev);
	    }
	else
	    {
	    }
	}
	

    /* Tally hits in introns and splice sites. */
    for (exonIx=1; exonIx<gp->exonCount; ++exonIx)
        {
	int iStart = gp->exonEnds[exonIx-1];
	int iEnd = gp->exonStarts[exonIx];
	int s1 = iStart - closeHalf;
	int e1 = s1 + closeSize;
	int s2 = iEnd - closeHalf;
	int e2 = s2 + closeSize;
	if (isRev)
	    {
	    tallyInRange(&g->splice3, hits, covers, chrom->size, 
		    s1, e1, isRev);
	    tallyInRange(&g->splice5, hits, covers, chrom->size, 
		    s2, e2, isRev);
	    }
	else
	    {
	    tallyInRange(&g->splice5, hits, covers, chrom->size, 
		    s1, e1, isRev);
	    tallyInRange(&g->splice3, hits, covers, chrom->size, 
		    s2, e2, isRev);
	    }
	tallyInRange(&g->intron, hits, covers, chrom->size, iStart, iEnd, isRev);
	}
    freez(&utr5Hits);
    freez(&utr3Hits);
    freez(&cdsAllHits);
    freez(&utr5Covers);
    freez(&utr3Covers);
    freez(&cdsAllCovers);
    }
freez(&hits);
freez(&covers);
lineFileClose(&lf);
}