Exemple #1
0
struct gapInfo *findLargeGaps(struct xaAli *xa, struct gapInfo *oldList)
/* Find large gaps in alignment and classify them. */
{
struct gdfGene *gdfList;
struct gapInfo *gapList = NULL, *gap;
int ceIx=0, cbIx=0, symIx=0;
int ceStart=0, cbStart=0, symStart=0;
int runSize = 0;
char sym, lastSym = 0;
int symCount = xa->symCount;

/* Fetch C. elegans region. */
gdfList = wormGdfGenesInRange(xa->target, xa->tStart, xa->tEnd, &wormSangerGdfCache);

/* Run a little state machine that does something at the end of each solid run 
 * of a symbol. */
for (symIx = 0; symIx <= symCount; ++symIx)
    {
    sym = xa->hSym[symIx];
    if (sym != lastSym)
        {
        if (runSize > 32)       /* Introns need to be at least this long. */
            {
            /* We're at end of a solid run. */
            if (lastSym == 'Q' || lastSym == 'T')
                {
                int ceGapStart = xa->tStart + ceStart;
                int ceGapEnd = xa->tStart + ceIx;
                struct gdfGene *gdf;
                char hBefore = xa->hSym[symStart-1];
                char hAfter = sym;
                char strand = '.';

                AllocVar(gap);
                gap->query = cloneString(xa->query);
                gap->qStart = xa->qStart + cbStart;
                gap->qEnd = xa->qStart + cbIx;
                gap->target = cloneString(xa->target);
                gap->tStart = ceGapStart;
                gap->tEnd = ceGapEnd;
                gap->name = cloneString(xa->name);
                gap->size = runSize;
                gap->hSym = lastSym;
                if (uniqueGap(oldList, gap))
                    {
                    slAddHead(&gapList, gap);

                    classifyGap(gdfList, xa->target, ceGapStart, ceGapEnd, lastSym, &gap->type, &gdf);
                    if (gdf != NULL)
                        strand = gdf->strand;
                    gap->hasIntronEnds = isIntron(xa, symStart, symIx, lastSym, strand, &gap->slideCount, &gap->isRc);
                    if (gap->hasIntronEnds)
                        slideGap(gap, xa, lastSym, symStart, symIx);
                    if (isConserved(hBefore) && isConserved(hAfter))
                        gap->hasStrongHomology = TRUE;
                    if (gap->hasStrongHomology)
                        {
                        if (lastSym == 'T')
                            writeGap(gap, xa, symStart+gap->slideCount, symIx+gap->slideCount, strand, out);
                        }
                    }
                }
            }
        runSize = 0;
        ceStart = ceIx;
        cbStart = cbIx;
        symStart = symIx;
        lastSym = sym;
        }
    ++runSize;
    if (xa->qSym[symIx] != '-')
        ++cbIx;
    if (xa->tSym[symIx] != '-')
        ++ceIx;
    }

gdfFreeGeneList(&gdfList);
slReverse(&gapList);
return gapList;
}
Exemple #2
0
void outputCheck(struct psl *psl, struct dnaSeq *qSeq, int qOffset,
	struct dnaSeq *tSeq, int tOffset, FILE *f)
/* Output quality check info to file */
{
int sizePolyA = 0;
int qSize = psl->qSize;
int i;
int missSmallStart = 0;
int missLargeStart = 0;
int missSmallEnd = 0;
int missLargeEnd = 0;
int missSmallMiddle = 0;
int missLargeMiddle = 0;
int weirdSplice = 0;
int doubleGap = 0;
int jumpBack = 0;
int diff;
int totalProblems = 0;
char strand = psl->strand[0];

if (strand == '+')
    {
    for (i=1; i<=qSize; ++i)
	{
	if (qSeq->dna[qSize - i - qOffset] == 'a')
	    ++sizePolyA;
	else
	    break;
	}
    }
else
    {
    for (i=0; i<qSize; ++i)
	{
	if (qSeq->dna[i - qOffset] == 't')
	    ++sizePolyA;
	else
	    break;
	}
    }
if (psl->qStart > tinySize)
    {
    if (psl->qStart <= smallSize)
	{
	missSmallStart = psl->qStart;
	++totalProblems;
	}
    else
	{
	missLargeStart = psl->qStart;
	++totalProblems;
	}
    }
diff = psl->qSize - psl->qEnd - sizePolyA;
if (diff > tinySize)
    {
    if (diff <= smallSize)
	{
	missSmallEnd = diff;
	++totalProblems;
	}
    else
	{
	missLargeEnd = diff;
	++totalProblems;
	}
    }
for (i=0; i<psl->blockCount-1; ++i)
    {
    int nextT = psl->tStarts[i+1];
    int nextQ = psl->qStarts[i+1];
    int sz = psl->blockSizes[i];
    int t = psl->tStarts[i] + sz;
    int q = psl->qStarts[i] + sz;
    int dq = nextQ - q;
    int dt = nextT - t;
    if (dq < 0 || dt < 0)
	{
	++jumpBack;
	++totalProblems;
	}
    else 
	{
	if (dq > 0 && dt > 0)
	    {
	    ++doubleGap;
	    ++totalProblems;
	    }
	if (dq > tinySize)
	    {
	    if (dq > smallSize)
		{
		++missLargeMiddle;
		++totalProblems;
		}
	    else
		{
		++missSmallMiddle;
		++totalProblems;
		}
	    }
	if (dq == 0 && dt >=30)
	    {
	    char *dna = tSeq->dna - tOffset;
	    if (!isIntron(strand, dna + t, dna + nextT))
		{
		++weirdSplice;
		++totalProblems;
		}
	    }
	}
    }
fprintf(f, "%2d %9s %s ", totalProblems, psl->qName, psl->strand);
fprintf(f, "%4dS ", missLargeStart);
fprintf(f, "%2ds ", missSmallStart);
fprintf(f, "%4dE ", missLargeEnd);
fprintf(f, "%2de ", missSmallEnd);
fprintf(f, "%2dM ", missLargeMiddle);
fprintf(f, "%2dm ", missSmallMiddle);
fprintf(f, "%2dW ", weirdSplice);
fprintf(f, "%2dG ", doubleGap);
fprintf(f, "%2dJ ", jumpBack);
fprintf(f, "\n");

total_missSmallStart += boolify(missSmallStart);
total_missLargeStart += boolify(missLargeStart);
total_missSmallEnd += boolify(missSmallEnd);
total_missLargeEnd += boolify(missLargeEnd);
total_missSmallMiddle += boolify(missSmallMiddle);
total_missLargeMiddle += boolify(missLargeMiddle);
total_weirdSplice += boolify(weirdSplice);
total_doubleGap += boolify(doubleGap);
total_jumpBack += boolify(jumpBack);
++total_rnaCount;
if (totalProblems == 0)
    ++total_rnaPerfect;
}