예제 #1
0
static void ssFindBest(struct ffAli *ffList, bioSeq *qSeq, bioSeq *tSeq,
	enum ffStringency stringency, boolean isProt, struct trans3 *t3List,
	struct ffAli **retBestAli, int *retScore, struct ffAli **retLeftovers)
/* String together blocks in alignment into chains. */
{
int count = ffAliCount(ffList);
if (count >= 10)
    {
    ssFindBestBig(ffList, qSeq, tSeq, stringency, isProt, t3List,
    	retBestAli, retScore, retLeftovers);
    }
else
    {
    ssFindBestSmall(ffList, qSeq, tSeq, stringency, isProt, t3List,
    	retBestAli, retScore, retLeftovers);
    }
}
예제 #2
0
static struct ffAli *trimFlakyEnds(struct dnaSeq *qSeq, struct dnaSeq *tSeq,
	struct ffAli *ffList)
/* Get rid of small initial and terminal exons that seem to just
 * be chance alignments.  Looks for splice sites and non-degenerate
 * sequence to keep things. */
{
int orientation = ffIntronOrientation(ffList);
struct ffAli *left, *right;
char *iStart, *iEnd;
int blockScore, gapPenalty;

/* If one or less block then don't bother. */
if (ffAliCount(ffList) < 2)
    return ffList;

/* Trim beginnings. */
left = ffList;
right = ffList->right;
while (right != NULL)
    {
    blockScore = ffScoreMatch(left->nStart, left->hStart, 
    	left->nEnd-left->nStart);
    blockScore -= aPenalty(left->nStart, left->nEnd - left->nStart);
    iStart = left->hEnd;
    iEnd = right->hStart;
    gapPenalty = trimGapPenalty(iEnd-iStart, 
    	right->nStart - left->nEnd, iStart, iEnd, orientation);
    if (gapPenalty >= blockScore)
        {
	freeMem(left);
	ffList = right;
	right->left = NULL;
	}
    else
        break;
    left = right;
    right = right->right;
    }

right = ffRightmost(ffList);
if (right == ffList)
    return ffList;
left = right->left;
while (left != NULL)
    {
    blockScore = ffScoreMatch(right->nStart, right->hStart, 
    	right->nEnd-right->nStart);
    blockScore -= aPenalty(right->nStart, right->nEnd - right->nStart);
    iStart = left->hEnd;
    iEnd = right->hStart;
    gapPenalty = trimGapPenalty(iEnd-iStart, 
    	right->nStart - left->nEnd, iStart, iEnd, orientation);
    if (gapPenalty >= blockScore)
        {
	freeMem(right);
	left->right = NULL;
	}
    else
        break;
    right = left;
    left = left->left;
    }
return ffList;
}
void oneAli(struct ffAli *left, struct dnaSeq *otherSeq, 
	struct repeatTracker *rt, boolean isRc, enum ffStringency stringency, FILE *out)
/* Analyse one alignment and if it looks good enough write it out to file. */
{
struct dnaSeq *genoSeq = rt->seq;
UBYTE *repBytes = rt->repBytes;
struct ffAli *ff, *nextFf;
struct ffAli *right = ffRightmost(left);
DNA *needle = otherSeq->dna;
DNA *hay = genoSeq->dna;
int nStart = left->nStart - needle;
int nEnd = right->nEnd - needle;
int hStart = left->hStart - hay;
int hEnd = right->hEnd - hay;
int nSize = nEnd - nStart;
int hSize = hEnd - hStart;
int nInsertBaseCount = 0;
int nInsertCount = 0;
int hInsertBaseCount = 0;
int hInsertCount = 0;
int matchCount = 0;
int mismatchCount = 0;
int repMatch = 0;
int countNs = 0;
DNA *np, *hp, n, h;
int blockSize;
int i;
int badScore;
int milliBad;
int passIt;

/* Count up matches, mismatches, inserts, etc. */
for (ff = left; ff != NULL; ff = nextFf)
    {
    int hStart;
    nextFf = ff->right;
    blockSize = ff->nEnd - ff->nStart;
    np = ff->nStart;
    hp = ff->hStart;
    hStart = hp - hay;
    for (i=0; i<blockSize; ++i)
	{
	n = np[i];
	h = hp[i];
	if (n == 'n' || h == 'n')
	    ++countNs;
	else
	    {
	    if (n == h)
		{
		if (repBytes[i+hStart])
		    ++repMatch;
		else
		    ++matchCount;
		}
	    else
		++mismatchCount;
	    }
	}
    if (nextFf != NULL)
	{
	if (ff->nEnd != nextFf->nStart)
	    {
	    ++nInsertCount;
	    nInsertBaseCount += nextFf->nStart - ff->nEnd;
	    }
	if (ff->hEnd != nextFf->hStart)
	    {
	    ++hInsertCount;
	    hInsertBaseCount += nextFf->hStart - ff->hEnd;
	    }
	}
    }

/* See if it looks good enough to output. */
milliBad = calcMilliBad(nEnd - nStart, hEnd - hStart, nInsertCount, hInsertCount, 
	matchCount, repMatch, mismatchCount, stringency == ffCdna);
if (veryTight)
    {
    passIt = (milliBad < 60 && 
	(matchCount >= 25 || 
	 (matchCount >= 15 && matchCount + repMatch >= 50) ||
	 (matchCount >= 5 && repMatch >= 100 && milliBad < 50)));
    }
else
    {
    passIt = (milliBad < maxBad && 
	(matchCount >= minBases || 
	 (matchCount >= minBases/2 && matchCount + repMatch >= 2*minBases) ||
	 (repMatch >= 4*minBases && milliBad < (maxBad/2))));
    }
if (passIt)
    {
    if (isRc)
	{
	int temp;
	int oSize = otherSeq->size;
	temp = nStart;
	nStart = oSize - nEnd;
	nEnd = oSize - temp;
	}
    fprintf(out, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t"
                 "%c\t"
		 "%s\t%d\t%d\t%d\t"
		 "%s\t%d\t%d\t%d\t%d\t",
	matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount,
	(isRc ? '-' : '+'),
	otherSeq->name, otherSeq->size, nStart, nEnd,
	genoSeq->name, genoSeq->size, hStart, hEnd,
	ffAliCount(left));
    for (ff = left; ff != NULL; ff = ff->right)
	fprintf(out, "%d,", ff->nEnd - ff->nStart);
    fprintf(out, "\t");
    for (ff = left; ff != NULL; ff = ff->right)
	fprintf(out, "%d,", ff->nStart - needle);
    fprintf(out, "\t");
    for (ff = left; ff != NULL; ff = ff->right)
	fprintf(out, "%d,", ff->hStart - hay);
    fprintf(out, "\n");
    if (ferror(out))
	{
	perror("");
	errAbort("Write error to .psl");
	}
    }
}
예제 #4
0
파일: gfOut.c 프로젝트: kenongit/sequencing
static void savePslx(char *chromName, int chromSize, int chromOffset,
                     struct ffAli *ali, struct dnaSeq *tSeq, struct dnaSeq *qSeq,
                     boolean isRc, enum ffStringency stringency, int minMatch, FILE *f,
                     struct hash *t3Hash, boolean reportTargetStrand, boolean targetIsRc,
                     struct hash *maskHash, int minIdentity,
                     boolean qIsProt, boolean tIsProt, boolean saveSeq)
/* Analyse one alignment and if it looks good enough write it out to file in
 * psl format (or pslX format - if saveSeq is TRUE).  */
{
    /* This function was stolen from psLayout and slightly extensively to cope
     * with protein as well as DNA aligments. */
    struct ffAli *ff, *nextFf;
    struct ffAli *right = ffRightmost(ali);
    DNA *needle = qSeq->dna;
    DNA *hay = tSeq->dna;
    int nStart = ali->nStart - needle;
    int nEnd = right->nEnd - needle;
    int hStart, hEnd;
    int nInsertBaseCount = 0;
    int nInsertCount = 0;
    int hInsertBaseCount = 0;
    int hInsertCount = 0;
    int matchCount = 0;
    int mismatchCount = 0;
    int repMatch = 0;
    int countNs = 0;
    DNA *np, *hp, n, h;
    int blockSize;
    int i;
    struct trans3 *t3List = NULL;
    Bits *maskBits = NULL;

    if (maskHash != NULL)
        maskBits = hashMustFindVal(maskHash, tSeq->name);
    if (t3Hash != NULL)
        t3List = hashMustFindVal(t3Hash, tSeq->name);
    hStart = trans3GenoPos(ali->hStart, tSeq, t3List, FALSE) + chromOffset;
    hEnd = trans3GenoPos(right->hEnd, tSeq, t3List, TRUE) + chromOffset;

    /* Count up matches, mismatches, inserts, etc. */
    for (ff = ali; ff != NULL; ff = nextFf)
    {
        nextFf = ff->right;
        blockSize = ff->nEnd - ff->nStart;
        np = ff->nStart;
        hp = ff->hStart;
        for (i=0; i<blockSize; ++i)
        {
            n = np[i];
            h = hp[i];
            if (n == 'n' || h == 'n')
                ++countNs;
            else
            {
                if (n == h)
                {
                    if (maskBits != NULL)
                    {
                        int seqOff = hp + i - hay;
                        if (bitReadOne(maskBits, seqOff))
                            ++repMatch;
                        else
                            ++matchCount;
                    }
                    else
                        ++matchCount;
                }
                else
                    ++mismatchCount;
            }
        }
        if (nextFf != NULL)
        {
            int nhStart = trans3GenoPos(nextFf->hStart, tSeq, t3List, FALSE) + chromOffset;
            int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE) + chromOffset;
            int hGap = nhStart - ohEnd;
            int nGap = nextFf->nStart - ff->nEnd;

            if (nGap != 0)
            {
                ++nInsertCount;
                nInsertBaseCount += nGap;
            }
            if (hGap != 0)
            {
                ++hInsertCount;
                hInsertBaseCount += hGap;
            }
        }
    }


    /* See if it looks good enough to output, and output. */
    /* if (score >= minMatch) Moved to higher level */
    {
        int gaps = nInsertCount + (stringency == ffCdna ? 0: hInsertCount);
        int id = roundingScale(1000, matchCount + repMatch - 2*gaps, matchCount + repMatch + mismatchCount);
        if (id >= minIdentity)
        {
            if (isRc)
            {
                int temp;
                int oSize = qSeq->size;
                temp = nStart;
                nStart = oSize - nEnd;
                nEnd = oSize - temp;
            }
            if (targetIsRc)
            {
                int temp;
                temp = hStart;
                hStart = chromSize - hEnd;
                hEnd = chromSize - temp;
            }
            fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c",
                    matchCount, mismatchCount, repMatch, countNs, nInsertCount, nInsertBaseCount, hInsertCount, hInsertBaseCount,
                    (isRc ? '-' : '+'));
            if (reportTargetStrand)
                fprintf(f, "%c", (targetIsRc ? '-' : '+') );
            fprintf(f, "\t%s\t%d\t%d\t%d\t"
                    "%s\t%d\t%d\t%d\t%d\t",
                    qSeq->name, qSeq->size, nStart, nEnd,
                    chromName, chromSize, hStart, hEnd,
                    ffAliCount(ali));
            for (ff = ali; ff != NULL; ff = ff->right)
                fprintf(f, "%ld,", (long)(ff->nEnd - ff->nStart));
            fprintf(f, "\t");
            for (ff = ali; ff != NULL; ff = ff->right)
                fprintf(f, "%ld,", (long)(ff->nStart - needle));
            fprintf(f, "\t");
            for (ff = ali; ff != NULL; ff = ff->right)
                fprintf(f, "%d,", trans3GenoPos(ff->hStart, tSeq, t3List, FALSE) + chromOffset);
            if (saveSeq)
            {
                fputc('\t', f);
                for (ff = ali; ff != NULL; ff = ff->right)
                {
                    mustWrite(f, ff->nStart, ff->nEnd - ff->nStart);
                    fputc(',', f);
                }
                fputc('\t', f);
                for (ff = ali; ff != NULL; ff = ff->right)
                {
                    mustWrite(f, ff->hStart, ff->hEnd - ff->hStart);
                    fputc(',', f);
                }
            }
            fprintf(f, "\n");
            if (ferror(f))
            {
                perror("");
                errAbort("Write error to .psl");
            }
        }
    }
}
예제 #5
0
static struct ssGraph *ssGraphMake(struct ffAli *ffList, bioSeq *qSeq,
	enum ffStringency stringency, boolean isProt, struct trans3 *t3List)
/* Make a graph corresponding to ffList */
{
int nodeCount = ffAliCount(ffList);
int maxEdgeCount = (nodeCount+1)*(nodeCount)/2;
int edgeCount = 0;
struct ssEdge *edges, *e;
struct ssNode *nodes;
struct ssGraph *graph;
struct ffAli *ff, *mid;
int i, midIx;
int overlap;
boolean canFollow;

if (nodeCount == 1)
    maxEdgeCount = 1;
    
AllocVar(graph);
graph->nodeCount = nodeCount;
graph->nodes = AllocArray(nodes, nodeCount+1);
for (i=1, ff = ffList; i<=nodeCount; ++i, ff = ff->right)
    {
    nodes[i].ff = ff;
    nodes[i].nodeScore = bioScoreMatch(isProt, ff->nStart, ff->hStart, ff->hEnd - ff->hStart);
    }

graph->edges = AllocArray(edges, maxEdgeCount);
for (mid = ffList, midIx=1; mid != NULL; mid = mid->right, ++midIx)
    {
    int midScore;
    struct ssNode *midNode = &nodes[midIx];
    e = &edges[edgeCount++];
    assert(edgeCount <= maxEdgeCount);
    e->nodeIn = &nodes[0];
    e->score = midScore = midNode->nodeScore;
    midNode->waysIn = e;
    for (ff = ffList,i=1; ff != mid; ff = ff->right,++i)
	{
	int mhStart = 0, mhEnd = 0;
	if (t3List)
	    {
	    canFollow = tripleCanFollow(ff, mid, qSeq, t3List);
	    trans3Offsets(t3List, mid->hStart, mid->hEnd, &mhStart, &mhEnd);
	    }
	else 
	    {
	    canFollow = (ff->nStart < mid->nStart && ff->nEnd < mid->nEnd 
			&& ff->hStart < mid->hStart && ff->hEnd < mid->hEnd);
	    }
	if (canFollow)
	    {
	    struct ssNode *ffNode = &nodes[i];
	    int score;
	    int hGap;
	    int nGap;
	    int crossover;

	    nGap = mid->nStart - ff->nEnd;
	    if (t3List)
	        {
		int fhStart, fhEnd;
		trans3Offsets(t3List, ff->hStart, ff->hEnd, &fhStart, &fhEnd);
		hGap = mhStart - fhEnd;
		}
	    else
		{
		hGap = mid->hStart - ff->hEnd;
		}
	    e = &edges[edgeCount++];
	    assert(edgeCount <= maxEdgeCount);
	    e->nodeIn = ffNode;
	    e->overlap = overlap = -nGap;
	    if (overlap > 0)
		{
		int midSize = mid->hEnd - mid->hStart;
		int ffSize = ff->hEnd - ff->hStart;
		int newMidScore, newFfScore;
		e->crossover = crossover = findCrossover(ff, mid, overlap, isProt);
		newMidScore = bioScoreMatch(isProt, mid->nStart, mid->hStart, midSize-overlap+crossover);
		newFfScore = bioScoreMatch(isProt, ff->nStart+crossover, ff->hStart+crossover,
				ffSize-crossover);
		score = newMidScore - ffNode->nodeScore + newFfScore;
		nGap = 0;
		hGap -= overlap;
		}
	    else
		{
		score = midScore;
		}
	    score -= ffCalcGapPenalty(hGap, nGap, stringency);
	    e->score = score;
	    slAddHead(&midNode->waysIn, e);
	    }
	}
    slReverse(&midNode->waysIn);
    }
return graph;
}
예제 #6
0
static boolean jiggleSmallExons(struct ffAli *ali, struct dnaSeq *nSeq, struct dnaSeq *hSeq)
/* See if can jiggle small exons to match splice sites a little
 * better. */
{
struct ffAli *left, *mid, *right;
int orient;
boolean creeped = FALSE;

if (ffAliCount(ali) < 3)
    return FALSE;
orient = ffIntronOrientation(ali);
left = ali;
mid = left->right;
right = mid->right;
while (right != NULL)
    {
    int midSizeN = mid->nEnd - mid->nStart;
    if (midSizeN < 10 && mid->hStart - left->hEnd > 1 && right->hStart - mid->hEnd > 1)
        {
	DNA *spLeft, *spRight;	/* Splice sites on either side of exon. */
	DNA exonX[10+2+2];    /* Storage for exon with splice sites. */
	DNA *match;
	static int creeps[4][2] = { {2, 2}, {2, 1}, {1, 2}, {1, 1},};
	int creepIx, creepL, creepR;
	DNA *hs = mid->hStart, *he = mid->hEnd;
	DNA *hMin = left->hEnd,  *hMax = right->hStart;
	if (orient >= 0)
	    {
	    spLeft = "ag";
	    spRight = "gt";
	    }
	else
	    {
	    spLeft = "ac";
	    spRight = "ct";
	    }
        for (creepIx=0; creepIx<4; ++creepIx)
	    {
	    creepL = creeps[creepIx][0];
	    creepR = creeps[creepIx][1];
	    /* Check to see if we already match consensus, and if so just bail. */
	    if (hs[-1] == spLeft[1] && he[0] == spRight[0])
	        {
		if ((creepL == 1 || hs[-2] == spLeft[0]) 
			&& (creepR == 1 || he[1] == spRight[1]))
		    {
		    break;
		    }
		}
	    memcpy(exonX, spLeft + 2 - creepL, creepL);
	    memcpy(exonX + creepL, mid->nStart, midSizeN);
	    memcpy(exonX + creepL + midSizeN, spRight, creepR);
	    match = memMatch(exonX, midSizeN + creepR + creepL, hMin, hMax - hMin);
	    if (match != NULL)
	        {
		mid->hStart = match + creepL;
		mid->hEnd = mid->hStart + (he - hs);
		creeped = TRUE;
		break;
		}
	    }
	}
    left = mid;
    mid = right;
    right = right->right;
    }
if (creeped)
    ffSlideIntrons(ali);
return creeped;
}