Ejemplo n.º 1
0
boolean fastFind(DNA *needle, int needleSize, 
    struct patSpace *ps, struct ffAli **retAli, boolean *retRc, int *retScore)
/* Do fast alignment. */
{
struct patClump *clumpList, *clump;
boolean isRc;
struct aliList *aliList = NULL, *ali;

for (isRc = 0; isRc <= 1; ++isRc)
    {
    if (isRc)
        reverseComplement(needle, needleSize);
    if ((clumpList = patSpaceFindOne(ps, needle, needleSize)) != NULL)
        {
        for (clump = clumpList; clump != NULL; clump = clump->next)
            {
            struct dnaSeq *haySeq = clump->seq;
            DNA *haystack = haySeq->dna;
            int start = clump->start;
            struct ffAli *ffAli = ffFind(needle, needle+needleSize, 
                haystack+start, haystack+start+clump->size, ffCdna);
            if (ffAli != NULL)
                {
                AllocVar(ali);
                ali->ali = ffAli;
                ali->score = ffScoreCdna(ffAli);
                ali->isRc = isRc;
                slAddHead(&aliList, ali);
                }
            }
        slFreeList(&clumpList);
        }
    if (isRc)
        reverseComplement(needle, needleSize);
    }
if (aliList != NULL)
    {
    slSort(&aliList, cmpAliList);
    *retAli = aliList->ali;
    aliList->ali = NULL;
    *retRc = aliList->isRc;
    *retScore = aliList->score;
    for (ali = aliList->next; ali != NULL; ali = ali->next)
        ffFreeAli(&ali->ali);
    slFreeList(&aliList);
    return TRUE;
    }
else
    return FALSE;
}
Ejemplo n.º 2
0
void refineAlis(struct cdnaInfo *ci, struct dnaSeq *cdnaSeq)
/* Turn ci->roughAli into ci->fineAli.  Refine alignment. */
{
struct roughAli *ra;
struct fineAli *fa;
struct ffAli *ffAli = NULL;
DNA *unpacked;
int outerStart, outerEnd;
int score;
boolean isRc;
boolean leftCruddyCount;
boolean rightCruddyCount;
DNA *hayStart;
int hayLen;
DNA *hayEnd;
int gStart, gEnd;
boolean badFind;

sortRoughAlis(&ci->roughAli);
flagDupeRoughAlis(ci->roughAli);

for (ra = ci->roughAli; ra != NULL; ra = ra->next)
    {
    int bestScore = -0x7fffffff;
    int oldBestScore = -0x7fffffff;
    int oldScore;
    struct ffAli *bestAli = NULL;
    boolean bestIsRc = FALSE;

    if (ra->isDupe)
        continue;
    /* If score is less than 1/8 of cdna size, don't bother
     * with further processing. */
    if (ra->score < ci->baseCount/8)
        {
        continue;
        }

    gStart = ra->gStart - 16;
    gEnd = ra->gEnd + 16;

    /* Unpack dna, including extra at either end. */
    fetchUnpacked(ra->chromIx, ra->gStart, ra->gEnd, 15250, 
        &unpacked, &outerStart, &outerEnd);

    badFind = FALSE;    
    for (;;)
        {
        clipEnds(outerStart, &gStart, &gEnd, outerEnd);
        hayStart = unpacked + gStart-outerStart;
        hayLen = gEnd - gStart;
        hayEnd = hayStart + hayLen;
        if (!ffFindEitherStrandN(cdnaSeq->dna, cdnaSeq->size, hayStart, hayLen,
            ffCdna, &ffAli, &isRc))
            {
            if (ra->score > 20 && ra->score > oldBestScore + 5 && ra == ci->roughAli)
                {
                if (badFind)
                    {
                    warn("%s - still couldn't ffFind after expansion",
                        ci->name);
                    break;
                    }
                warn("%s Couldn't ffFind %s (%d bases %d score %d bestScore) in chromosome %s %d-%d", 
                    ci->name, ci->name, ci->baseCount, ra->score, oldBestScore, chromNames[ra->chromIx], gStart, gEnd);
                addRedoHash(ci, "ffFind");
                }
            else
                break;
            badFind = TRUE;
            gStart -= 500;
            gEnd += 500;
            continue;
            }
        if (isRc)
            reverseComplement(cdnaSeq->dna, cdnaSeq->size);
        score = scoreExonAli(ffAli);
        oldScore = ffScoreCdna(ffAli);
        leftCruddyCount = leftFlakySize(ffAli, cdnaSeq->dna, cdnaSeq->size);
        rightCruddyCount = rightFlakySize(ffAli, cdnaSeq->dna, cdnaSeq->size) -
            polyaSize(cdnaSeq->dna, cdnaSeq->size);
        if (isRc)
            reverseComplement(cdnaSeq->dna, cdnaSeq->size);
        if (score <= bestScore)
            {
            ffFreeAli(&ffAli);
            break;
            }
        bestScore = score;
        oldBestScore = oldScore;
        ffFreeAli(&bestAli);
        bestAli = ffAli;
        bestIsRc = isRc;
        if (leftCruddyCount <= 0 && rightCruddyCount <= 0)
            break;
        if (gStart == outerStart || gEnd == outerEnd)
            break;
        if (leftCruddyCount < 16)
            gStart -= 2*leftCruddyCount;
        else
            gStart -= 5000;
        if (rightCruddyCount > 0)
            {
            if (rightCruddyCount < 16)
                gEnd += 2*rightCruddyCount;
            else
                gEnd += 5000;
            }
        }
    if (bestAli != NULL)
        {
        AllocVar(fa);
        fa->chromIx = ra->chromIx;
        fa->isRc = bestIsRc;
        fa->score = bestScore;
        fa->blocks = bestAli;
        fa->virtNeedle = cdnaSeq->dna;
        fa->virtHaystack = unpacked - outerStart;
        findAliEnds(fa->blocks, fa->virtNeedle, fa->virtHaystack,
            &fa->nStart, &fa->nEnd, &fa->hStart, &fa->hEnd);
        findClosestGene(chromNames[fa->chromIx], fa->hStart, fa->hEnd,
            (fa->isRc ? '-' : '+'), &fa->geneName, &fa->geneStart, &fa->geneEnd);
        fa->isBackwards = correctIsBackwards(ci->isBackwards, fa->isRc, fa->blocks, cdnaSeq->name);
        fa->next = ci->fineAli;
        ci->fineAli = fa;
        }
    freez(&unpacked);
    }
slReverse(&ci->fineAli);
sortFineAlis(&ci->fineAli);
flagDupeFineAlis(ci->fineAli);
if (weAreWeb())
    hyperReportAlis(ci);
else
    printf("%d %s\n", ci->ix, ci->name);
slFreeList(&ci->roughAli);
}
void glueFindOne(struct patSpace *ps, DNA *cdna, int cdnaSize, 
    char strand, char dir, char *cdnaName, struct cdnaAliList **pList)
/* Find occurrences of DNA in patSpace and print to hitOut. */
{
struct patClump *clumpList, *clump;

clumpList = patSpaceFindOne(ps, cdna, cdnaSize);
for (clump = clumpList; clump != NULL; clump = clump->next)
    {
    struct ffAli *ff;
    struct dnaSeq *seq = clump->seq;
    DNA *tStart = seq->dna + clump->start;
    char *contigName = seq->name;
    int seqIx = clump->seqIx;
    int bacIx = clump->bacIx;

    ++ffSubmitted;
    ff = ffFind(cdna, cdna+cdnaSize, tStart, tStart + clump->size, ffCdna);
    if (ff != NULL)
        {
        int ffScore = ffScoreCdna(ff);
        ++ffAccepted;
        if (ffScore >= 22)
            {
            int hiStart, hiEnd;
            int oldStart, oldEnd;
            struct ffAli *left, *right;

            ffFindEnds(ff, &left, &right);
            hiStart = oldStart = left->nStart - cdna;
            hiEnd = oldEnd = right->nEnd - cdna;
            ++ffOkScore;

            if (solidMatch(&left, &right, cdna, &hiStart, &hiEnd))
                {
                int solidSize = hiEnd - hiStart;
                int solidScore;
                int seqStart, seqEnd;
                double cookedScore;

                solidScore = scoreCdna(left, right);
                cookedScore = (double)solidScore/solidSize;
                if (cookedScore > 0.25)
                    {
                    struct cdnaAliList *cal;
                    ++ffSolidMatch;

                    seqStart = left->hStart - seq->dna;
                    seqEnd = right->hEnd - seq->dna;
                    fprintf(hitOut, "%3.1f%% %c %s:%d-%d (old %d-%d) of %d at %s.%d:%d-%d\n", 
                        100.0 * cookedScore, strand, cdnaName, 
                        hiStart, hiEnd, oldStart, oldEnd, cdnaSize,
                        contigName, seqIx, seqStart, seqEnd);

                    cal = newCal(bacIx, seqIx, hiStart, hiEnd, cdnaSize, strand, dir, cookedScore);
                    slAddHead(pList, cal);
                    }
                }
            }
        ffFreeAli(&ff);
        }
    }
slFreeList(&clumpList);
}
Ejemplo n.º 4
0
void writeClump(struct blockPos *first, struct blockPos *last,
    char *cdnaName, char strand, char dir, DNA *cdna, int cdnaSize, struct cdnaAliList **pList)
/* Write hitOut one clump. */
{
struct dnaSeq *seq = first->seq;
char *bacName = seq->name;
int seqIx = first->seqIx;
int start = first->offset;
int end = last->offset+last->size;
struct ffAli *ff, *left, *right;
int extraAtEnds = minMatch*patSize;
struct cdnaAliList *cal;

start -= extraAtEnds;
if (start < 0)
    start = 0;
end += extraAtEnds;
if (end >seq->size)
    end = seq->size;

++ffSubmitted;
if (dumpMe)
	fprintf(dumpOut, "%s %d %s %d-%d\n", cdnaName, cdnaSize, bacName, start, end);
ff = ffFind(cdna, cdna+cdnaSize, seq->dna+start, seq->dna+end, ffCdna);
if (dumpMe)
    {
    fprintf(dumpOut, "ffFind = %x\n", ff);
    }
if (ff != NULL)
    {
    int ffScore = ffScoreCdna(ff);
    ++ffAccepted;
    if (dumpMe) fprintf(dumpOut, "ffScore = %d\n", ffScore);
    if (ffScore >= 22)
        {
        int hiStart, hiEnd;
        int oldStart, oldEnd;

        ffFindEnds(ff, &left, &right);
        hiStart = oldStart = left->nStart - cdna;
        hiEnd = oldEnd = right->nEnd - cdna;
        ++ffOkScore;

        if (solidMatch(&left, &right, cdna, &hiStart, &hiEnd))
            {
            int solidSize = hiEnd - hiStart;
            int solidScore;
            int seqStart, seqEnd;
            double cookedScore;

            solidScore = scoreCdna(left, right);
            cookedScore = (double)solidScore/solidSize;
            if (cookedScore > 0.25)
                {
                ++ffSolidMatch;

                seqStart = left->hStart - seq->dna;
                seqEnd = right->hEnd - seq->dna;
                fprintf(hitOut, "%3.1f%% %c %s:%d-%d (old %d-%d) of %d at %s.%d:%d-%d\n", 
                    100.0 * cookedScore, strand, cdnaName, 
                    hiStart, hiEnd, oldStart, oldEnd, cdnaSize,
                    bacName, seqIx, seqStart, seqEnd);

                if (dumpMe)
                    {
                    fprintf(bigHtmlFile, "<A NAME=i%d>", htmlIx);
                    fprintf(bigHtmlFile, "<H2>%4.1f%% %4d %4d %c %s:%d-%d of %d at %s.%d:%d-%d</H2><BR>", 
                        100.0 * cookedScore, solidScore, ffScore, strand, cdnaName, 
                        hiStart, hiEnd, cdnaSize,
                        bacName, seqIx, seqStart, seqEnd);
                    fprintf(bigHtmlFile, "</A>");
                    ffShAli(bigHtmlFile, ff, cdnaName, cdna, cdnaSize, 0,
                        bacName, seq->dna+start, end-start, start, FALSE);
                    fprintf(bigHtmlFile, "<BR><BR>\n");

                    fprintf(littleHtmlFile, "<A HREF=\"patAli.html#i%d\">", htmlIx);
                    fprintf(littleHtmlFile, "%4.1f%% %4d %4d %c %s:%d-%d of %d at %s.%d:%d-%d\n", 
                        100.0 * cookedScore, solidScore, ffScore, strand, cdnaName, 
                        hiStart, hiEnd, cdnaSize,
                        bacName, seqIx, seqStart, seqEnd);
                    fprintf(littleHtmlFile, "</A><BR>");
                    ++htmlIx;
                    }

                cal = newCal(first->bacIx, seqIx, hiStart, hiEnd, cdnaSize, strand, dir, cookedScore);
                slAddHead(pList, cal);
                }
            }
        }
    ffFreeAli(&ff);
    }
}
Ejemplo n.º 5
0
int main(int argc, char *argv[])
{
    char *estName, *targetName, *oocName;
    FILE *estFile;
    struct dnaSeq *target;
    struct dnaSeq *est;
    struct patSpace *ps;
    struct patClump *clumpList, *clump;
    int estIx = 0;

    /* Check command line arguments and assign to local variables. */
    if (argc != 4)
        usage();
    estName = argv[1];
    estFile = mustOpen(estName, "rb");
    targetName = argv[2];
    oocName = argv[3];

    /* Read in target DNA from fasta files and check not too big. */
    fprintf(stderr, "Reading %s\n", targetName);
    target = faReadAllDna(targetName);
    if (totalSequenceSize(target) > 8000000)
    {
        errAbort("Can only handle 8000000 bases of genomic sequence at once, %s has %d.",
                 targetName, totalSequenceSize(target));
    }

    /* Make a pattern space index structure. */
    fprintf(stderr, "Making Pattern Space index\n");
    ps = makePatSpace(&target, 1, oocName, 4, 32000);

    /* Loop through each EST in query list. */
    printf("Searching for hits\n\n");
    while (faReadNext(estFile, NULL, TRUE, NULL, &est))
    {
        boolean isRc;   /* Reverse complemented? */

        if (++estIx % 5000 == 0)
            fprintf(stderr, "Processing EST %d\n", estIx);
        if (est->size > 20000)
        {
            warn("Very large EST sequence %s.\n"
                 "Maybe you mixed up the EST and genomic parameters?", est->name);
            usage();
        }

        for (isRc = 0; isRc <= 1; ++isRc)   /* Search both strands. */
        {
            if (isRc)
                reverseComplement(est->dna, est->size);
            clumpList = patSpaceFindOne(ps, est->dna, est->size);

            /* For each homology clump patSpace finds, do a fuzzyFinder
             * alignment of it and print the results. */
            for (clump = clumpList; clump != NULL; clump = clump->next)
            {
                struct ffAli *ali, *a;
                boolean isRc;
                int score;
                struct dnaSeq *t = clump->seq;
                DNA *tStart = t->dna + clump->start;

                ali = ffFind(est->dna, est->dna+est->size, tStart, tStart + clump->size, ffCdna);
                if (ali != NULL)
                {
                    score = ffScoreCdna(ali);
                    printf("%s hits %s strand %c score %d\n",
                           est->name, t->name, (isRc ? '+' : '-'), score);
                    for (a = ali; a != NULL; a = a->right)
                    {
                        printf("  Q %4d - %4d\t T %4d -%4d\n",
                               a->nStart - est->dna, a->nEnd - est->dna,
                               a->hStart - t->dna, a->hEnd - t->dna);
                    }
                    printf("\n");
                    ffFreeAli(&ali);
                }
                else
                {
                    printf("Couldn't align clump at %s %d-%d\n",
                           t->name, clump->start, clump->start + clump->size);
                }
            }
            slFreeList(&clumpList);
        }
        freeDnaSeq(&est);
    }
    /* Clean up time. */
    freePatSpace(&ps);
    freeSeqList(&target);
    return 0;
}