void ssFfItemFree(struct ssFfItem **pEl) /* Free a single ssFfItem. */ { struct ssFfItem *el; if ((el = *pEl) != NULL) { ffFreeAli(&el->ff); freez(pEl); } }
void freeCdnaAliList(struct cdnaAli **pList) /* Free a list of alignments and associated data. */ { struct cdnaAli *ca; for (ca = *pList; ca != NULL; ca = ca->next) { ffFreeAli(&ca->ali); freeDnaSeq(&ca->cdna); } slFreeList(pList); }
boolean fastFind(DNA *needle, int needleSize, struct patSpace *ps, struct ffAli **retAli, boolean *retRc, int *retScore) /* Do fast alignment. */ { struct patClump *clumpList, *clump; boolean isRc; struct aliList *aliList = NULL, *ali; for (isRc = 0; isRc <= 1; ++isRc) { if (isRc) reverseComplement(needle, needleSize); if ((clumpList = patSpaceFindOne(ps, needle, needleSize)) != NULL) { for (clump = clumpList; clump != NULL; clump = clump->next) { struct dnaSeq *haySeq = clump->seq; DNA *haystack = haySeq->dna; int start = clump->start; struct ffAli *ffAli = ffFind(needle, needle+needleSize, haystack+start, haystack+start+clump->size, ffCdna); if (ffAli != NULL) { AllocVar(ali); ali->ali = ffAli; ali->score = ffScoreCdna(ffAli); ali->isRc = isRc; slAddHead(&aliList, ali); } } slFreeList(&clumpList); } if (isRc) reverseComplement(needle, needleSize); } if (aliList != NULL) { slSort(&aliList, cmpAliList); *retAli = aliList->ali; aliList->ali = NULL; *retRc = aliList->isRc; *retScore = aliList->score; for (ali = aliList->next; ali != NULL; ali = ali->next) ffFreeAli(&ali->ali); slFreeList(&aliList); return TRUE; } else return FALSE; }
static struct ffAli *forceMonotonic(struct ffAli *aliList, struct dnaSeq *qSeq, struct dnaSeq *tSeq, enum ffStringency stringency, boolean isProt, struct trans3 *t3List) /* Remove any blocks that violate strictly increasing order in both coordinates. * This is not optimal, but it turns out to be very rarely used. */ { if (!isProt) { if (!isMonotonic(aliList)) { struct ffAli *leftovers = NULL; int score; ssFindBestBig(aliList, qSeq, tSeq, stringency, isProt, t3List, &aliList, &score, &leftovers); ffFreeAli(&leftovers); } } return aliList; }
void refineAlis(struct cdnaInfo *ci, struct dnaSeq *cdnaSeq) /* Turn ci->roughAli into ci->fineAli. Refine alignment. */ { struct roughAli *ra; struct fineAli *fa; struct ffAli *ffAli = NULL; DNA *unpacked; int outerStart, outerEnd; int score; boolean isRc; boolean leftCruddyCount; boolean rightCruddyCount; DNA *hayStart; int hayLen; DNA *hayEnd; int gStart, gEnd; boolean badFind; sortRoughAlis(&ci->roughAli); flagDupeRoughAlis(ci->roughAli); for (ra = ci->roughAli; ra != NULL; ra = ra->next) { int bestScore = -0x7fffffff; int oldBestScore = -0x7fffffff; int oldScore; struct ffAli *bestAli = NULL; boolean bestIsRc = FALSE; if (ra->isDupe) continue; /* If score is less than 1/8 of cdna size, don't bother * with further processing. */ if (ra->score < ci->baseCount/8) { continue; } gStart = ra->gStart - 16; gEnd = ra->gEnd + 16; /* Unpack dna, including extra at either end. */ fetchUnpacked(ra->chromIx, ra->gStart, ra->gEnd, 15250, &unpacked, &outerStart, &outerEnd); badFind = FALSE; for (;;) { clipEnds(outerStart, &gStart, &gEnd, outerEnd); hayStart = unpacked + gStart-outerStart; hayLen = gEnd - gStart; hayEnd = hayStart + hayLen; if (!ffFindEitherStrandN(cdnaSeq->dna, cdnaSeq->size, hayStart, hayLen, ffCdna, &ffAli, &isRc)) { if (ra->score > 20 && ra->score > oldBestScore + 5 && ra == ci->roughAli) { if (badFind) { warn("%s - still couldn't ffFind after expansion", ci->name); break; } warn("%s Couldn't ffFind %s (%d bases %d score %d bestScore) in chromosome %s %d-%d", ci->name, ci->name, ci->baseCount, ra->score, oldBestScore, chromNames[ra->chromIx], gStart, gEnd); addRedoHash(ci, "ffFind"); } else break; badFind = TRUE; gStart -= 500; gEnd += 500; continue; } if (isRc) reverseComplement(cdnaSeq->dna, cdnaSeq->size); score = scoreExonAli(ffAli); oldScore = ffScoreCdna(ffAli); leftCruddyCount = leftFlakySize(ffAli, cdnaSeq->dna, cdnaSeq->size); rightCruddyCount = rightFlakySize(ffAli, cdnaSeq->dna, cdnaSeq->size) - polyaSize(cdnaSeq->dna, cdnaSeq->size); if (isRc) reverseComplement(cdnaSeq->dna, cdnaSeq->size); if (score <= bestScore) { ffFreeAli(&ffAli); break; } bestScore = score; oldBestScore = oldScore; ffFreeAli(&bestAli); bestAli = ffAli; bestIsRc = isRc; if (leftCruddyCount <= 0 && rightCruddyCount <= 0) break; if (gStart == outerStart || gEnd == outerEnd) break; if (leftCruddyCount < 16) gStart -= 2*leftCruddyCount; else gStart -= 5000; if (rightCruddyCount > 0) { if (rightCruddyCount < 16) gEnd += 2*rightCruddyCount; else gEnd += 5000; } } if (bestAli != NULL) { AllocVar(fa); fa->chromIx = ra->chromIx; fa->isRc = bestIsRc; fa->score = bestScore; fa->blocks = bestAli; fa->virtNeedle = cdnaSeq->dna; fa->virtHaystack = unpacked - outerStart; findAliEnds(fa->blocks, fa->virtNeedle, fa->virtHaystack, &fa->nStart, &fa->nEnd, &fa->hStart, &fa->hEnd); findClosestGene(chromNames[fa->chromIx], fa->hStart, fa->hEnd, (fa->isRc ? '-' : '+'), &fa->geneName, &fa->geneStart, &fa->geneEnd); fa->isBackwards = correctIsBackwards(ci->isBackwards, fa->isRc, fa->blocks, cdnaSeq->name); fa->next = ci->fineAli; ci->fineAli = fa; } freez(&unpacked); } slReverse(&ci->fineAli); sortFineAlis(&ci->fineAli); flagDupeFineAlis(ci->fineAli); if (weAreWeb()) hyperReportAlis(ci); else printf("%d %s\n", ci->ix, ci->name); slFreeList(&ci->roughAli); }
void aliTrack(char *bacAcc, char *wholeName, char *partsName, struct memGfx *mg, int x, int y, FILE *mapFile, int trim, char *repeatMask) /* Write out one alignment track. */ { struct dnaSeq *whole, *partList, *part; bits16 contig; int maxBlockSize = 5000; int wholeSize; struct patSpace *ps; DNA *wholeDna; whole = faReadAllDna(wholeName); if (slCount(whole) > 1) warn("%d sequences in %s, only using first", slCount(whole), wholeName); wholeDna = whole->dna; wholeSize = whole->size; ps = makePatSpace(&whole, 1, oocFile, 5, 500); partList = faReadAllDna(partsName); printf("%d contigs in %s\n\n", slCount(partList), partsName); for (part = partList, contig = 0; part != NULL; part = part->next, ++contig) { DNA *dna = part->dna; int dnaSize = part->size; int start, size; int subIx = 0; char numText[12]; Color color = blockColors[contig%ArraySize(blockColors)]; sprintf(numText, "%d", contig+1); for (start = trim; start < dnaSize-trim; start += size) { struct ffAli *left, *right; boolean rc; int score; size = dnaSize - start-trim; if (size > maxBlockSize) size = maxBlockSize; if (!fastFind(dna+start, size, ps, &left, &rc, &score) ) { printf("Contig %d.%d:%d-%d of %d UNALIGNED\n", contig+1, subIx, start, start+size, dnaSize); } else { int x1, x2; int xo, w; double quality; int qStart, qSize, tStart,tSize; char qualityString[40]; right = left; while (right->right != NULL) right = right->right; qStart = left->nStart - dna; qSize = right->nEnd - left->nStart; if (rc) { int rcEnd = right->nEnd - (dna+start) - 1; qStart = reverseOffset(rcEnd, size) + start; } tStart = left->hStart - wholeDna; tSize = right->hEnd - left->hStart; quality = 100.0 * score / qSize; if (quality >= 25.0) sprintf(qualityString, "%4.1f%%", quality); else sprintf(qualityString, "<50%%"); printf("<A HREF=\"../cgi-bin/chkGlue.exe?bacAcc=%s&contig=%d&qStart=%d&qSize=%d&tStart=%d&tSize=%d&repeatMask=%s\">", bacAcc, contig, qStart, qSize, tStart, tSize, repeatMask); printf("Contig %d.%d:%d-%d %c of %d aligned %d-%d of %d aliSize %d quality %s</A>\n", contig+1, subIx, qStart, qStart+qSize, (rc ? '-' : '+'), dnaSize, tStart, tStart + tSize, wholeSize, qSize, qualityString); x1 = roundingScale(trackWidth, left->hStart - wholeDna, wholeSize); x2 = roundingScale(trackWidth, right->hEnd - wholeDna, wholeSize); xo = x1+x; w = x2-x1; mapWriteBox(mapFile, mtBlock, xo, y, w, trackHeight, bacAcc, contig, qStart, qSize, tStart, tSize); mgDrawBox(mg, xo, y, w, trackHeight, color); mgTextCentered(mg, xo, y, w, trackHeight, MG_WHITE, font, numText); ffFreeAli(&left); } ++subIx; } } freePatSpace(&ps); freeAllSeq(&whole); freeAllSeq(&partList); }
void writeClump(struct blockPos *first, struct blockPos *last, char *cdnaName, char strand, char dir, DNA *cdna, int cdnaSize, struct cdnaAliList **pList) /* Write hitOut one clump. */ { struct dnaSeq *seq = first->seq; char *bacName = seq->name; int seqIx = first->seqIx; int start = first->offset; int end = last->offset+last->size; struct ffAli *ff, *left, *right; int extraAtEnds = minMatch*patSize; struct cdnaAliList *cal; start -= extraAtEnds; if (start < 0) start = 0; end += extraAtEnds; if (end >seq->size) end = seq->size; ++ffSubmitted; if (dumpMe) fprintf(dumpOut, "%s %d %s %d-%d\n", cdnaName, cdnaSize, bacName, start, end); ff = ffFind(cdna, cdna+cdnaSize, seq->dna+start, seq->dna+end, ffCdna); if (dumpMe) { fprintf(dumpOut, "ffFind = %x\n", ff); } if (ff != NULL) { int ffScore = ffScoreCdna(ff); ++ffAccepted; if (dumpMe) fprintf(dumpOut, "ffScore = %d\n", ffScore); if (ffScore >= 22) { int hiStart, hiEnd; int oldStart, oldEnd; ffFindEnds(ff, &left, &right); hiStart = oldStart = left->nStart - cdna; hiEnd = oldEnd = right->nEnd - cdna; ++ffOkScore; if (solidMatch(&left, &right, cdna, &hiStart, &hiEnd)) { int solidSize = hiEnd - hiStart; int solidScore; int seqStart, seqEnd; double cookedScore; solidScore = scoreCdna(left, right); cookedScore = (double)solidScore/solidSize; if (cookedScore > 0.25) { ++ffSolidMatch; seqStart = left->hStart - seq->dna; seqEnd = right->hEnd - seq->dna; fprintf(hitOut, "%3.1f%% %c %s:%d-%d (old %d-%d) of %d at %s.%d:%d-%d\n", 100.0 * cookedScore, strand, cdnaName, hiStart, hiEnd, oldStart, oldEnd, cdnaSize, bacName, seqIx, seqStart, seqEnd); if (dumpMe) { fprintf(bigHtmlFile, "<A NAME=i%d>", htmlIx); fprintf(bigHtmlFile, "<H2>%4.1f%% %4d %4d %c %s:%d-%d of %d at %s.%d:%d-%d</H2><BR>", 100.0 * cookedScore, solidScore, ffScore, strand, cdnaName, hiStart, hiEnd, cdnaSize, bacName, seqIx, seqStart, seqEnd); fprintf(bigHtmlFile, "</A>"); ffShAli(bigHtmlFile, ff, cdnaName, cdna, cdnaSize, 0, bacName, seq->dna+start, end-start, start, FALSE); fprintf(bigHtmlFile, "<BR><BR>\n"); fprintf(littleHtmlFile, "<A HREF=\"patAli.html#i%d\">", htmlIx); fprintf(littleHtmlFile, "%4.1f%% %4d %4d %c %s:%d-%d of %d at %s.%d:%d-%d\n", 100.0 * cookedScore, solidScore, ffScore, strand, cdnaName, hiStart, hiEnd, cdnaSize, bacName, seqIx, seqStart, seqEnd); fprintf(littleHtmlFile, "</A><BR>"); ++htmlIx; } cal = newCal(first->bacIx, seqIx, hiStart, hiEnd, cdnaSize, strand, dir, cookedScore); slAddHead(pList, cal); } } } ffFreeAli(&ff); } }
void showClump(struct ernaClump *clump, FILE *f) /* Show detailed alignment for one clump. */ { int chromStart = clump->start - 1000; int chromEnd = clump->end + 1000; int chromSize; DNA *chromDna; struct wormFeature *cdnaNameList, *cdnaName; struct lineAli *laList = NULL, *la; struct ffAli *ali; struct dnaSeq *cdna; boolean rcCdna; int clumpSize = clump->end - clump->start + 1; int displaySize = lineSize; int displayStart = (clump->start+clump->end)/2 - displaySize/2; int displayEnd = displayStart + displaySize; int displayDnaOffset; DNA *displayDna; struct ernaHit *hit; /* Get genomic dna and list of all cDNAs in area around clump. */ wormClipRangeToChrom(clump->chrom, &chromStart, &chromEnd); chromSize = chromEnd - chromStart; chromDna = wormChromPart(clump->chrom, chromStart, chromSize); cdnaNameList = wormCdnasInRange(clump->chrom, chromStart, chromEnd); /* Figure out 60 bases to display alignment around clump. */ wormClipRangeToChrom(clump->chrom, &displayStart, &displayEnd); displaySize = displayEnd - displayStart; displayDnaOffset = displayStart - chromStart; displayDna = chromDna + displayDnaOffset; /* Make up detailed alignment on each cDNA */ for (cdnaName = cdnaNameList; cdnaName != NULL; cdnaName = cdnaName->next) { struct wormCdnaInfo info; if (!wormCdnaSeq(cdnaName->name, &cdna, &info)) { warn("Couldn't find %s", cdnaName->name); continue; } if (!ffFindEitherStrandN(cdna->dna, cdna->size, chromDna, chromSize, ffCdna, &ali, &rcCdna)) { warn("Couldn't align %s", cdnaName->name); continue; } if (rcCdna) reverseComplement(cdna->dna, cdna->size); la = makeLineAli(cdnaName->name, ali, chromDna, cdna->dna, displayDnaOffset); la->isEmbryo = info.isEmbryonic; slAddHead(&laList, la); freeDnaSeq(&cdna); ffFreeAli(&ali); } /* Display genomic with upper case at hot spots*/ displayDna[displaySize] = 0; for (hit = clump->hits; hit != NULL; hit = hit->next) { int doff = hit->pos - chromStart; chromDna[doff] = toupper(chromDna[doff]); } fprintf(f, "%s Genomic\n", displayDna); /* Display aligned list by sorted score. */ slSort(&laList, cmpLaScore); for (la = laList; la != NULL; la = la->next) { if (spaceCount(la->line) != lineSize) fprintf(f, "%s %s %s\n", la->line, la->name, (la->isEmbryo ? "emb" : " ")); } /* Clean up. */ slFreeList(&cdnaNameList); slFreeList(&laList); freeMem(chromDna); }
void glueFindOne(struct patSpace *ps, DNA *cdna, int cdnaSize, char strand, char dir, char *cdnaName, struct cdnaAliList **pList) /* Find occurrences of DNA in patSpace and print to hitOut. */ { struct patClump *clumpList, *clump; clumpList = patSpaceFindOne(ps, cdna, cdnaSize); for (clump = clumpList; clump != NULL; clump = clump->next) { struct ffAli *ff; struct dnaSeq *seq = clump->seq; DNA *tStart = seq->dna + clump->start; char *contigName = seq->name; int seqIx = clump->seqIx; int bacIx = clump->bacIx; ++ffSubmitted; ff = ffFind(cdna, cdna+cdnaSize, tStart, tStart + clump->size, ffCdna); if (ff != NULL) { int ffScore = ffScoreCdna(ff); ++ffAccepted; if (ffScore >= 22) { int hiStart, hiEnd; int oldStart, oldEnd; struct ffAli *left, *right; ffFindEnds(ff, &left, &right); hiStart = oldStart = left->nStart - cdna; hiEnd = oldEnd = right->nEnd - cdna; ++ffOkScore; if (solidMatch(&left, &right, cdna, &hiStart, &hiEnd)) { int solidSize = hiEnd - hiStart; int solidScore; int seqStart, seqEnd; double cookedScore; solidScore = scoreCdna(left, right); cookedScore = (double)solidScore/solidSize; if (cookedScore > 0.25) { struct cdnaAliList *cal; ++ffSolidMatch; seqStart = left->hStart - seq->dna; seqEnd = right->hEnd - seq->dna; fprintf(hitOut, "%3.1f%% %c %s:%d-%d (old %d-%d) of %d at %s.%d:%d-%d\n", 100.0 * cookedScore, strand, cdnaName, hiStart, hiEnd, oldStart, oldEnd, cdnaSize, contigName, seqIx, seqStart, seqEnd); cal = newCal(bacIx, seqIx, hiStart, hiEnd, cdnaSize, strand, dir, cookedScore); slAddHead(pList, cal); } } } ffFreeAli(&ff); } } slFreeList(&clumpList); }
void ssStitch(struct ssBundle *bundle, enum ffStringency stringency, int minScore, int maxToReturn) /* Glue together mrnas in bundle as much as possible. Returns number of * alignments after stitching. Updates bundle->ffList with stitched * together version. */ { struct dnaSeq *qSeq = bundle->qSeq; struct dnaSeq *genoSeq = bundle->genoSeq; struct ffAli *ffList = NULL; struct ssFfItem *ffl; struct ffAli *bestPath; int score; boolean firstTime = TRUE; if (bundle->ffList == NULL) return; /* The score may improve when we stitch together more alignments, * so don't let minScore be too harsh at this stage. */ if (minScore > 20) minScore = 20; /* Create ffAlis for all in bundle and move to one big list. */ for (ffl = bundle->ffList; ffl != NULL; ffl = ffl->next) { ffCat(&ffList, &ffl->ff); } slFreeList(&bundle->ffList); ffAliSort(&ffList, ffCmpHitsNeedleFirst); ffList = ffMergeClose(ffList, qSeq->dna, genoSeq->dna); while (ffList != NULL) { ssFindBest(ffList, qSeq, genoSeq, stringency, bundle->isProt, bundle->t3List, &bestPath, &score, &ffList); bestPath = ffMergeNeedleAlis(bestPath, TRUE); bestPath = ffRemoveEmptyAlis(bestPath, TRUE); if (!bestPath) { ffFreeAli(&ffList); break; } bestPath = ffMergeHayOverlaps(bestPath); bestPath = ffRemoveEmptyAlis(bestPath, TRUE); bestPath = forceMonotonic(bestPath, qSeq, genoSeq, stringency, bundle->isProt, bundle->t3List); if (firstTime && stringency == ffCdna && bundle->avoidFuzzyFindKludge == FALSE) { /* Only look for middle exons the first time. Next times * this might regenerate most of the first alignment... */ bestPath = smallMiddleExons(bestPath, bundle, stringency); } bestPath = ffMergeNeedleAlis(bestPath, TRUE); if (ffIntronMax != ffIntronMaxDefault) { bestPath = cutAtBigIntrons(bestPath, ffIntronMax, &score, stringency, bundle->isProt, genoSeq, bundle->t3List, &ffList); } if (!bundle->isProt) ffSlideIntrons(bestPath); bestPath = ffRemoveEmptyAlis(bestPath, TRUE); if (score >= minScore) { AllocVar(ffl); ffl->ff = bestPath; slAddHead(&bundle->ffList, ffl); } else { ffFreeAli(&bestPath); ffFreeAli(&ffList); break; } firstTime = FALSE; if (--maxToReturn <= 0) { ffFreeAli(&ffList); break; } } slReverse(&bundle->ffList); return; }
int main(int argc, char *argv[]) { char *estName, *targetName, *oocName; FILE *estFile; struct dnaSeq *target; struct dnaSeq *est; struct patSpace *ps; struct patClump *clumpList, *clump; int estIx = 0; /* Check command line arguments and assign to local variables. */ if (argc != 4) usage(); estName = argv[1]; estFile = mustOpen(estName, "rb"); targetName = argv[2]; oocName = argv[3]; /* Read in target DNA from fasta files and check not too big. */ fprintf(stderr, "Reading %s\n", targetName); target = faReadAllDna(targetName); if (totalSequenceSize(target) > 8000000) { errAbort("Can only handle 8000000 bases of genomic sequence at once, %s has %d.", targetName, totalSequenceSize(target)); } /* Make a pattern space index structure. */ fprintf(stderr, "Making Pattern Space index\n"); ps = makePatSpace(&target, 1, oocName, 4, 32000); /* Loop through each EST in query list. */ printf("Searching for hits\n\n"); while (faReadNext(estFile, NULL, TRUE, NULL, &est)) { boolean isRc; /* Reverse complemented? */ if (++estIx % 5000 == 0) fprintf(stderr, "Processing EST %d\n", estIx); if (est->size > 20000) { warn("Very large EST sequence %s.\n" "Maybe you mixed up the EST and genomic parameters?", est->name); usage(); } for (isRc = 0; isRc <= 1; ++isRc) /* Search both strands. */ { if (isRc) reverseComplement(est->dna, est->size); clumpList = patSpaceFindOne(ps, est->dna, est->size); /* For each homology clump patSpace finds, do a fuzzyFinder * alignment of it and print the results. */ for (clump = clumpList; clump != NULL; clump = clump->next) { struct ffAli *ali, *a; boolean isRc; int score; struct dnaSeq *t = clump->seq; DNA *tStart = t->dna + clump->start; ali = ffFind(est->dna, est->dna+est->size, tStart, tStart + clump->size, ffCdna); if (ali != NULL) { score = ffScoreCdna(ali); printf("%s hits %s strand %c score %d\n", est->name, t->name, (isRc ? '+' : '-'), score); for (a = ali; a != NULL; a = a->right) { printf(" Q %4d - %4d\t T %4d -%4d\n", a->nStart - est->dna, a->nEnd - est->dna, a->hStart - t->dna, a->hEnd - t->dna); } printf("\n"); ffFreeAli(&ali); } else { printf("Couldn't align clump at %s %d-%d\n", t->name, clump->start, clump->start + clump->size); } } slFreeList(&clumpList); } freeDnaSeq(&est); } /* Clean up time. */ freePatSpace(&ps); freeSeqList(&target); return 0; }