boolean fastFind(DNA *needle, int needleSize, struct patSpace *ps, struct ffAli **retAli, boolean *retRc, int *retScore) /* Do fast alignment. */ { struct patClump *clumpList, *clump; boolean isRc; struct aliList *aliList = NULL, *ali; for (isRc = 0; isRc <= 1; ++isRc) { if (isRc) reverseComplement(needle, needleSize); if ((clumpList = patSpaceFindOne(ps, needle, needleSize)) != NULL) { for (clump = clumpList; clump != NULL; clump = clump->next) { struct dnaSeq *haySeq = clump->seq; DNA *haystack = haySeq->dna; int start = clump->start; struct ffAli *ffAli = ffFind(needle, needle+needleSize, haystack+start, haystack+start+clump->size, ffCdna); if (ffAli != NULL) { AllocVar(ali); ali->ali = ffAli; ali->score = ffScoreCdna(ffAli); ali->isRc = isRc; slAddHead(&aliList, ali); } } slFreeList(&clumpList); } if (isRc) reverseComplement(needle, needleSize); } if (aliList != NULL) { slSort(&aliList, cmpAliList); *retAli = aliList->ali; aliList->ali = NULL; *retRc = aliList->isRc; *retScore = aliList->score; for (ali = aliList->next; ali != NULL; ali = ali->next) ffFreeAli(&ali->ali); slFreeList(&aliList); return TRUE; } else return FALSE; }
static boolean alignComponents(struct gfRange *combined, struct ssBundle *bun, enum ffStringency stringency) /* Align each piece of combined->components and put result in * bun->ffList. */ { struct gfRange *range; struct dnaSeq *qSeq = bun->qSeq, *tSeq = bun->genoSeq; struct ssFfItem *ffi; struct ffAli *ali; int qStart, qEnd, tStart, tEnd; int extra = 250; boolean gotAny = FALSE; for (range = combined->components; range != NULL; range = range->next) { /* Expand to include some extra sequence around range. */ qStart = range->qStart - extra; tStart = range->tStart - extra; qEnd = range->qEnd + extra; tEnd = range->tEnd + extra; if (range == combined->components) { qStart -= extra; tStart -= extra; } if (range->next == NULL) { qEnd += extra; tEnd += extra; } if (qStart < combined->qStart) qStart = combined->qStart; if (tStart < combined->tStart) tStart = combined->tStart; if (qEnd > combined->qEnd) qEnd = combined->qEnd; if (tEnd > combined->tEnd) tEnd = combined->tEnd; ali = ffFind(qSeq->dna + qStart, qSeq->dna + qEnd, tSeq->dna + tStart - combined->tStart, tSeq->dna + tEnd - combined->tStart, stringency); if (ali != NULL) { AllocVar(ffi); ffi->ff = ali; slAddHead(&bun->ffList, ffi); gotAny = TRUE; } } return gotAny; }
struct ffAli *smallMiddleExons(struct ffAli *aliList, struct ssBundle *bundle, enum ffStringency stringency) /* Look for small exons in the middle. */ { if (bundle->t3List != NULL) return aliList; /* Can't handle intense translated stuff. */ else { struct dnaSeq *qSeq = bundle->qSeq; struct dnaSeq *genoSeq = bundle->genoSeq; struct ffAli *right, *left = NULL, *newLeft, *newRight; left = aliList; for (right = aliList->right; right != NULL; right = right->right) { if (right->hStart - left->hEnd >= 3 && right->nStart - left->nEnd >= 3) { newLeft = ffFind(left->nEnd, right->nStart, left->hEnd, right->hStart, stringency); if (newLeft != NULL) { newLeft = forceMonotonic(newLeft, qSeq, genoSeq, stringency, bundle->isProt, bundle->t3List ); newRight = ffRightmost(newLeft); if (left != NULL) { left->right = newLeft; newLeft->left = left; } else { aliList = newLeft; } if (right != NULL) { right->left = newRight; newRight->right = right; } } } left = right; } } return aliList; }
void writeClump(struct blockPos *first, struct blockPos *last, char *cdnaName, char strand, char dir, DNA *cdna, int cdnaSize, struct cdnaAliList **pList) /* Write hitOut one clump. */ { struct dnaSeq *seq = first->seq; char *bacName = seq->name; int seqIx = first->seqIx; int start = first->offset; int end = last->offset+last->size; struct ffAli *ff, *left, *right; int extraAtEnds = minMatch*patSize; struct cdnaAliList *cal; start -= extraAtEnds; if (start < 0) start = 0; end += extraAtEnds; if (end >seq->size) end = seq->size; ++ffSubmitted; if (dumpMe) fprintf(dumpOut, "%s %d %s %d-%d\n", cdnaName, cdnaSize, bacName, start, end); ff = ffFind(cdna, cdna+cdnaSize, seq->dna+start, seq->dna+end, ffCdna); if (dumpMe) { fprintf(dumpOut, "ffFind = %x\n", ff); } if (ff != NULL) { int ffScore = ffScoreCdna(ff); ++ffAccepted; if (dumpMe) fprintf(dumpOut, "ffScore = %d\n", ffScore); if (ffScore >= 22) { int hiStart, hiEnd; int oldStart, oldEnd; ffFindEnds(ff, &left, &right); hiStart = oldStart = left->nStart - cdna; hiEnd = oldEnd = right->nEnd - cdna; ++ffOkScore; if (solidMatch(&left, &right, cdna, &hiStart, &hiEnd)) { int solidSize = hiEnd - hiStart; int solidScore; int seqStart, seqEnd; double cookedScore; solidScore = scoreCdna(left, right); cookedScore = (double)solidScore/solidSize; if (cookedScore > 0.25) { ++ffSolidMatch; seqStart = left->hStart - seq->dna; seqEnd = right->hEnd - seq->dna; fprintf(hitOut, "%3.1f%% %c %s:%d-%d (old %d-%d) of %d at %s.%d:%d-%d\n", 100.0 * cookedScore, strand, cdnaName, hiStart, hiEnd, oldStart, oldEnd, cdnaSize, bacName, seqIx, seqStart, seqEnd); if (dumpMe) { fprintf(bigHtmlFile, "<A NAME=i%d>", htmlIx); fprintf(bigHtmlFile, "<H2>%4.1f%% %4d %4d %c %s:%d-%d of %d at %s.%d:%d-%d</H2><BR>", 100.0 * cookedScore, solidScore, ffScore, strand, cdnaName, hiStart, hiEnd, cdnaSize, bacName, seqIx, seqStart, seqEnd); fprintf(bigHtmlFile, "</A>"); ffShAli(bigHtmlFile, ff, cdnaName, cdna, cdnaSize, 0, bacName, seq->dna+start, end-start, start, FALSE); fprintf(bigHtmlFile, "<BR><BR>\n"); fprintf(littleHtmlFile, "<A HREF=\"patAli.html#i%d\">", htmlIx); fprintf(littleHtmlFile, "%4.1f%% %4d %4d %c %s:%d-%d of %d at %s.%d:%d-%d\n", 100.0 * cookedScore, solidScore, ffScore, strand, cdnaName, hiStart, hiEnd, cdnaSize, bacName, seqIx, seqStart, seqEnd); fprintf(littleHtmlFile, "</A><BR>"); ++htmlIx; } cal = newCal(first->bacIx, seqIx, hiStart, hiEnd, cdnaSize, strand, dir, cookedScore); slAddHead(pList, cal); } } } ffFreeAli(&ff); } }
void glueFindOne(struct patSpace *ps, DNA *cdna, int cdnaSize, char strand, char dir, char *cdnaName, struct cdnaAliList **pList) /* Find occurrences of DNA in patSpace and print to hitOut. */ { struct patClump *clumpList, *clump; clumpList = patSpaceFindOne(ps, cdna, cdnaSize); for (clump = clumpList; clump != NULL; clump = clump->next) { struct ffAli *ff; struct dnaSeq *seq = clump->seq; DNA *tStart = seq->dna + clump->start; char *contigName = seq->name; int seqIx = clump->seqIx; int bacIx = clump->bacIx; ++ffSubmitted; ff = ffFind(cdna, cdna+cdnaSize, tStart, tStart + clump->size, ffCdna); if (ff != NULL) { int ffScore = ffScoreCdna(ff); ++ffAccepted; if (ffScore >= 22) { int hiStart, hiEnd; int oldStart, oldEnd; struct ffAli *left, *right; ffFindEnds(ff, &left, &right); hiStart = oldStart = left->nStart - cdna; hiEnd = oldEnd = right->nEnd - cdna; ++ffOkScore; if (solidMatch(&left, &right, cdna, &hiStart, &hiEnd)) { int solidSize = hiEnd - hiStart; int solidScore; int seqStart, seqEnd; double cookedScore; solidScore = scoreCdna(left, right); cookedScore = (double)solidScore/solidSize; if (cookedScore > 0.25) { struct cdnaAliList *cal; ++ffSolidMatch; seqStart = left->hStart - seq->dna; seqEnd = right->hEnd - seq->dna; fprintf(hitOut, "%3.1f%% %c %s:%d-%d (old %d-%d) of %d at %s.%d:%d-%d\n", 100.0 * cookedScore, strand, cdnaName, hiStart, hiEnd, oldStart, oldEnd, cdnaSize, contigName, seqIx, seqStart, seqEnd); cal = newCal(bacIx, seqIx, hiStart, hiEnd, cdnaSize, strand, dir, cookedScore); slAddHead(pList, cal); } } } ffFreeAli(&ff); } } slFreeList(&clumpList); }
struct ssBundle *ssFindBundles(struct patSpace *ps, struct dnaSeq *cSeq, char *cName, enum ffStringency stringency, boolean avoidSelfSelf) /* Find patSpace alignments. This routine is used by psLayout but not blat. */ { struct patClump *clumpList, *clump; struct ssBundle *bundleList = NULL, *bun = NULL; DNA *cdna = cSeq->dna; int totalCdnaSize = cSeq->size; DNA *endCdna = cdna+totalCdnaSize; struct ssFfItem *ffl; struct dnaSeq *lastSeq = NULL; int maxSize = 700; int preferredSize = 500; int overlapSize = 250; for (;;) { int cSize = endCdna - cdna; if (cSize > maxSize) cSize = preferredSize; clumpList = patSpaceFindOne(ps, cdna, cSize); for (clump = clumpList; clump != NULL; clump = clump->next) { struct ffAli *ff; struct dnaSeq *seq = clump->seq; DNA *tStart = seq->dna + clump->start; if (!avoidSelfSelf || !sameString(seq->name, cSeq->name)) { ff = ffFind(cdna, cdna+cSize, tStart, tStart + clump->size, stringency); if (ff != NULL) { if (lastSeq != seq) { lastSeq = seq; if ((bun = findBundle(bundleList, seq)) == NULL) { AllocVar(bun); bun->qSeq = cSeq; bun->genoSeq = seq; bun->genoIx = clump->bacIx; bun->genoContigIx = clump->seqIx; slAddHead(&bundleList, bun); } } AllocVar(ffl); ffl->ff = ff; slAddHead(&bun->ffList, ffl); } } } cdna += cSize; if (cdna >= endCdna) break; cdna -= overlapSize; slFreeList(&clumpList); } slReverse(&bundleList); cdna = cSeq->dna; for (bun = bundleList; bun != NULL; bun = bun->next) { ssStitch(bun, stringency, 20, 16); } return bundleList; }
int main(int argc, char *argv[]) { char *estName, *targetName, *oocName; FILE *estFile; struct dnaSeq *target; struct dnaSeq *est; struct patSpace *ps; struct patClump *clumpList, *clump; int estIx = 0; /* Check command line arguments and assign to local variables. */ if (argc != 4) usage(); estName = argv[1]; estFile = mustOpen(estName, "rb"); targetName = argv[2]; oocName = argv[3]; /* Read in target DNA from fasta files and check not too big. */ fprintf(stderr, "Reading %s\n", targetName); target = faReadAllDna(targetName); if (totalSequenceSize(target) > 8000000) { errAbort("Can only handle 8000000 bases of genomic sequence at once, %s has %d.", targetName, totalSequenceSize(target)); } /* Make a pattern space index structure. */ fprintf(stderr, "Making Pattern Space index\n"); ps = makePatSpace(&target, 1, oocName, 4, 32000); /* Loop through each EST in query list. */ printf("Searching for hits\n\n"); while (faReadNext(estFile, NULL, TRUE, NULL, &est)) { boolean isRc; /* Reverse complemented? */ if (++estIx % 5000 == 0) fprintf(stderr, "Processing EST %d\n", estIx); if (est->size > 20000) { warn("Very large EST sequence %s.\n" "Maybe you mixed up the EST and genomic parameters?", est->name); usage(); } for (isRc = 0; isRc <= 1; ++isRc) /* Search both strands. */ { if (isRc) reverseComplement(est->dna, est->size); clumpList = patSpaceFindOne(ps, est->dna, est->size); /* For each homology clump patSpace finds, do a fuzzyFinder * alignment of it and print the results. */ for (clump = clumpList; clump != NULL; clump = clump->next) { struct ffAli *ali, *a; boolean isRc; int score; struct dnaSeq *t = clump->seq; DNA *tStart = t->dna + clump->start; ali = ffFind(est->dna, est->dna+est->size, tStart, tStart + clump->size, ffCdna); if (ali != NULL) { score = ffScoreCdna(ali); printf("%s hits %s strand %c score %d\n", est->name, t->name, (isRc ? '+' : '-'), score); for (a = ali; a != NULL; a = a->right) { printf(" Q %4d - %4d\t T %4d -%4d\n", a->nStart - est->dna, a->nEnd - est->dna, a->hStart - t->dna, a->hEnd - t->dna); } printf("\n"); ffFreeAli(&ali); } else { printf("Couldn't align clump at %s %d-%d\n", t->name, clump->start, clump->start + clump->size); } } slFreeList(&clumpList); } freeDnaSeq(&est); } /* Clean up time. */ freePatSpace(&ps); freeSeqList(&target); return 0; }