struct ssBundle *ffSeedExtInMem(struct genoFind *gf, struct dnaSeq *qSeq, Bits *qMaskBits, int qOffset, struct lm *lm, int minScore, boolean isRc) /* Do seed and extend type alignment */ { struct ssBundle *bunList = NULL, *bun; int hitCount; struct gfClump *clumpList, *clump; struct gfRange *rangeList = NULL, *range; struct dnaSeq *tSeq; clumpList = gfFindClumpsWithQmask(gf, qSeq, qMaskBits, qOffset, lm, &hitCount); for (clump = clumpList; clump != NULL; clump = clump->next) clumpToExactRange(clump, qSeq, gf->tileSize, 0, NULL, &rangeList); slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, ffIntronMax); for (range = rangeList; range != NULL; range = range->next) { range->qStart += qOffset; range->qEnd += qOffset; tSeq = range->tSeq; AllocVar(bun); bun->qSeq = qSeq; bun->genoSeq = tSeq; bun->ffList = gfRangesToFfItem(range->components, qSeq); bun->isProt = FALSE; bun->avoidFuzzyFindKludge = TRUE; ssStitch(bun, ffCdna, 16, 10); refineBundle(gf, qSeq, qMaskBits, qOffset, tSeq, lm, bun, isRc); slAddHead(&bunList, bun); } gfRangeFreeList(&rangeList); gfClumpFreeList(&clumpList); return bunList; }
void searchOneProt(aaSeq *seq, struct genoFind *gf, FILE *f) /* Search for protein seq in index and write results to psl. */ { int hitCount; struct lm *lm = lmInit(0); struct gfClump *clumpList = gfFindClumps(gf, seq, lm, &hitCount); gfAlignAaClumps(gf, clumpList, seq, FALSE, minScore, gvo); gfClumpFreeList(&clumpList); lmCleanup(&lm); }
static struct ssBundle *gfTransTransFindBundles(struct genoFind *gfs[3], struct dnaSeq *qSeq, struct hash *t3Hash, boolean isRc, int minMatch, boolean isRna) /* Look for alignment to three translations of qSeq in three translated reading frames. * Save alignment via outFunction/outData. */ { struct trans3 *qTrans = trans3New(qSeq); int qFrame, tFrame; struct gfClump *clumps[3][3], *clump; struct gfRange *rangeList = NULL, *range; int tileSize = gfs[0]->tileSize; bioSeq *targetSeq; struct ssBundle *bun, *bunList = NULL; int hitCount; struct lm *lm = lmInit(0); enum ffStringency stringency = (isRna ? ffCdna : ffLoose); gfTransTransFindClumps(gfs, qTrans->trans, clumps, lm, &hitCount); for (qFrame = 0; qFrame<3; ++qFrame) { for (tFrame=0; tFrame<3; ++tFrame) { for (clump = clumps[qFrame][tFrame]; clump != NULL; clump = clump->next) { struct gfRange *rangeSet = NULL; clumpToHspRange(clump, qTrans->trans[qFrame], tileSize, tFrame, NULL, &rangeSet, TRUE, FALSE); untranslateRangeList(rangeSet, qFrame, tFrame, t3Hash, NULL, 0); rangeList = slCat(rangeSet, rangeList); } } } slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, 2000); for (range = rangeList; range != NULL; range = range->next) { targetSeq = range->tSeq; AllocVar(bun); bun->qSeq = qSeq; bun->genoSeq = targetSeq; bun->ffList = gfRangesToFfItem(range->components, qSeq); ssStitch(bun, stringency, minMatch, ssAliCount); slAddHead(&bunList, bun); } for (qFrame = 0; qFrame<3; ++qFrame) for (tFrame=0; tFrame<3; ++tFrame) gfClumpFreeList(&clumps[qFrame][tFrame]); gfRangeFreeList(&rangeList); trans3Free(&qTrans); lmCleanup(&lm); slReverse(&bunList); return bunList; }
void gfFindAlignAaTrans(struct genoFind *gfs[3], aaSeq *qSeq, struct hash *t3Hash, boolean tIsRc, int minMatch, struct gfOutput *out) /* Look for qSeq alignment in three translated reading frames. Save alignment * via outFunction/outData. */ { struct gfClump *clumps[3]; int frame; struct gfClump *clump; struct gfRange *rangeList = NULL, *range; aaSeq *targetSeq; struct ssBundle *bun; int tileSize = gfs[0]->tileSize; struct trans3 *t3; int hitCount; struct lm *lm = lmInit(0); gfTransFindClumps(gfs, qSeq, clumps, lm, &hitCount); for (frame=0; frame<3; ++frame) { for (clump = clumps[frame]; clump != NULL; clump = clump->next) { clumpToHspRange(clump, qSeq, tileSize, frame, NULL, &rangeList, TRUE, FALSE); } } slReverse(&rangeList); slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, ffIntronMax/3); for (range = rangeList; range != NULL; range = range->next) { targetSeq = range->tSeq; t3 = hashMustFindVal(t3Hash, targetSeq->name); AllocVar(bun); bun->qSeq = qSeq; bun->genoSeq = targetSeq; bun->ffList = gfRangesToFfItem(range->components, qSeq); bun->isProt = TRUE; bun->t3List = t3; ssStitch(bun, ffCdna, minMatch, ssAliCount); saveAlignments(targetSeq->name, t3->seq->size, 0, bun, t3Hash, FALSE, tIsRc, ffCdna, minMatch, out); ssBundleFree(&bun); } gfRangeFreeList(&rangeList); for (frame=0; frame<3; ++frame) gfClumpFreeList(&clumps[frame]); lmCleanup(&lm); }
void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf, boolean isRc, int minScore, Bits *qMaskBits, struct gfOutput *out, boolean fastMap, boolean band) /* Chop up query into pieces, align each, and stitch back * together again. */ { int hitCount; int maxSize = MAXSINGLEPIECESIZE; int preferredSize = 4500; int overlapSize = 250; struct dnaSeq subQuery = *query; struct lm *lm = lmInit(0); int subOffset, subSize, nextOffset; DNA saveEnd, *endPos; struct ssBundle *oneBunList = NULL, *bigBunList = NULL, *bun; struct hash *bunHash = newHash(8); for (subOffset = 0; subOffset<query->size; subOffset = nextOffset) { struct gfClump *clumpList; struct gfRange *rangeList = NULL; /* Figure out size of this piece. If query is * maxSize or less do it all. Otherwise just * do prefered size, and set it up to overlap * with surrounding pieces by overlapSize. */ if (subOffset == 0 && query->size <= maxSize) nextOffset = subSize = query->size; else { subSize = preferredSize; if (subSize + subOffset >= query->size) { subSize = query->size - subOffset; nextOffset = query->size; } else { nextOffset = subOffset + preferredSize - overlapSize; } } subQuery.dna = query->dna + subOffset; subQuery.size = subSize; endPos = &subQuery.dna[subSize]; saveEnd = *endPos; *endPos = 0; if (band) { oneBunList = ffSeedExtInMem(gf, &subQuery, qMaskBits, subOffset, lm, minScore, isRc); } else { clumpList = gfFindClumpsWithQmask(gf, &subQuery, qMaskBits, subOffset, lm, &hitCount); if (fastMap) { oneBunList = fastMapClumpsToBundles(gf, clumpList, &subQuery); } else { oneBunList = gfClumpsToBundles(clumpList, isRc, &subQuery, minScore, &rangeList); gfRangeFreeList(&rangeList); } gfClumpFreeList(&clumpList); } addToBigBundleList(&oneBunList, bunHash, &bigBunList, query); *endPos = saveEnd; } #ifdef DEBUG dumpBunList(bigBunList); #endif /* DEBUG */ for (bun = bigBunList; bun != NULL; bun = bun->next) { ssStitch(bun, ffCdna, minScore, ssAliCount); if (!fastMap && !band) refineSmallExonsInBundle(bun); saveAlignments(bun->genoSeq->name, bun->genoSeq->size, 0, bun, NULL, isRc, FALSE, ffCdna, minScore, out); } ssBundleFreeList(&bigBunList); freeHash(&bunHash); lmCleanup(&lm); }
void gfAlignTrans(int *pConn, char *tSeqDir, aaSeq *seq, int minMatch, struct hash *tFileCache, struct gfOutput *out) /* Search indexed translated genome on server with an amino acid sequence. * Then load homologous bits of genome locally and do detailed alignment. * Call 'outFunction' with each alignment that is found. */ { struct ssBundle *bun; struct gfClump *clumps[2][3], *clump; struct gfRange *rangeList = NULL, *range, *rl; struct dnaSeq *targetSeq, *tSeqList = NULL; char targetName[PATH_LEN]; int tileSize; int frame, isRc = 0; struct hash *t3Hash = NULL; struct slRef *t3RefList = NULL, *ref; struct gfSeqSource *ssList = NULL, *ss; struct trans3 *t3; struct lm *lm = lmInit(0); /* Get clumps from server. */ gfQuerySeqTrans(*pConn, seq, clumps, lm, &ssList, &tileSize); close(*pConn); *pConn = -1; for (isRc = 0; isRc <= 1; ++isRc) { /* Figure out which parts of sequence we need to load. */ for (frame = 0; frame < 3; ++frame) { rl = seqClumpToRangeList(clumps[isRc][frame], frame); rangeList = slCat(rangeList, rl); } /* Convert from amino acid to nucleotide coordinates. */ rangeCoorTimes3(rangeList); slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, ffIntronMax); loadHashT3Ranges(rangeList, tSeqDir, tFileCache, seq->size, isRc, &t3Hash, &tSeqList, &t3RefList); /* The old range list was not very precise - it was just to get * the DNA loaded. */ gfRangeFreeList(&rangeList); /* Patch up clump list and associated sequence source to refer * to bits of genome loaded into memory. Create new range list * by extending hits in clumps. */ for (frame = 0; frame < 3; ++frame) { for (clump = clumps[isRc][frame]; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; t3 = trans3Find(t3Hash, clumpTargetName(clump), clump->tStart*3, clump->tEnd*3); ss->seq = t3->trans[frame]; ss->start = t3->start/3; ss->end = t3->end/3; clumpToHspRange(clump, seq, tileSize, frame, t3, &rangeList, TRUE, FALSE); } } slReverse(&rangeList); slSort(&rangeList, gfRangeCmpTarget); rangeList = gfRangesBundle(rangeList, ffIntronMax/3); /* Do detailed alignment of each of the clustered ranges. */ for (range = rangeList; range != NULL; range = range->next) { targetSeq = range->tSeq; AllocVar(bun); bun->qSeq = seq; bun->genoSeq = targetSeq; bun->ffList = gfRangesToFfItem(range->components, seq); bun->isProt = TRUE; t3 = hashMustFindVal(t3Hash, range->tName); bun->t3List = t3; ssStitch(bun, ffCdna, minMatch, ssAliCount); getTargetName(range->tName, out->includeTargetFile, targetName); saveAlignments(targetName, t3->nibSize, 0, bun, t3Hash, FALSE, isRc, ffCdna, minMatch, out); ssBundleFree(&bun); } /* Cleanup for this strand of database. */ gfRangeFreeList(&rangeList); freeHash(&t3Hash); for (ref = t3RefList; ref != NULL; ref = ref->next) { struct trans3 *t3 = ref->val; trans3Free(&t3); } slFreeList(&t3RefList); freeDnaSeqList(&tSeqList); } /* Final cleanup. */ for (isRc=0; isRc<=1; ++isRc) for (frame=0; frame<3; ++frame) gfClumpFreeList(&clumps[isRc][frame]); for (ss = ssList; ss != NULL; ss = ss->next) freeMem(ss->fileName); slFreeList(&ssList); lmCleanup(&lm); }