struct dnaSeq *gfiExpandAndLoadCached(struct gfRange *range, struct hash *tFileCache, char *tSeqDir, int querySize, int *retTotalSeqSize, boolean respectFrame, boolean isRc, int expansion) /* Expand range to cover an additional expansion bases on either side. * Load up target sequence and return. (Done together because don't * know target size before loading.) */ { struct dnaSeq *target = NULL; char fileName[PATH_LEN+256]; safef(fileName, sizeof(fileName), "%s/%s", tSeqDir, range->tName); if (nibIsFile(fileName)) { struct nibInfo *nib = hashFindVal(tFileCache, fileName); if (nib == NULL) { nib = nibInfoNew(fileName); hashAdd(tFileCache, fileName, nib); } if (isRc) reverseIntRange(&range->tStart, &range->tEnd, nib->size); gfiExpandRange(range, querySize, nib->size, respectFrame, isRc, expansion); target = nibLdPart(fileName, nib->f, nib->size, range->tStart, range->tEnd - range->tStart); if (isRc) { reverseComplement(target->dna, target->size); reverseIntRange(&range->tStart, &range->tEnd, nib->size); } *retTotalSeqSize = nib->size; } else { struct twoBitFile *tbf = NULL; char *tSeqName = strchr(fileName, ':'); int tSeqSize = 0; if (tSeqName == NULL) errAbort("No colon in .2bit response from gfServer"); *tSeqName++ = 0; tbf = hashFindVal(tFileCache, fileName); if (tbf == NULL) { tbf = twoBitOpen(fileName); hashAdd(tFileCache, fileName, tbf); } tSeqSize = twoBitSeqSize(tbf, tSeqName); if (isRc) reverseIntRange(&range->tStart, &range->tEnd, tSeqSize); gfiExpandRange(range, querySize, tSeqSize, respectFrame, isRc, expansion); target = twoBitReadSeqFragLower(tbf, tSeqName, range->tStart, range->tEnd); if (isRc) { reverseComplement(target->dna, target->size); reverseIntRange(&range->tStart, &range->tEnd, tSeqSize); } *retTotalSeqSize = tSeqSize; } return target; }
void annoAssemblyGetSeq(struct annoAssembly *aa, char *seqName, uint start, uint end, char *buf, size_t bufSize) /* Copy sequence to buf; bufSize must be at least end-start+1 chars in length. */ { if (aa->curSeq == NULL || differentString(aa->curSeq->name, seqName)) { dnaSeqFree(&aa->curSeq); aa->curSeq = twoBitReadSeqFragLower(aa->tbf, seqName, 0, 0); } uint chromSize = aa->curSeq->size; if (end > chromSize || start > chromSize || start > end) errAbort("annoAssemblyGetSeq: bad coords [%u,%u) (sequence %s size %u)", start, end, seqName, chromSize); safencpy(buf, bufSize, aa->curSeq->dna+start, end-start); }
struct annoRow *annoGratorGpVarIntegrate(struct annoGrator *gSelf, struct annoStreamRows *primaryData, boolean *retRJFilterFailed, struct lm *callerLm) // integrate a variant and a genePred, generate as many rows as // needed to capture all the changes { struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf; lmCleanup(&(self->lm)); self->lm = lmInit(0); // Temporarily tweak primaryRow's start and end to find upstream/downstream overlap: struct annoRow *primaryRow = primaryData->rowList; int pStart = primaryRow->start, pEnd = primaryRow->end; if (primaryRow->start <= GPRANGE) primaryRow->start = 0; else primaryRow->start -= GPRANGE; primaryRow->end += GPRANGE; struct annoRow *rows = annoGratorIntegrate(gSelf, primaryData, retRJFilterFailed, self->lm); primaryRow->start = pStart; primaryRow->end = pEnd; if (self->variantFromRow == NULL) setVariantFromRow(self, primaryData); if (self->curChromSeq == NULL || differentString(self->curChromSeq->name, primaryRow->chrom)) { dnaSeqFree(&self->curChromSeq); struct twoBitFile *tbf = self->grator.streamer.assembly->tbf; self->curChromSeq = twoBitReadSeqFragLower(tbf, primaryRow->chrom, 0, 0); } // TODO Performance improvement: instead of creating the transcript sequence for each // variant that intersects the transcript, cache transcript sequence; possibly // an slPair with a concatenation of {chrom, txStart, txEnd, cdsStart, cdsEnd, // exonStarts, exonEnds} as the name, and sequence as the val. When something in // the list is no longer in the list of rows from the internal annoGratorIntegrate call, // drop it. // BETTER YET: make a callback for gpFx to get CDS sequence only when it needs it. char *refAllele = getGenomicSequence(self->curChromSeq->dna, primaryRow->start, primaryRow->end, self->lm); struct variant *variant = self->variantFromRow(self, primaryRow, refAllele); if (rows == NULL) { // No genePreds means that the primary variant is intergenic. if (self->funcFilter != NULL && self->funcFilter->intergenic) return aggvIntergenicRow(self, variant, retRJFilterFailed, callerLm); else if (retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) *retRJFilterFailed = TRUE; return NULL; } if (retRJFilterFailed && *retRJFilterFailed) return NULL; struct annoRow *outRows = NULL; int hasFrames = (asColumnFindIx(gSelf->mySource->asObj->columnList, "exonFrames") >= 0); for(; rows; rows = rows->next) { char **inWords = rows->data; // work around genePredLoad's trashing its input char *saveExonStarts = lmCloneString(self->lm, inWords[8]); char *saveExonEnds = lmCloneString(self->lm, inWords[9]); struct genePred *gp = hasFrames ? genePredExtLoad(inWords, GENEPREDX_NUM_COLS) : genePredLoad(inWords); inWords[8] = saveExonStarts; inWords[9] = saveExonEnds; struct annoRow *outRow = aggvGenRows(self, variant, gp, rows, callerLm); if (outRow != NULL) { slReverse(&outRow); outRows = slCat(outRow, outRows); } genePredFree(&gp); } slReverse(&outRows); // If all rows failed the filter, and we must overlap, set *retRJFilterFailed. if (outRows == NULL && retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) *retRJFilterFailed = TRUE; return outRows; }