示例#1
0
struct dnaSeq *gfiExpandAndLoadCached(struct gfRange *range, 
	struct hash *tFileCache, char *tSeqDir, int querySize, 
	int *retTotalSeqSize, boolean respectFrame, boolean isRc, int expansion)
/* Expand range to cover an additional expansion bases on either side.
 * Load up target sequence and return. (Done together because don't
 * know target size before loading.) */
{
struct dnaSeq *target = NULL;
char fileName[PATH_LEN+256];

safef(fileName, sizeof(fileName), "%s/%s", tSeqDir, range->tName);
if (nibIsFile(fileName))
    {
    struct nibInfo *nib = hashFindVal(tFileCache, fileName);
    if (nib == NULL)
        {
	nib = nibInfoNew(fileName);
	hashAdd(tFileCache, fileName, nib);
	}
    if (isRc)
	reverseIntRange(&range->tStart, &range->tEnd, nib->size);
    gfiExpandRange(range, querySize, nib->size, respectFrame, isRc, expansion);
    target = nibLdPart(fileName, nib->f, nib->size, 
    	range->tStart, range->tEnd - range->tStart);
    if (isRc)
	{
	reverseComplement(target->dna, target->size);
	reverseIntRange(&range->tStart, &range->tEnd, nib->size);
	}
    *retTotalSeqSize = nib->size;
    }
else
    {
    struct twoBitFile *tbf = NULL;
    char *tSeqName = strchr(fileName, ':');
    int tSeqSize = 0;
    if (tSeqName == NULL)
        errAbort("No colon in .2bit response from gfServer");
    *tSeqName++ = 0;
    tbf = hashFindVal(tFileCache, fileName);
    if (tbf == NULL)
        {
	tbf = twoBitOpen(fileName);
	hashAdd(tFileCache, fileName, tbf);
	}
    tSeqSize = twoBitSeqSize(tbf, tSeqName);
    if (isRc)
	reverseIntRange(&range->tStart, &range->tEnd, tSeqSize);
    gfiExpandRange(range, querySize, tSeqSize, respectFrame, isRc, expansion);
    target = twoBitReadSeqFragLower(tbf, tSeqName, range->tStart, range->tEnd);
    if (isRc)
	{
	reverseComplement(target->dna, target->size);
	reverseIntRange(&range->tStart, &range->tEnd, tSeqSize);
	}
    *retTotalSeqSize = tSeqSize;
    }
return target;
}
示例#2
0
void annoAssemblyGetSeq(struct annoAssembly *aa, char *seqName, uint start, uint end,
			char *buf, size_t bufSize)
/* Copy sequence to buf; bufSize must be at least end-start+1 chars in length. */
{
if (aa->curSeq == NULL || differentString(aa->curSeq->name, seqName))
    {
    dnaSeqFree(&aa->curSeq);
    aa->curSeq = twoBitReadSeqFragLower(aa->tbf, seqName, 0, 0);
    }
uint chromSize = aa->curSeq->size;
if (end > chromSize || start > chromSize || start > end)
    errAbort("annoAssemblyGetSeq: bad coords [%u,%u) (sequence %s size %u)",
	     start, end, seqName, chromSize);
safencpy(buf, bufSize, aa->curSeq->dna+start, end-start);
}
示例#3
0
struct annoRow *annoGratorGpVarIntegrate(struct annoGrator *gSelf,
					 struct annoStreamRows *primaryData,
					 boolean *retRJFilterFailed, struct lm *callerLm)
// integrate a variant and a genePred, generate as many rows as
// needed to capture all the changes
{
struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf;
lmCleanup(&(self->lm));
self->lm = lmInit(0);
// Temporarily tweak primaryRow's start and end to find upstream/downstream overlap:
struct annoRow *primaryRow = primaryData->rowList;
int pStart = primaryRow->start, pEnd = primaryRow->end;
if (primaryRow->start <= GPRANGE)
    primaryRow->start = 0;
else
    primaryRow->start -= GPRANGE;
primaryRow->end += GPRANGE;
struct annoRow *rows = annoGratorIntegrate(gSelf, primaryData, retRJFilterFailed, self->lm);
primaryRow->start = pStart;
primaryRow->end = pEnd;

if (self->variantFromRow == NULL)
    setVariantFromRow(self, primaryData);
if (self->curChromSeq == NULL || differentString(self->curChromSeq->name, primaryRow->chrom))
    {
    dnaSeqFree(&self->curChromSeq);
    struct twoBitFile *tbf = self->grator.streamer.assembly->tbf;
    self->curChromSeq = twoBitReadSeqFragLower(tbf, primaryRow->chrom, 0, 0);
    }
// TODO Performance improvement: instead of creating the transcript sequence for each
// variant that intersects the transcript, cache transcript sequence; possibly
// an slPair with a concatenation of {chrom, txStart, txEnd, cdsStart, cdsEnd,
// exonStarts, exonEnds} as the name, and sequence as the val.  When something in
// the list is no longer in the list of rows from the internal annoGratorIntegrate call,
// drop it.
// BETTER YET: make a callback for gpFx to get CDS sequence only when it needs it.
char *refAllele = getGenomicSequence(self->curChromSeq->dna, primaryRow->start, primaryRow->end,
				     self->lm);
struct variant *variant = self->variantFromRow(self, primaryRow, refAllele);

if (rows == NULL)
    {
    // No genePreds means that the primary variant is intergenic.
    if (self->funcFilter != NULL && self->funcFilter->intergenic)
	return aggvIntergenicRow(self, variant, retRJFilterFailed, callerLm);
    else if (retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap)
	*retRJFilterFailed = TRUE;
    return NULL;
    }
if (retRJFilterFailed && *retRJFilterFailed)
    return NULL;

struct annoRow *outRows = NULL;

int hasFrames = (asColumnFindIx(gSelf->mySource->asObj->columnList, "exonFrames") >= 0);

for(; rows; rows = rows->next)
    {
    char **inWords = rows->data;

    // work around genePredLoad's trashing its input
    char *saveExonStarts = lmCloneString(self->lm, inWords[8]);
    char *saveExonEnds = lmCloneString(self->lm, inWords[9]);
    struct genePred *gp = hasFrames ? genePredExtLoad(inWords, GENEPREDX_NUM_COLS) :
				      genePredLoad(inWords);
    inWords[8] = saveExonStarts;
    inWords[9] = saveExonEnds;

    struct annoRow *outRow = aggvGenRows(self, variant, gp, rows, callerLm);
    if (outRow != NULL)
	{
	slReverse(&outRow);
	outRows = slCat(outRow, outRows);
	}
    genePredFree(&gp);
    }
slReverse(&outRows);
// If all rows failed the filter, and we must overlap, set *retRJFilterFailed.
if (outRows == NULL && retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap)
    *retRJFilterFailed = TRUE;
return outRows;
}