void liftGenePredExt(char *destFile, struct hash *liftHash, int sourceCount, char *sources[])
/* Lift a genePred files. */
{
char *row[GENEPREDX_NUM_COLS];
struct lineFile* lf;
FILE* dest = mustOpen(destFile, "w");
int iSrc;
int colCount;

for (iSrc = 0; iSrc < sourceCount; iSrc++)
    {
    verbose(1, "Lifting %s\n", sources[iSrc]);
    lf = lineFileOpen(sources[iSrc], TRUE);
    while ((colCount = lineFileChopNextTab(lf, row, ArraySize(row))))
        {
        struct genePred* gp = genePredExtLoad(row, colCount);
        if (liftGenePredObj(liftHash, gp, lf))
            genePredTabOut(gp, dest);
        genePredFree(&gp);
        }
    lineFileClose(&lf);
    if (dots)
        verbose(1, "\n");
    }

carefulClose(&dest);
}
示例#2
0
static struct genePred *fileNext(struct genePredReader* gpr)
/* read the next record from a file */
{
char *row[GENEPREDX_NUM_COLS];
int numFields;

while ((numFields = lineFileChopNextTab(gpr->lf, row, GENEPREDX_NUM_COLS)) > 0)
    {
    lineFileExpectAtLeast(gpr->lf, GENEPRED_NUM_COLS, numFields);
    if ((gpr->chrom == NULL) || (sameString(row[1], gpr->chrom)))
        return genePredExtLoad(row, numFields);
    }
return NULL;
}
示例#3
0
static struct genePred *queryNext(struct genePredReader* gpr)
/* read the next record from a query */
{
int iFld, iCol;
char **row = sqlNextRow(gpr->sr);
char *reorderedRow[GENEPREDX_NUM_COLS];
if (row == NULL)
    return NULL;

/* fill in row defaults */
for (iFld = 0; iFld < GENEPREDX_NUM_COLS; iFld++)
    reorderedRow[iFld] = fieldTbl[iFld].defaultVal;

/* reorder row */
for (iCol = 0; iCol < gpr->queryCols; iCol++)
    {
    iFld = gpr->queryToFldMap[iCol];
    if (iFld >= 0)
        reorderedRow[iFld] = row[iCol];
    }
return genePredExtLoad(reorderedRow, gpr->numFields);
}
示例#4
0
struct annoRow *annoGratorGpVarIntegrate(struct annoGrator *gSelf,
					 struct annoStreamRows *primaryData,
					 boolean *retRJFilterFailed, struct lm *callerLm)
// integrate a variant and a genePred, generate as many rows as
// needed to capture all the changes
{
struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf;
lmCleanup(&(self->lm));
self->lm = lmInit(0);
// Temporarily tweak primaryRow's start and end to find upstream/downstream overlap:
struct annoRow *primaryRow = primaryData->rowList;
int pStart = primaryRow->start, pEnd = primaryRow->end;
if (primaryRow->start <= GPRANGE)
    primaryRow->start = 0;
else
    primaryRow->start -= GPRANGE;
primaryRow->end += GPRANGE;
struct annoRow *rows = annoGratorIntegrate(gSelf, primaryData, retRJFilterFailed, self->lm);
primaryRow->start = pStart;
primaryRow->end = pEnd;

if (self->variantFromRow == NULL)
    setVariantFromRow(self, primaryData);
if (self->curChromSeq == NULL || differentString(self->curChromSeq->name, primaryRow->chrom))
    {
    dnaSeqFree(&self->curChromSeq);
    struct twoBitFile *tbf = self->grator.streamer.assembly->tbf;
    self->curChromSeq = twoBitReadSeqFragLower(tbf, primaryRow->chrom, 0, 0);
    }
// TODO Performance improvement: instead of creating the transcript sequence for each
// variant that intersects the transcript, cache transcript sequence; possibly
// an slPair with a concatenation of {chrom, txStart, txEnd, cdsStart, cdsEnd,
// exonStarts, exonEnds} as the name, and sequence as the val.  When something in
// the list is no longer in the list of rows from the internal annoGratorIntegrate call,
// drop it.
// BETTER YET: make a callback for gpFx to get CDS sequence only when it needs it.
char *refAllele = getGenomicSequence(self->curChromSeq->dna, primaryRow->start, primaryRow->end,
				     self->lm);
struct variant *variant = self->variantFromRow(self, primaryRow, refAllele);

if (rows == NULL)
    {
    // No genePreds means that the primary variant is intergenic.
    if (self->funcFilter != NULL && self->funcFilter->intergenic)
	return aggvIntergenicRow(self, variant, retRJFilterFailed, callerLm);
    else if (retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap)
	*retRJFilterFailed = TRUE;
    return NULL;
    }
if (retRJFilterFailed && *retRJFilterFailed)
    return NULL;

struct annoRow *outRows = NULL;

int hasFrames = (asColumnFindIx(gSelf->mySource->asObj->columnList, "exonFrames") >= 0);

for(; rows; rows = rows->next)
    {
    char **inWords = rows->data;

    // work around genePredLoad's trashing its input
    char *saveExonStarts = lmCloneString(self->lm, inWords[8]);
    char *saveExonEnds = lmCloneString(self->lm, inWords[9]);
    struct genePred *gp = hasFrames ? genePredExtLoad(inWords, GENEPREDX_NUM_COLS) :
				      genePredLoad(inWords);
    inWords[8] = saveExonStarts;
    inWords[9] = saveExonEnds;

    struct annoRow *outRow = aggvGenRows(self, variant, gp, rows, callerLm);
    if (outRow != NULL)
	{
	slReverse(&outRow);
	outRows = slCat(outRow, outRows);
	}
    genePredFree(&gp);
    }
slReverse(&outRows);
// If all rows failed the filter, and we must overlap, set *retRJFilterFailed.
if (outRows == NULL && retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap)
    *retRJFilterFailed = TRUE;
return outRows;
}