void liftGenePredExt(char *destFile, struct hash *liftHash, int sourceCount, char *sources[]) /* Lift a genePred files. */ { char *row[GENEPREDX_NUM_COLS]; struct lineFile* lf; FILE* dest = mustOpen(destFile, "w"); int iSrc; int colCount; for (iSrc = 0; iSrc < sourceCount; iSrc++) { verbose(1, "Lifting %s\n", sources[iSrc]); lf = lineFileOpen(sources[iSrc], TRUE); while ((colCount = lineFileChopNextTab(lf, row, ArraySize(row)))) { struct genePred* gp = genePredExtLoad(row, colCount); if (liftGenePredObj(liftHash, gp, lf)) genePredTabOut(gp, dest); genePredFree(&gp); } lineFileClose(&lf); if (dots) verbose(1, "\n"); } carefulClose(&dest); }
static struct genePred *fileNext(struct genePredReader* gpr) /* read the next record from a file */ { char *row[GENEPREDX_NUM_COLS]; int numFields; while ((numFields = lineFileChopNextTab(gpr->lf, row, GENEPREDX_NUM_COLS)) > 0) { lineFileExpectAtLeast(gpr->lf, GENEPRED_NUM_COLS, numFields); if ((gpr->chrom == NULL) || (sameString(row[1], gpr->chrom))) return genePredExtLoad(row, numFields); } return NULL; }
static struct genePred *queryNext(struct genePredReader* gpr) /* read the next record from a query */ { int iFld, iCol; char **row = sqlNextRow(gpr->sr); char *reorderedRow[GENEPREDX_NUM_COLS]; if (row == NULL) return NULL; /* fill in row defaults */ for (iFld = 0; iFld < GENEPREDX_NUM_COLS; iFld++) reorderedRow[iFld] = fieldTbl[iFld].defaultVal; /* reorder row */ for (iCol = 0; iCol < gpr->queryCols; iCol++) { iFld = gpr->queryToFldMap[iCol]; if (iFld >= 0) reorderedRow[iFld] = row[iCol]; } return genePredExtLoad(reorderedRow, gpr->numFields); }
struct annoRow *annoGratorGpVarIntegrate(struct annoGrator *gSelf, struct annoStreamRows *primaryData, boolean *retRJFilterFailed, struct lm *callerLm) // integrate a variant and a genePred, generate as many rows as // needed to capture all the changes { struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf; lmCleanup(&(self->lm)); self->lm = lmInit(0); // Temporarily tweak primaryRow's start and end to find upstream/downstream overlap: struct annoRow *primaryRow = primaryData->rowList; int pStart = primaryRow->start, pEnd = primaryRow->end; if (primaryRow->start <= GPRANGE) primaryRow->start = 0; else primaryRow->start -= GPRANGE; primaryRow->end += GPRANGE; struct annoRow *rows = annoGratorIntegrate(gSelf, primaryData, retRJFilterFailed, self->lm); primaryRow->start = pStart; primaryRow->end = pEnd; if (self->variantFromRow == NULL) setVariantFromRow(self, primaryData); if (self->curChromSeq == NULL || differentString(self->curChromSeq->name, primaryRow->chrom)) { dnaSeqFree(&self->curChromSeq); struct twoBitFile *tbf = self->grator.streamer.assembly->tbf; self->curChromSeq = twoBitReadSeqFragLower(tbf, primaryRow->chrom, 0, 0); } // TODO Performance improvement: instead of creating the transcript sequence for each // variant that intersects the transcript, cache transcript sequence; possibly // an slPair with a concatenation of {chrom, txStart, txEnd, cdsStart, cdsEnd, // exonStarts, exonEnds} as the name, and sequence as the val. When something in // the list is no longer in the list of rows from the internal annoGratorIntegrate call, // drop it. // BETTER YET: make a callback for gpFx to get CDS sequence only when it needs it. char *refAllele = getGenomicSequence(self->curChromSeq->dna, primaryRow->start, primaryRow->end, self->lm); struct variant *variant = self->variantFromRow(self, primaryRow, refAllele); if (rows == NULL) { // No genePreds means that the primary variant is intergenic. if (self->funcFilter != NULL && self->funcFilter->intergenic) return aggvIntergenicRow(self, variant, retRJFilterFailed, callerLm); else if (retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) *retRJFilterFailed = TRUE; return NULL; } if (retRJFilterFailed && *retRJFilterFailed) return NULL; struct annoRow *outRows = NULL; int hasFrames = (asColumnFindIx(gSelf->mySource->asObj->columnList, "exonFrames") >= 0); for(; rows; rows = rows->next) { char **inWords = rows->data; // work around genePredLoad's trashing its input char *saveExonStarts = lmCloneString(self->lm, inWords[8]); char *saveExonEnds = lmCloneString(self->lm, inWords[9]); struct genePred *gp = hasFrames ? genePredExtLoad(inWords, GENEPREDX_NUM_COLS) : genePredLoad(inWords); inWords[8] = saveExonStarts; inWords[9] = saveExonEnds; struct annoRow *outRow = aggvGenRows(self, variant, gp, rows, callerLm); if (outRow != NULL) { slReverse(&outRow); outRows = slCat(outRow, outRows); } genePredFree(&gp); } slReverse(&outRows); // If all rows failed the filter, and we must overlap, set *retRJFilterFailed. if (outRows == NULL && retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) *retRJFilterFailed = TRUE; return outRows; }