static void chkGenePredRows(struct gbSelect* select, struct sqlConnection* conn, char* table, boolean isRefFlat, struct metaDataTbls* metaDataTbls, unsigned typeFlags) /* check rows of genePred or refFlat table */ { unsigned iRow = 0; char **row; char *geneName = NULL; int rowOff = (isRefFlat ? 1 : 0); /* columns to skip to genePred */ if (sqlFieldIndex(conn, table, "bin") >= 0) rowOff++; char query[512]; sqlSafef(query, sizeof(query), "SELECT * FROM %s", table); struct sqlResult *sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct genePred* gene = genePredLoad(row+rowOff); if (isRefFlat) geneName = row[0]; chkGenePred(gene, geneName, iRow, select->release->genome->database, table, metaDataTbls, typeFlags); genePredFree(&gene); iRow++; } sqlFreeResult(&sr); }
void findGenePredOverlap(struct hash *chromHash, char **row, FILE *outFh) /* find and output overlaps with a genePred object */ { struct genePred *gene = genePredLoad(row); struct binKeeper *chromBins = getChromBins(chromHash, gene->chrom, gene->strand); struct geneLoc *geneLocList = NULL; struct geneLoc *geneLoc; int iExon; /* get any with overlaping exons */ for (iExon = 0; iExon < gene->exonCount; iExon++) { int exonStart = gene->exonStarts[iExon]; int exonEnd = gene->exonEnds[iExon]; if (gCdsOnly) { exonStart = max(exonStart, gene->cdsStart); exonEnd = min(exonEnd, gene->cdsEnd); } if (exonStart < exonEnd) findOverlapingExons(&geneLocList, chromBins, exonStart, exonEnd); } for (geneLoc = geneLocList; geneLoc != NULL; geneLoc = geneLoc->next) fprintf(outFh, "%s\t%s\t%s\t%d\t%d\t%s\t%d\t%d\t%d\n", geneLoc->chrom, geneLoc->strand, gene->name, gene->txStart, gene->txEnd, geneLoc->name, geneLoc->start, geneLoc->end, geneLoc->numOverlap); geneLocUnlink(&geneLocList); genePredFree(&gene); }
void liftGenePredExt(char *destFile, struct hash *liftHash, int sourceCount, char *sources[]) /* Lift a genePred files. */ { char *row[GENEPREDX_NUM_COLS]; struct lineFile* lf; FILE* dest = mustOpen(destFile, "w"); int iSrc; int colCount; for (iSrc = 0; iSrc < sourceCount; iSrc++) { verbose(1, "Lifting %s\n", sources[iSrc]); lf = lineFileOpen(sources[iSrc], TRUE); while ((colCount = lineFileChopNextTab(lf, row, ArraySize(row)))) { struct genePred* gp = genePredExtLoad(row, colCount); if (liftGenePredObj(liftHash, gp, lf)) genePredTabOut(gp, dest); genePredFree(&gp); } lineFileClose(&lf); if (dots) verbose(1, "\n"); } carefulClose(&dest); }
static void checkGenePred(char *fileTbl) /* check a genePred file or table */ { struct sqlConnection *conn = NULL; struct genePredReader *gpr; struct genePred *gp; int iRec = 0; if (fileExists(fileTbl)) { gpr = genePredReaderFile(fileTbl, NULL); } else if (gDb != NULL) { conn = hAllocConn(gDb); gpr = genePredReaderQuery(conn, fileTbl, NULL); } else { errAbort("file %s doesn't exist, must specify -db=db if this is a table", fileTbl); } while ((gp = genePredReaderNext(gpr)) != NULL) { checkAGenePred(fileTbl, ++iRec, gp); genePredFree(&gp); } genePredReaderFree(&gpr); hFreeConn(&conn); }
static void showMrnaFromGenePred(struct sqlConnection *conn, char *geneId, char *geneName) /* Get mRNA sequence for gene from gene prediction. */ { char *table = genomeSetting("knownGene"); struct sqlResult *sr; char **row; char query[256]; boolean hasBin = hIsBinned(sqlGetDatabase(conn), table); hPrintf("<TT><PRE>"); safef(query, sizeof(query), "select * from %s where name='%s'" " and chrom='%s' and txStart=%d and txEnd=%d", table, geneId, curGeneChrom, curGeneStart, curGeneEnd); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { struct genePred *gene = genePredLoad(row+hasBin); struct bed *bed = bedFromGenePred(gene); struct dnaSeq *seq = hSeqForBed(sqlGetDatabase(conn), bed); hPrintf(">%s (%s predicted mRNA)\n", geneId, geneName); writeSeqWithBreaks(stdout, seq->dna, seq->size, 50); dnaSeqFree(&seq); bedFree(&bed); genePredFree(&gene); } else errAbort("Couldn't find %s at %s:%d-%d", geneId, curGeneChrom, curGeneStart, curGeneEnd); sqlFreeResult(&sr); hPrintf("</TT></PRE>"); }
static struct chromAnn* chromAnnGenePredReaderRead(struct chromAnnReader *car) /* Read the next genePred row and create a chromAnn object row read from a * GenePred file or table. If there is no CDS, and chromAnnCds is specified, * it will return a record with zero-length range.*/ { struct rowReader *rr = car->data; if (!rowReaderNext(rr)) return NULL; rowReaderExpectAtLeast(rr, GENEPRED_NUM_COLS); char **rawCols = (car->opts & chromAnnSaveLines) ? rowReaderCloneColumns(rr) : NULL; struct genePred *gp = genePredLoad(rr->row); struct chromAnn* ca = chromAnnNew(gp->chrom, gp->strand[0], gp->name, rawCols, strVectorWrite, strVectorFree); if (car->opts & chromAnnRange) { if (car->opts & chromAnnCds) { if (gp->cdsStart < gp->cdsEnd) chromAnnBlkNew(ca, gp->cdsStart, gp->cdsEnd); } else chromAnnBlkNew(ca, gp->txStart, gp->txEnd); } else addGenePredBlocks(ca, car->opts, gp); chromAnnFinish(ca); genePredFree(&gp); return ca; }
void convertPsl(struct psl *psl, struct genbankCds *cds, FILE *genePredFh) /* convert a cds and psl and output */ { struct genePred *genePred = pslToGenePred(psl, cds); if (genePred != NULL) { genePredTabOut(genePred, genePredFh); genePredFree(&genePred); } }
void geneFreeList(struct genePred **gList) /* Free a list of dynamically allocated genePred's */ { struct genePred *el, *next; for (el = *gList; el != NULL; el = next) { next = el->next; genePredFree(&el); } *gList = NULL; }
static void gbGeneTblWriteGeneFlat(struct gbGeneTbl *ggt, struct gbStatus* status, struct psl* psl, struct sqlConnection *conn) /* write genePred flat row */ { struct genePred* gp = genePredFromPsl3(psl, &status->cds, 0, genePredPslCdsMod3, genePredStdInsertMergeSize, genePredStdInsertMergeSize); FILE *fh = gbGeneTblGetFlatTabFh(ggt, conn); fprintf(fh, "%s\t", ((status->geneName == NULL) ? "" : status->geneName)); genePredTabOut(gp, fh); genePredFree(&gp); }
void fillInGene(struct chain *chain, struct genePred *gene, struct genePred **pGene) /** Fill in syntenic gene structure with initial information for gene. */ { FILE *cdsErrorFp; struct genePred *synGene = NULL; int qs, qe; struct chain *subChain=NULL, *toFree=NULL; AllocVar(synGene); chainSubSetForRegion(chain, gene->txStart, gene->txEnd , &subChain, &toFree); if(subChain == NULL) { *pGene= NULL; return; } qChainRangePlusStrand(subChain, &qs, &qe); synGene->chrom = cloneString(subChain->qName); synGene->name = cloneString(gene->name); synGene->txStart = qs; synGene->txEnd = qe; AllocArray(synGene->exonStarts, gene->exonCount); AllocArray(synGene->exonEnds, gene->exonCount); if(chain->qStrand == '+') strncpy(synGene->strand, gene->strand, sizeof(synGene->strand)); else { if(gene->strand[0] == '+') strncpy(synGene->strand, "-", sizeof(synGene->strand)); else if(gene->strand[0] == '-') strncpy(synGene->strand, "+", sizeof(synGene->strand)); else errAbort("Don't recognize strand %s from gene %s", gene->strand, gene->name); } chainFree(&toFree); chainSubSetForRegion(chain, gene->cdsStart, gene->cdsEnd , &subChain, &toFree); if(subChain == NULL ) { if(optionExists("cdsErrorFile")) { cdsErrorFp = fopen( optionVal("cdsErrorFile",NULL), "a" ); fprintf( cdsErrorFp, "%s\t%s\t%u\t%u\t%u\t%u\t%s\t%d\n", gene->name, gene->chrom, gene->txStart, gene->txEnd, gene->cdsStart, gene->cdsEnd, gene->strand, gene->exonCount ); fclose(cdsErrorFp); } *pGene = NULL; genePredFree(&synGene); return; } qChainRangePlusStrand(subChain, &qs, &qe); synGene->cdsStart = qs; synGene->cdsEnd = qe; chainFree(&toFree); *pGene = synGene; }
/* convert one line read from a bed file to a genePred */ void cnvBedRec(char *line, FILE *gpFh) { char *row[12]; int numCols = chopByWhite(line, row, ArraySize(row)); if (numCols < 4) errAbort("bed must have at least 4 columns"); struct bed *bed = bedLoadN(row, numCols); struct genePred* gp = bedToGenePred(bed); genePredTabOut(gp, gpFh); genePredFree(&gp); bedFree(&bed); }
static void gtfGroupToGenePred(struct gffFile *gtf, struct gffGroup *group, FILE *gpFh, FILE *infoFh) /* convert one gtf group to a genePred */ { unsigned optFields = (clGenePredExt ? genePredAllFlds : 0); struct errCatch *errCatch = errCatchNew(); if (errCatchStart(errCatch)) { struct genePred *gp = genePredFromGroupedGtf(gtf, group, group->name, optFields, clGxfOptions); if (gp == NULL) { if (!clIgnoreGroupsWithoutExons) { char *msg = "no exons defined for group %s, feature %s (perhaps try -ignoreGroupsWithoutExons)"; if (clAllErrors) { fprintf(stderr, msg, group->name, group->lineList->feature); fputc('\n', stderr); badGroupCount++; } else errAbort(msg, group->name, group->lineList->feature); } } else { genePredTabOut(gp, gpFh); genePredFree(&gp); } } errCatchEnd(errCatch); if (errCatch->gotError) { // drop trailing newline in caught message if (endsWith(errCatch->message->string, "\n")) dyStringResize(errCatch->message, dyStringLen(errCatch->message)-1); if (clAllErrors) { fprintf(stderr, "%s\n", errCatch->message->string); badGroupCount++; } else errAbort("%s", errCatch->message->string); } else { if (infoFh != NULL) writeInfo(infoFh, group); } errCatchFree(&errCatch); }
static void printCcdsHgGeneUrl(struct sqlConnection *conn, char *ccdsId, char* kgId) /* output a URL to hgGene for a ccds */ { char where[128]; struct genePredReader *gpr; struct genePred *ccdsGene = NULL, *kgGene = NULL; /* get ccds genePred to get location */ sqlSafefFrag(where, sizeof(where), "chrom = '%s' and name = '%s'", seqName, ccdsId); gpr = genePredReaderQuery(conn, "ccdsGene", where); ccdsGene = genePredReaderAll(gpr); genePredReaderFree(&gpr); if (ccdsGene == NULL) errAbort("%s not found in ccdsGene table for chrom %s", ccdsId, seqName); else if (ccdsGene->next != NULL) errAbort("multiple %s rows found in ccdsGene table for chrom %s", ccdsId, seqName); /* get KG genePred, as need exact location for link */ sqlSafefFrag(where, sizeof(where), "name = '%s' and strand = '%s'", kgId, ccdsGene->strand); gpr = genePredReaderRangeQuery(conn, "knownGene", seqName, ccdsGene->txStart, ccdsGene->txEnd, where); kgGene = genePredReaderAll(gpr); genePredReaderFree(&gpr); if (kgGene == NULL) errAbort("%s not found in knownGene table for chrom %s", kgId, seqName); else if (kgGene->next != NULL) errAbort("multiple %s rows found in knownGene table for chrom %s", kgId, seqName); printf("../cgi-bin/hgGene?%s&%s=%s&%s=%s&%s=%s&%s=%d&%s=%d", cartSidUrlString(cart), "db", database, "hgg_gene", kgId, "hgg_chrom", seqName, "hgg_start", kgGene->txStart, "hgg_end", kgGene->txEnd); genePredFree(&ccdsGene); genePredFree(&kgGene); }
void doGenePreds(struct sqlConnection *conn, char *db, char *orthoDb, char *chrom, char *netTable, char *geneFileName, char *geneTableName, char *outBedName, char *selectedFileName, int *foundCount, int *notFoundCount) /* Map over genePreds. */ { FILE *bedOut = NULL; FILE *selectedOut = NULL; FILE *cdsErrorFp = NULL; struct genePred *gene = NULL, *geneList = NULL; struct bed *bed = NULL; //init output files if(optionExists("cdsErrorFile")) { cdsErrorFp = fopen( optionVal("cdsErrorFile", NULL), "w" ); fprintf( cdsErrorFp, "#name\tchrom\ttxStart\ttxEnd\tcdsStart\tcdsEnd\tstrand\texonCount\n" ); fclose(cdsErrorFp); } warn("Loading Gene Predictions."); assert(outBedName); if(geneFileName) geneList=genePredLoadAll(geneFileName); else geneList=loadGeneFromTable(conn, geneTableName, chrom, 0, BIGNUM); /* Convert genePreds. */ warn("Converting genes."); bedOut = mustOpen(outBedName, "w"); if (selectedFileName != NULL) selectedOut = mustOpen(selectedFileName, "w"); for(gene = geneList; gene != NULL; gene = gene->next) { struct genePred *synGene = NULL; if(differentString(gene->chrom, chrom)) continue; synGene = orthoBedFromGene(conn, db, orthoDb, netTable, gene); occassionalDot(); if(synGene != NULL && synGene->exonCount > 0) { (*foundCount)++; genePredTabOut(synGene, bedOut); if (selectedOut != NULL) genePredTabOut(gene, selectedOut); } else (*notFoundCount)++; genePredFree(&synGene); } carefulClose(&selectedOut); carefulClose(&bedOut); }
static void getGeneAnns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile) /* get request genePred annotations from database */ { struct genePredReader *gpr = genePredReaderQuery(conn, "refGene", NULL); FILE *fh = mustOpen(outFile, "w"); struct genePred *gp; while ((gp = genePredReaderNext(gpr)) != NULL) { processGenePred(fh, refSeqVerInfoTbl, gp); genePredFree(&gp); } carefulClose(&fh); genePredReaderFree(&gpr); }
static void genePredHisto(char *what, char *gpFile, char *outFile) /* get data for generating histograms from a genePred file. */ { struct genePredReader *gpr = genePredReaderFile(gpFile, NULL); histoFuncType histoFunc = getHistoFunc(what); struct genePred *gp; FILE *outFh = mustOpen(outFile, "w"); while ((gp = genePredReaderNext(gpr)) != NULL) { histoFunc(gp, outFh); genePredFree(&gp); } carefulClose(&outFh); genePredReaderFree(&gpr); }
static void gbGeneTblWriteGene(struct gbGeneTbl *ggt, struct gbStatus* status, struct psl* psl, struct sqlConnection *conn) /* write genePred row */ { struct genePred* gp = genePredFromPsl3(psl, &status->cds, (ggt->hasExtCols ? genePredAllFlds : 0), genePredPslCdsMod3, genePredStdInsertMergeSize, genePredStdInsertMergeSize); FILE *fh = gbGeneTblGetTabFh(ggt, conn); if (ggt->hasExtCols) { /* add gene name */ freeMem(gp->name2); gp->name2 = cloneString(status->geneName); } if (ggt->hasBin) fprintf(fh, "%u\t", hFindBin(gp->txStart, gp->txEnd)); genePredTabOut(gp, fh); genePredFree(&gp); }
static void capAliTextOnTrack(struct mafAli *maf, char *db, char *chrom, char *track, boolean onlyCds) /* Capitalize exons in alignment. */ { int rowOffset; struct sqlConnection *conn = sqlConnect(db); struct mafComp *selfMc = maf->components, *mc; int start = selfMc->start; int end = start + selfMc->size; struct sqlResult *sr = hRangeQuery(conn, track, chrom, start, end, NULL, &rowOffset); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row+rowOffset); int i; for (i=0; i<gp->exonCount; ++i) { int s = gp->exonStarts[i]; int e = gp->exonEnds[i]; if (onlyCds) { if (s < gp->cdsStart) s = gp->cdsStart; if (e > gp->cdsEnd) e = gp->cdsEnd; } if (s < start) s = start; if (e > end) e = end; if (findAliRange(selfMc->text, maf->textSize, s-start, e-start, &s, &e)) { for (mc = maf->components; mc != NULL; mc = mc->next) if (mc->text) toUpperN(mc->text + s, e-s); } } genePredFree(&gp); } sqlFreeResult(&sr); sqlDisconnect(&conn); }
void borfMatcher(char *bedIn, char *borfIn, char *bedOutFile, char *genePredOutFile) /* Top level function to open files and call other functions. */ { struct borf *borf = NULL, *borfList = NULL; struct bed *bed = NULL, *bedList = NULL; struct genePred *gp = NULL; float threshold = optionFloat("minScore", 50); FILE *bedOut = mustOpen(bedOutFile, "w"); FILE *genePredOut = mustOpen(genePredOutFile, "w"); boolean keepSmall = optionExists("keepSmall"); boolean keepNmd = optionExists("keepNmd"); borfList = borfLoadAll(borfIn); bedList = bedLoadAll(bedIn); dotForUserInit(slCount(bedList)/10); for(bed = bedList, borf = borfList; bed != NULL && borf != NULL; bed = bed->next, borf = borf->next) { dotForUser(); if(!stringIn(bed->name, borf->name)) errAbort("Trying to match up %s bed with %s borf - bad idea!", bed->name, borf->name); /* Have to adjust cds end. Borf puts stop codon outside of cds, we put it inside. */ borf->cdsEnd = min(borf->cdsEnd+3, borf->size); if((borf->score > threshold || (keepSmall && borf->cdsSize > 0)) && sameString(borf->strand, "+")) { setThickStartStop(bed, borf); if(keepNmd || !nmdTarget(bed)) { gp = bedToGenePred(bed); bedTabOutN(bed, 12, bedOut); genePredTabOut(gp, genePredOut); genePredFree(&gp); } } } warn("Done."); carefulClose(&bedOut); carefulClose(&genePredOut); }
void addGenePred(struct hash *chromHash, char **row) /* add a genePred's exons to the approriate binkeeper object in hash */ { struct genePred *gene = genePredLoad(row); int iExon; struct binKeeper *chromBins = getChromBins(chromHash, gene->chrom, gene->strand); struct geneLoc *geneLoc = geneLocNew(chromHash->lm, gene->name, gene->chrom, gene->strand, gene->txStart, gene->txEnd); for (iExon = 0; iExon < gene->exonCount; iExon++) { int exonStart = gene->exonStarts[iExon]; int exonEnd = gene->exonEnds[iExon]; if (gCdsOnly) { exonStart = max(exonStart, gene->cdsStart); exonEnd = min(exonEnd, gene->cdsEnd); } if (exonStart < exonEnd) binKeeperAdd(chromBins, exonStart, exonEnd, geneLoc); } genePredFree(&gene); }
void intronSizes(char *database, char *table) /* intronSizes - Output list of intron sizes.. */ { struct dyString *query = newDyString(1024); struct sqlConnection *conn; struct sqlResult *sr; char **row; struct genePred *gp; int rowOffset; struct bed *bedList = NULL, *bed = NULL; hSetDb(database); rowOffset = hOffsetPastBin(NULL, table); conn = hAllocConn(database); sqlDyStringPrintf(query, "select * from %s", table); if (chromName != NULL) dyStringPrintf(query, " where chrom = '%s'", chromName); if (cgiBoolean("withUtr")) { dyStringPrintf(query, " %s txStart != cdsStart", (chromName == NULL ? "where" : "and")); } sr = sqlGetResult(conn, query->string); while ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row+rowOffset); genePredIntrons(gp, &bedList); slReverse(&bedList); for (bed = bedList ; bed != NULL ; bed=bed->next) bedTabOutN(bed,6, stdout); bedFreeList(&bedList); genePredFree(&gp); } sqlFreeResult(&sr); hFreeConn(&conn); }
void intronEnds(char *database, char *table) /* intronEnds - Gather stats on intron ends.. */ { struct dyString *query = newDyString(1024); struct sqlConnection *conn; struct sqlResult *sr; char **row; struct genePred *gp; int total = 0; int gtag = 0; int gcag = 0; int atac = 0; int ctac = 0; DNA ends[4]; int exonIx, txStart; struct dnaSeq *seq; int rowOffset; char strand; rowOffset = hOffsetPastBin(database, NULL, table); conn = hAllocConn(database); sqlDyStringPrintf(query, "select * from %s", table); if (chromName != NULL) dyStringPrintf(query, " where chrom = '%s'", chromName); if (cgiBoolean("withUtr")) { dyStringPrintf(query, " %s txStart != cdsStart", (chromName == NULL ? "where" : "and")); } sr = sqlGetResult(conn, query->string); while ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row+rowOffset); strand = gp->strand[0]; txStart = gp->txStart; seq = hDnaFromSeq(database, gp->chrom, txStart, gp->txEnd, dnaLower); for (exonIx=1; exonIx < gp->exonCount; ++exonIx) { ++total; memcpy(ends, seq->dna + gp->exonEnds[exonIx-1] - txStart, 2); memcpy(ends+2, seq->dna + gp->exonStarts[exonIx] - txStart - 2, 2); if (strand == '-') reverseComplement(ends, 4); if (ends[0] == 'g' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'g') ++gtag; if (ends[0] == 'g' && ends[1] == 'c' && ends[2] == 'a' && ends[3] == 'g') ++gcag; if (ends[0] == 'a' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'c') ++atac; if (ends[0] == 'c' && ends[1] == 't' && ends[2] == 'a' && ends[3] == 'c') ++ctac; } freeDnaSeq(&seq); genePredFree(&gp); } sqlFreeResult(&sr); hFreeConn(&conn); printf("gt/ag %d (%4.2f)\n", gtag, 100.0*gtag/total); printf("gc/ag %d (%4.2f)\n", gcag, 100.0*gcag/total); printf("at/ac %d (%4.2f)\n", atac, 100.0*atac/total); printf("ct/ac %d (%4.2f)\n", ctac, 100.0*ctac/total); printf("Total %d\n", total); }
struct genePred *orthoBedFromGene(struct sqlConnection *conn, char *db, char *orthoDb, char *netTable, struct genePred *gene) /** Produce a genePred on the orthologous genome from the original gene. */ { struct genePred *synGene= NULL; int i; int *blockSizes; struct chain *chain = NULL; int diff = 0; AllocArray(blockSizes, gene->exonCount); for(i=0; i<gene->exonCount; i++) blockSizes[i] = gene->exonEnds[i] - gene->exonStarts[i]; chain = chainForBlocks(conn, db, netTable, gene->chrom, gene->txStart, gene->txEnd, (int *)gene->exonStarts, blockSizes, gene->exonCount); if(chain == NULL) return NULL; fillInGene(chain, gene, &synGene); if(synGene == NULL) return NULL; if(chain->qStrand == '+') { for(i=0; i<gene->exonCount; i++) { addExonToGene(chain, gene, synGene, i); } } else { for(i=gene->exonCount-1; i>=0; i--) { addExonToGene(chain, gene, synGene, i); } } if(synGene->exonCount > 0 && synGene->exonStarts[0] != 0) diff = synGene->exonStarts[0]; /* Make sure the txStart/End and cdsStart/End are at reasonable places. */ if(synGene->exonCount > 0) { synGene->txStart = synGene->exonStarts[0]; synGene->txEnd = synGene->exonEnds[synGene->exonCount - 1]; } /* Adjust cdsStart to be in an exon */ for(i = 0; i < synGene->exonCount; i++) { if(synGene->cdsStart >= synGene->exonStarts[i] && synGene->cdsStart < synGene->exonEnds[i]) break; /* found in exon */ if(synGene->cdsStart < synGene->exonStarts[i]) { /* move to next exon */ synGene->cdsStart = synGene->exonStarts[i]; break; } } if(i == synGene->exonCount) synGene->cdsStart = synGene->txEnd; /* didn't find start */ /* Adjust cdsEnd to be in an exon */ for(i = synGene->exonCount-1; i >= 0; i--) { if(synGene->cdsEnd > synGene->exonStarts[i] && synGene->cdsEnd <= synGene->exonEnds[i]) break; /* found in exon */ if(synGene->cdsEnd >= synGene->exonEnds[i]) { /* move to previous exon */ synGene->cdsEnd = synGene->exonEnds[i]; break; } } if(i == -1) synGene->cdsEnd = synGene->txStart; /* didn't find start */ if (synGene->cdsStart >= synGene->cdsEnd) synGene->cdsStart = synGene->cdsEnd = synGene->txEnd; /* no CDS left */ if(synGene->exonCount == 0) genePredFree(&synGene); return synGene; }
struct annoRow *annoGratorGpVarIntegrate(struct annoGrator *gSelf, struct annoStreamRows *primaryData, boolean *retRJFilterFailed, struct lm *callerLm) // integrate a variant and a genePred, generate as many rows as // needed to capture all the changes { struct annoGratorGpVar *self = (struct annoGratorGpVar *)gSelf; lmCleanup(&(self->lm)); self->lm = lmInit(0); // Temporarily tweak primaryRow's start and end to find upstream/downstream overlap: struct annoRow *primaryRow = primaryData->rowList; int pStart = primaryRow->start, pEnd = primaryRow->end; if (primaryRow->start <= GPRANGE) primaryRow->start = 0; else primaryRow->start -= GPRANGE; primaryRow->end += GPRANGE; struct annoRow *rows = annoGratorIntegrate(gSelf, primaryData, retRJFilterFailed, self->lm); primaryRow->start = pStart; primaryRow->end = pEnd; if (self->variantFromRow == NULL) setVariantFromRow(self, primaryData); if (self->curChromSeq == NULL || differentString(self->curChromSeq->name, primaryRow->chrom)) { dnaSeqFree(&self->curChromSeq); struct twoBitFile *tbf = self->grator.streamer.assembly->tbf; self->curChromSeq = twoBitReadSeqFragLower(tbf, primaryRow->chrom, 0, 0); } // TODO Performance improvement: instead of creating the transcript sequence for each // variant that intersects the transcript, cache transcript sequence; possibly // an slPair with a concatenation of {chrom, txStart, txEnd, cdsStart, cdsEnd, // exonStarts, exonEnds} as the name, and sequence as the val. When something in // the list is no longer in the list of rows from the internal annoGratorIntegrate call, // drop it. // BETTER YET: make a callback for gpFx to get CDS sequence only when it needs it. char *refAllele = getGenomicSequence(self->curChromSeq->dna, primaryRow->start, primaryRow->end, self->lm); struct variant *variant = self->variantFromRow(self, primaryRow, refAllele); if (rows == NULL) { // No genePreds means that the primary variant is intergenic. if (self->funcFilter != NULL && self->funcFilter->intergenic) return aggvIntergenicRow(self, variant, retRJFilterFailed, callerLm); else if (retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) *retRJFilterFailed = TRUE; return NULL; } if (retRJFilterFailed && *retRJFilterFailed) return NULL; struct annoRow *outRows = NULL; int hasFrames = (asColumnFindIx(gSelf->mySource->asObj->columnList, "exonFrames") >= 0); for(; rows; rows = rows->next) { char **inWords = rows->data; // work around genePredLoad's trashing its input char *saveExonStarts = lmCloneString(self->lm, inWords[8]); char *saveExonEnds = lmCloneString(self->lm, inWords[9]); struct genePred *gp = hasFrames ? genePredExtLoad(inWords, GENEPREDX_NUM_COLS) : genePredLoad(inWords); inWords[8] = saveExonStarts; inWords[9] = saveExonEnds; struct annoRow *outRow = aggvGenRows(self, variant, gp, rows, callerLm); if (outRow != NULL) { slReverse(&outRow); outRows = slCat(outRow, outRows); } genePredFree(&gp); } slReverse(&outRows); // If all rows failed the filter, and we must overlap, set *retRJFilterFailed. if (outRows == NULL && retRJFilterFailed && self->gpVarOverlapRule == agoMustOverlap) *retRJFilterFailed = TRUE; return outRows; }