void liftAcross(char *liftAcross, char *srcFile, char *dstOut) /* liftAcross - convert one coordinate system to another, no overlapping items. */ { struct hash *lftHash = readLift(liftAcross); struct genePred *gpList = genePredExtLoadAll(srcFile); struct genePred *gp = NULL; FILE *out = mustOpen(dstOut, "w"); if (bedOut) bedRegionOutput(lftHash); int genePredItemCount = 0; for (gp = gpList; gp != NULL; gp = gp->next) { struct liftSpec *lsFound = hashFindVal(lftHash, gp->chrom); if (lsFound) { struct genePred *gpLifted = liftGenePred(gp, lsFound); struct genePred *gpl; for (gpl = gpLifted; gpl != NULL; gpl = gpl->next) genePredTabOut(gpl, out); genePredFreeList(&gpLifted); } else { genePredTabOut(gp, out); } ++genePredItemCount; } /* lftHash and gpList are left allocated to disappear at exit */ verbose(2,"#\tgene pred item count: %d\n", genePredItemCount); }
void doGenePreds(struct sqlConnection *conn, char *db, char *orthoDb, char *chrom, char *netTable, char *geneFileName, char *geneTableName, char *outBedName, char *selectedFileName, int *foundCount, int *notFoundCount) /* Map over genePreds. */ { FILE *bedOut = NULL; FILE *selectedOut = NULL; FILE *cdsErrorFp = NULL; struct genePred *gene = NULL, *geneList = NULL; struct bed *bed = NULL; //init output files if(optionExists("cdsErrorFile")) { cdsErrorFp = fopen( optionVal("cdsErrorFile", NULL), "w" ); fprintf( cdsErrorFp, "#name\tchrom\ttxStart\ttxEnd\tcdsStart\tcdsEnd\tstrand\texonCount\n" ); fclose(cdsErrorFp); } warn("Loading Gene Predictions."); assert(outBedName); if(geneFileName) geneList=genePredLoadAll(geneFileName); else geneList=loadGeneFromTable(conn, geneTableName, chrom, 0, BIGNUM); /* Convert genePreds. */ warn("Converting genes."); bedOut = mustOpen(outBedName, "w"); if (selectedFileName != NULL) selectedOut = mustOpen(selectedFileName, "w"); for(gene = geneList; gene != NULL; gene = gene->next) { struct genePred *synGene = NULL; if(differentString(gene->chrom, chrom)) continue; synGene = orthoBedFromGene(conn, db, orthoDb, netTable, gene); occassionalDot(); if(synGene != NULL && synGene->exonCount > 0) { (*foundCount)++; genePredTabOut(synGene, bedOut); if (selectedOut != NULL) genePredTabOut(gene, selectedOut); } else (*notFoundCount)++; genePredFree(&synGene); } carefulClose(&selectedOut); carefulClose(&bedOut); }
void liftGenePredExt(char *destFile, struct hash *liftHash, int sourceCount, char *sources[]) /* Lift a genePred files. */ { char *row[GENEPREDX_NUM_COLS]; struct lineFile* lf; FILE* dest = mustOpen(destFile, "w"); int iSrc; int colCount; for (iSrc = 0; iSrc < sourceCount; iSrc++) { verbose(1, "Lifting %s\n", sources[iSrc]); lf = lineFileOpen(sources[iSrc], TRUE); while ((colCount = lineFileChopNextTab(lf, row, ArraySize(row)))) { struct genePred* gp = genePredExtLoad(row, colCount); if (liftGenePredObj(liftHash, gp, lf)) genePredTabOut(gp, dest); genePredFree(&gp); } lineFileClose(&lf); if (dots) verbose(1, "\n"); } carefulClose(&dest); }
void gffIntoDatabase(char *database, char *fileName, char *table, int offset) /* Load a gff file into database. */ { struct gffFile *gff = gffFileNew(""); struct gffGroup *group; struct genePred *gpList = NULL, *gp; FILE *f; char *tabName = "genePred.tab"; /* Load fixed gff and convert it to genePred. */ gffFileAdd(gff, fileName, 0); gffGroupLines(gff); for (group = gff->groupList; group != NULL; group = group->next) { gp = genePredFromGroupedGff(gff, group, group->name, "exon", genePredCdsStatFld|genePredExonFramesFld, genePredGxfDefaults); if (gp != NULL) { slAddHead(&gpList, gp); genePredOffset(gp, offset); } } slSort(&gpList, genePredCmp); /* Create tab-delimited file. */ f = mustOpen(tabName, "w"); for (gp = gpList; gp != NULL; gp = gp->next) genePredTabOut(gp, f); carefulClose(&f); /* Load into database. */ loadIntoDatabase(database, createGenePred, "sanger22", tabName); }
void convertPsl(struct psl *psl, struct genbankCds *cds, FILE *genePredFh) /* convert a cds and psl and output */ { struct genePred *genePred = pslToGenePred(psl, cds); if (genePred != NULL) { genePredTabOut(genePred, genePredFh); genePredFree(&genePred); } }
/* convert one line read from a bed file to a genePred */ void cnvBedRec(char *line, FILE *gpFh) { char *row[12]; int numCols = chopByWhite(line, row, ArraySize(row)); if (numCols < 4) errAbort("bed must have at least 4 columns"); struct bed *bed = bedLoadN(row, numCols); struct genePred* gp = bedToGenePred(bed); genePredTabOut(gp, gpFh); genePredFree(&gp); bedFree(&bed); }
static void processGenePred(FILE *fh, struct hash *refSeqVerInfoTbl, struct genePred *gp) /* check if a genePred has been select, if so, write including version in name */ { struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, gp->name); if (rsvi != NULL) { char buf[GENBANK_ACC_BUFSZ], *hold = gp->name; gp->name = addVer(gp->name, rsvi->ver, buf, sizeof(buf)); genePredTabOut(gp, fh); gp->name = hold; } }
static void gbGeneTblWriteGeneFlat(struct gbGeneTbl *ggt, struct gbStatus* status, struct psl* psl, struct sqlConnection *conn) /* write genePred flat row */ { struct genePred* gp = genePredFromPsl3(psl, &status->cds, 0, genePredPslCdsMod3, genePredStdInsertMergeSize, genePredStdInsertMergeSize); FILE *fh = gbGeneTblGetFlatTabFh(ggt, conn); fprintf(fh, "%s\t", ((status->geneName == NULL) ? "" : status->geneName)); genePredTabOut(gp, fh); genePredFree(&gp); }
static void gtfGroupToGenePred(struct gffFile *gtf, struct gffGroup *group, FILE *gpFh, FILE *infoFh) /* convert one gtf group to a genePred */ { unsigned optFields = (clGenePredExt ? genePredAllFlds : 0); struct errCatch *errCatch = errCatchNew(); if (errCatchStart(errCatch)) { struct genePred *gp = genePredFromGroupedGtf(gtf, group, group->name, optFields, clGxfOptions); if (gp == NULL) { if (!clIgnoreGroupsWithoutExons) { char *msg = "no exons defined for group %s, feature %s (perhaps try -ignoreGroupsWithoutExons)"; if (clAllErrors) { fprintf(stderr, msg, group->name, group->lineList->feature); fputc('\n', stderr); badGroupCount++; } else errAbort(msg, group->name, group->lineList->feature); } } else { genePredTabOut(gp, gpFh); genePredFree(&gp); } } errCatchEnd(errCatch); if (errCatch->gotError) { // drop trailing newline in caught message if (endsWith(errCatch->message->string, "\n")) dyStringResize(errCatch->message, dyStringLen(errCatch->message)-1); if (clAllErrors) { fprintf(stderr, "%s\n", errCatch->message->string); badGroupCount++; } else errAbort("%s", errCatch->message->string); } else { if (infoFh != NULL) writeInfo(infoFh, group); } errCatchFree(&errCatch); }
static void createCcdsGene(struct sqlConnection *conn, char *ccdsGeneFile, struct genomeInfo *genome, struct hash* ignoreTbl, struct hash *gotCcds) /* create the ccdsGene tab file from the ccds database */ { struct ccdsLocationsJoin *locs = loadLocations(conn, genome, ignoreTbl, gotCcds); struct genePred *gp, *genes = buildCcdsGene(&locs); FILE *genesFh; genesFh = mustOpen(ccdsGeneFile, "w"); for (gp = genes; gp != NULL; gp = gp->next) { if (loadDb) fprintf(genesFh, "%d\t", binFromRange(gp->txStart, gp->txEnd)); genePredTabOut(gp, genesFh); } carefulClose(&genesFh); genePredFreeList(&genes); }
static void gbGeneTblWriteGene(struct gbGeneTbl *ggt, struct gbStatus* status, struct psl* psl, struct sqlConnection *conn) /* write genePred row */ { struct genePred* gp = genePredFromPsl3(psl, &status->cds, (ggt->hasExtCols ? genePredAllFlds : 0), genePredPslCdsMod3, genePredStdInsertMergeSize, genePredStdInsertMergeSize); FILE *fh = gbGeneTblGetTabFh(ggt, conn); if (ggt->hasExtCols) { /* add gene name */ freeMem(gp->name2); gp->name2 = cloneString(status->geneName); } if (ggt->hasBin) fprintf(fh, "%u\t", hFindBin(gp->txStart, gp->txEnd)); genePredTabOut(gp, fh); genePredFree(&gp); }
void copyGene(char *db, struct genePred *gene, FILE *tabFh) /* copy one gene to the tab file */ { unsigned holdOptFields = gene->optFields; unsigned optFields = (genePredScoreFld|genePredName2Fld|genePredCdsStatFld|genePredExonFramesFld); if (gGenePredExt && ((optFields & optFields) != optFields)) errAbort("genePred %s doesn't have fields required for -genePredExt", gene->name); if (gNoValidate || checkGene(db, gene)) { if (!gGenePredExt) gene->optFields = 0; /* omit optional fields */ if (gBin) fprintf(tabFh, "%u\t", hFindBin(gene->txStart, gene->txEnd)); genePredTabOut(gene, tabFh); gene->optFields = holdOptFields; /* restore optional fields */ } }
void borfMatcher(char *bedIn, char *borfIn, char *bedOutFile, char *genePredOutFile) /* Top level function to open files and call other functions. */ { struct borf *borf = NULL, *borfList = NULL; struct bed *bed = NULL, *bedList = NULL; struct genePred *gp = NULL; float threshold = optionFloat("minScore", 50); FILE *bedOut = mustOpen(bedOutFile, "w"); FILE *genePredOut = mustOpen(genePredOutFile, "w"); boolean keepSmall = optionExists("keepSmall"); boolean keepNmd = optionExists("keepNmd"); borfList = borfLoadAll(borfIn); bedList = bedLoadAll(bedIn); dotForUserInit(slCount(bedList)/10); for(bed = bedList, borf = borfList; bed != NULL && borf != NULL; bed = bed->next, borf = borf->next) { dotForUser(); if(!stringIn(bed->name, borf->name)) errAbort("Trying to match up %s bed with %s borf - bad idea!", bed->name, borf->name); /* Have to adjust cds end. Borf puts stop codon outside of cds, we put it inside. */ borf->cdsEnd = min(borf->cdsEnd+3, borf->size); if((borf->score > threshold || (keepSmall && borf->cdsSize > 0)) && sameString(borf->strand, "+")) { setThickStartStop(bed, borf); if(keepNmd || !nmdTarget(bed)) { gp = bedToGenePred(bed); bedTabOutN(bed, 12, bedOut); genePredTabOut(gp, genePredOut); genePredFree(&gp); } } } warn("Done."); carefulClose(&bedOut); carefulClose(&genePredOut); }