static struct hash* bbiChromSizes(struct bbiFile* bbi) /* return the hash of chrom sizes from the bigBed/bigWig */ { struct bbiChromInfo* cList = bbiChromList(bbi); struct bbiChromInfo* c; struct hash* cHash = newHash(10); for (c = cList; c != NULL; c = c->next) hashAddInt(cHash, c->name, (int)c->size); bbiChromInfoFreeList(&cList); return cHash; }
void bigBedToBed(char *inFile, char *outFile) /* bigBedToBed - Convert from bigBed to ascii bed format.. */ { struct bbiFile *bbi = bigBedFileOpen(inFile); FILE *f = mustOpen(outFile, "w"); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); int itemCount = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { if (clChrom != NULL && !sameString(clChrom, chrom->name)) continue; char *chromName = chrom->name; int start = 0, end = chrom->size; if (clStart > 0) start = clStart; if (clEnd > 0) end = clEnd; int itemsLeft = 0; // Zero actually means no limit.... if (maxItems != 0) { itemsLeft = maxItems - itemCount; if (itemsLeft <= 0) break; } struct lm *lm = lmInit(0); struct bigBedInterval *interval, *intervalList = bigBedIntervalQuery(bbi, chromName, start, end, itemsLeft, lm); for (interval = intervalList; interval != NULL; interval = interval->next) { fprintf(f, "%s\t%u\t%u", chromName, interval->start, interval->end); char *rest = interval->rest; if (rest != NULL) fprintf(f, "\t%s\n", rest); else fprintf(f, "\n"); } lmCleanup(&lm); } bbiChromInfoFreeList(&chromList); carefulClose(&f); bbiFileClose(&bbi); }
struct genomeRangeTree *edwGrtFromBigBed(char *fileName) /* Return genome range tree for simple (unblocked) bed */ { struct bbiFile *bbi = bigBedFileOpen(fileName); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); struct genomeRangeTree *grt = genomeRangeTreeNew(); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *tree = genomeRangeTreeFindOrAddRangeTree(grt, chrom->name); struct lm *lm = lmInit(0); struct bigBedInterval *iv, *ivList = NULL; ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(tree, iv->start, iv->end); lmCleanup(&lm); } bigBedFileClose(&bbi); bbiChromInfoFreeList(&chromList); return grt; }
/* --- .Call ENTRY POINT --- */ SEXP BWGFile_seqlengths(SEXP r_filename) { pushRHandlers(); struct bbiFile * file = bigWigFileOpen((char *)CHAR(asChar(r_filename))); struct bbiChromInfo *chromList = bbiChromList(file); struct bbiChromInfo *chrom = chromList; SEXP seqlengths, seqlengthNames; PROTECT(seqlengths = allocVector(INTSXP, slCount(chromList))); seqlengthNames = allocVector(STRSXP, length(seqlengths)); setAttrib(seqlengths, R_NamesSymbol, seqlengthNames); for(int i = 0; i < length(seqlengths); i++) { INTEGER(seqlengths)[i] = chrom->size; SET_STRING_ELT(seqlengthNames, i, mkChar(chrom->name)); chrom = chrom->next; } bbiChromInfoFreeList(&chromList); popRHandlers(); UNPROTECT(1); return seqlengths; }
void bigWigToWig(char *inFile, char *outFile) /* bigWigToWig - Convert bigWig to wig. This will keep more of the same structure of the * original wig than bigWigToBedGraph does, but still will break up large stepped sections into * smaller ones. */ { struct bbiFile *bwf = bigWigFileOpen(inFile); FILE *f = mustOpen(outFile, "w"); struct bbiChromInfo *chrom, *chromList = bbiChromList(bwf); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { if (clChrom != NULL && !sameString(clChrom, chrom->name)) continue; char *chromName = chrom->name; int start = 0, end = chrom->size; if (clStart > 0) start = clStart; if (clEnd > 0) end = clEnd; bigWigIntervalDump(bwf, chromName, start, end, 0, f); } bbiChromInfoFreeList(&chromList); carefulClose(&f); bbiFileClose(&bwf); }
void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew(); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { long startBigQueryTime = clock1000(); boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; if (gotData) { double *valBuf = valsOnChrom->valBuf; Bits *covBuf = valsOnChrom->covBuf; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(targetTree); for (range = rangeList; range != NULL; range = range->next) { int s = range->start, e = range->end, i; for (i=s; i<=e; ++i) { if (bitReadOne(covBuf, i)) { double x = valBuf[i]; target->uniqOverlapBases += 1; target->overlapBases += x; } } } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; } } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bigWigValsOnChromFree(&valsOnChrom); bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }
/* This old way is ~3 times as slow */ void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigWig for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); long startBigQueryTime = clock1000(); struct bbiInterval *ivList = bigWigIntervalQuery(bbi, chrom->name, 0, chrom->size, lm); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; struct bbiInterval *iv; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->uniqOverlapBases += overlap; target->overlapBases += overlap * iv->val; } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; lmCleanup(&lm); } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }
void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); }