void bigWigFileCreate( char *inName, /* Input file in ascii wiggle format. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 512 is good. */ boolean clipDontDie, /* If TRUE then clip items off end of chrom rather than dying. */ boolean compress, /* If TRUE then compress data. */ boolean keepAllChromosomes, /* If TRUE then store all chromosomes in chromosomal b-tree. */ boolean fixedSummaries, /* If TRUE then impose fixed summary levels. */ char *outName) /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) * to binary big wig format. */ { /* This code needs to agree with code in two other places currently - bigBedFileCreate, * and bbiFileOpen. I'm thinking of refactoring to share at least between * bigBedFileCreate and bigWigFileCreate. It'd be great so it could be structured * so that it could send the input in one chromosome at a time, and send in the zoom * stuff only after all the chromosomes are done. This'd potentially reduce the memory * footprint by a factor of 2 or 4. Still, for now it works. -JK */ struct hash *chromSizeHash = bbiChromSizesFromFile(chromSizes); struct lm *lm = lmInit(0); struct bwgSection *sectionList = bwgParseWig(inName, clipDontDie, chromSizeHash, itemsPerSlot, lm); if (sectionList == NULL) errAbort("%s is empty of data", inName); bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, keepAllChromosomes, fixedSummaries, outName); lmCleanup(&lm); }
void bedClip(char *inFile, char *chromSizes, char *outFile) /* bedClip - Remove lines from bed file that refer to off-chromosome places.. */ { struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); struct lineFile *lf = lineFileOpen(inFile, TRUE); FILE *f = mustOpen(outFile, "w"); char *line; while (lineFileNextReal(lf, &line)) { char *chrom = nextWord(&line); char *startString = nextWord(&line); char *endString = nextWord(&line); if (endString == NULL) errAbort("Need at least three fields line %d of %s", lf->lineIx, lf->fileName); if (startString[0] == '-') { verbose(2, "Clipping negative line %d of %s\n", lf->lineIx, lf->fileName); continue; // Clip off negatives } if (!isdigit(startString[0])) errAbort("Expecting number got %s line %d of %s", startString, lf->lineIx, lf->fileName); if (!isdigit(endString[0])) errAbort("Expecting number got %s line %d of %s", endString, lf->lineIx, lf->fileName); int start = sqlUnsigned(startString); int end = sqlUnsigned(endString); if (start >= end) { verbose(2, "Clipping end <= start line %d of %s\n", lf->lineIx, lf->fileName); continue; } struct hashEl *hel = hashLookup(chromSizesHash, chrom); if (hel == NULL) errAbort("Chromosome %s isn't in %s line %d of %s\n", chrom, chromSizes, lf->lineIx, lf->fileName); int chromSize = ptToInt(hel->val); if (end > chromSize) { verbose(2, "Clipping end > chromSize line %d of %s\n", lf->lineIx, lf->fileName); continue; } fprintf(f, "%s\t%s\t%s", chrom, startString, endString); line = skipLeadingSpaces(line); if (line == NULL || line[0] == 0) fputc('\n', f); else fprintf(f, "\t%s\n", line); } carefulClose(&f); }
void bbFileCreate( char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */ char *asText, /* Field definitions in a string */ struct asObject *as, /* Field definitions parsed out */ boolean doCompress, /* If TRUE then compress data. */ struct slName *extraIndexList, /* List of extra indexes to add */ char *outName) /* BigBed output file name. */ /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */ { /* Set up timing measures. */ verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); bits16 fieldCount = slCount(as->columnList); bits16 extraIndexCount = slCount(extraIndexList); struct bbExIndexMaker *eim = NULL; if (extraIndexList != NULL) eim = bbExIndexMakerNew(extraIndexList, as); /* Load in chromosome sizes. */ struct hash *chromSizesHash = NULL; if (sizesIs2Bit) chromSizesHash = twoBitChromHash(chromSizes); else chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "Read %d chromosomes and sizes from %s\n", chromSizesHash->elCount, chromSizes); /* Do first pass, mostly just scanning file and counting hits per chromosome. */ int minDiff = 0; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, &minDiff, &aveSize, &bedCount, tabSep); verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList)); verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize); /* Open output file and write dummy header. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out autoSql string */ bits64 asOffset = ftell(f); mustWrite(f, asText, strlen(asText) + 1); verbose(2, "as definition has %d columns\n", fieldCount); /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); /* Write out dummy header extension */ bits64 extHeaderOffset = ftell(f); bits16 extHeaderSize = 64; repeatCharOut(f, 0, extHeaderSize); /* Write out extra index stuff if need be. */ bits64 extraIndexListOffset = 0; bits64 extraIndexListEndOffset = 0; if (extraIndexList != NULL) { extraIndexListOffset = ftell(f); int extraIndexSize = 16 + 4*1; // Fixed record size 16, plus 1 times field size of 4 repeatCharOut(f, 0, extraIndexSize*extraIndexCount); extraIndexListEndOffset = ftell(f); } /* Write out chromosome/size database. */ bits64 chromTreeOffset = ftell(f); bbiWriteChromInfo(usageList, blockSize, f); /* Set up to keep track of possible initial reduction levels. */ int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels]; int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes); /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); bits32 blockCount = 0; bits32 maxBlockSize = 0; struct bbiBoundsArray *boundsArray = NULL; writeOne(f, bedCount); if (bedCount > 0) { blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); AllocArray(boundsArray, blockCount); lineFileRewind(lf); if (eim) bbExIndexMakerAllocChunkArrays(eim, bedCount); writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress, f, resTryCount, resScales, resSizes, eim, bedCount, fieldCount, &maxBlockSize); } verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", (long long)bedCount, fieldCount); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); verboseTime(2, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; /* Call monster zoom maker library function that bedGraphToBigWig also uses. */ int zoomLevels = 0; if (bedCount > 0) { zoomLevels = bbiWriteZoomLevels(lf, f, blockSize, itemsPerSlot, bedWriteReducedOnceReturnReducedTwice, fieldCount, doCompress, indexOffset - dataOffset, usageList, resTryCount, resScales, resSizes, zoomAmounts, zoomDataOffsets, zoomIndexOffsets, &totalSum); } /* Write out extra indexes if need be. */ if (eim) { int i; for (i=0; i < eim->indexCount; ++i) { eim->fileOffsets[i] = ftell(f); maxBedNameSize = eim->maxFieldSize[i]; qsort(eim->chunkArrayArray[i], bedCount, sizeof(struct bbNamedFileChunk), bbNamedFileChunkCmpByName); assert(sizeof(struct bbNamedFileChunk) == sizeof(eim->chunkArrayArray[i][0])); bptFileBulkIndexToOpenFile(eim->chunkArrayArray[i], sizeof(eim->chunkArrayArray[i][0]), bedCount, blockSize, bbNamedFileChunkKey, maxBedNameSize, bbNamedFileChunkVal, sizeof(bits64) + sizeof(bits64), f); verboseTime(1, "Sorting and writing extra index %d", i); } } /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxBlockSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigBedSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits32 reserved32 = 0; bits64 reserved64 = 0; bits16 definedFieldCount = bedN; /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, fieldCount); writeOne(f, definedFieldCount); writeOne(f, asOffset); writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); writeOne(f, extHeaderOffset); assert(ftell(f) == 64); /* Write summary headers with data. */ int i; verbose(2, "Writing %d levels of zoom\n", zoomLevels); for (i=0; i<zoomLevels; ++i) { verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]); writeOne(f, zoomAmounts[i]); writeOne(f, reserved32); writeOne(f, zoomDataOffsets[i]); writeOne(f, zoomIndexOffsets[i]); } /* Write rest of summary headers with no data. */ for (i=zoomLevels; i<bbiMaxZoomLevels; ++i) { writeOne(f, reserved32); writeOne(f, reserved32); writeOne(f, reserved64); writeOne(f, reserved64); } /* Write total summary. */ fseek(f, totalSummaryOffset, SEEK_SET); bbiSummaryElementWrite(f, &totalSum); /* Write extended header */ fseek(f, extHeaderOffset, SEEK_SET); writeOne(f, extHeaderSize); writeOne(f, extraIndexCount); writeOne(f, extraIndexListOffset); repeatCharOut(f, 0, 52); // reserved assert(ftell(f) - extHeaderOffset == extHeaderSize); /* Write extra index offsets if need be. */ if (extraIndexCount != 0) { fseek(f, extraIndexListOffset, SEEK_SET); int i; for (i=0; i<extraIndexCount; ++i) { // Write out fixed part of index info bits16 type = 0; // bPlusTree type bits16 indexFieldCount = 1; writeOne(f, type); writeOne(f, indexFieldCount); writeOne(f, eim->fileOffsets[i]); repeatCharOut(f, 0, 4); // reserved // Write out field list - easy this time because for now always only one field. bits16 fieldId = eim->indexFields[i]; writeOne(f, fieldId); repeatCharOut(f, 0, 2); // reserved } assert(ftell(f) == extraIndexListEndOffset); } /* Write end signature. */ fseek(f, 0L, SEEK_END); writeOne(f, sig); /* Clean up. */ lineFileClose(&lf); carefulClose(&f); freeHash(&chromSizesHash); bbiChromUsageFreeList(&usageList); asObjectFreeList(&as); }
void bedGraphToBigWig(char *inName, char *chromSizes, char *outName) /* bedGraphToBigWig - Convert a bedGraph program to bigWig.. */ { verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes); int minDiff = 0, i; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, NULL, &minDiff, &aveSize, &bedCount); verboseTime(2, "pass1"); verbose(2, "%d chroms in %s, minDiff=%d, aveSize=%g, bedCount=%lld\n", slCount(usageList), inName, minDiff, aveSize, bedCount); /* Write out dummy header, zoom offsets. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); /* Write out chromosome/size database. */ bits64 chromTreeOffset = ftell(f); bbiWriteChromInfo(usageList, blockSize, f); /* Set up to keep track of possible initial reduction levels. */ int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels]; int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes); /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); bits64 sectionCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); writeOne(f, sectionCount); struct bbiBoundsArray *boundsArray; AllocArray(boundsArray, sectionCount); lineFileRewind(lf); bits32 maxSectionSize = 0; writeSections(usageList, lf, itemsPerSlot, boundsArray, sectionCount, f, resTryCount, resScales, resSizes, doCompress, &maxSectionSize); verboseTime(2, "pass2"); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), sectionCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); verboseTime(2, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; /* Call monster zoom maker library function that bedToBigBed also uses. */ int zoomLevels = bbiWriteZoomLevels(lf, f, blockSize, itemsPerSlot, bedGraphWriteReducedOnceReturnReducedTwice, 4, doCompress, indexOffset - dataOffset, usageList, resTryCount, resScales, resSizes, zoomAmounts, zoomDataOffsets, zoomIndexOffsets, &totalSum); /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxSectionSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigWigSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits16 reserved16 = 0; bits32 reserved32 = 0; bits64 reserved64 = 0; /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, reserved16); // fieldCount writeOne(f, reserved16); // definedFieldCount writeOne(f, reserved64); // autoSqlOffset writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); writeOne(f, reserved64); // nameIndexOffset assert(ftell(f) == 64); /* Write summary headers with data. */ verbose(2, "Writing %d levels of zoom\n", zoomLevels); for (i=0; i<zoomLevels; ++i) { verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]); writeOne(f, zoomAmounts[i]); writeOne(f, reserved32); writeOne(f, zoomDataOffsets[i]); writeOne(f, zoomIndexOffsets[i]); } /* Write rest of summary headers with no data. */ for (i=zoomLevels; i<bbiMaxZoomLevels; ++i) { writeOne(f, reserved32); writeOne(f, reserved32); writeOne(f, reserved64); writeOne(f, reserved64); } /* Write total summary. */ fseek(f, totalSummaryOffset, SEEK_SET); bbiSummaryElementWrite(f, &totalSum); /* Write end signature. */ fseek(f, 0L, SEEK_END); writeOne(f, sig); lineFileClose(&lf); carefulClose(&f); }