static void bwgComputeDynamicSummaries(struct bwgSection *sectionList, struct bbiSummary ** reduceSummaries, bits16 * summaryCount, struct bbiChromInfo *chromInfoArray, int chromCount, bits32 * reductionAmounts, boolean doCompress) { /* Figure out initial summary level - starting with a summary 10 times the amount * of the smallest item. See if summarized data is smaller than half input data, if * not bump up reduction by a factor of 2 until it is, or until further summarying * yeilds no size reduction. */ int i; int minRes = bwgAverageResolution(sectionList); int initialReduction = minRes*10; bits64 fullSize = bwgTotalSectionSize(sectionList); bits64 lastSummarySize = 0, summarySize; bits64 maxReducedSize = fullSize/2; struct bbiSummary *summaryList = NULL; for (;;) { summaryList = bwgReduceSectionList(sectionList, chromInfoArray, initialReduction); bits64 summarySize = bbiTotalSummarySize(summaryList); if (doCompress) { summarySize *= 2; // Compensate for summary not compressing as well as primary data } if (summarySize >= maxReducedSize && summarySize != lastSummarySize) { /* Need to do more reduction. First scale reduction by amount that it missed * being small enough last time, with an extra 10% for good measure. Then * just to keep from spinning through loop two many times, make sure this is * at least 2x the previous reduction. */ int nextReduction = 1.1 * initialReduction * summarySize / maxReducedSize; if (nextReduction < initialReduction*2) nextReduction = initialReduction*2; initialReduction = nextReduction; bbiSummaryFreeList(&summaryList); lastSummarySize = summarySize; } else break; } *summaryCount = 1; reduceSummaries[0] = summaryList; reductionAmounts[0] = initialReduction; /* Now calculate up to 10 levels of further summary. */ bits64 reduction = initialReduction; for (i=0; i<9; i++) { reduction *= 4; if (reduction > 1000000000) break; summaryList = bbiReduceSummaryList(reduceSummaries[*summaryCount-1], chromInfoArray, reduction); summarySize = bbiTotalSummarySize(summaryList); if (summarySize != lastSummarySize) { reduceSummaries[*summaryCount] = summaryList; reductionAmounts[*summaryCount] = reduction; ++(*summaryCount); } int summaryItemCount = slCount(summaryList); if (summaryItemCount <= chromCount) break; } }
void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, int blockSize, int itemsPerSlot, boolean doCompress, char *fileName) /* Create a bigWig file out of a sorted sectionList. */ { bits64 sectionCount = slCount(sectionList); FILE *f = mustOpen(fileName, "wb"); bits32 sig = bigWigSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = 0; bits16 reserved16 = 0; bits32 reserved32 = 0; bits64 reserved64 = 0; bits64 dataOffset = 0, dataOffsetPos; bits64 indexOffset = 0, indexOffsetPos; bits64 chromTreeOffset = 0, chromTreeOffsetPos; bits64 totalSummaryOffset = 0, totalSummaryOffsetPos; bits32 uncompressBufSize = 0; bits64 uncompressBufSizePos; struct bbiSummary *reduceSummaries[10]; bits32 reductionAmounts[10]; bits64 reductionDataOffsetPos[10]; bits64 reductionDataOffsets[10]; bits64 reductionIndexOffsets[10]; int i; /* Figure out chromosome ID's. */ struct bbiChromInfo *chromInfoArray; int chromCount, maxChromNameSize; bwgMakeChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize); /* Figure out initial summary level - starting with a summary 10 times the amount * of the smallest item. See if summarized data is smaller than half input data, if * not bump up reduction by a factor of 2 until it is, or until further summarying * yeilds no size reduction. */ int minRes = bwgAverageResolution(sectionList); int initialReduction = minRes*10; bits64 fullSize = bwgTotalSectionSize(sectionList); bits64 maxReducedSize = fullSize/2; struct bbiSummary *firstSummaryList = NULL, *summaryList = NULL; bits64 lastSummarySize = 0, summarySize; for (;;) { summaryList = bwgReduceSectionList(sectionList, chromInfoArray, initialReduction); bits64 summarySize = bbiTotalSummarySize(summaryList); if (doCompress) { summarySize *= 2; // Compensate for summary not compressing as well as primary data } if (summarySize >= maxReducedSize && summarySize != lastSummarySize) { /* Need to do more reduction. First scale reduction by amount that it missed * being small enough last time, with an extra 10% for good measure. Then * just to keep from spinning through loop two many times, make sure this is * at least 2x the previous reduction. */ int nextReduction = 1.1 * initialReduction * summarySize / maxReducedSize; if (nextReduction < initialReduction*2) nextReduction = initialReduction*2; initialReduction = nextReduction; bbiSummaryFreeList(&summaryList); lastSummarySize = summarySize; } else break; } summaryCount = 1; reduceSummaries[0] = firstSummaryList = summaryList; reductionAmounts[0] = initialReduction; /* Now calculate up to 10 levels of further summary. */ bits64 reduction = initialReduction; for (i=0; i<ArraySize(reduceSummaries)-1; i++) { reduction *= 4; if (reduction > 1000000000) break; summaryList = bbiReduceSummaryList(reduceSummaries[summaryCount-1], chromInfoArray, reduction); summarySize = bbiTotalSummarySize(summaryList); if (summarySize != lastSummarySize) { reduceSummaries[summaryCount] = summaryList; reductionAmounts[summaryCount] = reduction; ++summaryCount; } int summaryItemCount = slCount(summaryList); if (summaryItemCount <= chromCount) break; } /* Write fixed header. */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); chromTreeOffsetPos = ftell(f); writeOne(f, chromTreeOffset); dataOffsetPos = ftell(f); writeOne(f, dataOffset); indexOffsetPos = ftell(f); writeOne(f, indexOffset); writeOne(f, reserved16); /* fieldCount */ writeOne(f, reserved16); /* definedFieldCount */ writeOne(f, reserved64); /* autoSqlOffset. */ totalSummaryOffsetPos = ftell(f); writeOne(f, totalSummaryOffset); uncompressBufSizePos = ftell(f); writeOne(f, uncompressBufSize); writeOne(f, reserved64); /* nameIndexOffset */ assert(ftell(f) == 64); /* Write summary headers */ for (i=0; i<summaryCount; ++i) { writeOne(f, reductionAmounts[i]); writeOne(f, reserved32); reductionDataOffsetPos[i] = ftell(f); writeOne(f, reserved64); // Fill in with data offset later writeOne(f, reserved64); // Fill in with index offset later } /* Write dummy summary */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); /* Write chromosome bPlusTree */ chromTreeOffset = ftell(f); int chromBlockSize = min(blockSize, chromCount); bptFileBulkIndexToOpenFile(chromInfoArray, sizeof(chromInfoArray[0]), chromCount, chromBlockSize, bbiChromInfoKey, maxChromNameSize, bbiChromInfoVal, sizeof(chromInfoArray[0].id) + sizeof(chromInfoArray[0].size), f); /* Write out data section count and sections themselves. */ dataOffset = ftell(f); writeOne(f, sectionCount); struct bwgSection *section; for (section = sectionList; section != NULL; section = section->next) { bits32 uncSizeOne = bwgSectionWrite(section, doCompress, f); if (uncSizeOne > uncompressBufSize) uncompressBufSize = uncSizeOne; } /* Write out index - creating a temporary array rather than list representation of * sections in the process. */ indexOffset = ftell(f); struct bwgSection **sectionArray; AllocArray(sectionArray, sectionCount); for (section = sectionList, i=0; section != NULL; section = section->next, ++i) sectionArray[i] = section; cirTreeFileBulkIndexToOpenFile(sectionArray, sizeof(sectionArray[0]), sectionCount, blockSize, 1, NULL, bwgSectionFetchKey, bwgSectionFetchOffset, indexOffset, f); freez(§ionArray); /* Write out summary sections. */ verbose(2, "bwgCreate writing %d summaries\n", summaryCount); for (i=0; i<summaryCount; ++i) { reductionDataOffsets[i] = ftell(f); reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, doCompress, f); verbose(3, "wrote %d of data, %d of index on level %d\n", (int)(reductionIndexOffsets[i] - reductionDataOffsets[i]), (int)(ftell(f) - reductionIndexOffsets[i]), i); } /* Calculate summary */ struct bbiSummary *sum = firstSummaryList; if (sum != NULL) { totalSum.validCount = sum->validCount; totalSum.minVal = sum->minVal; totalSum.maxVal = sum->maxVal; totalSum.sumData = sum->sumData; totalSum.sumSquares = sum->sumSquares; for (sum = sum->next; sum != NULL; sum = sum->next) { totalSum.validCount += sum->validCount; if (sum->minVal < totalSum.minVal) totalSum.minVal = sum->minVal; if (sum->maxVal > totalSum.maxVal) totalSum.maxVal = sum->maxVal; totalSum.sumData += sum->sumData; totalSum.sumSquares += sum->sumSquares; } /* Write real summary */ fseek(f, totalSummaryOffset, SEEK_SET); bbiSummaryElementWrite(f, &totalSum); } else totalSummaryOffset = 0; /* Edge case, no summary. */ /* Go back and fill in offsets properly in header. */ fseek(f, dataOffsetPos, SEEK_SET); writeOne(f, dataOffset); fseek(f, indexOffsetPos, SEEK_SET); writeOne(f, indexOffset); fseek(f, chromTreeOffsetPos, SEEK_SET); writeOne(f, chromTreeOffset); fseek(f, totalSummaryOffsetPos, SEEK_SET); writeOne(f, totalSummaryOffset); if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); if (maxZoomUncompSize > uncompressBufSize) uncompressBufSize = maxZoomUncompSize; fseek(f, uncompressBufSizePos, SEEK_SET); writeOne(f, uncompressBufSize); } /* Also fill in offsets in zoom headers. */ for (i=0; i<summaryCount; ++i) { fseek(f, reductionDataOffsetPos[i], SEEK_SET); writeOne(f, reductionDataOffsets[i]); writeOne(f, reductionIndexOffsets[i]); } /* Write end signature. */ fseek(f, 0L, SEEK_END); writeOne(f, sig); /* Clean up */ freez(&chromInfoArray); carefulClose(&f); }