struct bwgSection *bwgParseWig( char *fileName, /* Name of ascii wig file. */ boolean clipDontDie, /* Skip items outside chromosome rather than aborting. */ struct hash *chromSizeHash, /* If non-NULL items checked to be inside chromosome. */ int maxSectionSize, /* Biggest size of a section. 100 - 100,000 is usual range. */ struct lm *lm) /* Memory pool to allocate from. */ /* Parse out ascii wig file - allocating memory in lm. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; struct bwgSection *sectionList = NULL; /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); while (lineFileNextReal(lf, &line)) { verbose(2, "processing %s\n", line); if (stringIn("chrom=", line)) parseSteppedSection(lf, clipDontDie, chromSizeHash, line, lm, maxSectionSize, §ionList); else { /* Check for bed... */ char *dupe = cloneString(line); char *words[5]; int wordCount = chopLine(dupe, words); if (wordCount != 4) errAbort("Unrecognized line %d of %s:\n%s\n", lf->lineIx, lf->fileName, line); /* Parse out a bed graph line just to check numerical format. */ char *chrom = words[0]; int start = lineFileNeedNum(lf, words, 1); int end = lineFileNeedNum(lf, words, 2); double val = lineFileNeedDouble(lf, words, 3); verbose(2, "bedGraph %s:%d-%d@%g\n", chrom, start, end, val); /* Push back line and call bed parser. */ lineFileReuse(lf); parseBedGraphSection(lf, clipDontDie, chromSizeHash, lm, maxSectionSize, §ionList); } } slSort(§ionList, bwgSectionCmp); /* Check for overlap at section level. */ struct bwgSection *section, *nextSection; for (section = sectionList; section != NULL; section = nextSection) { nextSection = section->next; if (nextSection != NULL) { if (sameString(section->chrom, nextSection->chrom)) { if (section->end > nextSection->start) { errAbort("There's more than one value for %s base %d (in coordinates that start with 1).\n", section->chrom, nextSection->start+1); } } } } return sectionList; }
struct bbiSummary *bedGraphWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress, struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart, struct bbiSummaryElement *totalSum) /* Write out data reduced by factor of initialReduction. Also calculate and keep in memory * next reduction level. This is more work than some ways, but it keeps us from having to * keep the first reduction entirely in memory. */ { struct bbiSummary *twiceReducedList = NULL; bits32 doubleReductionSize = initialReduction * zoomIncrement; struct bbiChromUsage *usage = usageList; struct bbiSummary oneSummary, *sum = NULL; struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd; boundsPt = AllocArray(boundsArray, initialReductionCount); boundsEnd = boundsPt + initialReductionCount; *retDataStart = ftell(f); writeOne(f, initialReductionCount); boolean firstRow = TRUE; struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress); /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); for (;;) { /* Get next line of input if any. */ char *row[5]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); /* Output last section and break if at end of file. */ if (rowSize == 0 && sum != NULL) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); break; } /* Parse out row. */ char *chrom = row[0]; bits32 start = sqlUnsigned(row[1]); bits32 end = sqlUnsigned(row[2]); float val = sqlFloat(row[3]); /* Update total summary stuff. */ bits32 size = end-start; if (firstRow) { totalSum->validCount = size; totalSum->minVal = totalSum->maxVal = val; totalSum->sumData = val*size; totalSum->sumSquares = val*val*size; firstRow = FALSE; } else { totalSum->validCount += size; if (val < totalSum->minVal) totalSum->minVal = val; if (val > totalSum->maxVal) totalSum->maxVal = val; totalSum->sumData += val*size; totalSum->sumSquares += val*val*size; } /* If new chromosome output existing block. */ if (differentString(chrom, usage->name)) { usage = usage->next; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If start past existing block then output it. */ else if (sum != NULL && sum->end <= start) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If don't have a summary we're working on now, make one. */ if (sum == NULL) { oneSummary.chromId = usage->id; oneSummary.start = start; oneSummary.end = start + initialReduction; if (oneSummary.end > usage->size) oneSummary.end = usage->size; oneSummary.minVal = oneSummary.maxVal = val; oneSummary.sumData = oneSummary.sumSquares = 0.0; oneSummary.validCount = 0; sum = &oneSummary; } /* Deal with case where might have to split an item between multiple summaries. This * loop handles all but the final affected summary in that case. */ while (end > sum->end) { verbose(3, "Splitting start %d, end %d, sum->start %d, sum->end %d\n", start, end, sum->start, sum->end); /* Fold in bits that overlap with existing summary and output. */ bits32 overlap = rangeIntersection(start, end, sum->start, sum->end); sum->validCount += overlap; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * overlap; sum->sumSquares += val*val * overlap; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); size -= overlap; /* Move summary to next part. */ sum->start = start = sum->end; sum->end = start + initialReduction; if (sum->end > usage->size) sum->end = usage->size; sum->minVal = sum->maxVal = val; sum->sumData = sum->sumSquares = 0.0; sum->validCount = 0; } /* Add to summary. */ sum->validCount += size; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * size; sum->sumSquares += val*val * size; } bbiSumOutStreamClose(&stream); /* Write out 1st zoom index. */ int indexOffset = *retIndexStart = ftell(f); assert(boundsPt == boundsEnd); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; }
struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount) /* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL * collect max field sizes there too. */ { int maxRowSize = (eim == NULL ? 3 : bbExIndexMakerMaxIndexField(eim) + 1); char *row[maxRowSize]; struct hash *uniqHash = hashNew(0); struct bbiChromUsage *usage = NULL, *usageList = NULL; int lastStart = -1; bits32 id = 0; bits64 totalBases = 0, bedCount = 0; int minDiff = BIGNUM; lineFileRemoveInitialCustomTrackLines(lf); for (;;) { int rowSize = lineFileChopNext(lf, row, maxRowSize); if (rowSize == 0) break; lineFileExpectAtLeast(lf, maxRowSize, rowSize); char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (eim != NULL) bbExIndexMakerUpdateMaxFieldSize(eim, row); if (start > end) { errAbort("end (%d) before start (%d) line %d of %s", end, start, lf->lineIx, lf->fileName); } ++bedCount; totalBases += (end - start); if (usage == NULL || differentString(usage->name, chrom)) { if (hashLookup(uniqHash, chrom)) { errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); } hashAdd(uniqHash, chrom, NULL); struct hashEl *chromHashEl = hashLookup(chromSizesHash, chrom); if (chromHashEl == NULL) errAbort("%s is not found in chromosome sizes file", chrom); int chromSize = ptToInt(chromHashEl->val); AllocVar(usage); usage->name = cloneString(chrom); usage->id = id++; usage->size = chromSize; slAddHead(&usageList, usage); lastStart = -1; } if (end > usage->size) errAbort("End coordinate %d bigger than %s size of %d line %d of %s", end, usage->name, usage->size, lf->lineIx, lf->fileName); usage->itemCount += 1; if (lastStart >= 0) { int diff = start - lastStart; if (diff < minDiff) { if (diff < 0) errAbort("%s is not sorted at line %d. Please use \"sort -k1,1 -k2,2n\" or bedSort and try again.", lf->fileName, lf->lineIx); minDiff = diff; } } lastStart = start; } slReverse(&usageList); double aveSize = 0; if (bedCount > 0) aveSize = (double)totalBases/bedCount; *retMinDiff = minDiff; *retAveSize = aveSize; *retBedCount = bedCount; freeHash(&uniqHash); return usageList; }
void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f, int resTryCount, int resScales[], int resSizes[], boolean doCompress, bits32 *retMaxSectionSize) /* Read through lf, chunking it into sections that get written to f. Save info * about sections in bounds. */ { int maxSectionSize = 0; struct bbiChromUsage *usage = usageList; int itemIx = 0, sectionIx = 0; bits32 reserved32 = 0; UBYTE reserved8 = 0; struct sectionItem items[itemsPerSlot]; struct sectionItem *lastB = NULL; bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; struct dyString *stream = dyStringNew(0); /* remove initial browser and track lines */ lineFileRemoveInitialCustomTrackLines(lf); for (;;) { /* Get next line of input if any. */ char *row[5]; int rowSize = lineFileChopNext(lf, row, ArraySize(row)); /* Figure out whether need to output section. */ boolean sameChrom = FALSE; if (rowSize > 0) sameChrom = sameString(row[0], usage->name); if (itemIx >= itemsPerSlot || rowSize == 0 || !sameChrom) { /* Figure out section position. */ bits32 chromId = usage->id; bits32 sectionStart = items[0].start; bits32 sectionEnd = items[itemIx-1].end; /* Save section info for indexing. */ assert(sectionIx < sectionCount); struct bbiBoundsArray *section = &bounds[sectionIx++]; section->offset = ftell(f); section->range.chromIx = chromId; section->range.start = sectionStart; section->range.end = sectionEnd; /* Output section header to stream. */ dyStringClear(stream); UBYTE type = bwgTypeBedGraph; bits16 itemCount = itemIx; dyStringWriteOne(stream, chromId); // chromId dyStringWriteOne(stream, sectionStart); // start dyStringWriteOne(stream, sectionEnd); // end dyStringWriteOne(stream, reserved32); // itemStep dyStringWriteOne(stream, reserved32); // itemSpan dyStringWriteOne(stream, type); // type dyStringWriteOne(stream, reserved8); // reserved dyStringWriteOne(stream, itemCount); // itemCount /* Output each item in section to stream. */ int i; for (i=0; i<itemIx; ++i) { struct sectionItem *item = &items[i]; dyStringWriteOne(stream, item->start); dyStringWriteOne(stream, item->end); dyStringWriteOne(stream, item->val); } /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxSectionSize) maxSectionSize = stream->stringSize; if (doCompress) { size_t maxCompSize = zCompBufSize(stream->stringSize); char compBuf[maxCompSize]; int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); /* If at end of input we are done. */ if (rowSize == 0) break; /* Set up for next section. */ itemIx = 0; if (!sameChrom) { usage = usage->next; assert(usage != NULL); if (!sameString(row[0], usage->name)) errAbort("read %s, expecting %s on line %d in file %s\n", row[0], usage->name, lf->lineIx, lf->fileName); assert(sameString(row[0], usage->name)); lastB = NULL; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } } /* Parse out input. */ lineFileExpectWords(lf, 4, rowSize); bits32 start = lineFileNeedNum(lf, row, 1); bits32 end = lineFileNeedNum(lf, row, 2); float val = lineFileNeedDouble(lf, row, 3); /* Verify that inputs meets our assumption - that it is a sorted bedGraph file. */ if (start > end) errAbort("Start (%u) after end (%u) line %d of %s", start, end, lf->lineIx, lf->fileName); if (lastB != NULL) { if (lastB->start > start) errAbort("BedGraph not sorted on start line %d of %s", lf->lineIx, lf->fileName); if (lastB->end > start) errAbort("Overlapping regions in bedGraph line %d of %s", lf->lineIx, lf->fileName); } /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; if (start >= resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } /* Save values in output array. */ struct sectionItem *b = &items[itemIx]; b->start = start; b->end = end; b->val = val; lastB = b; itemIx += 1; } assert(sectionIx == sectionCount); *retMaxSectionSize = maxSectionSize; }