Esempio n. 1
0
static struct bbiSummary *bedWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
	int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
	struct bbiSummaryElement *totalSum)
/* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
 * next reduction level.  This is more work than some ways, but it keeps us from having to
 * keep the first reduction entirely in memory. */
{
struct bbiSummary *twiceReducedList = NULL;
bits32 doubleReductionSize = initialReduction * zoomIncrement;
struct bbiChromUsage *usage = usageList;
struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd;
boundsPt = AllocArray(boundsArray, initialReductionCount);
boundsEnd = boundsPt + initialReductionCount;

*retDataStart = ftell(f);
writeOne(f, initialReductionCount);

/* This gets a little complicated I'm afraid.  The strategy is to:
 *   1) Build up a range tree that represents coverage depth on that chromosome
 *      This also has the nice side effect of getting rid of overlaps.
 *   2) Stream through the range tree, outputting the initial summary level and
 *      further reducing. 
 */
boolean firstTime = TRUE;
struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
for (usage = usageList; usage != NULL; usage = usage->next)
    {
    struct bbiSummary oneSummary, *sum = NULL;
    struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
    struct range *range, *rangeList = rangeTreeList(rangeTree);
    for (range = rangeList; range != NULL; range = range->next)
        {
	/* Grab values we want from range. */
	double val = ptToInt(range->val);
	int start = range->start;
	int end = range->end;
	bits32 size = end - start;

	/* Add to total summary. */
	if (firstTime)
	    {
	    totalSum->validCount = size;
	    totalSum->minVal = totalSum->maxVal = val;
	    totalSum->sumData = val*size;
	    totalSum->sumSquares = val*val*size;
	    firstTime = FALSE;
	    }
	else
	    {
	    totalSum->validCount += size;
	    if (val < totalSum->minVal) totalSum->minVal = val;
	    if (val > totalSum->maxVal) totalSum->maxVal = val;
	    totalSum->sumData += val*size;
	    totalSum->sumSquares += val*val*size;
	    }

	/* If start past existing block then output it. */
	if (sum != NULL && sum->end <= start && sum->end < usage->size)
	    {
	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
		&boundsPt, boundsEnd, lm, stream);
	    sum = NULL;
	    }
	/* If don't have a summary we're working on now, make one. */
	if (sum == NULL)
	    {
	    oneSummary.chromId = usage->id;
	    oneSummary.start = start;
	    oneSummary.end = start + initialReduction;
	    if (oneSummary.end > usage->size) oneSummary.end = usage->size;
	    oneSummary.minVal = oneSummary.maxVal = val;
	    oneSummary.sumData = oneSummary.sumSquares = 0.0;
	    oneSummary.validCount = 0;
	    sum = &oneSummary;
	    }
	/* Deal with case where might have to split an item between multiple summaries.  This
	 * loop handles all but the final affected summary in that case. */
	while (end > sum->end)
	    {
	    /* Fold in bits that overlap with existing summary and output. */
	    int overlap = rangeIntersection(start, end, sum->start, sum->end);
	    assert(overlap > 0);
	    verbose(3, "Splitting size %d at %d, overlap %d\n", end - start, sum->end, overlap);
	    sum->validCount += overlap;
	    if (sum->minVal > val) sum->minVal = val;
	    if (sum->maxVal < val) sum->maxVal = val;
	    sum->sumData += val * overlap;
	    sum->sumSquares += val*val * overlap;
	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
		    &boundsPt, boundsEnd, lm, stream);
	    size -= overlap;

	    /* Move summary to next part. */
	    sum->start = start = sum->end;
	    sum->end = start + initialReduction;
	    if (sum->end > usage->size) sum->end = usage->size;
	    sum->minVal = sum->maxVal = val;
	    sum->sumData = sum->sumSquares = 0.0;
	    sum->validCount = 0;
	    }

	/* Add to summary. */
	sum->validCount += size;
	if (sum->minVal > val) sum->minVal = val;
	if (sum->maxVal < val) sum->maxVal = val;
	sum->sumData += val * size;
	sum->sumSquares += val*val * size;
	}
    if (sum != NULL)
	{
	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
	    &boundsPt, boundsEnd, lm, stream);
	}
    rangeTreeFree(&rangeTree);
    }
bbiSumOutStreamClose(&stream);

/* Write out 1st zoom index. */
int indexOffset = *retIndexStart = ftell(f);
assert(boundsPt == boundsEnd);
cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount,
    blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
    indexOffset, f);

freez(&boundsArray);
slReverse(&twiceReducedList);
return twiceReducedList;
}
Esempio n. 2
0
struct bbiSummary *bedGraphWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
	int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
	struct bbiSummaryElement *totalSum)
/* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
 * next reduction level.  This is more work than some ways, but it keeps us from having to
 * keep the first reduction entirely in memory. */
{
struct bbiSummary *twiceReducedList = NULL;
bits32 doubleReductionSize = initialReduction * zoomIncrement;
struct bbiChromUsage *usage = usageList;
struct bbiSummary oneSummary, *sum = NULL;
struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd;
boundsPt = AllocArray(boundsArray, initialReductionCount);
boundsEnd = boundsPt + initialReductionCount;

*retDataStart = ftell(f);
writeOne(f, initialReductionCount);
boolean firstRow = TRUE;

struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);

/* remove initial browser and track lines */
lineFileRemoveInitialCustomTrackLines(lf);

for (;;)
    {
    /* Get next line of input if any. */
    char *row[5];
    int rowSize = lineFileChopNext(lf, row, ArraySize(row));

    /* Output last section and break if at end of file. */
    if (rowSize == 0 && sum != NULL)
	{
	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
		&boundsPt, boundsEnd, lm, stream);
	break;
	}

    /* Parse out row. */
    char *chrom = row[0];
    bits32 start = sqlUnsigned(row[1]);
    bits32 end = sqlUnsigned(row[2]);
    float val = sqlFloat(row[3]);

    /* Update total summary stuff. */
    bits32 size = end-start;
    if (firstRow)
	{
        totalSum->validCount = size;
	totalSum->minVal = totalSum->maxVal = val;
	totalSum->sumData = val*size;
	totalSum->sumSquares = val*val*size;
	firstRow = FALSE;
	}
    else
        {
	totalSum->validCount += size;
	if (val < totalSum->minVal) totalSum->minVal = val;
	if (val > totalSum->maxVal) totalSum->maxVal = val;
	totalSum->sumData += val*size;
	totalSum->sumSquares += val*val*size;
	}

    /* If new chromosome output existing block. */
    if (differentString(chrom, usage->name))
        {
	usage = usage->next;
	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
		&boundsPt, boundsEnd, lm, stream);
	sum = NULL;
	}

    /* If start past existing block then output it. */
    else if (sum != NULL && sum->end <= start)
	{
	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
		&boundsPt, boundsEnd, lm, stream);
	sum = NULL;
	}

    /* If don't have a summary we're working on now, make one. */
    if (sum == NULL)
        {
	oneSummary.chromId = usage->id;
	oneSummary.start = start;
	oneSummary.end = start + initialReduction;
	if (oneSummary.end > usage->size) oneSummary.end = usage->size;
	oneSummary.minVal = oneSummary.maxVal = val;
	oneSummary.sumData = oneSummary.sumSquares = 0.0;
	oneSummary.validCount = 0;
	sum = &oneSummary;
	}
    
    /* Deal with case where might have to split an item between multiple summaries.  This
     * loop handles all but the final affected summary in that case. */
    while (end > sum->end)
        {
	verbose(3, "Splitting start %d, end %d, sum->start %d, sum->end %d\n", start, end, sum->start, sum->end);
	/* Fold in bits that overlap with existing summary and output. */
	bits32 overlap = rangeIntersection(start, end, sum->start, sum->end);
	sum->validCount += overlap;
	if (sum->minVal > val) sum->minVal = val;
	if (sum->maxVal < val) sum->maxVal = val;
	sum->sumData += val * overlap;
	sum->sumSquares += val*val * overlap;
	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
		&boundsPt, boundsEnd, lm, stream);
	size -= overlap;

	/* Move summary to next part. */
	sum->start = start = sum->end;
	sum->end = start + initialReduction;
	if (sum->end > usage->size) sum->end = usage->size;
	sum->minVal = sum->maxVal = val;
	sum->sumData = sum->sumSquares = 0.0;
	sum->validCount = 0;
	}

    /* Add to summary. */
    sum->validCount += size;
    if (sum->minVal > val) sum->minVal = val;
    if (sum->maxVal < val) sum->maxVal = val;
    sum->sumData += val * size;
    sum->sumSquares += val*val * size;
    }
bbiSumOutStreamClose(&stream);

/* Write out 1st zoom index. */
int indexOffset = *retIndexStart = ftell(f);
assert(boundsPt == boundsEnd);
cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount,
    blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
    indexOffset, f);

freez(&boundsArray);
slReverse(&twiceReducedList);
return twiceReducedList;
}