예제 #1
0
void bigWigFileCreate(
	char *inName, 		/* Input file in ascii wiggle format. */
	char *chromSizes, 	/* Two column tab-separated file: <chromosome> <size>. */
	int blockSize,		/* Number of items to bundle in r-tree.  1024 is good. */
	int itemsPerSlot,	/* Number of items in lowest level of tree.  512 is good. */
	boolean clipDontDie,	/* If TRUE then clip items off end of chrom rather than dying. */
	boolean compress,	/* If TRUE then compress data. */
	boolean keepAllChromosomes,	/* If TRUE then store all chromosomes in chromosomal b-tree. */
	boolean fixedSummaries,	/* If TRUE then impose fixed summary levels. */
	char *outName)
/* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) 
 * to binary big wig format. */
{
/* This code needs to agree with code in two other places currently - bigBedFileCreate,
 * and bbiFileOpen.  I'm thinking of refactoring to share at least between
 * bigBedFileCreate and bigWigFileCreate.  It'd be great so it could be structured
 * so that it could send the input in one chromosome at a time, and send in the zoom
 * stuff only after all the chromosomes are done.  This'd potentially reduce the memory
 * footprint by a factor of 2 or 4.  Still, for now it works. -JK */
struct hash *chromSizeHash = bbiChromSizesFromFile(chromSizes);
struct lm *lm = lmInit(0);
struct bwgSection *sectionList = bwgParseWig(inName, clipDontDie, chromSizeHash, itemsPerSlot, lm);
if (sectionList == NULL)
    errAbort("%s is empty of data", inName);
bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, keepAllChromosomes, fixedSummaries, outName);
lmCleanup(&lm);
}
예제 #2
0
void bedClip(char *inFile, char *chromSizes, char *outFile)
/* bedClip - Remove lines from bed file that refer to off-chromosome places.. */
{
struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes);
struct lineFile *lf = lineFileOpen(inFile, TRUE);
FILE *f = mustOpen(outFile, "w");
char *line;
while (lineFileNextReal(lf, &line))
    {
    char *chrom = nextWord(&line);
    char *startString = nextWord(&line);
    char *endString = nextWord(&line);
    if (endString == NULL)
        errAbort("Need at least three fields line %d of %s", lf->lineIx, lf->fileName);
    if (startString[0] == '-')
	{
	verbose(2, "Clipping negative line %d of %s\n", lf->lineIx, lf->fileName);
        continue;		// Clip off negatives
	}
    if (!isdigit(startString[0]))
        errAbort("Expecting number got %s line %d of %s", startString, lf->lineIx, lf->fileName);
    if (!isdigit(endString[0]))
        errAbort("Expecting number got %s line %d of %s", endString, lf->lineIx, lf->fileName);
    int start = sqlUnsigned(startString);
    int end = sqlUnsigned(endString);
    if (start >= end)
	{
	verbose(2, "Clipping end <= start line %d of %s\n", lf->lineIx, lf->fileName);
	continue;
	}
    struct hashEl *hel = hashLookup(chromSizesHash, chrom);
    if (hel == NULL)
        errAbort("Chromosome %s isn't in %s line %d of %s\n", chrom, chromSizes, lf->lineIx, lf->fileName);
    int chromSize = ptToInt(hel->val);
    if (end > chromSize)
	{
	verbose(2, "Clipping end > chromSize line %d of %s\n", lf->lineIx, lf->fileName);
	continue;
	}
    fprintf(f, "%s\t%s\t%s", chrom, startString, endString);
    line = skipLeadingSpaces(line);
    if (line == NULL || line[0] == 0)
        fputc('\n', f);
    else
        fprintf(f, "\t%s\n", line);
    }
carefulClose(&f);
}
예제 #3
0
void bbFileCreate(
	char *inName, 	  /* Input file in a tabular bed format <chrom><start><end> + whatever. */
	char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
	char *asText,	  /* Field definitions in a string */
	struct asObject *as,  /* Field definitions parsed out */
	boolean doCompress, /* If TRUE then compress data. */
	struct slName *extraIndexList,	/* List of extra indexes to add */
	char *outName)    /* BigBed output file name. */
/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
{
/* Set up timing measures. */
verboseTimeInit();
struct lineFile *lf = lineFileOpen(inName, TRUE);

bits16 fieldCount = slCount(as->columnList);
bits16 extraIndexCount = slCount(extraIndexList);

struct bbExIndexMaker *eim = NULL;
if (extraIndexList != NULL)
    eim = bbExIndexMakerNew(extraIndexList, as);

/* Load in chromosome sizes. */
struct hash *chromSizesHash = NULL;

if (sizesIs2Bit)
    chromSizesHash = twoBitChromHash(chromSizes);
else
    chromSizesHash = bbiChromSizesFromFile(chromSizes);
verbose(2, "Read %d chromosomes and sizes from %s\n",  chromSizesHash->elCount, chromSizes);

/* Do first pass, mostly just scanning file and counting hits per chromosome. */
int minDiff = 0;
double aveSize = 0;
bits64 bedCount = 0;
bits32 uncompressBufSize = 0;
struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, 
    &minDiff, &aveSize, &bedCount, tabSep);
verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList));
verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize);

/* Open output file and write dummy header. */
FILE *f = mustOpen(outName, "wb");
bbiWriteDummyHeader(f);
bbiWriteDummyZooms(f);

/* Write out autoSql string */
bits64 asOffset = ftell(f);
mustWrite(f, asText, strlen(asText) + 1);
verbose(2, "as definition has %d columns\n", fieldCount);

/* Write out dummy total summary. */
struct bbiSummaryElement totalSum;
ZeroVar(&totalSum);
bits64 totalSummaryOffset = ftell(f);
bbiSummaryElementWrite(f, &totalSum);

/* Write out dummy header extension */
bits64 extHeaderOffset = ftell(f);
bits16 extHeaderSize = 64;
repeatCharOut(f, 0, extHeaderSize);

/* Write out extra index stuff if need be. */
bits64 extraIndexListOffset = 0;
bits64 extraIndexListEndOffset = 0;
if (extraIndexList != NULL)
    {
    extraIndexListOffset = ftell(f);
    int extraIndexSize = 16 + 4*1;   // Fixed record size 16, plus 1 times field size of 4 
    repeatCharOut(f, 0, extraIndexSize*extraIndexCount);
    extraIndexListEndOffset = ftell(f);
    }

/* Write out chromosome/size database. */
bits64 chromTreeOffset = ftell(f);
bbiWriteChromInfo(usageList, blockSize, f);

/* Set up to keep track of possible initial reduction levels. */
int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels];
int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes);

/* Write out primary full resolution data in sections, collect stats to use for reductions. */
bits64 dataOffset = ftell(f);
bits32 blockCount = 0;
bits32 maxBlockSize = 0;
struct bbiBoundsArray *boundsArray = NULL;
writeOne(f, bedCount);
if (bedCount > 0)
    {
    blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot);
    AllocArray(boundsArray, blockCount);
    lineFileRewind(lf);
    if (eim)
	bbExIndexMakerAllocChunkArrays(eim, bedCount);
    writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress,
	    f, resTryCount, resScales, resSizes, eim, bedCount, fieldCount, &maxBlockSize);
    }
verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", 
	(long long)bedCount, fieldCount);

/* Write out primary data index. */
bits64 indexOffset = ftell(f);
cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount,
    blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
    indexOffset, f);
freez(&boundsArray);
verboseTime(2, "index write");

/* Declare arrays and vars that track the zoom levels we actually output. */
bits32 zoomAmounts[bbiMaxZoomLevels];
bits64 zoomDataOffsets[bbiMaxZoomLevels];
bits64 zoomIndexOffsets[bbiMaxZoomLevels];

/* Call monster zoom maker library function that bedGraphToBigWig also uses. */
int zoomLevels = 0;
if (bedCount > 0)
    {
    zoomLevels = bbiWriteZoomLevels(lf, f, blockSize, itemsPerSlot,
	bedWriteReducedOnceReturnReducedTwice, fieldCount,
	doCompress, indexOffset - dataOffset, 
	usageList, resTryCount, resScales, resSizes, 
	zoomAmounts, zoomDataOffsets, zoomIndexOffsets, &totalSum);
    }

/* Write out extra indexes if need be. */
if (eim)
    {
    int i;
    for (i=0; i < eim->indexCount; ++i)
        {
	eim->fileOffsets[i] = ftell(f);
	maxBedNameSize = eim->maxFieldSize[i];
	qsort(eim->chunkArrayArray[i], bedCount, 
	    sizeof(struct bbNamedFileChunk), bbNamedFileChunkCmpByName);
	assert(sizeof(struct bbNamedFileChunk) == sizeof(eim->chunkArrayArray[i][0]));
	bptFileBulkIndexToOpenFile(eim->chunkArrayArray[i], sizeof(eim->chunkArrayArray[i][0]), 
	    bedCount, blockSize, bbNamedFileChunkKey, maxBedNameSize, bbNamedFileChunkVal, 
	    sizeof(bits64) + sizeof(bits64), f);
	verboseTime(1, "Sorting and writing extra index %d", i);
	}
    }

/* Figure out buffer size needed for uncompression if need be. */
if (doCompress)
    {
    int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
    uncompressBufSize = max(maxBlockSize, maxZoomUncompSize);
    }

/* Go back and rewrite header. */
rewind(f);
bits32 sig = bigBedSig;
bits16 version = bbiCurrentVersion;
bits16 summaryCount = zoomLevels;
bits32 reserved32 = 0;
bits64 reserved64 = 0;

bits16 definedFieldCount = bedN;

/* Write fixed header */
writeOne(f, sig);
writeOne(f, version);
writeOne(f, summaryCount);
writeOne(f, chromTreeOffset);
writeOne(f, dataOffset);
writeOne(f, indexOffset);
writeOne(f, fieldCount);
writeOne(f, definedFieldCount);
writeOne(f, asOffset);
writeOne(f, totalSummaryOffset);
writeOne(f, uncompressBufSize);
writeOne(f, extHeaderOffset);
assert(ftell(f) == 64);

/* Write summary headers with data. */
int i;
verbose(2, "Writing %d levels of zoom\n", zoomLevels);
for (i=0; i<zoomLevels; ++i)
    {
    verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]);
    writeOne(f, zoomAmounts[i]);
    writeOne(f, reserved32);
    writeOne(f, zoomDataOffsets[i]);
    writeOne(f, zoomIndexOffsets[i]);
    }
/* Write rest of summary headers with no data. */
for (i=zoomLevels; i<bbiMaxZoomLevels; ++i)
    {
    writeOne(f, reserved32);
    writeOne(f, reserved32);
    writeOne(f, reserved64);
    writeOne(f, reserved64);
    }

/* Write total summary. */
fseek(f, totalSummaryOffset, SEEK_SET);
bbiSummaryElementWrite(f, &totalSum);

/* Write extended header */
fseek(f, extHeaderOffset, SEEK_SET);
writeOne(f, extHeaderSize);
writeOne(f, extraIndexCount);
writeOne(f, extraIndexListOffset);
repeatCharOut(f, 0, 52);    // reserved
assert(ftell(f) - extHeaderOffset == extHeaderSize);

/* Write extra index offsets if need be. */
if (extraIndexCount != 0)
    {
    fseek(f, extraIndexListOffset, SEEK_SET);
    int i;
    for (i=0; i<extraIndexCount; ++i)
        {
	// Write out fixed part of index info
	bits16 type = 0;    // bPlusTree type
	bits16 indexFieldCount = 1;
	writeOne(f, type);
	writeOne(f, indexFieldCount);
	writeOne(f, eim->fileOffsets[i]);
	repeatCharOut(f, 0, 4);  // reserved

	// Write out field list - easy this time because for now always only one field.
	bits16 fieldId = eim->indexFields[i];
	writeOne(f, fieldId);
	repeatCharOut(f, 0, 2); // reserved
	}
    assert(ftell(f) == extraIndexListEndOffset);
    }

/* Write end signature. */
fseek(f, 0L, SEEK_END);
writeOne(f, sig);


/* Clean up. */
lineFileClose(&lf);
carefulClose(&f);
freeHash(&chromSizesHash);
bbiChromUsageFreeList(&usageList);
asObjectFreeList(&as);
}
예제 #4
0
파일: from_kent.c 프로젝트: lidaof/iteres
void bedGraphToBigWig(char *inName, char *chromSizes, char *outName)
/* bedGraphToBigWig - Convert a bedGraph program to bigWig.. */
{
verboseTimeInit();
struct lineFile *lf = lineFileOpen(inName, TRUE);
struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes);
verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes);
int minDiff = 0, i;
double aveSize = 0;
bits64 bedCount = 0;
bits32 uncompressBufSize = 0;
struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, NULL, 
    &minDiff, &aveSize, &bedCount);
verboseTime(2, "pass1");
verbose(2, "%d chroms in %s, minDiff=%d, aveSize=%g, bedCount=%lld\n", 
    slCount(usageList), inName, minDiff, aveSize, bedCount);

/* Write out dummy header, zoom offsets. */
FILE *f = mustOpen(outName, "wb");
bbiWriteDummyHeader(f);
bbiWriteDummyZooms(f);

/* Write out dummy total summary. */
struct bbiSummaryElement totalSum;
ZeroVar(&totalSum);
bits64 totalSummaryOffset = ftell(f);
bbiSummaryElementWrite(f, &totalSum);

/* Write out chromosome/size database. */
bits64 chromTreeOffset = ftell(f);
bbiWriteChromInfo(usageList, blockSize, f);

/* Set up to keep track of possible initial reduction levels. */
int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels];
int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes);

/* Write out primary full resolution data in sections, collect stats to use for reductions. */
bits64 dataOffset = ftell(f);
bits64 sectionCount = bbiCountSectionsNeeded(usageList, itemsPerSlot);
writeOne(f, sectionCount);
struct bbiBoundsArray *boundsArray;
AllocArray(boundsArray, sectionCount);
lineFileRewind(lf);
bits32 maxSectionSize = 0;
writeSections(usageList, lf, itemsPerSlot, boundsArray, sectionCount, f,
	resTryCount, resScales, resSizes, doCompress, &maxSectionSize);
verboseTime(2, "pass2");

/* Write out primary data index. */
bits64 indexOffset = ftell(f);
cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), sectionCount,
    blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
    indexOffset, f);
verboseTime(2, "index write");

/* Declare arrays and vars that track the zoom levels we actually output. */
bits32 zoomAmounts[bbiMaxZoomLevels];
bits64 zoomDataOffsets[bbiMaxZoomLevels];
bits64 zoomIndexOffsets[bbiMaxZoomLevels];

/* Call monster zoom maker library function that bedToBigBed also uses. */
int zoomLevels = bbiWriteZoomLevels(lf, f, blockSize, itemsPerSlot,
    bedGraphWriteReducedOnceReturnReducedTwice, 4,
    doCompress, indexOffset - dataOffset, 
    usageList, resTryCount, resScales, resSizes, 
    zoomAmounts, zoomDataOffsets, zoomIndexOffsets, &totalSum);


/* Figure out buffer size needed for uncompression if need be. */
if (doCompress)
    {
    int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
    uncompressBufSize = max(maxSectionSize, maxZoomUncompSize);
    }

/* Go back and rewrite header. */
rewind(f);
bits32 sig = bigWigSig;
bits16 version = bbiCurrentVersion;
bits16 summaryCount = zoomLevels;
bits16 reserved16 = 0;
bits32 reserved32 = 0;
bits64 reserved64 = 0;

/* Write fixed header */
writeOne(f, sig);
writeOne(f, version);
writeOne(f, summaryCount);
writeOne(f, chromTreeOffset);
writeOne(f, dataOffset);
writeOne(f, indexOffset);
writeOne(f, reserved16);	// fieldCount
writeOne(f, reserved16);	// definedFieldCount
writeOne(f, reserved64);	// autoSqlOffset
writeOne(f, totalSummaryOffset);
writeOne(f, uncompressBufSize);
writeOne(f, reserved64);	// nameIndexOffset
assert(ftell(f) == 64);

/* Write summary headers with data. */
verbose(2, "Writing %d levels of zoom\n", zoomLevels);
for (i=0; i<zoomLevels; ++i)
    {
    verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]);
    writeOne(f, zoomAmounts[i]);
    writeOne(f, reserved32);
    writeOne(f, zoomDataOffsets[i]);
    writeOne(f, zoomIndexOffsets[i]);
    }
/* Write rest of summary headers with no data. */
for (i=zoomLevels; i<bbiMaxZoomLevels; ++i)
    {
    writeOne(f, reserved32);
    writeOne(f, reserved32);
    writeOne(f, reserved64);
    writeOne(f, reserved64);
    }

/* Write total summary. */
fseek(f, totalSummaryOffset, SEEK_SET);
bbiSummaryElementWrite(f, &totalSum);

/* Write end signature. */
fseek(f, 0L, SEEK_END);
writeOne(f, sig);

lineFileClose(&lf);
carefulClose(&f);
}