void wigCorrelate(int inCount, char **inNames, char *outName) /* wigCorrelate - Produce a table that correlates all pairs of wigs.. */ { int i,j; FILE *f = mustOpen(outName, "w"); verboseTimeInit(); for (i=0; i<inCount-1; ++i) { char *iName = inNames[i]; struct metaWig *iMeta = metaWigOpen(iName); verboseTime(2, "parsed %s into %p", iName, iMeta); for (j=i+1; j<inCount; ++j) { char *jName = inNames[j]; struct metaWig *jMeta = metaWigOpen(jName); verboseTime(2, "parsed %s into %p", jName, jMeta); fprintf(f, "%s\t%s\t", iName, jName); fflush(f); double r = correlatePair(iMeta, jMeta); fprintf(f, "%g\n", r); fflush(f); verboseTime(2, "correlated %g from %s and %s", r, iName, jName); metaWigClose(&jMeta); } metaWigClose(&iMeta); } carefulClose(&f); }
void splatToEland(char *inName, char *outName) /* splatToEland - Convert from splat to eland format.. */ { verboseTimeInit(); struct splatAli *list = splatAliLoadAll(inName); verboseTime(1, "Loaded %d sequences from %s", slCount(list), inName); if (needSort(list)) { slSort(&list, splatAliCmpReadName); verboseTime(1, "Sorted by read name"); } FILE *f = mustOpen(outName, "w"); struct splatAli *el, *next; for (el = list; el != NULL; el = next) { int sameCount; findNextDifferent(el, &next, &sameCount); int bestScore, bestCount; splatAliLookForBest(el, next, &bestScore, &bestCount); int subCounts[3]; countSubsInList(el, next, subCounts); if (multi) elandMultiOutput(el, next, bestScore, bestCount, subCounts, f); else elandSingleOutput(el, next, bestScore, bestCount, subCounts, f); } }
void verboseTime(int verbosity, char *label, ...) /* Print label and how long it's been since last call. Start time can be * initialized with verboseTimeInit, otherwise the elapsed time will be * zero. */ { assert(label != NULL); // original version allowed this, but breaks some GCCs if (lastTime < 0) verboseTimeInit(); long time = clock1000(); va_list args; va_start(args, label); verboseVa(verbosity, label, args); verbose(verbosity, ": %ld millis\n", time - lastTime); lastTime = time; va_end(args); }
void itsaMake(int inCount, char *inputs[], char *output) /* itsaMake - Make a suffix array file out of input DNA sequences.. */ { verboseTimeInit(); bits64 maxGenomeSize = 1024LL*1024*1024*4; itsaBaseToValInit(); /* Load all DNA, make sure names are unique, and alphabetize by name. */ struct dnaSeq *seqList = NULL, *seq; struct hash *uniqSeqHash = hashNew(0); bits64 totalDnaSize = 1; /* FOr space between. */ int inputIx; for (inputIx=0; inputIx<inCount; ++inputIx) { char * input = inputs[inputIx]; struct dnaLoad *dl = dnaLoadOpen(input); while ((seq = dnaLoadNext(dl)) != NULL) { verbose(1, "read %s with %d bases\n", seq->name, seq->size); if (hashLookup(uniqSeqHash, seq->name)) errAbort("Input sequence name %s repeated, all must be unique.", seq->name); totalDnaSize += seq->size + 1; if (totalDnaSize > maxGenomeSize) errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize); slAddHead(&seqList, seq); } dnaLoadClose(&dl); } slSort(&seqList, dnaSeqCmpName); verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList)); /* Allocate big buffer for all DNA. */ DNA *allDna = globalAllDna = needHugeMem(totalDnaSize); allDna[0] = 0; bits64 chromOffset = 1; /* Have zeroes between each chrom, and before and after. */ /* Copy DNA to a single big buffer, and create chromInfo on each sequence. */ struct chromInfo *chrom, *chromList = NULL; for (seq = seqList; seq != NULL; seq = seq->next) { AllocVar(chrom); chrom->name = cloneString(seq->name); chrom->size = seq->size; chrom->offset = chromOffset; slAddHead(&chromList, chrom); toUpperN(seq->dna, seq->size); memcpy(allDna + chromOffset, seq->dna, seq->size + 1); chromOffset += seq->size + 1; } slReverse(&chromList); /* Free up separate dna sequences because we're going to need a lot of RAM soon. */ /* Allocate index array, and offset and list arrays. */ dnaSeqFreeList(&seqList); bits32 *index13; AllocArray(index13, itsaSlotCount); bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32)); bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32)); verboseTime(1, "Allocated buffers %lld bytes total", (long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32))); /* Where normally we'd keep some sort of structure with a next element to form a list * of matching positions in each slot of our index, to conserve memory we'll do this * with two parallel arrays. Because we're such cheapskates in terms of memory we'll * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different * purposes. * In the first phase they will together be used to form linked lists of * offsets, and the 13mer index will point to the first item in each list. In this * phase the offsetArray contains offsets into the allDna structure, and the listArray * contains the next pointers for the list. After the first phase we write out the * suffix array to disk. * In the second phase we read the suffix array back into the offsetArray, and * use the listArray for the traverseArray. We write out the traverse array to finish * things up. */ /* Load up all DNA buffer. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { verbose(2, " About to do first pass index\n"); indexChromPass1(chrom, allDna, offsetArray, listArray, index13); verbose(2, " Done first pass index\n"); } verboseTime(1, "Done big bucket sort"); slReverse(&chromList); itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output); }
void bbFileCreate( char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */ char *asText, /* Field definitions in a string */ struct asObject *as, /* Field definitions parsed out */ boolean doCompress, /* If TRUE then compress data. */ struct slName *extraIndexList, /* List of extra indexes to add */ char *outName) /* BigBed output file name. */ /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */ { /* Set up timing measures. */ verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); bits16 fieldCount = slCount(as->columnList); bits16 extraIndexCount = slCount(extraIndexList); struct bbExIndexMaker *eim = NULL; if (extraIndexList != NULL) eim = bbExIndexMakerNew(extraIndexList, as); /* Load in chromosome sizes. */ struct hash *chromSizesHash = NULL; if (sizesIs2Bit) chromSizesHash = twoBitChromHash(chromSizes); else chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "Read %d chromosomes and sizes from %s\n", chromSizesHash->elCount, chromSizes); /* Do first pass, mostly just scanning file and counting hits per chromosome. */ int minDiff = 0; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, &minDiff, &aveSize, &bedCount, tabSep); verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList)); verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize); /* Open output file and write dummy header. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out autoSql string */ bits64 asOffset = ftell(f); mustWrite(f, asText, strlen(asText) + 1); verbose(2, "as definition has %d columns\n", fieldCount); /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); /* Write out dummy header extension */ bits64 extHeaderOffset = ftell(f); bits16 extHeaderSize = 64; repeatCharOut(f, 0, extHeaderSize); /* Write out extra index stuff if need be. */ bits64 extraIndexListOffset = 0; bits64 extraIndexListEndOffset = 0; if (extraIndexList != NULL) { extraIndexListOffset = ftell(f); int extraIndexSize = 16 + 4*1; // Fixed record size 16, plus 1 times field size of 4 repeatCharOut(f, 0, extraIndexSize*extraIndexCount); extraIndexListEndOffset = ftell(f); } /* Write out chromosome/size database. */ bits64 chromTreeOffset = ftell(f); bbiWriteChromInfo(usageList, blockSize, f); /* Set up to keep track of possible initial reduction levels. */ int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels]; int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes); /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); bits32 blockCount = 0; bits32 maxBlockSize = 0; struct bbiBoundsArray *boundsArray = NULL; writeOne(f, bedCount); if (bedCount > 0) { blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); AllocArray(boundsArray, blockCount); lineFileRewind(lf); if (eim) bbExIndexMakerAllocChunkArrays(eim, bedCount); writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress, f, resTryCount, resScales, resSizes, eim, bedCount, fieldCount, &maxBlockSize); } verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", (long long)bedCount, fieldCount); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); verboseTime(2, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; /* Call monster zoom maker library function that bedGraphToBigWig also uses. */ int zoomLevels = 0; if (bedCount > 0) { zoomLevels = bbiWriteZoomLevels(lf, f, blockSize, itemsPerSlot, bedWriteReducedOnceReturnReducedTwice, fieldCount, doCompress, indexOffset - dataOffset, usageList, resTryCount, resScales, resSizes, zoomAmounts, zoomDataOffsets, zoomIndexOffsets, &totalSum); } /* Write out extra indexes if need be. */ if (eim) { int i; for (i=0; i < eim->indexCount; ++i) { eim->fileOffsets[i] = ftell(f); maxBedNameSize = eim->maxFieldSize[i]; qsort(eim->chunkArrayArray[i], bedCount, sizeof(struct bbNamedFileChunk), bbNamedFileChunkCmpByName); assert(sizeof(struct bbNamedFileChunk) == sizeof(eim->chunkArrayArray[i][0])); bptFileBulkIndexToOpenFile(eim->chunkArrayArray[i], sizeof(eim->chunkArrayArray[i][0]), bedCount, blockSize, bbNamedFileChunkKey, maxBedNameSize, bbNamedFileChunkVal, sizeof(bits64) + sizeof(bits64), f); verboseTime(1, "Sorting and writing extra index %d", i); } } /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxBlockSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigBedSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits32 reserved32 = 0; bits64 reserved64 = 0; bits16 definedFieldCount = bedN; /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, fieldCount); writeOne(f, definedFieldCount); writeOne(f, asOffset); writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); writeOne(f, extHeaderOffset); assert(ftell(f) == 64); /* Write summary headers with data. */ int i; verbose(2, "Writing %d levels of zoom\n", zoomLevels); for (i=0; i<zoomLevels; ++i) { verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]); writeOne(f, zoomAmounts[i]); writeOne(f, reserved32); writeOne(f, zoomDataOffsets[i]); writeOne(f, zoomIndexOffsets[i]); } /* Write rest of summary headers with no data. */ for (i=zoomLevels; i<bbiMaxZoomLevels; ++i) { writeOne(f, reserved32); writeOne(f, reserved32); writeOne(f, reserved64); writeOne(f, reserved64); } /* Write total summary. */ fseek(f, totalSummaryOffset, SEEK_SET); bbiSummaryElementWrite(f, &totalSum); /* Write extended header */ fseek(f, extHeaderOffset, SEEK_SET); writeOne(f, extHeaderSize); writeOne(f, extraIndexCount); writeOne(f, extraIndexListOffset); repeatCharOut(f, 0, 52); // reserved assert(ftell(f) - extHeaderOffset == extHeaderSize); /* Write extra index offsets if need be. */ if (extraIndexCount != 0) { fseek(f, extraIndexListOffset, SEEK_SET); int i; for (i=0; i<extraIndexCount; ++i) { // Write out fixed part of index info bits16 type = 0; // bPlusTree type bits16 indexFieldCount = 1; writeOne(f, type); writeOne(f, indexFieldCount); writeOne(f, eim->fileOffsets[i]); repeatCharOut(f, 0, 4); // reserved // Write out field list - easy this time because for now always only one field. bits16 fieldId = eim->indexFields[i]; writeOne(f, fieldId); repeatCharOut(f, 0, 2); // reserved } assert(ftell(f) == extraIndexListEndOffset); } /* Write end signature. */ fseek(f, 0L, SEEK_END); writeOne(f, sig); /* Clean up. */ lineFileClose(&lf); carefulClose(&f); freeHash(&chromSizesHash); bbiChromUsageFreeList(&usageList); asObjectFreeList(&as); }
void bedGraphToBigWig(char *inName, char *chromSizes, char *outName) /* bedGraphToBigWig - Convert a bedGraph program to bigWig.. */ { verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes); int minDiff = 0, i; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, NULL, &minDiff, &aveSize, &bedCount); verboseTime(2, "pass1"); verbose(2, "%d chroms in %s, minDiff=%d, aveSize=%g, bedCount=%lld\n", slCount(usageList), inName, minDiff, aveSize, bedCount); /* Write out dummy header, zoom offsets. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); /* Write out chromosome/size database. */ bits64 chromTreeOffset = ftell(f); bbiWriteChromInfo(usageList, blockSize, f); /* Set up to keep track of possible initial reduction levels. */ int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels]; int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes); /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); bits64 sectionCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); writeOne(f, sectionCount); struct bbiBoundsArray *boundsArray; AllocArray(boundsArray, sectionCount); lineFileRewind(lf); bits32 maxSectionSize = 0; writeSections(usageList, lf, itemsPerSlot, boundsArray, sectionCount, f, resTryCount, resScales, resSizes, doCompress, &maxSectionSize); verboseTime(2, "pass2"); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), sectionCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); verboseTime(2, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; /* Call monster zoom maker library function that bedToBigBed also uses. */ int zoomLevels = bbiWriteZoomLevels(lf, f, blockSize, itemsPerSlot, bedGraphWriteReducedOnceReturnReducedTwice, 4, doCompress, indexOffset - dataOffset, usageList, resTryCount, resScales, resSizes, zoomAmounts, zoomDataOffsets, zoomIndexOffsets, &totalSum); /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxSectionSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigWigSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits16 reserved16 = 0; bits32 reserved32 = 0; bits64 reserved64 = 0; /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, reserved16); // fieldCount writeOne(f, reserved16); // definedFieldCount writeOne(f, reserved64); // autoSqlOffset writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); writeOne(f, reserved64); // nameIndexOffset assert(ftell(f) == 64); /* Write summary headers with data. */ verbose(2, "Writing %d levels of zoom\n", zoomLevels); for (i=0; i<zoomLevels; ++i) { verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]); writeOne(f, zoomAmounts[i]); writeOne(f, reserved32); writeOne(f, zoomDataOffsets[i]); writeOne(f, zoomIndexOffsets[i]); } /* Write rest of summary headers with no data. */ for (i=zoomLevels; i<bbiMaxZoomLevels; ++i) { writeOne(f, reserved32); writeOne(f, reserved32); writeOne(f, reserved64); writeOne(f, reserved64); } /* Write total summary. */ fseek(f, totalSummaryOffset, SEEK_SET); bbiSummaryElementWrite(f, &totalSum); /* Write end signature. */ fseek(f, 0L, SEEK_END); writeOne(f, sig); lineFileClose(&lf); carefulClose(&f); }
void encode2MakeEncode3(char *sourceDir, char *sourceManifest, char *destDir, char *outMake, char *outManifest, char *outRename) /* encode2MakeEncode3 - Copy files in encode2 manifest and in case of tar'd files rezip them * independently. */ { struct encode2Manifest *fileList = encode2ManifestShortLoadAll(sourceManifest); verbose(2, "Loaded information on %d files from %s\n", slCount(fileList), sourceManifest); verboseTimeInit(); FILE *f = mustOpen(outMake, "w"); FILE *manF = mustOpen(outManifest, "w"); FILE *renameF = mustOpen(outRename, "w"); struct encode2Manifest *mi; int destDirPrefixSize = strlen(destDir)+1; /* +1 for / */ struct hash *destDirHash = hashNew(0); makeDirOnlyOnce(destDir, destDirHash); /* Print first dependency in makefile - the one that causes all files to be made. */ fprintf(f, "startHere: all\n\techo all done\n\n"); /* Write out each file target, and save also list of all targets. */ struct slName *targetList = NULL; int itemNo = 0; for (mi = fileList; mi != NULL; mi = mi->next) { char *origFileName = cloneString(mi->fileName); /* Keep a list of what all it goes to. */ struct slName *oneTargetList = NULL; /* Make path to source file. */ char sourcePath[PATH_LEN]; safef(sourcePath, sizeof(sourcePath), "%s/%s", sourceDir, mi->fileName); /* Make destination dir */ char localDir[PATH_LEN]; splitPath(mi->fileName, localDir, NULL, NULL); char destSubDir[PATH_LEN]; safef(destSubDir, sizeof(destSubDir), "%s/%s", destDir, localDir); makeDirOnlyOnce(destSubDir, destDirHash); char destPath[PATH_LEN]; safef(destPath, sizeof(destPath), "%s/%s", destDir, mi->fileName); processManifestItem(++itemNo, mi, sourceDir, destDir, &oneTargetList, f, manF); /* Write out rename info. */ if (oneTargetList != NULL) { if (oneTargetList->next != NULL || !sameString(origFileName, oneTargetList->name + destDirPrefixSize)) { fprintf(renameF, "%s", origFileName); struct slName *target; for (target = oneTargetList; target != NULL; target = target->next) { fprintf(renameF, " %s", target->name + destDirPrefixSize); } fprintf(renameF, "\n"); } } /* Save local target list to big one. */ targetList = slCat(oneTargetList, targetList); } slReverse(&targetList); fprintf(f, "all:"); struct slName *target; for (target = targetList; target != NULL; target = target->next) fprintf(f, " %s", target->name); fprintf(f, "\n"); carefulClose(&renameF); carefulClose(&manF); carefulClose(&f); }