static void expandFaFastBuf(int bufPos, int minExp) /* Make faFastBuf bigger. */ { if (faFastBufSize == 0) { faFastBufSize = 64 * 1024; while (minExp > faFastBufSize) faFastBufSize <<= 1; faFastBuf = needHugeMem(faFastBufSize); } else { DNA *newBuf; unsigned newBufSize = faFastBufSize + faFastBufSize; while (newBufSize < minExp) { newBufSize <<= 1; if (newBufSize <= 0) errAbort("expandFaFastBuf: integer overflow when trying to " "increase buffer size from %u to a min of %u.", faFastBufSize, minExp); } newBuf = needHugeMem(newBufSize); memcpy(newBuf, faFastBuf, bufPos); freeMem(faFastBuf); faFastBuf = newBuf; faFastBufSize = newBufSize; } }
void agpToFaOne(struct agpFrag **pAgpList, char *agpFile, char *agpSeq, char *seqDir, int lastPos, FILE *f) /* Given one sequence's worth of AGP in pAgpList, process it into FASTA * and write to f. */ { DNA *dna = NULL; slReverse(pAgpList); if (lastPos == 0) errAbort("%s not found in %s\n", agpSeq, agpFile); dna = needHugeMem(lastPos+1); memset(dna, 'n', lastPos); dna[lastPos] = 0; if (optionExists("simpleMulti")) { simpleMultiFillInSequence(0, seqDir, *pAgpList, dna, lastPos); } else if (optionExists("simpleMultiMixed")) { simpleMultiFillInSequence(1, seqDir, *pAgpList, dna, lastPos); } else if (optionExists("simple")) { simpleFillInSequence(seqDir, *pAgpList, dna, lastPos); } else { gsFillInSequence(seqDir, *pAgpList, dna, lastPos); } verbose(2,"Writing %s (%d bases)\n", agpSeq, lastPos); faWriteNext(f, agpSeq, dna, lastPos); agpFragFreeList(pAgpList); }
struct hash *qacReadToHash(char *fileName) /* Read in a qac file into a hash of qacs keyed by name. */ { boolean isSwapped; FILE *f = qacOpenVerify(fileName, &isSwapped); bits32 compSize, uncSize; struct qac *qac; char *name; struct hash *hash = newHash(18); int count = 0; for (;;) { name = readString(f); if (name == NULL) break; mustReadOne(f, uncSize); if (isSwapped) uncSize = byteSwap32(uncSize); mustReadOne(f, compSize); if (isSwapped) compSize = byteSwap32(compSize); qac = needHugeMem(sizeof(*qac) + compSize - 1); qac->uncSize = uncSize; qac->compSize = compSize; mustRead(f, qac->data, compSize); hashAdd(hash, name, qac); ++count; } carefulClose(&f); printf("Read %d qacs from %s\n", count, fileName); return hash; }
void *needHugeZeroedMem(size_t size) /* Request a large block of memory and zero it. */ { void *v; v = needHugeMem(size); memset(v, 0, size); return v; }
static struct bed *randomTrial(struct chrGapList *bounding, struct bed *placed) /* placed bed list has already been sorted by size descending, return is the newly placed bed list */ { struct bed *bedList = NULL; struct bed *bedEl; int placedCount = slCount(placed); int gapCount = countGaps(bounding); int i; struct gap **sizedGaps = NULL; /* an array of pointers */ int maxGapCount = 0; /* We should never have more gaps than the initial set of gaps plus * the placed item count since each placed item only creates one * new gap. This array will be used repeatedly as lists of gaps of * specific sizes are created. The array will be an array of * pointers to the gaps greater than the specified size. * The + 1 on the maxGapCount is to keep the array one larger than * expected maximum so that a safety check can be performed that it * never reaches past the expected maximum. */ maxGapCount = placedCount + gapCount + 1; sizedGaps = needHugeMem((size_t)(sizeof(struct gap *) * maxGapCount)); i = 0; for (bedEl = placed; bedEl != NULL; bedEl = bedEl->next) { struct bed *newBed; int N; int R; int itemSize = bedEl->chromEnd - bedEl->chromStart; if (itemSize < 1) errAbort("ERROR: placing items less than 1 bp in length ? %s:%d-%d", bedEl->chrom, bedEl->chromEnd, bedEl->chromStart); N = gapsOfSize(bounding,itemSize, sizedGaps, maxGapCount); /* From those N gaps, randomly select one of them (drand48 = [0.0,1.0)*/ R = floor(N * drand48()); /* interval: [0,N) == [0,N-1] */ if ((R >= N) || (R >= maxGapCount)) errAbort("ERROR: did not expect random " "number %d to be >= %d (or %d)\n", R, N, maxGapCount); /* The newBed is the bedEl translated to a new random location */ newBed = randomInsert(bedEl,sizedGaps[R]); slAddHead(&bedList,newBed); } /* sizedGaps are just a bunch of pointers, the bed element inserts * actually went into the bounding gap list which is going to be * freed up, along with the specially added bed elements back in * the loop that is managing the copying of the bounding list. */ freeMem(sizedGaps); return(bedList); }
struct dnaSeq *cloneDnaSeq(struct dnaSeq *orig) /* Duplicate dna sequence in RAM. */ { struct dnaSeq *seq = CloneVar(orig); seq->name = cloneString(seq->name); seq->dna = needHugeMem(seq->size+1); memcpy(seq->dna, orig->dna, seq->size+1); seq->mask = NULL; if (orig->mask != NULL) { seq->mask = bitClone(orig->mask, seq->size); } return seq; }
struct dnaSeq *faReadSeq(char *fileName, boolean isDna) /* Open fa file and read a single sequence from it. */ { int maxSize = fileSize(fileName); int fd; DNA *s; if (maxSize < 0) errAbort("can't open %s", fileName); s = needHugeMem(maxSize+1); fd = open(fileName, O_RDONLY); read(fd, s, maxSize); close(fd); s[maxSize] = 0; return faSeqFromMemText(s, isDna); }
boolean bigWigValsOnChromFetchData(struct bigWigValsOnChrom *chromVals, char *chrom, struct bbiFile *bigWig) /* Fetch data for chromosome from bigWig. Returns FALSE if not data on that chrom. */ { /* Fetch chromosome and size into self. */ freeMem(chromVals->chrom); chromVals->chrom = cloneString(chrom); long chromSize = chromVals->chromSize = bbiChromSize(bigWig, chrom); if (chromSize <= 0) return FALSE; /* Make sure buffers are big enough. */ if (chromSize > chromVals->bufSize) { freeMem(chromVals->valBuf); freeMem(chromVals->covBuf); chromVals->valBuf = needHugeMem((sizeof(double))*chromSize); chromVals->covBuf = bitAlloc(chromSize); chromVals->bufSize = chromSize; } /* Zero out buffers */ bitClear(chromVals->covBuf, chromSize); double *valBuf = chromVals->valBuf; int i; for (i=0; i<chromSize; ++i) valBuf[i] = 0.0; fetchIntoBuf(bigWig, chrom, 0, chromSize, chromVals); #ifdef OLD /* Fetch intervals for this chromosome and fold into buffers. */ struct lm *lm = lmInit(0); struct bbiInterval *iv, *ivList = bigWigIntervalQuery(bigWig, chrom, 0, chromSize, lm); for (iv = ivList; iv != NULL; iv = iv->next) { double val = iv->val; int end = iv->end; for (i=iv->start; i<end; ++i) valBuf[i] = val; bitSetRange(chromVals->covBuf, iv->start, iv->end - iv->start); } lmCleanup(&lm); #endif /* OLD */ return TRUE; }
static void measurePlaced(struct bed *placed) { struct bed *bedEl; int placedCount = slCount(placed); int *sizeArray = NULL; int i = 0; sizeArray = needHugeMem((size_t)(sizeof(int) * placedCount)); i = 0; for (bedEl = placed; bedEl != NULL; bedEl = bedEl->next) { sizeArray[i++] = bedEl->chromEnd - bedEl->chromStart; } verbose(2,"placed item size range: [%d : %d]\n", sizeArray[0], sizeArray[i-1]); verbose(2,"placed item median: %d\n", sizeArray[i/2]); intSort(i,sizeArray); verbose(2,"placed item size range: [%d : %d]\n", sizeArray[0], sizeArray[i-1]); verbose(2,"placed item median: %d\n", sizeArray[i/2]); freeMem(sizeArray); }
struct floatPic *floatPicNew(int width, int height) /* Return a new floatPic. */ { long lineSize = 3L * width; long imageSize = lineSize * height; struct floatPic *pic = needMem(sizeof(struct floatPic)); pic->width = width; pic->height = height; pic->image = needHugeMem(imageSize * sizeof(float)); /* Create and initialize line start array */ AllocArray(pic->lines, height); int i = height; float *line = pic->image; float **lines = pic->lines; while (--i >= 0) { *lines++ = line; line += lineSize; } return pic; }
void itsaMake(int inCount, char *inputs[], char *output) /* itsaMake - Make a suffix array file out of input DNA sequences.. */ { verboseTimeInit(); bits64 maxGenomeSize = 1024LL*1024*1024*4; itsaBaseToValInit(); /* Load all DNA, make sure names are unique, and alphabetize by name. */ struct dnaSeq *seqList = NULL, *seq; struct hash *uniqSeqHash = hashNew(0); bits64 totalDnaSize = 1; /* FOr space between. */ int inputIx; for (inputIx=0; inputIx<inCount; ++inputIx) { char * input = inputs[inputIx]; struct dnaLoad *dl = dnaLoadOpen(input); while ((seq = dnaLoadNext(dl)) != NULL) { verbose(1, "read %s with %d bases\n", seq->name, seq->size); if (hashLookup(uniqSeqHash, seq->name)) errAbort("Input sequence name %s repeated, all must be unique.", seq->name); totalDnaSize += seq->size + 1; if (totalDnaSize > maxGenomeSize) errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize); slAddHead(&seqList, seq); } dnaLoadClose(&dl); } slSort(&seqList, dnaSeqCmpName); verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList)); /* Allocate big buffer for all DNA. */ DNA *allDna = globalAllDna = needHugeMem(totalDnaSize); allDna[0] = 0; bits64 chromOffset = 1; /* Have zeroes between each chrom, and before and after. */ /* Copy DNA to a single big buffer, and create chromInfo on each sequence. */ struct chromInfo *chrom, *chromList = NULL; for (seq = seqList; seq != NULL; seq = seq->next) { AllocVar(chrom); chrom->name = cloneString(seq->name); chrom->size = seq->size; chrom->offset = chromOffset; slAddHead(&chromList, chrom); toUpperN(seq->dna, seq->size); memcpy(allDna + chromOffset, seq->dna, seq->size + 1); chromOffset += seq->size + 1; } slReverse(&chromList); /* Free up separate dna sequences because we're going to need a lot of RAM soon. */ /* Allocate index array, and offset and list arrays. */ dnaSeqFreeList(&seqList); bits32 *index13; AllocArray(index13, itsaSlotCount); bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32)); bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32)); verboseTime(1, "Allocated buffers %lld bytes total", (long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32))); /* Where normally we'd keep some sort of structure with a next element to form a list * of matching positions in each slot of our index, to conserve memory we'll do this * with two parallel arrays. Because we're such cheapskates in terms of memory we'll * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different * purposes. * In the first phase they will together be used to form linked lists of * offsets, and the 13mer index will point to the first item in each list. In this * phase the offsetArray contains offsets into the allDna structure, and the listArray * contains the next pointers for the list. After the first phase we write out the * suffix array to disk. * In the second phase we read the suffix array back into the offsetArray, and * use the listArray for the traverseArray. We write out the traverse array to finish * things up. */ /* Load up all DNA buffer. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { verbose(2, " About to do first pass index\n"); indexChromPass1(chrom, allDna, offsetArray, listArray, index13); verbose(2, " Done first pass index\n"); } verboseTime(1, "Done big bucket sort"); slReverse(&chromList); itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output); }
static void itsaWriteMerged(struct chromInfo *chromList, DNA *allDna, bits32 *offsetArray, bits32 *listArray, bits32 *index13, char *output) /* Write out a file that contains a single splix that is the merger of * all of the individual splixes in list. As a side effect will replace * offsetArray with suffix array and listArray with traverse array */ { FILE *f = mustOpen(output, "w+"); /** Allocate header and fill out easy constant fields. */ struct itsaFileHeader *header; AllocVar(header); header->majorVersion = ITSA_MAJOR_VERSION; header->minorVersion = ITSA_MINOR_VERSION; /* Figure out sizes of names and sequence for each chromosome. */ struct chromInfo *chrom; bits32 chromNamesSize = 0; bits64 dnaDiskSize = 1; /* For initial zero. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { chromNamesSize += strlen(chrom->name) + 1; dnaDiskSize += chrom->size + 1; /* Include separating zeroes. */ } /* Fill in most of rest of header fields */ header->chromCount = slCount(chromList); header->chromNamesSize = roundUpTo4(chromNamesSize); header->dnaDiskSize = roundUpTo4(dnaDiskSize); bits32 chromSizesSize = header->chromCount*sizeof(bits32); /* Write header. */ mustWrite(f, header, sizeof(*header)); /* Write chromosome names. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) mustWrite(f, chrom->name, strlen(chrom->name)+1); zeroPad(f, header->chromNamesSize - chromNamesSize); /* Write chromosome sizes. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) mustWrite(f, &chrom->size, sizeof(chrom->size)); int chromSizesSizePad = chromSizesSize - header->chromCount * sizeof(bits32); zeroPad(f, chromSizesSizePad); /* Write out chromosome DNA and zeros before, between, and after. */ mustWrite(f, allDna, dnaDiskSize); zeroPad(f, header->dnaDiskSize - dnaDiskSize); verboseTime(1, "Wrote %lld bases of DNA including zero padding", header->dnaDiskSize); /* Calculate and write suffix array. Convert index13 to index of array as opposed to index * of sequence. */ bits64 arraySize = 0; off_t suffixArrayFileOffset = ftello(f); int slotCount = itsaSlotCount; int slotIx; for (slotIx=0; slotIx < slotCount; ++slotIx) { int slotSize = finishAndWriteOneSlot(offsetArray, listArray, index13[slotIx], allDna, f); /* Convert index13 to hold the position in the suffix array where the first thing matching * the corresponding 13-base prefix is found. */ if (slotSize != 0) index13[slotIx] = arraySize+1; /* The +1 is so we can keep 0 for not found. */ else index13[slotIx] = 0; arraySize += slotSize; if ((slotIx % 200000 == 0) && slotIx != 0) { verboseDot(); if (slotIx % 10000000 == 0) verbose(1, "fine sort bucket %d of %d\n", slotIx, slotCount); } } verbose(1, "fine sort bucket %d of %d\n", slotCount, slotCount); verboseTime(1, "Wrote %lld suffix array positions", arraySize); /* Now we're done with the offsetArray and listArray buffers, so use them for the * next phase. */ bits32 *suffixArray = offsetArray; offsetArray = NULL; /* Help make some errors more obvious */ bits32 *traverseArray = listArray; listArray = NULL; /* Help make some errors more obvious */ /* Read the suffix array back from the file. */ fseeko(f, suffixArrayFileOffset, SEEK_SET); mustRead(f, suffixArray, arraySize*sizeof(bits32)); verboseTime(1, "Read suffix array back in"); /* Calculate traverse array and cursor arrays */ memset(traverseArray, 0, arraySize*sizeof(bits32)); UBYTE *cursorArray = needHugeMem(arraySize); itsaFillInTraverseArray(allDna, suffixArray, arraySize, traverseArray, cursorArray); verboseTime(1, "Filled in traverseArray"); /* Write out traverse array. */ mustWrite(f, traverseArray, arraySize*sizeof(bits32)); verboseTime(1, "Wrote out traverseArray"); /* Write out 13-mer index. */ mustWrite(f, index13, itsaSlotCount*sizeof(bits32)); verboseTime(1, "Wrote out index13"); /* Write out bits of cursor array corresponding to index. */ for (slotIx=0; slotIx<itsaSlotCount; ++slotIx) { bits32 indexPos = index13[slotIx]; if (indexPos == 0) fputc(0, f); else fputc(cursorArray[indexPos-1], f); } verboseTime(1, "Wrote out cursors13"); /* Update a few fields in header, and go back and write it out again with * the correct magic number to indicate it's complete. */ header->magic = ITSA_MAGIC; header->arraySize = arraySize; header->size = sizeof(*header) // header + header->chromNamesSize + // chromosome names + header->chromCount * sizeof(bits32) // chromosome sizes + header->dnaDiskSize // dna sequence + sizeof(bits32) * arraySize // suffix array + sizeof(bits32) * arraySize // traverse array + sizeof(bits32) * itsaSlotCount // index13 + sizeof(UBYTE) * itsaSlotCount; // cursors13 rewind(f); mustWrite(f, header, sizeof(*header)); carefulClose(&f); verbose(1, "Completed %s is %lld bytes\n", output, header->size); }
struct sufa *sufaRead(char *fileName, boolean memoryMap) /* Read in a sufa from a file. Does this via memory mapping if you like, * which will be faster typically for about 100 reads, and slower for more * than that (_much_ slower for thousands of reads and more). */ { /* Open file (low level), read in header, and check it. */ int fd = open(fileName, O_RDONLY); if (fd < 0) errnoAbort("Can't open %s", fileName); struct sufaFileHeader h; if (read(fd, &h, sizeof(h)) < sizeof(h)) errnoAbort("Couldn't read header of file %s", fileName); if (h.magic != SUFA_MAGIC) errAbort("%s does not seem to be a sufa file.", fileName); if (h.majorVersion > SUFA_MAJOR_VERSION) errAbort("%s is a newer, incompatible version of sufa format. " "This program works on version %d and below. " "%s is version %d.", fileName, SUFA_MAJOR_VERSION, fileName, h.majorVersion); struct sufa *sufa; verbose(2, "sufa file %s size %lld\n", fileName, h.size); /* Get a pointer to data in memory, via memory map, or allocation and read. */ struct sufaFileHeader *header ; if (memoryMap) { #ifdef MACHTYPE_sparc header = (struct sufaFileHeader *)mmap(NULL, h.size, PROT_READ, MAP_SHARED, fd, 0); #else header = mmap(NULL, h.size, PROT_READ, MAP_FILE|MAP_SHARED, fd, 0); #endif if (header == (void*)(-1)) errnoAbort("Couldn't mmap %s, sorry", fileName); } else { header = needHugeMem(h.size); if (lseek(fd, 0, SEEK_SET) < 0) errnoAbort("Couldn't seek back to start of sufa file %s. " "Splix files must be random access files, not pipes and the like" , fileName); if (read(fd, header, h.size) < h.size) errnoAbort("Couldn't read all of sufa file %s.", fileName); } /* Allocate wrapper structure and fill it in. */ AllocVar(sufa); sufa->header = header; sufa->isMapped = memoryMap; /* Make an array for easy access to chromosome names. */ int chromCount = header->chromCount; char **chromNames = AllocArray(sufa->chromNames, chromCount); char *s = pointerOffset(header, sizeof(*header) ); int i; for (i=0; i<chromCount; ++i) { chromNames[i] = s; s += strlen(s)+1; } /* Keep track of where we are in memmap. */ bits64 mapOffset = sizeof(*header) + header->chromNamesSize; /* Point into chromSizes array. */ bits32 *chromSizes = sufa->chromSizes = pointerOffset(header, mapOffset); mapOffset += sizeof(bits32) * chromCount; verbose(2, "total dna size %lld in %d chromosomes\n", (long long)header->dnaDiskSize, header->chromCount); sufa->allDna = pointerOffset(header, mapOffset); mapOffset += header->dnaDiskSize; /* Calculate chromOffset array. */ bits32 offset = 0; bits32 *chromOffsets = AllocArray(sufa->chromOffsets, chromCount); for (i=0; i<chromCount; ++i) { chromOffsets[i] = offset; offset += chromSizes[i] + 1; verbose(2, "sufa contains %s, %d bases, %d offset\n", sufa->chromNames[i], (int)sufa->chromSizes[i], (int)chromOffsets[i]); } /* Finally point to the suffix array!. */ sufa->array = pointerOffset(header, mapOffset); mapOffset += header->arraySize * sizeof(bits32); assert(mapOffset == header->size); /* Sanity check */ return sufa; }
void axtHiQualDiffs(char *axtFile, struct hash *qacHash, FILE *f) /* Write out high quality diffs in axtFile to f. */ { char *qName = cloneString(""); UBYTE *qQuals = NULL; UBYTE *quals = NULL; struct qac *qac = NULL; struct axt *axt = NULL; struct lineFile *lf = lineFileOpen(axtFile, TRUE); int qStart, qDir, qPos, qWinStart, qWinEnd, tPos; int qWinSize = optionInt("winSize", 11); int qQualMin = optionInt("diffQualMin", 30); int qWinQualMin = optionInt("winQualMin", 25); int qWinMaxDiff = optionInt("winMaxDiff", 2); boolean qIndelOk = optionExists("indelOk"); boolean qIgnore98 = optionExists("ignore98"); boolean chimpPos = optionExists("chimpPos"); int qHalfWinSize = qWinSize/2; while ((axt = axtRead(lf)) != NULL) { char *qSym = axt->qSym, *tSym = axt->tSym; int symIx, symCount = axt->symCount; char qc,tc; toUpperN(qSym, symCount); toUpperN(tSym, symCount); if (!sameString(axt->qName, qName)) { freez(&qName); qName = cloneString(axt->qName); qac = hashMustFindVal(qacHash, qName); freez(&qQuals); qQuals = needHugeMem(qac->uncSize); rleUncompress(qac->data, qac->compSize, qQuals, qac->uncSize); } if (axt->qStrand == '+') { qStart = axt->qStart; qDir = 1; } else { qStart = qac->uncSize - axt->qStart - 1; qDir = -1; } qPos = qStart; tPos = axt->tStart; for (symIx = 0; symIx < symCount; ++symIx) { qc = qSym[symIx]; tc = tSym[symIx]; if (qc == '-') tPos += 1; else if (tc == '-') qPos += qDir; else { if (qc != tc) { qWinStart = qPos - qHalfWinSize; qWinEnd = qWinStart + qWinSize; if (qWinStart >= 0 && qWinEnd < qac->uncSize) { if (qQuals[qPos] >= qQualMin) { int i; boolean ok = TRUE; for (i = qWinStart; i<qWinEnd; ++i) if (qQuals[i] < qWinQualMin) { ok = FALSE; break; } if (ok) { int diffCount = 0; int symWinStart = symIx - qHalfWinSize; int symWinEnd = symWinStart + qWinSize; for (i=symWinStart; i < symWinEnd; ++i) { qc = qSym[i]; tc = tSym[i]; if (qc == '-' || tc == '-') { ok = FALSE; break; } if (qc != tc) ++diffCount; } if (ok && diffCount <= qWinMaxDiff && (!qIgnore98 || qQuals[qPos] != 98) ) { if (chimpPos) fprintf(f, "%s\t%d\t%d\t%c\t%c\t%s\t%d\t%d\n", axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx], axt->qName, qPos, qPos+1); else fprintf(f, "%s\t%d\t%d\t%c\t%c\n", axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx]); } } } } } qPos += qDir; tPos += 1; } } axtFree(&axt); } lineFileClose(&lf); }
void bigWigMerge(int inCount, char *inFiles[], char *outFile) /* bigWigMerge - Merge together multiple bigWigs into a single one.. */ { /* Make a list of open bigWig files. */ struct bbiFile *inFile, *inFileList = NULL; int i; for (i=0; i<inCount; ++i) { if (clInList) { addWigsInFile(inFiles[i], &inFileList); } else { inFile = bigWigFileOpen(inFiles[i]); slAddTail(&inFileList, inFile); } } FILE *f = mustOpen(outFile, "w"); struct bbiChromInfo *chrom, *chromList = getAllChroms(inFileList); verbose(1, "Got %d chromosomes from %d bigWigs\nProcessing", slCount(chromList), slCount(inFileList)); double *mergeBuf = NULL; int mergeBufSize = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct lm *lm = lmInit(0); /* Make sure merge buffer is big enough. */ int chromSize = chrom->size; verboseDot(); verbose(2, "Processing %s (%d bases)\n", chrom->name, chromSize); if (chromSize > mergeBufSize) { mergeBufSize = chromSize; freeMem(mergeBuf); mergeBuf = needHugeMem(mergeBufSize * sizeof(double)); } int i; for (i=0; i<chromSize; ++i) mergeBuf[i] = 0.0; /* Loop through each input file grabbing data and merging it in. */ for (inFile = inFileList; inFile != NULL; inFile = inFile->next) { struct bbiInterval *ivList = bigWigIntervalQuery(inFile, chrom->name, 0, chromSize, lm); verbose(3, "Got %d intervals in %s\n", slCount(ivList), inFile->fileName); struct bbiInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { double val = iv->val; if (val > clClip) val = clClip; int end = iv->end; for (i=iv->start; i < end; ++i) mergeBuf[i] += val; } } /* Output each range of same values as a bedGraph item */ int sameCount; for (i=0; i<chromSize; i += sameCount) { sameCount = doublesTheSame(mergeBuf+i, chromSize-i); double val = mergeBuf[i] + clAdjust; if (val > clThreshold) fprintf(f, "%s\t%d\t%d\t%g\n", chrom->name, i, i + sameCount, val); } lmCleanup(&lm); } verbose(1, "\n"); carefulClose(&f); }
boolean faReadMixedNext(FILE *f, boolean preserveCase, char *defaultName, boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq) /* Read next sequence from .fa file. Return sequence in retSeq. * If retCommentLine is non-null return the '>' line in retCommentLine. * The whole thing returns FALSE at end of file. * Contains parameter to preserve mixed case. */ { char lineBuf[1024]; int lineSize; char *words[1]; int c; off_t offset = ftello(f); size_t dnaSize = 0; DNA *dna, *sequence; char *name = defaultName; if (name == NULL) name = ""; dnaUtilOpen(); if (retCommentLine != NULL) *retCommentLine = NULL; *retSeq = NULL; /* Skip first lines until it starts with '>' */ for (;;) { if(fgets(lineBuf, sizeof(lineBuf), f) == NULL) { if (ferror(f)) errnoAbort("read of fasta file failed"); return FALSE; } lineSize = strlen(lineBuf); if (lineBuf[0] == '>') { if (retCommentLine != NULL) *retCommentLine = cloneString(lineBuf); offset = ftello(f); chopByWhite(lineBuf, words, ArraySize(words)); name = words[0]+1; break; } else if (!mustStartWithComment) { if (fseeko(f, offset, SEEK_SET) < 0) errnoAbort("fseek on fasta file failed"); break; } else offset += lineSize; } /* Count up DNA. */ for (;;) { c = fgetc(f); if (c == EOF || c == '>') break; if (isalpha(c)) { ++dnaSize; } } if (dnaSize == 0) { warn("Invalid fasta format: sequence size == 0 for element %s",name); } /* Allocate DNA and fill it up from file. */ dna = sequence = needHugeMem(dnaSize+1); if (fseeko(f, offset, SEEK_SET) < 0) errnoAbort("fseek on fasta file failed"); for (;;) { c = fgetc(f); if (c == EOF || c == '>') break; if (isalpha(c)) { /* check for non-DNA char */ if (ntChars[c] == 0) { *dna++ = preserveCase ? 'N' : 'n'; } else { *dna++ = preserveCase ? c : ntChars[c]; } } } if (c == '>') ungetc(c, f); *dna = 0; *retSeq = newDnaSeq(sequence, dnaSize, name); if (ferror(f)) errnoAbort("read of fasta file failed"); return TRUE; }
void bedItemOverlapCount(struct hash *chromHash, char *infile, char *outfile){ unsigned maxChromSize = 0; unitSize *counts = (unitSize *)NULL; FILE *f = mustOpen(outfile, "w"); struct hashCookie hc = hashFirst(chromHash); struct hashEl *hel; while( (hel = hashNext(&hc)) != NULL) { unsigned num = (unsigned) ptToInt(hel->val); maxChromSize = max(num, maxChromSize); } verbose(2,"#\tmaxChromSize: %u\n", maxChromSize); if (maxChromSize < 1) errAbort("maxChromSize is zero ?"); /* Allocate just once for the largest chrom and reuse this array */ counts = needHugeMem(sizeof(unitSize) * maxChromSize); /* Reset the array to be zero to be reused */ memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize); unsigned chromSize = 0; char *prevChrom = (char *)NULL; boolean outputToDo = FALSE; struct hash *seenHash = newHash(5); struct lineFile *bf = lineFileOpen(infile , TRUE); struct bed *bed = (struct bed *)NULL; char *row[12]; int numFields = doBed12 ? 12 : 3; while (lineFileNextRow(bf,row, numFields)) { int i; bed = bedLoadN(row, numFields); verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd); if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr { verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize); if (outputToDo) outputCounts(counts, prevChrom, chromSize, f); outputToDo = FALSE; memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize); /* zero counts */ freez(&prevChrom); // prevChrom is now NULL so it will be caught by next if! } if ((char *)NULL == prevChrom) // begin a chr { if (hashLookup(seenHash, bed->chrom)) errAbort("ERROR:input file not sorted. %s seen before on line %d\n", bed->chrom, bf->lineIx); hashAdd(seenHash, bed->chrom, NULL); prevChrom = cloneString(bed->chrom); chromSize = hashIntVal(chromHash, prevChrom); verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize); } if (bed->chromEnd > chromSize) { // check for circular chrM if (doBed12 || bed->chromStart>=chromSize || differentWord(bed->chrom,"chrM")) { warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart, bed->chromEnd); errAbort("chromEnd > chromSize ? %d > %d", bed->chromEnd,chromSize); } for (i = bed->chromStart; i < chromSize; ++i) INCWOVERFLOW(counts,i); for (i = 0; i < (bed->chromEnd - chromSize); ++i) INCWOVERFLOW(counts,i); } else if (doBed12) { int *starts = bed->chromStarts; int *sizes = bed->blockSizes; int *endStarts = &bed->chromStarts[bed->blockCount]; for(; starts < endStarts; starts++, sizes++) { unsigned int end = *starts + *sizes + bed->chromStart; for (i = *starts + bed->chromStart; i < end; ++i) INCWOVERFLOW(counts,i); } } else { for (i = bed->chromStart; i < bed->chromEnd; ++i) INCWOVERFLOW(counts, i); } outputToDo = TRUE; bedFree(&bed); // plug the memory leak } lineFileClose(&bf); // Note, next file could be on same chr! if (outputToDo) outputCounts(counts, prevChrom, chromSize, f); if (doOutBounds) fprintf(stderr, "min %lu max %lu\n", (unsigned long)overMin, (unsigned long)overMax); verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize); carefulClose(&f); freeMem(counts); freez(&prevChrom); // hashFreeWithVals(&chromHash, freez); freeHash(&seenHash); }
static struct statistic *gapStats(struct chrGapList *cList, char *zeroBedFile, char *shoulderBedFile, char *distanceFile) { struct statistic *returnStats; struct chrGapList *cl; int chrCount = 0; int gapCountNonZeroSize = 0; int gapCountZeroSize = 0; int gapCountShoulderSize = 0; unsigned long totalGapSize = 0; int maxGap = 0; int minGap = BIGNUM; int *gapSizeArray = NULL; int i; int boundingElementCount = 0; int placedItemCount = 0; FILE *zeroFH = NULL; FILE *distFH = NULL; FILE *shoulderFH = NULL; AllocVar(returnStats); /* first count number of non-zero gaps */ for (cl=cList; cl != NULL; cl = cl->next) { struct gap *gl; int gapCount = 0; int zeroSized = 0; int shoulderSized = 0; boolean firstOne = TRUE; for (gl = cl->gList; gl != NULL; gl = gl->next) { int gapSize = gl->gapSize; if (firstOne) { firstOne = FALSE; if (gl->isUpstreamBound) ++boundingElementCount; else ++placedItemCount; } if (gl->isDownstreamBound) ++boundingElementCount; else ++placedItemCount; if (gapSize > 0) { totalGapSize += gapSize; if (gapSize > maxGap) maxGap = gapSize; if (gapSize < minGap) minGap = gapSize; if (gapSize <= shoulder) ++shoulderSized; ++gapCount; } else { ++zeroSized; } } gapCountNonZeroSize += gapCount; gapCountZeroSize += zeroSized; gapCountShoulderSize += shoulderSized; ++chrCount; verbose(4,"'%s': %d gaps + %d of size zero = %d\n", cl->chrom, gapCount, zeroSized, gapCount+zeroSized); } verbose(3,"counted %d chroms and %d gaps ( + %d size zero = %d total gaps)" "\n\ton the bounding list\n", chrCount, gapCountNonZeroSize, gapCountZeroSize, gapCountNonZeroSize+gapCountZeroSize); returnStats->chromCount = chrCount; returnStats->totalGaps = gapCountNonZeroSize + gapCountZeroSize; returnStats->sizeZeroGapCount = gapCountZeroSize; returnStats->boundingElementCount = boundingElementCount; /* now copy all the values to a integer array for more detailed * stats measurements */ if (0 == gapCountNonZeroSize) { returnStats->meanGap = 0.0; returnStats->maxGap = maxGap; returnStats->minGap = minGap; returnStats->medianGap = 0; } else { gapSizeArray = needHugeMem((size_t)(sizeof(int) * gapCountNonZeroSize)); i = 0; for (cl=cList; cl != NULL; cl = cl->next) { struct gap *gl; for (gl = cl->gList; gl != NULL; gl = gl->next) { int gapSize = gl->gapSize; if (gapSize > 0) { gapSizeArray[i] = gapSize; ++i; } } } verbose(3,"assigned %d values to int array\n", i); intSort(i,gapSizeArray); /* 0.5 rounds up to next integer */ returnStats->meanGap = 0.5 + ((double)totalGapSize/(double)gapCountNonZeroSize); returnStats->maxGap = gapSizeArray[i-1]; returnStats->minGap = gapSizeArray[0]; returnStats->medianGap = gapSizeArray[i/2]; } verbose(2,"average gap size: %d = %ul / %d (non-zero size gaps only)\n", returnStats->meanGap, totalGapSize, gapCountNonZeroSize); verbose(2,"maximum gap size: %d\n", returnStats->maxGap); verbose(2,"median gap size: %d\n", returnStats->medianGap); verbose(2,"minimum gap size: %d\n", returnStats->minGap); verbose(2,"minimum gap: %d, maximum gap: %d\n", minGap, maxGap); if (minGap != returnStats->minGap) errAbort("ERROR: didn't find the same minimum gap ?"); if (maxGap != returnStats->maxGap) errAbort("ERROR: didn't find the same maximum gap ?"); verbose(2,"bounding element count: %d, placed element count: %d\n", boundingElementCount, placedItemCount); /* if there are placed elements, measure their nearest neighbor statistics */ if (placedItemCount) { int i; int sumDistances = 0; int *placedDistances = needHugeMem((size_t)(sizeof(int) * placedItemCount)); int placedCount = 0; int boundingElements = 0; if (distanceFile) distFH=mustOpen(distanceFile, "w"); for (cl=cList; cl != NULL; cl = cl->next) { struct gap *gl; struct gap *next; struct bed *upstreamBound = NULL; struct bed *downstreamBound = NULL; int chrCount = 0; ++boundingElements; /* first one must be bounding */ /* after this, only need to count the downstream ones */ for (gl = cl->gList; gl != NULL; gl = next) { struct gap *gEl; int gapCount = 0; int shoulderCount = 0; ++chrCount; /* make note of upstream and downstream bounding elements */ if (NULL == upstreamBound || gl->isUpstreamBound) { if (gl->isUpstreamBound) { upstreamBound = gl->upstream; downstreamBound = nextDownstreamBound(gl); ++boundingElements; if (NULL == downstreamBound) errAbort("Can not find a downstream" " bounding element ?"); } else errAbort("Do not find a bounding element as" " first upstream item ?"); } /* measure all downstream elements as long as they * are not bounding elements */ for (gEl = gl; (gEl != NULL) && (! gEl->isDownstreamBound); gEl = gEl->next) { /* protect against negative results with the max(0,..) */ int upstreamDist = max(0, (gEl->downstream->chromStart - upstreamBound->chromEnd)); int downstreamDist = max(0, (downstreamBound->chromStart - gEl->downstream->chromEnd)); int minDistance; if (upstreamOnly) minDistance = upstreamDist; else if (downstreamOnly) minDistance = downstreamDist; else minDistance = min(upstreamDist, downstreamDist); if (distFH) fprintf (distFH, "%s\t%d\t%d\t%s_%d\t%d\n", cl->chrom, gEl->downstream->chromStart, gEl->downstream->chromEnd, cl->chrom, placedCount, minDistance); if (minDistance < 0) errAbort("minimum distance < 0 ?"); if (minDistance == 0) { ++gapCount; if (zeroBedFile) { if (! zeroFH) zeroFH=mustOpen(zeroBedFile, "w"); fprintf (zeroFH, "%s\t%d\t%d\t%s_%d.%d\n", cl->chrom, gEl->downstream->chromStart, gEl->downstream->chromEnd, cl->chrom, chrCount, gapCount); } } else if ((minDistance > 0) && (minDistance <= shoulder)) { ++shoulderCount; if (shoulderBedFile) { if (! shoulderFH) shoulderFH=mustOpen(shoulderBedFile, "w"); fprintf (shoulderFH, "%s\t%d\t%d\t%s_%d.%d\n", cl->chrom, gEl->downstream->chromStart, gEl->downstream->chromEnd, cl->chrom, chrCount, shoulderCount); } } placedDistances[placedCount++] = minDistance; sumDistances += minDistance; } if (gEl) next = gEl->next; else next = NULL; } } /* for (cl=cList; cl != NULL; cl = cl->next) */ returnStats->placedItemCount = placedCount; returnStats->meanNearestNeighbor = 0.5 + (double)sumDistances / placedCount; if (boundingElements != boundingElementCount) errAbort("ERROR: did not find the same number of bounding elements ? %d ? %d =! %d", boundingElements, boundingElementCount, returnStats->boundingElementCount); intSort(placedCount,placedDistances); returnStats->medianNearestNeighbor = placedDistances[placedCount/2]; returnStats->maximumNearestNeighbor = placedDistances[placedCount-1]; verbose(2,"measured %d placed items\n", placedCount); verbose(2,"mean distance: %d = %d / %d\n", returnStats->meanNearestNeighbor, sumDistances, placedCount); verbose(2,"median distance: %d\n", returnStats->medianNearestNeighbor); verbose(2,"maximum distance: %d\n", returnStats->maximumNearestNeighbor); for (i = 0; i < placedCount; ++i) if (placedDistances[i] > 0) break; /* this doesn't need the + 1 */ returnStats->zeroNeighbor = i + 1; returnStats->zeroNeighbor = i; for ( ; i < placedCount; ++i) if (placedDistances[i] > shoulder) break; /* for a while we were counting this without the zero distances */ returnStats->placedWithinShoulder = (i+1) - returnStats->zeroNeighbor; returnStats->placedWithinShoulder = i; verbose(2,"%d - number of items zero distance to nearest " "bounding element\n", returnStats->zeroNeighbor); verbose(2,"%d - number of items of non-zero distance within %d bp of " "nearest bounding element\n", returnStats->placedWithinShoulder, shoulder); if ((placedCount - returnStats->zeroNeighbor) > 0) verbose(2,"%% %.04f - percent of of items of non-zero distance " "within %d bp of nearest bounding element\n", 100.0 * returnStats->placedWithinShoulder / (placedCount-returnStats->zeroNeighbor), shoulder); else errAbort("something wrong with placed item count %d " "minus zeroDistance Count %d", placedCount, returnStats->zeroNeighbor); freeMem(placedDistances); } /* if (placedItemCount) */ carefulClose(&zeroFH); carefulClose(&distFH); carefulClose(&shoulderFH); freeMem(gapSizeArray); return (returnStats); }