Ejemplo n.º 1
0
static void expandFaFastBuf(int bufPos, int minExp)
/* Make faFastBuf bigger. */
{
if (faFastBufSize == 0)
    {
    faFastBufSize = 64 * 1024;
    while (minExp > faFastBufSize)
        faFastBufSize <<= 1;
    faFastBuf = needHugeMem(faFastBufSize);
    }
else
    {
    DNA *newBuf;
    unsigned newBufSize = faFastBufSize + faFastBufSize;
    while (newBufSize < minExp)
	{
        newBufSize <<= 1;
	if (newBufSize <= 0)
	    errAbort("expandFaFastBuf: integer overflow when trying to "
		     "increase buffer size from %u to a min of %u.",
		     faFastBufSize, minExp);
	}
    newBuf = needHugeMem(newBufSize);
    memcpy(newBuf, faFastBuf, bufPos);
    freeMem(faFastBuf);
    faFastBuf = newBuf;
    faFastBufSize = newBufSize;
    }
}
Ejemplo n.º 2
0
void agpToFaOne(struct agpFrag **pAgpList, char *agpFile, char *agpSeq,
		char *seqDir, int lastPos, FILE *f)
/* Given one sequence's worth of AGP in pAgpList, process it into FASTA
 * and write to f. */
{
DNA *dna = NULL;

slReverse(pAgpList);
if (lastPos == 0)
    errAbort("%s not found in %s\n", agpSeq, agpFile);
dna = needHugeMem(lastPos+1);
memset(dna, 'n', lastPos);
dna[lastPos] = 0;
if (optionExists("simpleMulti"))
    {
    simpleMultiFillInSequence(0, seqDir, *pAgpList, dna, lastPos);
    }
else if (optionExists("simpleMultiMixed"))
    {
    simpleMultiFillInSequence(1, seqDir, *pAgpList, dna, lastPos);
    }
else if (optionExists("simple"))
    {
    simpleFillInSequence(seqDir, *pAgpList, dna, lastPos);
    }
else
    {
    gsFillInSequence(seqDir, *pAgpList, dna, lastPos);
    }
verbose(2,"Writing %s (%d bases)\n", agpSeq, lastPos);
faWriteNext(f, agpSeq, dna, lastPos);
agpFragFreeList(pAgpList);
}
Ejemplo n.º 3
0
struct hash *qacReadToHash(char *fileName)
/* Read in a qac file into a hash of qacs keyed by name. */
{
boolean isSwapped;
FILE *f = qacOpenVerify(fileName, &isSwapped);
bits32 compSize, uncSize;
struct qac *qac;
char *name;
struct hash *hash = newHash(18);
int count = 0;

for (;;)
    {
    name = readString(f);
    if (name == NULL)
       break;
    mustReadOne(f, uncSize);
    if (isSwapped)
	uncSize = byteSwap32(uncSize);
    mustReadOne(f, compSize);
    if (isSwapped)
	compSize = byteSwap32(compSize);
    qac = needHugeMem(sizeof(*qac) + compSize - 1);
    qac->uncSize = uncSize;
    qac->compSize = compSize;
    mustRead(f, qac->data, compSize);
    hashAdd(hash, name, qac);
    ++count;
    }
carefulClose(&f);
printf("Read %d qacs from %s\n", count, fileName);
return hash;
}
Ejemplo n.º 4
0
void *needHugeZeroedMem(size_t size)
/* Request a large block of memory and zero it. */
{
void *v;
v = needHugeMem(size);
memset(v, 0, size);
return v;
}
static struct bed *randomTrial(struct chrGapList *bounding, struct bed *placed)
/*	placed bed list has already been sorted by size descending,
	return is the newly placed bed list	*/
{
struct bed *bedList = NULL;
struct bed *bedEl;
int placedCount = slCount(placed);
int gapCount = countGaps(bounding);
int i;
struct gap **sizedGaps = NULL;	/*	an array of pointers	*/
int maxGapCount = 0;

/*	We should never have more gaps than the initial set of gaps plus
 *	the placed item count since each placed item only creates one
 *	new gap.  This array will be used repeatedly as lists of gaps of
 *	specific sizes are created.  The array will be an array of
 *	pointers to the gaps greater than the specified size.
 *	The + 1 on the maxGapCount is to keep the array one larger than
 *	expected maximum so that a safety check can be performed that it
 *	never reaches past the expected maximum.
 */
maxGapCount = placedCount + gapCount + 1;
sizedGaps = needHugeMem((size_t)(sizeof(struct gap *) * maxGapCount));
i = 0;

for (bedEl = placed; bedEl != NULL; bedEl = bedEl->next)
    {
    struct bed *newBed;
    int N;
    int R;
    int itemSize = bedEl->chromEnd - bedEl->chromStart;
    if (itemSize < 1)
	errAbort("ERROR: placing items less than 1 bp in length ? %s:%d-%d",
	bedEl->chrom, bedEl->chromEnd, bedEl->chromStart);
    N = gapsOfSize(bounding,itemSize, sizedGaps, maxGapCount);
    /*	From those N gaps, randomly select one of them	(drand48 = [0.0,1.0)*/
    R = floor(N * drand48());	/*	interval: [0,N) == [0,N-1]	*/
    if ((R >= N) || (R >= maxGapCount))
	errAbort("ERROR: did not expect random "
	    "number %d to be >= %d (or %d)\n", R, N, maxGapCount);
    /*	The newBed is the bedEl translated to a new random location */
    newBed = randomInsert(bedEl,sizedGaps[R]);
    slAddHead(&bedList,newBed);
    }
/*	sizedGaps are just a bunch of pointers, the bed element inserts
 *	actually went into the bounding gap list which is going to be
 *	freed up, along with the specially added bed elements back in
 *	the loop that is managing the copying of the bounding list.
 */
freeMem(sizedGaps);
return(bedList);
}
Ejemplo n.º 6
0
struct dnaSeq *cloneDnaSeq(struct dnaSeq *orig)
/* Duplicate dna sequence in RAM. */
{
struct dnaSeq *seq = CloneVar(orig);
seq->name = cloneString(seq->name);
seq->dna = needHugeMem(seq->size+1);
memcpy(seq->dna, orig->dna, seq->size+1);
seq->mask = NULL;
if (orig->mask != NULL)
    {
    seq->mask = bitClone(orig->mask, seq->size);
    }
return seq;
}
Ejemplo n.º 7
0
struct dnaSeq *faReadSeq(char *fileName, boolean isDna)
/* Open fa file and read a single sequence from it. */
{
int maxSize = fileSize(fileName);
int fd;
DNA *s;

if (maxSize < 0)
    errAbort("can't open %s", fileName);
s = needHugeMem(maxSize+1);
fd = open(fileName, O_RDONLY);
read(fd, s, maxSize);
close(fd);
s[maxSize] = 0;
return faSeqFromMemText(s, isDna);
}
Ejemplo n.º 8
0
boolean bigWigValsOnChromFetchData(struct bigWigValsOnChrom *chromVals, char *chrom, 
	struct bbiFile *bigWig)
/* Fetch data for chromosome from bigWig. Returns FALSE if not data on that chrom. */
{
/* Fetch chromosome and size into self. */
freeMem(chromVals->chrom);
chromVals->chrom = cloneString(chrom);
long chromSize = chromVals->chromSize = bbiChromSize(bigWig, chrom);

if (chromSize <= 0)
    return FALSE;

/* Make sure buffers are big enough. */
if (chromSize > chromVals->bufSize)
    {
    freeMem(chromVals->valBuf);
    freeMem(chromVals->covBuf);
    chromVals->valBuf = needHugeMem((sizeof(double))*chromSize);
    chromVals->covBuf = bitAlloc(chromSize);
    chromVals->bufSize = chromSize;
    }

/* Zero out buffers */
bitClear(chromVals->covBuf, chromSize);
double *valBuf = chromVals->valBuf;
int i;
for (i=0; i<chromSize; ++i)
    valBuf[i] = 0.0;

fetchIntoBuf(bigWig, chrom, 0, chromSize, chromVals);

#ifdef OLD
/* Fetch intervals for this chromosome and fold into buffers. */
struct lm *lm = lmInit(0);
struct bbiInterval *iv, *ivList = bigWigIntervalQuery(bigWig, chrom, 0, chromSize, lm);
for (iv = ivList; iv != NULL; iv = iv->next)
    {
    double val = iv->val;
    int end = iv->end;
    for (i=iv->start; i<end; ++i)
	valBuf[i] = val;
    bitSetRange(chromVals->covBuf, iv->start, iv->end - iv->start);
    }
lmCleanup(&lm);
#endif /* OLD */
return TRUE;
}
static void measurePlaced(struct bed *placed)
{
struct bed *bedEl;
int placedCount = slCount(placed);
int *sizeArray = NULL;
int i = 0;

sizeArray = needHugeMem((size_t)(sizeof(int) * placedCount));
i = 0;
for (bedEl = placed; bedEl != NULL; bedEl = bedEl->next)
    {
    sizeArray[i++] = bedEl->chromEnd - bedEl->chromStart;
    }
verbose(2,"placed item size range: [%d : %d]\n", sizeArray[0], sizeArray[i-1]);
verbose(2,"placed item median: %d\n", sizeArray[i/2]);
intSort(i,sizeArray);
verbose(2,"placed item size range: [%d : %d]\n", sizeArray[0], sizeArray[i-1]);
verbose(2,"placed item median: %d\n", sizeArray[i/2]);
freeMem(sizeArray);
}
Ejemplo n.º 10
0
struct floatPic *floatPicNew(int width, int height)
/* Return a new floatPic. */
{
long lineSize = 3L * width;
long imageSize = lineSize * height;
struct floatPic *pic = needMem(sizeof(struct floatPic));
pic->width = width;
pic->height = height;
pic->image = needHugeMem(imageSize * sizeof(float));

/* Create and initialize line start array */
AllocArray(pic->lines, height);
int i = height;
float *line = pic->image;
float **lines = pic->lines;
while (--i >= 0)
    {
    *lines++ = line;
    line += lineSize;
    }
return pic;
}
void itsaMake(int inCount, char *inputs[], char *output)
/* itsaMake - Make a suffix array file out of input DNA sequences.. */
{
verboseTimeInit();
bits64 maxGenomeSize = 1024LL*1024*1024*4;

itsaBaseToValInit();

/* Load all DNA, make sure names are unique, and alphabetize by name. */
struct dnaSeq *seqList = NULL, *seq;
struct hash *uniqSeqHash = hashNew(0);
bits64 totalDnaSize = 1;	/* FOr space between. */
int inputIx;
for (inputIx=0; inputIx<inCount; ++inputIx)
    {
    char * input = inputs[inputIx];
    struct dnaLoad *dl = dnaLoadOpen(input);
    while ((seq = dnaLoadNext(dl)) != NULL)
	{
	verbose(1, "read %s with %d bases\n", seq->name, seq->size);
	if (hashLookup(uniqSeqHash, seq->name))
	    errAbort("Input sequence name %s repeated, all must be unique.", seq->name);
	totalDnaSize +=  seq->size + 1;
	if (totalDnaSize > maxGenomeSize)
	    errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize);
	slAddHead(&seqList, seq);
	}
    dnaLoadClose(&dl);
    }
slSort(&seqList, dnaSeqCmpName);
verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList));

/* Allocate big buffer for all DNA. */
DNA *allDna = globalAllDna = needHugeMem(totalDnaSize);
allDna[0] = 0;
bits64 chromOffset = 1;	/* Have zeroes between each chrom, and before and after. */

/* Copy DNA to a single big buffer, and create chromInfo on each sequence. */
struct chromInfo *chrom, *chromList = NULL;
for (seq = seqList; seq != NULL; seq = seq->next)
    {
    AllocVar(chrom);
    chrom->name = cloneString(seq->name);
    chrom->size = seq->size;
    chrom->offset = chromOffset;
    slAddHead(&chromList, chrom);
    toUpperN(seq->dna, seq->size);
    memcpy(allDna + chromOffset, seq->dna, seq->size + 1);
    chromOffset += seq->size + 1;
    }
slReverse(&chromList);

/* Free up separate dna sequences because we're going to need a lot of RAM soon. */


/* Allocate index array, and offset and list arrays. */
dnaSeqFreeList(&seqList);
bits32 *index13;
AllocArray(index13, itsaSlotCount);
bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32));
bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32));
verboseTime(1, "Allocated buffers %lld bytes total", 
	(long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32)));

/* Where normally we'd keep some sort of structure with a next element to form a list
 * of matching positions in each slot of our index,  to conserve memory we'll do this
 * with two parallel arrays.  Because we're such cheapskates in terms of memory we'll
 * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different
 * purposes.   
 *     In the first phase they will together be used to form linked lists of
 * offsets, and the 13mer index will point to the first item in each list.  In this
 * phase the offsetArray contains offsets into the allDna structure, and the listArray
 * contains the next pointers for the list.  After the first phase we write out the
 * suffix array to disk.
 *     In the second phase we read the suffix array back into the offsetArray, and
 * use the listArray for the traverseArray.  We write out the traverse array to finish
 * things up. */


/* Load up all DNA buffer. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    verbose(2, "  About to do first pass index\n");
    indexChromPass1(chrom, allDna, offsetArray, listArray, index13);
    verbose(2, "  Done first pass index\n");
    }
verboseTime(1, "Done big bucket sort");
slReverse(&chromList);
itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output);
}
static void itsaWriteMerged(struct chromInfo *chromList, DNA *allDna,
	bits32 *offsetArray, bits32 *listArray, bits32 *index13, char *output)
/* Write out a file that contains a single splix that is the merger of
 * all of the individual splixes in list.   As a side effect will replace
 * offsetArray with suffix array and listArray with traverse array */
{
FILE *f = mustOpen(output, "w+");

/** Allocate header and fill out easy constant fields. */
struct itsaFileHeader *header;
AllocVar(header);
header->majorVersion = ITSA_MAJOR_VERSION;
header->minorVersion = ITSA_MINOR_VERSION;

/* Figure out sizes of names and sequence for each chromosome. */
struct chromInfo *chrom;
bits32 chromNamesSize = 0;
bits64 dnaDiskSize = 1;	/* For initial zero. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
   {
   chromNamesSize += strlen(chrom->name) + 1;
   dnaDiskSize += chrom->size + 1;  /* Include separating zeroes. */
   }

/* Fill in  most of rest of header fields */
header->chromCount = slCount(chromList);
header->chromNamesSize = roundUpTo4(chromNamesSize);
header->dnaDiskSize = roundUpTo4(dnaDiskSize);
bits32 chromSizesSize = header->chromCount*sizeof(bits32);

/* Write header. */
mustWrite(f, header, sizeof(*header));

/* Write chromosome names. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    mustWrite(f, chrom->name, strlen(chrom->name)+1);
zeroPad(f, header->chromNamesSize - chromNamesSize);

/* Write chromosome sizes. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    mustWrite(f, &chrom->size, sizeof(chrom->size));
int chromSizesSizePad = chromSizesSize - header->chromCount * sizeof(bits32);
zeroPad(f, chromSizesSizePad);

/* Write out chromosome DNA and zeros before, between, and after. */
mustWrite(f, allDna, dnaDiskSize);
zeroPad(f, header->dnaDiskSize - dnaDiskSize);
verboseTime(1, "Wrote %lld bases of DNA including zero padding", header->dnaDiskSize);

/* Calculate and write suffix array. Convert index13 to index of array as opposed to index
 * of sequence. */
bits64 arraySize = 0;
off_t suffixArrayFileOffset = ftello(f);
int slotCount = itsaSlotCount;
int slotIx;
for (slotIx=0; slotIx < slotCount; ++slotIx)
    {
    int slotSize = finishAndWriteOneSlot(offsetArray, listArray, index13[slotIx], allDna, f);
    /* Convert index13 to hold the position in the suffix array where the first thing matching
     * the corresponding 13-base prefix is found. */
    if (slotSize != 0)
        index13[slotIx] = arraySize+1;	/* The +1 is so we can keep 0 for not found. */
    else
        index13[slotIx] = 0;
    arraySize += slotSize;
    if ((slotIx % 200000 == 0) && slotIx != 0)
	{
	verboseDot();
	if (slotIx % 10000000 == 0)
	    verbose(1, "fine sort bucket %d of %d\n", slotIx, slotCount);
	}
    }
verbose(1, "fine sort bucket %d of %d\n", slotCount, slotCount);
verboseTime(1, "Wrote %lld suffix array positions", arraySize);

/* Now we're done with the offsetArray and listArray buffers, so use them for the
 * next phase. */
bits32 *suffixArray = offsetArray;
offsetArray = NULL;	/* Help make some errors more obvious */
bits32 *traverseArray = listArray;
listArray = NULL;	/* Help make some errors more obvious */

/* Read the suffix array back from the file. */
fseeko(f, suffixArrayFileOffset, SEEK_SET);
mustRead(f, suffixArray, arraySize*sizeof(bits32));
verboseTime(1, "Read suffix array back in");

/* Calculate traverse array and cursor arrays */
memset(traverseArray, 0, arraySize*sizeof(bits32));
UBYTE *cursorArray = needHugeMem(arraySize);
itsaFillInTraverseArray(allDna, suffixArray, arraySize, traverseArray, cursorArray);
verboseTime(1, "Filled in traverseArray");

/* Write out traverse array. */
mustWrite(f, traverseArray, arraySize*sizeof(bits32));
verboseTime(1, "Wrote out traverseArray");

/* Write out 13-mer index. */
mustWrite(f, index13, itsaSlotCount*sizeof(bits32));
verboseTime(1, "Wrote out index13");

/* Write out bits of cursor array corresponding to index. */
for (slotIx=0; slotIx<itsaSlotCount; ++slotIx)
    {
    bits32 indexPos = index13[slotIx];
    if (indexPos == 0)
       fputc(0, f);
    else
       fputc(cursorArray[indexPos-1], f);
    }
verboseTime(1, "Wrote out cursors13");

/* Update a few fields in header, and go back and write it out again with
 * the correct magic number to indicate it's complete. */
header->magic = ITSA_MAGIC;
header->arraySize = arraySize;
header->size = sizeof(*header) 			// header
	+ header->chromNamesSize + 		// chromosome names
	+ header->chromCount * sizeof(bits32)	// chromosome sizes
	+ header->dnaDiskSize 			// dna sequence
	+ sizeof(bits32) * arraySize	 	// suffix array
	+ sizeof(bits32) * arraySize   		// traverse array
	+ sizeof(bits32) * itsaSlotCount 	// index13
	+ sizeof(UBYTE) * itsaSlotCount;	// cursors13

rewind(f);
mustWrite(f, header, sizeof(*header));
carefulClose(&f);
verbose(1, "Completed %s is %lld bytes\n", output, header->size);
}
Ejemplo n.º 13
0
struct sufa *sufaRead(char *fileName, boolean memoryMap)
/* Read in a sufa from a file.  Does this via memory mapping if you like,
 * which will be faster typically for about 100 reads, and slower for more
 * than that (_much_ slower for thousands of reads and more). */
{
/* Open file (low level), read in header, and check it. */
int fd = open(fileName, O_RDONLY);
if (fd < 0)
    errnoAbort("Can't open %s", fileName);
struct sufaFileHeader h;
if (read(fd, &h, sizeof(h)) < sizeof(h))
    errnoAbort("Couldn't read header of file %s", fileName);
if (h.magic != SUFA_MAGIC)
    errAbort("%s does not seem to be a sufa file.", fileName);
if (h.majorVersion > SUFA_MAJOR_VERSION)
    errAbort("%s is a newer, incompatible version of sufa format. "
             "This program works on version %d and below. "
	     "%s is version %d.",  fileName, SUFA_MAJOR_VERSION, fileName, h.majorVersion);

struct sufa *sufa;
verbose(2, "sufa file %s size %lld\n", fileName, h.size);

/* Get a pointer to data in memory, via memory map, or allocation and read. */
struct sufaFileHeader *header ;
if (memoryMap)
    {
#ifdef MACHTYPE_sparc
    header = (struct sufaFileHeader *)mmap(NULL, h.size, PROT_READ, MAP_SHARED, fd, 0);
#else
    header = mmap(NULL, h.size, PROT_READ, MAP_FILE|MAP_SHARED, fd, 0);
#endif
    if (header == (void*)(-1))
	errnoAbort("Couldn't mmap %s, sorry", fileName);
    }
else
    {
    header = needHugeMem(h.size);
    if (lseek(fd, 0, SEEK_SET) < 0)
	errnoAbort("Couldn't seek back to start of sufa file %s.  "
		   "Splix files must be random access files, not pipes and the like"
		   , fileName);
    if (read(fd, header, h.size) < h.size)
        errnoAbort("Couldn't read all of sufa file %s.", fileName);
    }

/* Allocate wrapper structure and fill it in. */
AllocVar(sufa);
sufa->header = header;
sufa->isMapped = memoryMap;

/* Make an array for easy access to chromosome names. */
int chromCount = header->chromCount;
char **chromNames = AllocArray(sufa->chromNames, chromCount);
char *s = pointerOffset(header, sizeof(*header) );
int i;
for (i=0; i<chromCount; ++i)
    {
    chromNames[i] = s;
    s += strlen(s)+1;
    }

/* Keep track of where we are in memmap. */
bits64 mapOffset = sizeof(*header) + header->chromNamesSize;

/* Point into chromSizes array. */
bits32 *chromSizes = sufa->chromSizes 
	= pointerOffset(header, mapOffset);
mapOffset += sizeof(bits32) * chromCount;

verbose(2, "total dna size %lld in %d chromosomes\n", (long long)header->dnaDiskSize, header->chromCount);
sufa->allDna = pointerOffset(header, mapOffset);
mapOffset += header->dnaDiskSize;

/* Calculate chromOffset array. */
bits32 offset = 0;
bits32 *chromOffsets = AllocArray(sufa->chromOffsets, chromCount);
for (i=0; i<chromCount; ++i)
    {
    chromOffsets[i] = offset;
    offset += chromSizes[i] + 1;
    verbose(2, "sufa contains %s,  %d bases, %d offset\n", 
    	sufa->chromNames[i], (int)sufa->chromSizes[i], (int)chromOffsets[i]);
    }

/* Finally point to the suffix array!. */
sufa->array = pointerOffset(header, mapOffset);
mapOffset += header->arraySize * sizeof(bits32);


assert(mapOffset == header->size);	/* Sanity check */
return sufa;
}
Ejemplo n.º 14
0
void axtHiQualDiffs(char *axtFile, struct hash *qacHash, FILE *f)
/* Write out high quality diffs in axtFile to f. */
{
char *qName = cloneString("");
UBYTE *qQuals = NULL;
UBYTE *quals = NULL;
struct qac *qac = NULL;
struct axt *axt = NULL;
struct lineFile *lf = lineFileOpen(axtFile, TRUE);
int qStart, qDir, qPos, qWinStart, qWinEnd, tPos;
int qWinSize     = optionInt("winSize",     11);
int qQualMin     = optionInt("diffQualMin", 30);
int qWinQualMin  = optionInt("winQualMin",  25);
int qWinMaxDiff  = optionInt("winMaxDiff",  2);
boolean qIndelOk = optionExists("indelOk");
boolean qIgnore98 = optionExists("ignore98");
boolean chimpPos = optionExists("chimpPos");
int qHalfWinSize = qWinSize/2;

while ((axt = axtRead(lf)) != NULL)
    {
    char *qSym = axt->qSym, *tSym = axt->tSym;
    int symIx, symCount = axt->symCount;
    char qc,tc;
    toUpperN(qSym, symCount);
    toUpperN(tSym, symCount);
    if (!sameString(axt->qName, qName))
        {
	freez(&qName);
	qName = cloneString(axt->qName);
	qac = hashMustFindVal(qacHash, qName);
	freez(&qQuals);
	qQuals = needHugeMem(qac->uncSize);
	rleUncompress(qac->data, qac->compSize, qQuals, qac->uncSize);
	}
    if (axt->qStrand == '+')
        {
	qStart = axt->qStart;
	qDir = 1;
	}
    else
        {
	qStart = qac->uncSize - axt->qStart - 1;
	qDir = -1;
	}
    qPos = qStart;
    tPos = axt->tStart;
    for (symIx = 0; symIx < symCount; ++symIx)
        {
	qc = qSym[symIx];
	tc = tSym[symIx];
	if (qc == '-')
	    tPos += 1;
	else if (tc == '-')
	    qPos += qDir;
	else 
	    {
	    if (qc != tc)
		{
		qWinStart = qPos - qHalfWinSize;
		qWinEnd = qWinStart + qWinSize;
		if (qWinStart >= 0 && qWinEnd < qac->uncSize)
		    {
		    if (qQuals[qPos] >= qQualMin)
		        {
			int i;
			boolean ok = TRUE;
			for (i = qWinStart; i<qWinEnd; ++i)
			    if (qQuals[i] < qWinQualMin)
			        {
				ok = FALSE;
				break;
				}
			if (ok)
			    {
			    int diffCount = 0;
			    int symWinStart = symIx - qHalfWinSize;
			    int symWinEnd = symWinStart + qWinSize;
			    for (i=symWinStart; i < symWinEnd; ++i)
			        {
				qc = qSym[i];
				tc = tSym[i];
				if (qc == '-' || tc == '-')
				    {
				    ok = FALSE;
				    break;
				    }
				if (qc != tc)
				    ++diffCount;
				}
			    if (ok && diffCount <= qWinMaxDiff && (!qIgnore98 || qQuals[qPos] != 98) )
				{
				if (chimpPos)
				    fprintf(f, "%s\t%d\t%d\t%c\t%c\t%s\t%d\t%d\n",
					    axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx], axt->qName, qPos, qPos+1);
				else
				    fprintf(f, "%s\t%d\t%d\t%c\t%c\n",
					    axt->tName, tPos, tPos+1, tSym[symIx], qSym[symIx]);
				}
			    }
			}
		    }
		}
	    qPos += qDir;
	    tPos += 1;
	    }
	}
    axtFree(&axt);
    }
lineFileClose(&lf);
}
Ejemplo n.º 15
0
void bigWigMerge(int inCount, char *inFiles[], char *outFile)
/* bigWigMerge - Merge together multiple bigWigs into a single one.. */
{
/* Make a list of open bigWig files. */
struct bbiFile *inFile, *inFileList = NULL;
int i;
for (i=0; i<inCount; ++i)
    {
    if (clInList)
        {
	addWigsInFile(inFiles[i], &inFileList);
	}
    else
	{
	inFile = bigWigFileOpen(inFiles[i]);
	slAddTail(&inFileList, inFile);
	}
    }

FILE *f = mustOpen(outFile, "w");

struct bbiChromInfo *chrom, *chromList = getAllChroms(inFileList);
verbose(1, "Got %d chromosomes from %d bigWigs\nProcessing", 
	slCount(chromList), slCount(inFileList));
double *mergeBuf = NULL;
int mergeBufSize = 0;
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    struct lm *lm = lmInit(0);

    /* Make sure merge buffer is big enough. */
    int chromSize = chrom->size;
    verboseDot();
    verbose(2, "Processing %s (%d bases)\n", chrom->name, chromSize);
    if (chromSize > mergeBufSize)
        {
	mergeBufSize = chromSize;
	freeMem(mergeBuf);
	mergeBuf = needHugeMem(mergeBufSize * sizeof(double));
	}
    int i;
    for (i=0; i<chromSize; ++i)
        mergeBuf[i] = 0.0;

    /* Loop through each input file grabbing data and merging it in. */
    for (inFile = inFileList; inFile != NULL; inFile = inFile->next)
        {
	struct bbiInterval *ivList = bigWigIntervalQuery(inFile, chrom->name, 0, chromSize, lm);
	verbose(3, "Got %d intervals in %s\n", slCount(ivList), inFile->fileName);
	struct bbiInterval *iv;
	for (iv = ivList; iv != NULL; iv = iv->next)
	    {
	    double val = iv->val;
	    if (val > clClip)
	        val = clClip;
	    int end = iv->end;
	    for (i=iv->start; i < end; ++i)
	         mergeBuf[i] += val;
	    }
	}


    /* Output each range of same values as a bedGraph item */
    int sameCount;
    for (i=0; i<chromSize; i += sameCount)
        {
	sameCount = doublesTheSame(mergeBuf+i, chromSize-i);
	double val = mergeBuf[i] + clAdjust;
	if (val > clThreshold)
	    fprintf(f, "%s\t%d\t%d\t%g\n", chrom->name, i, i + sameCount, val);
	}

    lmCleanup(&lm);
    }
verbose(1, "\n");

carefulClose(&f);
}
Ejemplo n.º 16
0
boolean faReadMixedNext(FILE *f, boolean preserveCase, char *defaultName, 
    boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq)
/* Read next sequence from .fa file. Return sequence in retSeq.  
 * If retCommentLine is non-null return the '>' line in retCommentLine.
 * The whole thing returns FALSE at end of file. 
 * Contains parameter to preserve mixed case. */
{
char lineBuf[1024];
int lineSize;
char *words[1];
int c;
off_t offset = ftello(f);
size_t dnaSize = 0;
DNA *dna, *sequence;
char *name = defaultName;

if (name == NULL)
    name = "";
dnaUtilOpen();
if (retCommentLine != NULL)
    *retCommentLine = NULL;
*retSeq = NULL;

/* Skip first lines until it starts with '>' */
for (;;)
    {
    if(fgets(lineBuf, sizeof(lineBuf), f) == NULL)
        {
        if (ferror(f))
            errnoAbort("read of fasta file failed");
        return FALSE;
        }
    lineSize = strlen(lineBuf);
    if (lineBuf[0] == '>')
        {
	if (retCommentLine != NULL)
            *retCommentLine = cloneString(lineBuf);
        offset = ftello(f);
        chopByWhite(lineBuf, words, ArraySize(words));
        name = words[0]+1;
        break;
        }
    else if (!mustStartWithComment)
        {
        if (fseeko(f, offset, SEEK_SET) < 0)
            errnoAbort("fseek on fasta file failed");
        break;
        }
    else
        offset += lineSize;
    }
/* Count up DNA. */
for (;;)
    {
    c = fgetc(f);
    if (c == EOF || c == '>')
        break;
    if (isalpha(c))
        {
        ++dnaSize;
        }
    }

if (dnaSize == 0)
    {
    warn("Invalid fasta format: sequence size == 0 for element %s",name);
    }

/* Allocate DNA and fill it up from file. */
dna = sequence = needHugeMem(dnaSize+1);
if (fseeko(f, offset, SEEK_SET) < 0)
    errnoAbort("fseek on fasta file failed");
for (;;)
    {
    c = fgetc(f);
    if (c == EOF || c == '>')
        break;
    if (isalpha(c))
        {
        /* check for non-DNA char */
        if (ntChars[c] == 0)
            {
            *dna++ = preserveCase ? 'N' : 'n';
            }
        else
            {
            *dna++ = preserveCase ? c : ntChars[c];
            }
        }
    }
if (c == '>')
    ungetc(c, f);
*dna = 0;

*retSeq = newDnaSeq(sequence, dnaSize, name);
if (ferror(f))
    errnoAbort("read of fasta file failed");    
return TRUE;
}
Ejemplo n.º 17
0
void bedItemOverlapCount(struct hash *chromHash, char *infile, char *outfile){
unsigned maxChromSize = 0;
unitSize *counts = (unitSize *)NULL;
FILE *f = mustOpen(outfile, "w");
struct hashCookie hc = hashFirst(chromHash);
struct hashEl *hel;
while( (hel = hashNext(&hc)) != NULL) {
    unsigned num = (unsigned) ptToInt(hel->val);
    maxChromSize = max(num, maxChromSize);
}
verbose(2,"#\tmaxChromSize: %u\n", maxChromSize);
if (maxChromSize < 1)
    errAbort("maxChromSize is zero ?");

/*	Allocate just once for the largest chrom and reuse this array */
counts = needHugeMem(sizeof(unitSize) * maxChromSize);

/*	Reset the array to be zero to be reused */
memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize);

unsigned chromSize = 0;
char *prevChrom = (char *)NULL;
boolean outputToDo = FALSE;
struct hash *seenHash = newHash(5);

    struct lineFile *bf = lineFileOpen(infile , TRUE);
    struct bed *bed = (struct bed *)NULL;
    char *row[12];
    int numFields = doBed12 ? 12 : 3;

    while (lineFileNextRow(bf,row, numFields))
	{
	int i;
	bed = bedLoadN(row, numFields);

	verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd);

	if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr
	    {
	    verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
	    if (outputToDo)
		outputCounts(counts, prevChrom, chromSize, f);
	    outputToDo = FALSE;
	    memset((void *)counts, 0,
		sizeof(unitSize)*(size_t)maxChromSize); /* zero counts */
	    freez(&prevChrom); 
	    // prevChrom is now NULL so it will be caught by next if!
	    }
	if ((char *)NULL == prevChrom)  // begin a chr
	    {
	    if (hashLookup(seenHash, bed->chrom))
		errAbort("ERROR:input file not sorted. %s seen before on line %d\n",
		    bed->chrom, bf->lineIx);

	    hashAdd(seenHash, bed->chrom, NULL);
	    prevChrom = cloneString(bed->chrom);
	    chromSize = hashIntVal(chromHash, prevChrom);
	    verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize);
	    }
	if (bed->chromEnd > chromSize)
	    {
	    // check for circular chrM
	    if (doBed12 || bed->chromStart>=chromSize 
		|| differentWord(bed->chrom,"chrM")) 
		{
		warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart,
		bed->chromEnd);
		errAbort("chromEnd > chromSize ?  %d > %d", 
		    bed->chromEnd,chromSize);
		}

	    for (i = bed->chromStart; i < chromSize; ++i)
		INCWOVERFLOW(counts,i);
	    for (i = 0; i < (bed->chromEnd - chromSize); ++i)
		INCWOVERFLOW(counts,i);
	    }
	else if (doBed12)
	    {
	    int *starts = bed->chromStarts;
	    int *sizes = bed->blockSizes;
	    int *endStarts = &bed->chromStarts[bed->blockCount];

	    for(; starts < endStarts; starts++, sizes++)
		{
		unsigned int end = *starts + *sizes + bed->chromStart;
		for (i = *starts + bed->chromStart; i < end; ++i)
		    INCWOVERFLOW(counts,i);
		}
	    }
	else
	    {
	    for (i = bed->chromStart; i < bed->chromEnd; ++i)
		INCWOVERFLOW(counts, i);
	    }
	outputToDo = TRUE;
	bedFree(&bed); // plug the memory leak
	}

    lineFileClose(&bf);
    // Note, next file could be on same chr!

if (outputToDo)
    outputCounts(counts, prevChrom, chromSize, f);

if (doOutBounds)
    fprintf(stderr, "min %lu max %lu\n", (unsigned long)overMin, (unsigned long)overMax);

verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
carefulClose(&f);
freeMem(counts);
freez(&prevChrom);
// hashFreeWithVals(&chromHash, freez);
freeHash(&seenHash);
}
static struct statistic *gapStats(struct chrGapList *cList, char *zeroBedFile,
	char *shoulderBedFile, char *distanceFile)
{
struct statistic *returnStats;
struct chrGapList *cl;
int chrCount = 0;
int gapCountNonZeroSize = 0;
int gapCountZeroSize = 0;
int gapCountShoulderSize = 0;
unsigned long totalGapSize = 0;
int maxGap = 0;
int minGap = BIGNUM;
int *gapSizeArray = NULL;
int i;
int boundingElementCount = 0;
int placedItemCount = 0;
FILE *zeroFH = NULL;
FILE *distFH = NULL;
FILE *shoulderFH = NULL;

AllocVar(returnStats);

/*	first count number of non-zero gaps	*/
for (cl=cList; cl != NULL; cl = cl->next)
    {
    struct gap *gl;
    int gapCount = 0;
    int zeroSized = 0;
    int shoulderSized = 0;
    boolean firstOne = TRUE;
    for (gl = cl->gList; gl != NULL; gl = gl->next)
	{
	int gapSize = gl->gapSize;
	if (firstOne)
	    {
	    firstOne = FALSE;
	    if (gl->isUpstreamBound)
		++boundingElementCount;
	    else
		++placedItemCount;
	    }
	if (gl->isDownstreamBound)
	    ++boundingElementCount;
	else
	    ++placedItemCount;
	if (gapSize > 0)
	    {
	    totalGapSize += gapSize;
	    if (gapSize > maxGap) maxGap = gapSize;
	    if (gapSize < minGap) minGap = gapSize;
	    if (gapSize <= shoulder) ++shoulderSized;
	    ++gapCount;
	    }
	else
	    {
	    ++zeroSized;
	    }
	}
    gapCountNonZeroSize += gapCount;
    gapCountZeroSize += zeroSized;
    gapCountShoulderSize += shoulderSized;
    ++chrCount;
verbose(4,"'%s': %d gaps + %d of size zero = %d\n", cl->chrom, gapCount,
	zeroSized, gapCount+zeroSized);
    }

verbose(3,"counted %d chroms and %d gaps ( + %d size zero = %d total gaps)"
    "\n\ton the bounding list\n", chrCount, gapCountNonZeroSize,
	gapCountZeroSize, gapCountNonZeroSize+gapCountZeroSize);

returnStats->chromCount = chrCount;
returnStats->totalGaps = gapCountNonZeroSize + gapCountZeroSize;
returnStats->sizeZeroGapCount = gapCountZeroSize;
returnStats->boundingElementCount = boundingElementCount;

/*	now copy all the values to a integer array for more detailed
 *	stats measurements
 */
if (0 == gapCountNonZeroSize)
    {
    returnStats->meanGap = 0.0;
    returnStats->maxGap = maxGap;
    returnStats->minGap = minGap;
    returnStats->medianGap = 0;
    }
else
    {
    gapSizeArray = needHugeMem((size_t)(sizeof(int) * gapCountNonZeroSize));
    i = 0;

    for (cl=cList; cl != NULL; cl = cl->next)
	{
	struct gap *gl;
	for (gl = cl->gList; gl != NULL; gl = gl->next)
	    {
	    int gapSize = gl->gapSize;
	    if (gapSize > 0)
		{
		gapSizeArray[i] = gapSize;
		++i;
		}
	    }
	}
    verbose(3,"assigned %d values to int array\n", i);

    intSort(i,gapSizeArray);

    /*	0.5 rounds up to next integer	*/
    returnStats->meanGap = 0.5 +
	((double)totalGapSize/(double)gapCountNonZeroSize);
    returnStats->maxGap = gapSizeArray[i-1];
    returnStats->minGap = gapSizeArray[0];
    returnStats->medianGap = gapSizeArray[i/2];
    }

verbose(2,"average gap size: %d = %ul / %d (non-zero size gaps only)\n",
    returnStats->meanGap, totalGapSize, gapCountNonZeroSize);
verbose(2,"maximum gap size: %d\n", returnStats->maxGap);
verbose(2,"median gap size: %d\n", returnStats->medianGap);
verbose(2,"minimum gap size: %d\n", returnStats->minGap);
verbose(2,"minimum gap: %d, maximum gap: %d\n", minGap, maxGap);
if (minGap != returnStats->minGap)
    errAbort("ERROR: didn't find the same minimum gap ?");
if (maxGap != returnStats->maxGap)
    errAbort("ERROR: didn't find the same maximum gap ?");
verbose(2,"bounding element count: %d, placed element count: %d\n",
	boundingElementCount, placedItemCount);


/* if there are placed elements, measure their nearest neighbor statistics */
if (placedItemCount)
    {
    int i;
    int sumDistances = 0;
    int *placedDistances =
	needHugeMem((size_t)(sizeof(int) * placedItemCount));
    int placedCount = 0;
    int boundingElements = 0;

    if (distanceFile)
	distFH=mustOpen(distanceFile, "w");

    for (cl=cList; cl != NULL; cl = cl->next)
	{
	struct gap *gl;
	struct gap *next;
	struct bed *upstreamBound = NULL;
	struct bed *downstreamBound = NULL;
	int chrCount = 0;

	++boundingElements; /*	first one must be bounding	*/
		/* after this, only need to count the downstream ones */

	for (gl = cl->gList; gl != NULL; gl = next)
	    {
	    struct gap *gEl;
	    int gapCount = 0;
	    int shoulderCount = 0;

	    ++chrCount;

	    /*	make note of upstream and downstream bounding elements */
	    if (NULL == upstreamBound || gl->isUpstreamBound)
		{
		if (gl->isUpstreamBound)
		    {
		    upstreamBound = gl->upstream;
		    downstreamBound = nextDownstreamBound(gl);
		    ++boundingElements;
		    if (NULL == downstreamBound)
			errAbort("Can not find a downstream"
				" bounding element ?");
		    }
		else
		    errAbort("Do not find a bounding element as"
			" first upstream item ?");
		}
	    /*	measure all downstream elements as long as they
	     *	are not bounding elements
	     */
	    for (gEl = gl; (gEl != NULL) && (! gEl->isDownstreamBound);
			gEl = gEl->next)
		{
		/* protect against negative results with the max(0,..) */
		int upstreamDist = max(0,
		    (gEl->downstream->chromStart - upstreamBound->chromEnd));
		int downstreamDist = max(0,
		    (downstreamBound->chromStart - gEl->downstream->chromEnd));
		int minDistance;

		if (upstreamOnly)
		    minDistance = upstreamDist;
		else if (downstreamOnly)
		    minDistance = downstreamDist;
		else
		    minDistance = min(upstreamDist, downstreamDist);

		if (distFH)
		    fprintf (distFH, "%s\t%d\t%d\t%s_%d\t%d\n",
			    cl->chrom, gEl->downstream->chromStart,
			    gEl->downstream->chromEnd, cl->chrom, placedCount,
				minDistance);

		if (minDistance < 0)
			errAbort("minimum distance < 0 ?");
		if (minDistance == 0)
		    {
		    ++gapCount;
		    if (zeroBedFile)
			{
			if (! zeroFH)
			    zeroFH=mustOpen(zeroBedFile, "w");
			fprintf (zeroFH, "%s\t%d\t%d\t%s_%d.%d\n",
			    cl->chrom, gEl->downstream->chromStart,
			    gEl->downstream->chromEnd, cl->chrom,
			    chrCount, gapCount);
			}
		    }
		else if ((minDistance > 0) && (minDistance <= shoulder))
		    {
		    ++shoulderCount;
		    if (shoulderBedFile)
			{
			if (! shoulderFH)
			    shoulderFH=mustOpen(shoulderBedFile, "w");
			fprintf (shoulderFH, "%s\t%d\t%d\t%s_%d.%d\n",
			    cl->chrom, gEl->downstream->chromStart,
			    gEl->downstream->chromEnd, cl->chrom,
			    chrCount, shoulderCount);
			}
		    }

		placedDistances[placedCount++] = minDistance;
		sumDistances += minDistance;
		}
	    if (gEl)
		next = gEl->next;
	    else
		next = NULL;
	    }
	}	/*	for (cl=cList; cl != NULL; cl = cl->next)	*/

    returnStats->placedItemCount = placedCount;
    returnStats->meanNearestNeighbor = 0.5 + (double)sumDistances / placedCount;

    if (boundingElements != boundingElementCount)
	errAbort("ERROR: did not find the same number of bounding elements ? %d ? %d =! %d", boundingElements, boundingElementCount, returnStats->boundingElementCount);


    intSort(placedCount,placedDistances);
    returnStats->medianNearestNeighbor = placedDistances[placedCount/2];
    returnStats->maximumNearestNeighbor = placedDistances[placedCount-1];

    verbose(2,"measured %d placed items\n", placedCount);
    verbose(2,"mean distance: %d = %d / %d\n",
	returnStats->meanNearestNeighbor, sumDistances, placedCount);
    verbose(2,"median distance: %d\n", returnStats->medianNearestNeighbor);
    verbose(2,"maximum distance: %d\n", returnStats->maximumNearestNeighbor);

    for (i = 0; i < placedCount; ++i)
	if (placedDistances[i] > 0) break;

    /*	this doesn't need the + 1	*/
    returnStats->zeroNeighbor = i + 1;
    returnStats->zeroNeighbor = i;

    for ( ; i < placedCount; ++i)
	if (placedDistances[i] > shoulder) break;

    /*	for a while we were counting this without the zero distances */
    returnStats->placedWithinShoulder = (i+1) - returnStats->zeroNeighbor;
    returnStats->placedWithinShoulder = i;

    verbose(2,"%d - number of items zero distance to nearest "
	"bounding element\n", returnStats->zeroNeighbor);
    verbose(2,"%d - number of items of non-zero distance within %d bp of "
	"nearest bounding element\n", returnStats->placedWithinShoulder,
		shoulder);
    if ((placedCount - returnStats->zeroNeighbor) > 0)
	verbose(2,"%% %.04f - percent of of items of non-zero distance "
	    "within %d bp of nearest bounding element\n",
		100.0 * returnStats->placedWithinShoulder /
			(placedCount-returnStats->zeroNeighbor),
		shoulder);
    else
       errAbort("something wrong with placed item count %d "
	    "minus zeroDistance Count %d",
		placedCount, returnStats->zeroNeighbor);

    freeMem(placedDistances);
    }	/*	if (placedItemCount)	*/
carefulClose(&zeroFH);
carefulClose(&distFH);
carefulClose(&shoulderFH);
freeMem(gapSizeArray);
return (returnStats);
}