Пример #1
0
void chimpSuperQuals(char *agpFile, char *qacInName, char *qacOutName)
/* chimpSuperQuals - Map chimp quality scores from contig to supercontig.. */
{
struct hash *qacHash = qacReadToHash(qacInName);
struct scaffold *scaffold, *scaffoldList = readScaffoldsFromAgp(agpFile);
FILE *f = mustOpen(qacOutName, "w");
struct qaSeq qa;
struct agpFrag *frag;
struct qac *qac;
int bufSize = 0;
UBYTE *buf = NULL;
int qaMaxSize = 0;
int fragSize;
int count = 0;

qacWriteHead(f);
ZeroVar(&qa);

for (scaffold = scaffoldList; scaffold != NULL; scaffold = scaffold->next)
    {
    /* Set up qa to hold uncompressed quals for whole scaffold. */
    qa.name = scaffold->list->chrom;
    qa.size = scaffold->size;
    if (qaMaxSize < qa.size)
	{
	freez(&qa.qa);
	qa.qa = needHugeZeroedMem(qa.size);
	qaMaxSize = qa.size;
	}

    /* Uncompress contig quality scores and copy into scaffold's quality buffer. */
    for (frag = scaffold->list; frag != NULL; frag = frag->next)
        {
	qac = hashMustFindVal(qacHash, frag->frag);
	if (bufSize < qac->uncSize)
	    {
	    freez(&buf);
	    bufSize = qac->uncSize;
	    buf = needMem(bufSize);
	    }
	rleUncompress(qac->data, qac->compSize, buf, qac->uncSize);
	fragSize = frag->fragEnd - frag->fragStart;
	memcpy(qa.qa + frag->chromStart, buf + frag->fragStart, fragSize);
	}

    /* Compress and write it out. */
    qacWriteNext(f, &qa);
    ++count;
    }
carefulClose(&f);
}
void itsaMake(int inCount, char *inputs[], char *output)
/* itsaMake - Make a suffix array file out of input DNA sequences.. */
{
verboseTimeInit();
bits64 maxGenomeSize = 1024LL*1024*1024*4;

itsaBaseToValInit();

/* Load all DNA, make sure names are unique, and alphabetize by name. */
struct dnaSeq *seqList = NULL, *seq;
struct hash *uniqSeqHash = hashNew(0);
bits64 totalDnaSize = 1;	/* FOr space between. */
int inputIx;
for (inputIx=0; inputIx<inCount; ++inputIx)
    {
    char * input = inputs[inputIx];
    struct dnaLoad *dl = dnaLoadOpen(input);
    while ((seq = dnaLoadNext(dl)) != NULL)
	{
	verbose(1, "read %s with %d bases\n", seq->name, seq->size);
	if (hashLookup(uniqSeqHash, seq->name))
	    errAbort("Input sequence name %s repeated, all must be unique.", seq->name);
	totalDnaSize +=  seq->size + 1;
	if (totalDnaSize > maxGenomeSize)
	    errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize);
	slAddHead(&seqList, seq);
	}
    dnaLoadClose(&dl);
    }
slSort(&seqList, dnaSeqCmpName);
verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList));

/* Allocate big buffer for all DNA. */
DNA *allDna = globalAllDna = needHugeMem(totalDnaSize);
allDna[0] = 0;
bits64 chromOffset = 1;	/* Have zeroes between each chrom, and before and after. */

/* Copy DNA to a single big buffer, and create chromInfo on each sequence. */
struct chromInfo *chrom, *chromList = NULL;
for (seq = seqList; seq != NULL; seq = seq->next)
    {
    AllocVar(chrom);
    chrom->name = cloneString(seq->name);
    chrom->size = seq->size;
    chrom->offset = chromOffset;
    slAddHead(&chromList, chrom);
    toUpperN(seq->dna, seq->size);
    memcpy(allDna + chromOffset, seq->dna, seq->size + 1);
    chromOffset += seq->size + 1;
    }
slReverse(&chromList);

/* Free up separate dna sequences because we're going to need a lot of RAM soon. */


/* Allocate index array, and offset and list arrays. */
dnaSeqFreeList(&seqList);
bits32 *index13;
AllocArray(index13, itsaSlotCount);
bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32));
bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32));
verboseTime(1, "Allocated buffers %lld bytes total", 
	(long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32)));

/* Where normally we'd keep some sort of structure with a next element to form a list
 * of matching positions in each slot of our index,  to conserve memory we'll do this
 * with two parallel arrays.  Because we're such cheapskates in terms of memory we'll
 * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different
 * purposes.   
 *     In the first phase they will together be used to form linked lists of
 * offsets, and the 13mer index will point to the first item in each list.  In this
 * phase the offsetArray contains offsets into the allDna structure, and the listArray
 * contains the next pointers for the list.  After the first phase we write out the
 * suffix array to disk.
 *     In the second phase we read the suffix array back into the offsetArray, and
 * use the listArray for the traverseArray.  We write out the traverse array to finish
 * things up. */


/* Load up all DNA buffer. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    verbose(2, "  About to do first pass index\n");
    indexChromPass1(chrom, allDna, offsetArray, listArray, index13);
    verbose(2, "  Done first pass index\n");
    }
verboseTime(1, "Done big bucket sort");
slReverse(&chromList);
itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output);
}
void qacAgpLift(char *agpFile, char *qacInName, char *qacOutName)
/* qacAgpLift - Use AGP to combine per-scaffold qac into per-chrom qac. */
{
struct hash *qacHash = qacReadToHash(qacInName);
struct chrom *chrom, *chromList = readChromScaffoldsFromAgp(agpFile);
FILE *f = mustOpen(qacOutName, "w");
struct qaSeq qa;
struct agpFrag *frag;
struct qac *qac;
int bufSize = 0;
UBYTE *buf = NULL;
int qaMaxSize = 0;
int fragSize;
int count = 0;

qacWriteHead(f);
ZeroVar(&qa);

for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    /* Set up qa to hold uncompressed quals for whole chrom. */
    qa.name = chrom->list->chrom;
    verbose(1, "    %s size=%d\n", chrom->list->chrom, chrom->size);
    qa.size = chrom->size;
    if (qaMaxSize < qa.size)
	{
	qa.qa = needHugeZeroedMem(qa.size);
	qaMaxSize = qa.size;
	}
    else
        {
        zeroBytes(qa.qa, qa.size);
        }

    /* Uncompress contig quality scores and copy into chrom's quality buffer. */
    for (frag = chrom->list; frag != NULL; frag = frag->next)
        {
        struct hashEl *hel;
        fragSize = frag->fragEnd - frag->fragStart;
        if ((hel = hashLookup(qacHash, frag->frag)) != NULL)
            {
            qac = (struct qac *) hel->val;
            if (bufSize < qac->uncSize)
                {
                freez(&buf);
                bufSize = qac->uncSize;
                buf = needMem(bufSize);
                }
            rleUncompress(qac->data, qac->compSize, buf, qac->uncSize);
            if (frag->strand[0] == '-')
                reverseBytes((char*)buf, qac->uncSize);
            memcpy(qa.qa + frag->chromStart, buf + frag->fragStart, fragSize);
            }
        else
            {
            /* agp frag not found in qac hash -- missing data */
            if (mScore < 0)
                errAbort("missing data: no quality scores for %s", frag->frag);
            /* fill in missing data with specified score */
            memset(qa.qa + frag->chromStart, mScore, fragSize);
            }
	}

    /* Compress and write it out. */
    qacWriteNext(f, &qa);
    ++count;
    }
carefulClose(&f);
}