void chimpSuperQuals(char *agpFile, char *qacInName, char *qacOutName) /* chimpSuperQuals - Map chimp quality scores from contig to supercontig.. */ { struct hash *qacHash = qacReadToHash(qacInName); struct scaffold *scaffold, *scaffoldList = readScaffoldsFromAgp(agpFile); FILE *f = mustOpen(qacOutName, "w"); struct qaSeq qa; struct agpFrag *frag; struct qac *qac; int bufSize = 0; UBYTE *buf = NULL; int qaMaxSize = 0; int fragSize; int count = 0; qacWriteHead(f); ZeroVar(&qa); for (scaffold = scaffoldList; scaffold != NULL; scaffold = scaffold->next) { /* Set up qa to hold uncompressed quals for whole scaffold. */ qa.name = scaffold->list->chrom; qa.size = scaffold->size; if (qaMaxSize < qa.size) { freez(&qa.qa); qa.qa = needHugeZeroedMem(qa.size); qaMaxSize = qa.size; } /* Uncompress contig quality scores and copy into scaffold's quality buffer. */ for (frag = scaffold->list; frag != NULL; frag = frag->next) { qac = hashMustFindVal(qacHash, frag->frag); if (bufSize < qac->uncSize) { freez(&buf); bufSize = qac->uncSize; buf = needMem(bufSize); } rleUncompress(qac->data, qac->compSize, buf, qac->uncSize); fragSize = frag->fragEnd - frag->fragStart; memcpy(qa.qa + frag->chromStart, buf + frag->fragStart, fragSize); } /* Compress and write it out. */ qacWriteNext(f, &qa); ++count; } carefulClose(&f); }
void itsaMake(int inCount, char *inputs[], char *output) /* itsaMake - Make a suffix array file out of input DNA sequences.. */ { verboseTimeInit(); bits64 maxGenomeSize = 1024LL*1024*1024*4; itsaBaseToValInit(); /* Load all DNA, make sure names are unique, and alphabetize by name. */ struct dnaSeq *seqList = NULL, *seq; struct hash *uniqSeqHash = hashNew(0); bits64 totalDnaSize = 1; /* FOr space between. */ int inputIx; for (inputIx=0; inputIx<inCount; ++inputIx) { char * input = inputs[inputIx]; struct dnaLoad *dl = dnaLoadOpen(input); while ((seq = dnaLoadNext(dl)) != NULL) { verbose(1, "read %s with %d bases\n", seq->name, seq->size); if (hashLookup(uniqSeqHash, seq->name)) errAbort("Input sequence name %s repeated, all must be unique.", seq->name); totalDnaSize += seq->size + 1; if (totalDnaSize > maxGenomeSize) errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize); slAddHead(&seqList, seq); } dnaLoadClose(&dl); } slSort(&seqList, dnaSeqCmpName); verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList)); /* Allocate big buffer for all DNA. */ DNA *allDna = globalAllDna = needHugeMem(totalDnaSize); allDna[0] = 0; bits64 chromOffset = 1; /* Have zeroes between each chrom, and before and after. */ /* Copy DNA to a single big buffer, and create chromInfo on each sequence. */ struct chromInfo *chrom, *chromList = NULL; for (seq = seqList; seq != NULL; seq = seq->next) { AllocVar(chrom); chrom->name = cloneString(seq->name); chrom->size = seq->size; chrom->offset = chromOffset; slAddHead(&chromList, chrom); toUpperN(seq->dna, seq->size); memcpy(allDna + chromOffset, seq->dna, seq->size + 1); chromOffset += seq->size + 1; } slReverse(&chromList); /* Free up separate dna sequences because we're going to need a lot of RAM soon. */ /* Allocate index array, and offset and list arrays. */ dnaSeqFreeList(&seqList); bits32 *index13; AllocArray(index13, itsaSlotCount); bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32)); bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32)); verboseTime(1, "Allocated buffers %lld bytes total", (long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32))); /* Where normally we'd keep some sort of structure with a next element to form a list * of matching positions in each slot of our index, to conserve memory we'll do this * with two parallel arrays. Because we're such cheapskates in terms of memory we'll * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different * purposes. * In the first phase they will together be used to form linked lists of * offsets, and the 13mer index will point to the first item in each list. In this * phase the offsetArray contains offsets into the allDna structure, and the listArray * contains the next pointers for the list. After the first phase we write out the * suffix array to disk. * In the second phase we read the suffix array back into the offsetArray, and * use the listArray for the traverseArray. We write out the traverse array to finish * things up. */ /* Load up all DNA buffer. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { verbose(2, " About to do first pass index\n"); indexChromPass1(chrom, allDna, offsetArray, listArray, index13); verbose(2, " Done first pass index\n"); } verboseTime(1, "Done big bucket sort"); slReverse(&chromList); itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output); }
void qacAgpLift(char *agpFile, char *qacInName, char *qacOutName) /* qacAgpLift - Use AGP to combine per-scaffold qac into per-chrom qac. */ { struct hash *qacHash = qacReadToHash(qacInName); struct chrom *chrom, *chromList = readChromScaffoldsFromAgp(agpFile); FILE *f = mustOpen(qacOutName, "w"); struct qaSeq qa; struct agpFrag *frag; struct qac *qac; int bufSize = 0; UBYTE *buf = NULL; int qaMaxSize = 0; int fragSize; int count = 0; qacWriteHead(f); ZeroVar(&qa); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Set up qa to hold uncompressed quals for whole chrom. */ qa.name = chrom->list->chrom; verbose(1, " %s size=%d\n", chrom->list->chrom, chrom->size); qa.size = chrom->size; if (qaMaxSize < qa.size) { qa.qa = needHugeZeroedMem(qa.size); qaMaxSize = qa.size; } else { zeroBytes(qa.qa, qa.size); } /* Uncompress contig quality scores and copy into chrom's quality buffer. */ for (frag = chrom->list; frag != NULL; frag = frag->next) { struct hashEl *hel; fragSize = frag->fragEnd - frag->fragStart; if ((hel = hashLookup(qacHash, frag->frag)) != NULL) { qac = (struct qac *) hel->val; if (bufSize < qac->uncSize) { freez(&buf); bufSize = qac->uncSize; buf = needMem(bufSize); } rleUncompress(qac->data, qac->compSize, buf, qac->uncSize); if (frag->strand[0] == '-') reverseBytes((char*)buf, qac->uncSize); memcpy(qa.qa + frag->chromStart, buf + frag->fragStart, fragSize); } else { /* agp frag not found in qac hash -- missing data */ if (mScore < 0) errAbort("missing data: no quality scores for %s", frag->frag); /* fill in missing data with specified score */ memset(qa.qa + frag->chromStart, mScore, fragSize); } } /* Compress and write it out. */ qacWriteNext(f, &qa); ++count; } carefulClose(&f); }