Ejemplo n.º 1
0
int main(int argc, char *argv[])
/* The program */
{
struct bed *bedList = NULL;
struct dnaSeq *targets = NULL, *target;
struct dnaSeq *queries = NULL, *query;
if (argc != 4)
    usage();
targets = dnaLoadAll(argv[2]);
queries = dnaLoadAll(argv[1]);
for (target = targets; target != NULL; target = target->next)
    for (query = queries; query != NULL; query = query->next)
	{
	struct bed *oneList = oligoMatch(target, query);
	bedList = slCat(bedList, oneList);
	}
outputBed6(bedList, argv[3]);
bedFreeList(&bedList);
dnaSeqFreeList(&targets);
dnaSeqFreeList(&queries);
return 0;
}
void itsaMake(int inCount, char *inputs[], char *output)
/* itsaMake - Make a suffix array file out of input DNA sequences.. */
{
verboseTimeInit();
bits64 maxGenomeSize = 1024LL*1024*1024*4;

itsaBaseToValInit();

/* Load all DNA, make sure names are unique, and alphabetize by name. */
struct dnaSeq *seqList = NULL, *seq;
struct hash *uniqSeqHash = hashNew(0);
bits64 totalDnaSize = 1;	/* FOr space between. */
int inputIx;
for (inputIx=0; inputIx<inCount; ++inputIx)
    {
    char * input = inputs[inputIx];
    struct dnaLoad *dl = dnaLoadOpen(input);
    while ((seq = dnaLoadNext(dl)) != NULL)
	{
	verbose(1, "read %s with %d bases\n", seq->name, seq->size);
	if (hashLookup(uniqSeqHash, seq->name))
	    errAbort("Input sequence name %s repeated, all must be unique.", seq->name);
	totalDnaSize +=  seq->size + 1;
	if (totalDnaSize > maxGenomeSize)
	    errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize);
	slAddHead(&seqList, seq);
	}
    dnaLoadClose(&dl);
    }
slSort(&seqList, dnaSeqCmpName);
verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList));

/* Allocate big buffer for all DNA. */
DNA *allDna = globalAllDna = needHugeMem(totalDnaSize);
allDna[0] = 0;
bits64 chromOffset = 1;	/* Have zeroes between each chrom, and before and after. */

/* Copy DNA to a single big buffer, and create chromInfo on each sequence. */
struct chromInfo *chrom, *chromList = NULL;
for (seq = seqList; seq != NULL; seq = seq->next)
    {
    AllocVar(chrom);
    chrom->name = cloneString(seq->name);
    chrom->size = seq->size;
    chrom->offset = chromOffset;
    slAddHead(&chromList, chrom);
    toUpperN(seq->dna, seq->size);
    memcpy(allDna + chromOffset, seq->dna, seq->size + 1);
    chromOffset += seq->size + 1;
    }
slReverse(&chromList);

/* Free up separate dna sequences because we're going to need a lot of RAM soon. */


/* Allocate index array, and offset and list arrays. */
dnaSeqFreeList(&seqList);
bits32 *index13;
AllocArray(index13, itsaSlotCount);
bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32));
bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32));
verboseTime(1, "Allocated buffers %lld bytes total", 
	(long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32)));

/* Where normally we'd keep some sort of structure with a next element to form a list
 * of matching positions in each slot of our index,  to conserve memory we'll do this
 * with two parallel arrays.  Because we're such cheapskates in terms of memory we'll
 * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different
 * purposes.   
 *     In the first phase they will together be used to form linked lists of
 * offsets, and the 13mer index will point to the first item in each list.  In this
 * phase the offsetArray contains offsets into the allDna structure, and the listArray
 * contains the next pointers for the list.  After the first phase we write out the
 * suffix array to disk.
 *     In the second phase we read the suffix array back into the offsetArray, and
 * use the listArray for the traverseArray.  We write out the traverse array to finish
 * things up. */


/* Load up all DNA buffer. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    verbose(2, "  About to do first pass index\n");
    indexChromPass1(chrom, allDna, offsetArray, listArray, index13);
    verbose(2, "  Done first pass index\n");
    }
verboseTime(1, "Done big bucket sort");
slReverse(&chromList);
itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output);
}