static void findMotif(char *input)
/* findMotif - find specified motif in sequence file. */
{
struct dnaLoad *dl = dnaLoadOpen(input);
struct dnaSeq *seq; 

while ((seq = dnaLoadNext(dl)) != NULL)
    {
    verbose(2, "#\tprocessing: %s\n", seq->name);
    scanSeq(seq);
    }
}
static void blatzClient(char *input, char *output)
/* Send query message and dna to server and print result. */
{
struct dnaLoad *dl = dnaLoadOpen(input);
struct dnaSeq *seq;
FILE *f = mustOpen(output, "w");
static struct optionSpec options[] = {
   BZP_CLIENT_OPTIONS
};
int i;
while ((seq = dnaLoadNext(dl)) != NULL)
    {
    /* Connect */
    int sd = netMustConnect(host, port);
    FILE *sf = NULL;

    /* Send query command. */
    netSendString(sd, "query");

    /* Send options. */
    for (i=0; i<ArraySize(options); ++i)
        sendOption(sd, options[i].name);

    /* Send sequence. */
    if (optionExists("rna") || optionExists("unmask"))
        toUpperN(seq->dna, seq->size);
    else
	{
	if (seqIsLower(seq))
	    warn("Sequence %s is all lower case, and thus ignored. Use -unmask "
	         "flag to unmask lower case sequence.", seq->name);
	}
    netSendString(sd, "seq");
    netSendString(sd, seq->name);
    netSendHugeString(sd, seq->dna);
    verbose(1, "%s\n", seq->name);
    dnaSeqFree(&seq);

    /* Get and save response. */
    sf = netFileFromSocket(sd);
    copyOpenFile(sf, f);
    carefulClose(&sf);

    /* Close connection */
    close(sd);
    }
dnaLoadClose(&dl);
carefulClose(&f);
}
Example #3
0
struct dnaSeq *dnaLoadAll(char *fileName)
/* Return list of all DNA referenced in file.  File
 * can be either a single fasta file, a single .2bit
 * file, a .nib file, or a text file containing
 * a list of the above files. DNA is mixed case. */
{
struct dnaLoad *dl = dnaLoadOpen(fileName);
struct dnaSeq *seqList = NULL, *seq;
while ((seq = dnaLoadNext(dl)) != NULL)
    {
    slAddHead(&seqList, seq);
    }
dnaLoadClose(&dl);
slReverse(&seqList);
return seqList;
}
void itsaMake(int inCount, char *inputs[], char *output)
/* itsaMake - Make a suffix array file out of input DNA sequences.. */
{
verboseTimeInit();
bits64 maxGenomeSize = 1024LL*1024*1024*4;

itsaBaseToValInit();

/* Load all DNA, make sure names are unique, and alphabetize by name. */
struct dnaSeq *seqList = NULL, *seq;
struct hash *uniqSeqHash = hashNew(0);
bits64 totalDnaSize = 1;	/* FOr space between. */
int inputIx;
for (inputIx=0; inputIx<inCount; ++inputIx)
    {
    char * input = inputs[inputIx];
    struct dnaLoad *dl = dnaLoadOpen(input);
    while ((seq = dnaLoadNext(dl)) != NULL)
	{
	verbose(1, "read %s with %d bases\n", seq->name, seq->size);
	if (hashLookup(uniqSeqHash, seq->name))
	    errAbort("Input sequence name %s repeated, all must be unique.", seq->name);
	totalDnaSize +=  seq->size + 1;
	if (totalDnaSize > maxGenomeSize)
	    errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize);
	slAddHead(&seqList, seq);
	}
    dnaLoadClose(&dl);
    }
slSort(&seqList, dnaSeqCmpName);
verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList));

/* Allocate big buffer for all DNA. */
DNA *allDna = globalAllDna = needHugeMem(totalDnaSize);
allDna[0] = 0;
bits64 chromOffset = 1;	/* Have zeroes between each chrom, and before and after. */

/* Copy DNA to a single big buffer, and create chromInfo on each sequence. */
struct chromInfo *chrom, *chromList = NULL;
for (seq = seqList; seq != NULL; seq = seq->next)
    {
    AllocVar(chrom);
    chrom->name = cloneString(seq->name);
    chrom->size = seq->size;
    chrom->offset = chromOffset;
    slAddHead(&chromList, chrom);
    toUpperN(seq->dna, seq->size);
    memcpy(allDna + chromOffset, seq->dna, seq->size + 1);
    chromOffset += seq->size + 1;
    }
slReverse(&chromList);

/* Free up separate dna sequences because we're going to need a lot of RAM soon. */


/* Allocate index array, and offset and list arrays. */
dnaSeqFreeList(&seqList);
bits32 *index13;
AllocArray(index13, itsaSlotCount);
bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32));
bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32));
verboseTime(1, "Allocated buffers %lld bytes total", 
	(long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32)));

/* Where normally we'd keep some sort of structure with a next element to form a list
 * of matching positions in each slot of our index,  to conserve memory we'll do this
 * with two parallel arrays.  Because we're such cheapskates in terms of memory we'll
 * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different
 * purposes.   
 *     In the first phase they will together be used to form linked lists of
 * offsets, and the 13mer index will point to the first item in each list.  In this
 * phase the offsetArray contains offsets into the allDna structure, and the listArray
 * contains the next pointers for the list.  After the first phase we write out the
 * suffix array to disk.
 *     In the second phase we read the suffix array back into the offsetArray, and
 * use the listArray for the traverseArray.  We write out the traverse array to finish
 * things up. */


/* Load up all DNA buffer. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    verbose(2, "  About to do first pass index\n");
    indexChromPass1(chrom, allDna, offsetArray, listArray, index13);
    verbose(2, "  Done first pass index\n");
    }
verboseTime(1, "Done big bucket sort");
slReverse(&chromList);
itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output);
}
static void alignAll(struct bzp *bzp, struct blatzIndex *indexList, 
        struct dnaLoad *queryDl, char *outFile)
/* Make up neighorhood index for queryList, and use it to scan
 * targetList.  Put output in outFile */
{
FILE *f = mustOpen(outFile, "w");
struct dnaSeq *query;

// LX BEG
int b, bend, printing; 
FILE *bedfp = NULL;
// See if bed file output of the mask was requested
if (differentString(bzp->dynaBedFileQ, "")) 
   bedfp = mustOpen(bzp->dynaBedFileQ, "w");
// Counts all the query-target hits encountered by the program inside the 
// loops of gapless.c
dynaHits = 0;
// Counts how many target and query positions reached the limit
dynaCountTarget = 0;
dynaCountQuery = 0;
// This is the limit used by the program, currently just bzp->dynaLimit(QT)
// but should be useful for scaling to sequence size
targetHitDLimit = VERY_LARGE_NUMBER; // perhaps unnecessary default
queryHitDLimit = VERY_LARGE_NUMBER; // perhaps unnecessary default
// LX END

while ((query = dnaLoadNext(queryDl)) != NULL)
    {
    double bestScore = 0;
    struct chain *chainList;
    // LX BEG
    if (bzp->dynaLimitQ<VERY_LARGE_NUMBER)
       {
       queryHitDLimit = bzp->dynaLimitQ;
       // allocate zeroed memory for hit counters
       AllocArray(dynaCountQ, query->size);
       }
    // LX END
    if (bzp->unmask || bzp->rna)
        toUpperN(query->dna, query->size);
    if (bzp->rna)
        maskTailPolyA(query->dna, query->size);
    chainList = blatzAlign(bzp, indexList, query);
    if (chainList != NULL) 
    	bestScore = chainList->score;
    else
        {
	if (seqIsLower(query))
	    warn("Sequence %s is all lower case, and thus ignored. Use -unmask "
	         "flag to unmask lower case sequence.", query->name);
	}
    verbose(1, "%s (%d bases) score %2.0f\n", 
            query->name, query->size, bestScore);
    blatzWriteChains(bzp, &chainList, query, 
    	dnaLoadCurStart(queryDl), dnaLoadCurEnd(queryDl),
	dnaLoadCurSize(queryDl), indexList, f);
    // LX BEG
    // This prints the contents of the mask into the .bed file opened above
    if (bedfp != NULL)
       {
       if (bzp->dynaLimitQ<VERY_LARGE_NUMBER)
          {
          printing = 0;
          for (b=0;b<query->size;b++)
              {
              if (dynaCountQ[b] > queryHitDLimit)
                 {
                 if (printing == 0)
                    {
                    printing = 1;
                    fprintf(bedfp,"%s %d ",query->name,b);
                    }
                 }
              if (dynaCountQ[b] <= queryHitDLimit)
                 {
                 if (printing == 1)
                    {
                    printing = 0;
                    bend = b-1;
                    fprintf(bedfp,"%d\n",bend);
                    }
                 }
              }
           }
        else
           {
           fprintf(bedfp,"#No dynamic masking data to print.\n");
           }
        }
    // LX END
    dnaSeqFree(&query);
    }
    // LX BEG
    // Statistics to print about how many hits were dropped (ignored)
    dynaDrops = dynaCountTarget + dynaCountQuery;
    dynaDropsPerc = (float)100*dynaDrops/dynaHits+0.5;
    verbose(2, "%d dynaDrops (%f%%) at T=%d Q=%d \n", 
    	dynaDrops, (double)dynaDropsPerc, targetHitDLimit, queryHitDLimit);
   // Free dynamic memory used for the sequence-length-dependent counter arrays
   freeMem(dynaCountQ);
   if (bedfp != NULL)
      carefulClose(&bedfp);
   freeMem(dynaWordCount);
   // LX END
carefulClose(&f);
}