static void findMotif(char *input) /* findMotif - find specified motif in sequence file. */ { struct dnaLoad *dl = dnaLoadOpen(input); struct dnaSeq *seq; while ((seq = dnaLoadNext(dl)) != NULL) { verbose(2, "#\tprocessing: %s\n", seq->name); scanSeq(seq); } }
static void blatzClient(char *input, char *output) /* Send query message and dna to server and print result. */ { struct dnaLoad *dl = dnaLoadOpen(input); struct dnaSeq *seq; FILE *f = mustOpen(output, "w"); static struct optionSpec options[] = { BZP_CLIENT_OPTIONS }; int i; while ((seq = dnaLoadNext(dl)) != NULL) { /* Connect */ int sd = netMustConnect(host, port); FILE *sf = NULL; /* Send query command. */ netSendString(sd, "query"); /* Send options. */ for (i=0; i<ArraySize(options); ++i) sendOption(sd, options[i].name); /* Send sequence. */ if (optionExists("rna") || optionExists("unmask")) toUpperN(seq->dna, seq->size); else { if (seqIsLower(seq)) warn("Sequence %s is all lower case, and thus ignored. Use -unmask " "flag to unmask lower case sequence.", seq->name); } netSendString(sd, "seq"); netSendString(sd, seq->name); netSendHugeString(sd, seq->dna); verbose(1, "%s\n", seq->name); dnaSeqFree(&seq); /* Get and save response. */ sf = netFileFromSocket(sd); copyOpenFile(sf, f); carefulClose(&sf); /* Close connection */ close(sd); } dnaLoadClose(&dl); carefulClose(&f); }
struct dnaSeq *dnaLoadAll(char *fileName) /* Return list of all DNA referenced in file. File * can be either a single fasta file, a single .2bit * file, a .nib file, or a text file containing * a list of the above files. DNA is mixed case. */ { struct dnaLoad *dl = dnaLoadOpen(fileName); struct dnaSeq *seqList = NULL, *seq; while ((seq = dnaLoadNext(dl)) != NULL) { slAddHead(&seqList, seq); } dnaLoadClose(&dl); slReverse(&seqList); return seqList; }
void itsaMake(int inCount, char *inputs[], char *output) /* itsaMake - Make a suffix array file out of input DNA sequences.. */ { verboseTimeInit(); bits64 maxGenomeSize = 1024LL*1024*1024*4; itsaBaseToValInit(); /* Load all DNA, make sure names are unique, and alphabetize by name. */ struct dnaSeq *seqList = NULL, *seq; struct hash *uniqSeqHash = hashNew(0); bits64 totalDnaSize = 1; /* FOr space between. */ int inputIx; for (inputIx=0; inputIx<inCount; ++inputIx) { char * input = inputs[inputIx]; struct dnaLoad *dl = dnaLoadOpen(input); while ((seq = dnaLoadNext(dl)) != NULL) { verbose(1, "read %s with %d bases\n", seq->name, seq->size); if (hashLookup(uniqSeqHash, seq->name)) errAbort("Input sequence name %s repeated, all must be unique.", seq->name); totalDnaSize += seq->size + 1; if (totalDnaSize > maxGenomeSize) errAbort("Too much DNA. Can only handle up to %lld bases", maxGenomeSize); slAddHead(&seqList, seq); } dnaLoadClose(&dl); } slSort(&seqList, dnaSeqCmpName); verboseTime(1, "Loaded %lld bases in %d sequences", totalDnaSize, slCount(seqList)); /* Allocate big buffer for all DNA. */ DNA *allDna = globalAllDna = needHugeMem(totalDnaSize); allDna[0] = 0; bits64 chromOffset = 1; /* Have zeroes between each chrom, and before and after. */ /* Copy DNA to a single big buffer, and create chromInfo on each sequence. */ struct chromInfo *chrom, *chromList = NULL; for (seq = seqList; seq != NULL; seq = seq->next) { AllocVar(chrom); chrom->name = cloneString(seq->name); chrom->size = seq->size; chrom->offset = chromOffset; slAddHead(&chromList, chrom); toUpperN(seq->dna, seq->size); memcpy(allDna + chromOffset, seq->dna, seq->size + 1); chromOffset += seq->size + 1; } slReverse(&chromList); /* Free up separate dna sequences because we're going to need a lot of RAM soon. */ /* Allocate index array, and offset and list arrays. */ dnaSeqFreeList(&seqList); bits32 *index13; AllocArray(index13, itsaSlotCount); bits32 *offsetArray = needHugeMem(totalDnaSize * sizeof(bits32)); bits32 *listArray = needHugeZeroedMem(totalDnaSize * sizeof(bits32)); verboseTime(1, "Allocated buffers %lld bytes total", (long long)(9LL*totalDnaSize + itsaSlotCount*sizeof(bits32))); /* Where normally we'd keep some sort of structure with a next element to form a list * of matching positions in each slot of our index, to conserve memory we'll do this * with two parallel arrays. Because we're such cheapskates in terms of memory we'll * (and still using 9*genomeSize bytes of RAM) we'll use these arrays for two different * purposes. * In the first phase they will together be used to form linked lists of * offsets, and the 13mer index will point to the first item in each list. In this * phase the offsetArray contains offsets into the allDna structure, and the listArray * contains the next pointers for the list. After the first phase we write out the * suffix array to disk. * In the second phase we read the suffix array back into the offsetArray, and * use the listArray for the traverseArray. We write out the traverse array to finish * things up. */ /* Load up all DNA buffer. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { verbose(2, " About to do first pass index\n"); indexChromPass1(chrom, allDna, offsetArray, listArray, index13); verbose(2, " Done first pass index\n"); } verboseTime(1, "Done big bucket sort"); slReverse(&chromList); itsaWriteMerged(chromList, allDna, offsetArray, listArray, index13, output); }
static void alignAll(struct bzp *bzp, struct blatzIndex *indexList, struct dnaLoad *queryDl, char *outFile) /* Make up neighorhood index for queryList, and use it to scan * targetList. Put output in outFile */ { FILE *f = mustOpen(outFile, "w"); struct dnaSeq *query; // LX BEG int b, bend, printing; FILE *bedfp = NULL; // See if bed file output of the mask was requested if (differentString(bzp->dynaBedFileQ, "")) bedfp = mustOpen(bzp->dynaBedFileQ, "w"); // Counts all the query-target hits encountered by the program inside the // loops of gapless.c dynaHits = 0; // Counts how many target and query positions reached the limit dynaCountTarget = 0; dynaCountQuery = 0; // This is the limit used by the program, currently just bzp->dynaLimit(QT) // but should be useful for scaling to sequence size targetHitDLimit = VERY_LARGE_NUMBER; // perhaps unnecessary default queryHitDLimit = VERY_LARGE_NUMBER; // perhaps unnecessary default // LX END while ((query = dnaLoadNext(queryDl)) != NULL) { double bestScore = 0; struct chain *chainList; // LX BEG if (bzp->dynaLimitQ<VERY_LARGE_NUMBER) { queryHitDLimit = bzp->dynaLimitQ; // allocate zeroed memory for hit counters AllocArray(dynaCountQ, query->size); } // LX END if (bzp->unmask || bzp->rna) toUpperN(query->dna, query->size); if (bzp->rna) maskTailPolyA(query->dna, query->size); chainList = blatzAlign(bzp, indexList, query); if (chainList != NULL) bestScore = chainList->score; else { if (seqIsLower(query)) warn("Sequence %s is all lower case, and thus ignored. Use -unmask " "flag to unmask lower case sequence.", query->name); } verbose(1, "%s (%d bases) score %2.0f\n", query->name, query->size, bestScore); blatzWriteChains(bzp, &chainList, query, dnaLoadCurStart(queryDl), dnaLoadCurEnd(queryDl), dnaLoadCurSize(queryDl), indexList, f); // LX BEG // This prints the contents of the mask into the .bed file opened above if (bedfp != NULL) { if (bzp->dynaLimitQ<VERY_LARGE_NUMBER) { printing = 0; for (b=0;b<query->size;b++) { if (dynaCountQ[b] > queryHitDLimit) { if (printing == 0) { printing = 1; fprintf(bedfp,"%s %d ",query->name,b); } } if (dynaCountQ[b] <= queryHitDLimit) { if (printing == 1) { printing = 0; bend = b-1; fprintf(bedfp,"%d\n",bend); } } } } else { fprintf(bedfp,"#No dynamic masking data to print.\n"); } } // LX END dnaSeqFree(&query); } // LX BEG // Statistics to print about how many hits were dropped (ignored) dynaDrops = dynaCountTarget + dynaCountQuery; dynaDropsPerc = (float)100*dynaDrops/dynaHits+0.5; verbose(2, "%d dynaDrops (%f%%) at T=%d Q=%d \n", dynaDrops, (double)dynaDropsPerc, targetHitDLimit, queryHitDLimit); // Free dynamic memory used for the sequence-length-dependent counter arrays freeMem(dynaCountQ); if (bedfp != NULL) carefulClose(&bedfp); freeMem(dynaWordCount); // LX END carefulClose(&f); }