Пример #1
0
/**
 * Clip off some of the high-numbered positions.
 */
void Edit::clipHi(EList<Edit>& ed, size_t len, size_t amt) {
	assert_leq(amt, len);
	size_t max = len - amt;
	size_t nrm = 0;
	for(size_t i = 0; i < ed.size(); i++) {
		size_t ii = ed.size() - i - 1;
		assert_lt(ed[ii].pos, len);
		if(ed[ii].pos > max) {
			nrm++;
		} else if(ed[ii].pos == max && !ed[ii].isReadGap()) {
			nrm++;
		} else {
			break;
		}
	}
	ed.resize(ed.size() - nrm);
}
Пример #2
0
/**
 * Merman main driver function.  Does the following:
 *
 * 1. Parses command-line options
 */
int merman(int argc, char **argv) {
	reset();
	try {
		parseCommandLine(argc, argv);
		Timer tov(cerr, "Overall time: ", timing);
		EList<string> refstrs;
		ReferenceSet refs;
		EList<string> refnames;
		EList<size_t> reflens;
		string refstr = argv[optind++];
		tokenize(refstr, ",", refstrs);
		auto_ptr<MerIndex> ind(
			new MerIndex(ap, rp, readLen, seedWidth, nk.first, nk.second,
			             specificity, begin, naiveCheck, nthreads));
		{
			Timer t(cerr, "... ", timing);
			if(timing) cerr << "Reading reference sequences..." << endl;
			for(size_t i = 0; i < refstrs.size(); i++) {
				if(timing) {
					cerr << "  Sequence " << (i+1) << " of " << refstrs.size() << endl;
				}
				if(refIsStr) {
					refs.addOrigReferenceString(refstrs[i].c_str(), rp);
				} else {
					refs.addOrigReferenceFasta(refstrs[i].c_str(), rp);
				}
			}
			for(size_t i = 0; i < refs.numRefs(); i++) {
				refnames.push_back(string(refs[i].name.toZBuf()));
				reflens.push_back(refs[i].seq.length(color));
			}
			if(refs.numRefs() == 0) {
				cerr << "Warning: No references were found" << endl;
			}
			if(rp.genCrick) {
				if(timing) {
					cerr << "  Crickizing" << endl;
				}
				// Add the crick strand.  If there were bisulfite
				// transformations to the Watson strand, they are
				// removed from the Watson strand before the Crick copy
				// is made.  Transformations are then applied to the
				// new Crick strand.  This has the effect of correctly
				// producing either Watson / Crick in the non-bisulfite
				// case, or BS Watson / BS Crick in the bisulfite case.
				refs.addReferenceRevComps(rp, false, 1, 0);
			}
			if(rp.genRevcomps) {
				if(timing) {
					cerr << "  Adding reverse comps" << endl;
				}
				// Add reverse complements of all existing references
				// (after the transformations have already been
				// applied).
				refs.addReferenceRevComps(rp, true, -1, 1);
			}
			assert(refs.repOk());
		}

		pair<size_t, size_t> mers = make_pair(0, 0);
		EList<MerIndexThread> threads;
		{
			Timer t(cerr, "... ", timing);
			if(timing) cerr << "Preparing to extract sub-sequences..." << endl;
			// Instantiate and run index threads
			assert_gt(nthreads, 0);
			threads.resize(nthreads);
			for(int i = 0; i < nthreads; i++) {
				threads[i].runCount(&refs, ind.get(), i, nthreads, color);
			}
			for(int i = 0; i < nthreads; i++) {
				pair<size_t, size_t> mrs = threads[i].join();
				mers.first += mrs.first;
				mers.second += mrs.second;
			}
			ind->allocateMers();
		}
		if(timing || verbose || justBlowup) {
			cerr << "Expecting index footprint of ";
			printBytes(mers.first * sizeof(mer_ent), cerr);
			cerr << endl;
			if(mers.first > mers.second) {
				cerr.setf(ios::fixed);
				cerr << "  base footprint is ";
				printBytes(mers.second * sizeof(mer_ent), cerr);
				cerr << endl
				     << "  blowup factor: " << setprecision(2) << ((double)mers.first / (double)mers.second) << endl;
			}
			if(justBlowup) throw 0;
		}
		{
			Timer t(cerr, "... ", timing);
			if(timing) cerr << "Extracting index sub-sequences..." << endl;
			// Instantiate and run index threads
			for(int i = 0; i < nthreads; i++) {
				threads[i].runIndex(&refs, ind.get(), i, nthreads, color);
			}
			for(int i = 0; i < nthreads; i++) threads[i].join();
		}
		assert_eq(mers.first, ind->size());
		if(verbose) {
			cout << "  read " << refs.numRefs() << " reference strings" << endl;
		}
		if(refs.empty() && iformat != INPUT_CHAININ) {
			cerr << "Index is empty; not enough reference sequence supplied" << endl;
			throw 1;
		}
		if(refs.numRefs() == 0 && iformat != INPUT_CHAININ) {
			cerr << "No reference strings provided; aborting..." << endl;
			throw 1;
		}
		{
			Timer t(cerr, "Sorting reference mers: ", timing);
			ind->sort(nthreads); // sort mers
		}
		{
			Timer t(cerr, "... ", timing);
			if(timing) cerr << "Aligning reads..." << endl;
			string rstr = argv[optind++];
			// Instantiate reference map, which translates to new reference
			// coordinate system prior to alignment output
			auto_ptr<ReferenceMap> rmap(
				refmapFile == NULL ? NULL : new ReferenceMap(refmapFile, !refidx));
			// Instantiate annotation map, which encodes SNP locations & alleles
			auto_ptr<AnnotationMap> amap(
				annotFile == NULL ? NULL : new AnnotationMap(annotFile));
			// Instantiate the read-input object
			auto_ptr<Reads> rs(
				(iformat == INPUT_CMDLINE) ?
					(Reads*)new StringReads(rstr, begin) :
					((iformat == INPUT_FASTA) ?
						(Reads*)new FastaReads(rstr, begin, bufsz) :
						((iformat == INPUT_FASTA_CONT) ?
							(Reads*)new FastaContinuousReads(
								rstr, begin, fastaContLen,
								fastaContFreq, fcontBis, fcontRc,
								color) :
							((iformat == INPUT_FASTQ) ?
								(Reads*)new FastqReads(rstr, solexaScale, sixty4off, begin, bufsz) :
									((iformat == INPUT_CHAININ) ?
										(Reads*)new ChainReads(rstr, begin, bufsz) :
											((iformat == INPUT_CSFASTA) ?
												(Reads*)new CSFastaReads(rstr, begin, bufsz) :
													((iformat == INPUT_CSFASTA_AND_QV) ?
														(Reads*)new CSFastaAndQVReads(rstr, qualFile, begin, bufsz) :
														(Reads*)new CSFastqReads(rstr, solexaScale, sixty4off, begin, bufsz))))))));
			// Set output stream
			string of = "-";
			if(optind < argc) of = argv[optind++];
			// Instantiate the alignment-output object
			auto_ptr<AlignOutput> outs(
				(oformat == OUTPUT_SAM) ?
					(AlignOutput*)new SamOutput(of, fullref, refidx, rp.bisulfiteC || rp.bisulfiteCpG, !samNoCsCq) :
					(AlignOutput*)new BowtieOutput(of, fullref, printCost, refidx, rp.bisulfiteC || rp.bisulfiteCpG));
			outs->printHeader(refnames, reflens);
			// Run the progress thread, if requested
			ProgressThread proThread;
			if(progress) proThread.run();
			// Instantiate and run search threads
			EList<SearchThread> sthreads;
			sthreads.resize(nthreads);
			for(int i = 0; i < (int)sthreads.size(); i++) {
				sthreads[i].init(
					i, (int)sthreads.size(), ind.get(), rs.get(), &refs,
					outs.get(), rmap.get(), amap.get());
				sthreads[i].run();
			}
			// Wait until search sthreads are finished
			for(size_t i = 0; i < sthreads.size(); i++) {
				sthreads[i].join();
			}
			if(progress) {
				proThread.kill();
				proThread.join();
			}
			outs->flush();
		}
		if(!quiet) ProgressThread::reportStats();
	} catch(exception& e) {
		cerr << "Command: ";
		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
		cerr << endl;
		return 1;
	} catch(int e) {
		if(e != 0) {
			cerr << "Command: ";
			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
			cerr << endl;
		}
		return e;
	}
	return 0;
}