int main(int argc, char *argv[]) { high_resolution_timer timer; const int n = argc > 1 ? atoi(argv[1]) : 1000 * 1000; const char* filename = "fasta.txt"; fasta(n, filename); reverse_complement(filename); regex_dna(filename); high_resolution_timer::duration dur = timer.pulse(); std::cerr << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << std::endl; return 0; }
int CVecScreenApp::Run(void) { int status = BLAST_EXIT_SUCCESS; try { // Allow the fasta reader to complain on invalid sequence input SetDiagPostLevel(eDiag_Warning); const bool kIsProtein(false); /*** Process the command line arguments ***/ const CArgs& args = GetArgs(); const string kDbName(args[kArgDb].AsString()); CRef<CBlastOptionsHandle> opts_hndl(CBlastOptionsFactory::Create(eVecScreen)); /*** Initialize the scope ***/ SDataLoaderConfig dlconfig(kDbName, kIsProtein); dlconfig.OptimizeForWholeLargeSequenceRetrieval(); CBlastInputSourceConfig iconfig(dlconfig); iconfig.SetQueryLocalIdMode(); CRef<CScope> scope = CBlastScopeSource(dlconfig).NewScope(); /*** Initialize the input stream ***/ CBlastFastaInputSource fasta(args[kArgQuery].AsInputFile(), iconfig); CBlastInput input(&fasta, 1); /*** Get the formatting options ***/ const CVecscreenRun::CFormatter::TOutputFormat kFmt = args[kArgOutputFormat].AsInteger(); const bool kHtmlOutput = !args["text_output"].AsBoolean(); /*** Process the input ***/ while ( !input.End() ) { CRef<CBlastQueryVector> query_batch(input.GetNextSeqBatch(*scope)); _ASSERT(query_batch->Size() == 1); CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(*query_batch)); CVecscreenRun vs(CRef<CSeq_loc>(const_cast<CSeq_loc*>(&*query_batch->GetQuerySeqLoc(0))), query_batch->GetScope(0), kDbName); CVecscreenRun::CFormatter vs_format(vs, *scope, kFmt, kHtmlOutput); vs_format.FormatResults(args[kArgOutput].AsOutputFile(), opts_hndl); } } CATCH_ALL(status) return status; }
void Consensus(int id, TrgBuf& trgBuf, CnsBuf& cnsBuf) { TargetData td; trgBuf.pop(&td); std::vector<CnsResult> seqs; el::Loggers::getLogger("Consensus"); while (td.alns.size() > 0) { if (td.alns.size() < popts.minCov) { trgBuf.pop(&td); continue; } boost::format msg("(%d) calling: %s Alignments: %d"); CLOG(INFO, "Consensus") << msg % id % td.alns[0].id % td.alns.size(); AlnGraphBoost ag(td.targSeq); AlnVec alns = td.alns; for (auto it = alns.begin(); it != alns.end(); ++it) { if (it->qstr.length() < popts.minLen) continue; dagcon::Alignment aln = normalizeGaps(*it); // XXX: Shouldn't be needed for dazcon, but causes some infinite // loops in the current consensus code. trimAln(aln, popts.trim); ag.addAln(aln); } CVLOG(3, "Consensus") << "Merging nodes"; ag.mergeNodes(); CVLOG(3, "Consensus") << "Generating consensus"; ag.consensus(seqs, popts.minCov, popts.minLen); for (auto it = seqs.begin(); it != seqs.end(); ++it) { CnsResult result = *it; boost::format fasta(">%s/%d_%d\n%s\n"); fasta % alns[0].id % result.range[0] % result.range[1]; fasta % result.seq; cnsBuf.push(fasta.str()); } trgBuf.pop(&td); } boost::format msg("(%d) ending ..."); CLOG(INFO, "Consensus") << msg % id; // write out a sentinal cnsBuf.push(""); }
void testMarkup() { TwoBitEncoding bin[1024]; std::string fasta(1024, '\0'); BaseLocationVectorType markup; markup = TwoBitSequence::compressSequence(n1, bin); BOOST_CHECK_EQUAL(markup.size(), 1ul); BOOST_CHECK_EQUAL(markup[0].first, 'N'); BOOST_CHECK_EQUAL(markup[0].second, 0ul); TwoBitSequence::uncompressSequence(bin, std::strlen(n1), fasta); TwoBitSequence::applyMarkup(fasta, markup); BOOST_CHECK_EQUAL(n1, fasta); markup = TwoBitSequence::compressSequence(n2, bin); BOOST_CHECK_EQUAL(markup.size(), 2ul); BOOST_CHECK_EQUAL(markup[0].first, 'N'); BOOST_CHECK_EQUAL(markup[0].second, 0ul); BOOST_CHECK_EQUAL(markup[1].first, 'N'); BOOST_CHECK_EQUAL(markup[1].second, 5ul); TwoBitSequence::uncompressSequence(bin, std::strlen(n2), fasta); TwoBitSequence::applyMarkup(fasta, markup); BOOST_CHECK_EQUAL(n2, fasta); markup = TwoBitSequence::compressSequence(n3, bin); BOOST_CHECK_EQUAL(markup.size(), 3ul); BOOST_CHECK_EQUAL(markup[0].first, 'N'); BOOST_CHECK_EQUAL(markup[0].second, 0ul); BOOST_CHECK_EQUAL(markup[1].first, 'N'); BOOST_CHECK_EQUAL(markup[1].second, 5ul); BOOST_CHECK_EQUAL(markup[2].first, 'N'); BOOST_CHECK_EQUAL(markup[2].second, 10ul); TwoBitSequence::uncompressSequence(bin, std::strlen(n3), fasta); TwoBitSequence::applyMarkup(fasta, markup); BOOST_CHECK_EQUAL(n3, fasta); markup = TwoBitSequence::compressSequence(n4, bin); BOOST_CHECK_EQUAL(markup.size(), 4ul); BOOST_CHECK_EQUAL(markup[0].first, 'N'); BOOST_CHECK_EQUAL(markup[0].second, 0ul); BOOST_CHECK_EQUAL(markup[1].first, 'N'); BOOST_CHECK_EQUAL(markup[1].second, 5ul); BOOST_CHECK_EQUAL(markup[2].first, 'N'); BOOST_CHECK_EQUAL(markup[2].second, 10ul); BOOST_CHECK_EQUAL(markup[3].first, 'N'); BOOST_CHECK_EQUAL(markup[3].second, 15ul); TwoBitSequence::uncompressSequence(bin, std::strlen(n4), fasta); TwoBitSequence::applyMarkup(fasta, markup); BOOST_CHECK_EQUAL(n4, fasta); markup = TwoBitSequence::compressSequence(n5, bin); BOOST_CHECK_EQUAL(markup.size(), 5ul); BOOST_CHECK_EQUAL(markup[0].first, 'N'); BOOST_CHECK_EQUAL(markup[0].second, 0ul); BOOST_CHECK_EQUAL(markup[1].first, 'N'); BOOST_CHECK_EQUAL(markup[1].second, 5ul); BOOST_CHECK_EQUAL(markup[2].first, 'N'); BOOST_CHECK_EQUAL(markup[2].second, 10ul); BOOST_CHECK_EQUAL(markup[3].first, 'N'); BOOST_CHECK_EQUAL(markup[3].second, 15ul); BOOST_CHECK_EQUAL(markup[4].first, 'N'); BOOST_CHECK_EQUAL(markup[4].second, 23ul); TwoBitSequence::uncompressSequence(bin, std::strlen(n5), fasta); TwoBitSequence::applyMarkup(fasta, markup); BOOST_CHECK_EQUAL(n5, fasta); }
/** * cactusMerge.cpp: merge two pairs of c2h and FASTA files into one pair. The * files must be star trees with the same root sequence. */ int main( int argc, char** argv ) { // Register ctrl+c handler. See // <http://www.yolinux.com/TUTORIALS/C++Signals.html> signal(SIGINT, stacktraceOnSignal); // Register segfaults with the stack trace handler signal(SIGSEGV, stacktraceOnSignal); // Parse options with boost::programOptions. See // <http://www.radmangames.com/programming/how-to-use-boost-program_options> std::string appDescription = std::string("Merge c2h/FASTA file pairs.\n" "Usage: cactusMerge <c2hOut> <fastaOut> --c2h <c2h files...> " "--fasta <fasta files...> --suffix <suffixes...>"); // Make an options description for our program's options. boost::program_options::options_description description("Options"); // Add all the options description.add_options() ("help", "Print help messages") ("c2h", boost::program_options::value<std::vector<std::string>>(), "List of c2h files to merge") ("fasta", boost::program_options::value<std::vector<std::string>>(), "List of FASTA files for the given c2h files") ("suffix", boost::program_options::value<std::vector<std::string>>(), "List of suffixes to add on to event names") ("mergeOn", boost::program_options::value<std::string>()->required(), "An event on which to merge the files") ("c2hOut", boost::program_options::value<std::string>()->required(), "File to save .c2h-format alignment in") ("fastaOut", boost::program_options::value<std::string>()->required(), "File in which to save FASTA records for building HAL from .c2h"); // And set up our positional arguments boost::program_options::positional_options_description positionals; positionals.add("mergeOn", 1); positionals.add("c2hOut", 1); positionals.add("fastaOut", 1); // Add a variables map to hold option variables. boost::program_options::variables_map options; try { // Parse options into the variable map, or throw an error if there's // something wring with them. boost::program_options::store( // Build the command line parser. boost::program_options::command_line_parser(argc, argv) .options(description) .positional(positionals) .run(), options); boost::program_options::notify(options); if(options.count("help")) { // The help option was given. Print program help. std::cout << appDescription << std::endl; std::cout << description << std::endl; // Don't do the actual program. return 0; } if(!options.count("mergeOn") || !options.count("c2h") || !options.count("fasta")) { // We need both of these throw boost::program_options::error("Missing important arguments!"); } if(options["c2h"].as<std::vector<std::string>>().size() != options["fasta"].as<std::vector<std::string>>().size()) { // Counts need to match up here, because these are pairs throw boost::program_options::error( "c2h/fasta counts don't match!"); } if(options.count("suffix") && options["c2h"].as<std::vector<std::string>>().size() != options["suffix"].as<std::vector<std::string>>().size()) { // If we have any suffixes we must have the right number throw boost::program_options::error( "c2h/suffix counts don't match!"); } } catch(boost::program_options::error& error) { // Something is bad about our options. Complain on stderr std::cerr << "Option parsing error: " << error.what() << std::endl; std::cerr << std::endl; // Talk about our app. std::cerr << appDescription << std::endl; // Show all the actually available options. std::cerr << description << std::endl; // Stop the program. return -1; } // If we get here, we have the right arguments. // Make a list of the c2h files to use std::vector<std::string> c2hFiles( options["c2h"].as<std::vector<std::string>>()); // This holds the suffix applied to all the top sequences and events in each // file. std::vector<std::string> suffixes( options["suffix"].as<std::vector<std::string>>()); // Make a list of the FASTA files to use std::vector<std::string> fastaFiles( options["fasta"].as<std::vector<std::string>>()); // This will hold all of the renames that have to happen for each file. // These are generated when we go through the file by renaming top and // bottom sequences with suffixes. std::vector<std::map<std::string, std::string>> renames; for(size_t i = 0; i < c2hFiles.size(); i++) { // Make sure it has an empty map of renames for each file. renames.push_back(std::map<std::string, std::string>()); } // This will hold the event names for the c2h files in order std::vector<std::string> eventNames; // And this will hold the sequence names std::vector<std::string> sequenceNames; // This will hold bottom (1) and top (0) flags for each sequence. std::vector<bool> isBottom; // And this will hold the sequence lengths std::vector<size_t> sequenceLengths; // This will hold the first sequence number for any event and sequence name std::map<std::pair<std::string, std::string>, size_t> firstSequenceNumber; // Holds Merge structs to be executed later. std::vector<C2hMerge> merges; // We're going to throw out all of the events that are old rootSeqs, and // just keep the actual leaves. This holds the list of renamed event names // we are keeping. std::set<std::string> eventsToKeep; for(size_t fileIndex = 0; fileIndex < c2hFiles.size(); fileIndex++) { // Scan through the c2h files to get the event, sequence, and length of // each thread, and to collect merges. Log::output() << "Reading alignment " << c2hFiles[fileIndex] << std::endl; // Open the file std::ifstream c2h(c2hFiles[fileIndex]); // This maps block name to (sequence number, start location) pairs for // this file. We use it to compose merges for our list in global // sequence number space. std::map<size_t, std::pair<size_t, size_t>> nameMap; for(std::string line; std::getline(c2h, line);) { // This is a new sequence. Split it up on \t. std::vector<std::string> parts; boost::split(parts, line, boost::is_any_of("\t")); if(parts.size() < 1) { // Skip lines that have nothing on them. continue; } // For each line if(parts[0] == "s") { // It's a sequence line. Start a new squence. if(parts.size() != 4) { // Not the right number of fields. throw std::runtime_error( std::string("Invalid field count in ") + line); } // Grab the parts std::string eventName = unquote(parts[1]); std::string sequenceName = unquote(parts[2]); bool bottomFlag = std::stoi(parts[3]); Log::info() << "Read sequence " << eventName << "." << sequenceName << (bottomFlag ? " (bottom)" : " (top)") << std::endl; if(eventName != options["mergeOn"].as<std::string>()) { // We aren't merging on this sequence, so we may have to // apply a suffix. // We need to rename this event (possibly to the same thing) renames[fileIndex][eventName] = eventName + suffixes[fileIndex]; if(bottomFlag) { // All the bottom events (that aren't being merged // on) need to be renamed apart manually since the names // may be reused. renames[fileIndex][eventName] += "-" + std::to_string(fileIndex); } eventName = renames[fileIndex][eventName]; if(!bottomFlag) { // Keep this event when we do our final output. eventsToKeep.insert(eventName); } // And the sequence renames[fileIndex][sequenceName] = sequenceName + suffixes[fileIndex]; if(bottomFlag) { // All the bottom sequences (that aren't being merged // on) need to be renamed apart manually since the names // may be reused. renames[fileIndex][sequenceName] += "-" + std::to_string(fileIndex); } sequenceName = renames[fileIndex][sequenceName]; Log::info() << "Canonical name: " << eventName << "." << sequenceName << std::endl; } else { // If we are going to merge on it, we keep its name the same // and then later we just make one thread for that name. We // do definitely need it in the output though. eventsToKeep.insert(eventName); } // Save the names eventNames.push_back(eventName); sequenceNames.push_back(sequenceName); // Save the bottomness flag. isBottom.push_back(bottomFlag); // Initialize the total length to 0 sequenceLengths.push_back(0); auto namePair = std::make_pair(eventName, sequenceName); if(!firstSequenceNumber.count(namePair)) { // This is the first time we have seen a sequence // for this event and sequence name. Everything should // merge against this one thread and not make more // threads. // If this is the mergeOn event, we'll only make this // once across all the files. // Later instances of this event and sequence name should // redirect here. firstSequenceNumber[namePair] = sequenceNames.size() - 1; Log::info() << "This is the first time we have seen " "this sequence." << std::endl; } } else if(parts[0] == "a") { // This is an alignment block if(sequenceNames.size() == 0) { throw std::runtime_error( "Found alignmet block before sequence"); } // Which sequence are we working on? size_t sequenceNumber = sequenceNames.size() - 1; if(isBottom[sequenceNumber]) { // Parse it as a bottom block: "a" name start length if(parts.size() != 4) { // Not the right number of fields. throw std::runtime_error( std::string("Invalid field count in ") + line); } size_t blockName = std::stoll(parts[1]); size_t blockStart = std::stoll(parts[2]); size_t blockLength = std::stoll(parts[3]); // Look up the sequence number we actually want to merge // against when we come get this block. auto namePair = std::make_pair(eventNames[sequenceNumber], sequenceNames[sequenceNumber]); size_t mergeSequenceNumber = firstSequenceNumber[namePair]; // We need to associate the block name with the thread // number for the sequence we want it to merge into, and the // start location it specifies, for merging later. nameMap[blockName] = std::make_pair(mergeSequenceNumber, blockStart); Log::debug() << "Bottom block " << blockName << " is " << blockStart << " on sequence " << mergeSequenceNumber << std::endl; // Also record the additional length on this sequence sequenceLengths[sequenceNumber] += blockLength; } else { // Parse it as a top block: // "a" start length [name orientation] if(parts.size() < 3) { // Not the right number of fields. throw std::runtime_error( std::string("Invalid field count in ") + line); } // Parse out the start and length size_t segmentStart = std::stoll(parts[1]); size_t segmentLength = std::stoll(parts[2]); // Add in the length sequenceLengths[sequenceNumber] += segmentLength; if(parts.size() == 5) { // If it has a name and orientation, remember a merge. size_t blockName = std::stoll(parts[3]); bool orientation = std::stoi(parts[4]); // Get the sequence number that canonically represents // all sequences with this event/sequence name // combination. auto namePair = std::make_pair( eventNames[sequenceNumber], sequenceNames[sequenceNumber]); size_t mergeSequenceNumber = firstSequenceNumber[ namePair]; // Make a merge and populate it with everything we can // get from this segment. C2hMerge merge; merge.sequence1 = mergeSequenceNumber; merge.start1 = segmentStart; merge.length = segmentLength; // TODO: error-check length merge.orientation = orientation; // Grab the info from the bottom segment we are talking // about earlier in this file. merge.sequence2 = nameMap[blockName].first; merge.start2 = nameMap[blockName].second; Log::debug() << "Going to merge " << segmentStart << " length " << segmentLength << " to " << blockName << " orientation " << orientation << std::endl; // Save the merge for doing later. merges.push_back(merge); } } } } } // Make a thread set with all those threads stPinchThreadSet* threadSet = stPinchThreadSet_construct(); // Make all the threads. Be 1-based internally since the serialization code // wants that. for(size_t i = 0; i < sequenceLengths.size(); i++) { auto namePair = std::make_pair(eventNames[i], sequenceNames[i]); if(firstSequenceNumber[namePair] != i) { // This sequence is not the first; it is getting merged into another // one that is the same length and structure and name (i.e. it // appears in two files). Don't make a thread for it. continue; } // Make threads for all the top sequences and the first bottom sequence // for every event and sequence name pair. stPinchThreadSet_addThread(threadSet, i, 1, sequenceLengths[i]); } for(auto merge : merges) { // Apply all the merges, converting merges to 1-based stPinchThread_pinch( stPinchThreadSet_getThread(threadSet, merge.sequence1), stPinchThreadSet_getThread(threadSet, merge.sequence2), merge.start1 + 1, merge.start2 + 1, merge.length, merge.orientation); Log::trace() << "Applied merge between threads " << merge.sequence1 << ":" << merge.start1 << "-" << merge.start1 + merge.length << " and " << merge.sequence2 << ":" << merge.start2 << "-" << merge.start2 + merge.length << " orientation " << merge.orientation << std::endl; } // Write out a new c2h file, with a new rootSeq. size_t newRootLength = writeAlignment(threadSet, sequenceNames, eventNames, options["c2hOut"].as<std::string>(), &eventsToKeep); // Clean up thread set. stPinchThreadSet_destruct(threadSet); // Merge the FASTAs, applying any renaming that needs to happen. // We'll do the FASTA output ourselves. Open the file. std::ofstream fastaOut(options["fastaOut"].as<std::string>()); // Write the newly synthesized rootSeq. TODO: unify with writeAlignmentFasta // by moving support for renames over there. fastaOut << ">rootSeq" << std::endl; for(size_t i = 0; i < newRootLength; i++) { // Write an n for every base fastaOut << "N"; } fastaOut << std::endl; // This holds the IDs of all the sequences we already wrote. Only write // sequences if they aren't duplicates after renaming (which is how we // deduplicate the shared root) std::unordered_set<std::string> alreadyWritten; for(size_t fileIndex = 0; fileIndex < fastaFiles.size(); fileIndex++) { // Open up the FASTA for reading Fasta fasta(fastaFiles[fileIndex]); Log::info() << "Copying over FASTA records from " << fastaFiles[fileIndex] << std::endl; while(fasta.hasNext()) { // Go through all the FASTA records. // TODO: assumes FASTA headers have nothing but IDs. std::pair<std::string, std::string> record = fasta.getNextRecord(); if(renames[fileIndex].count(record.first)) { // Rename them if necessary record.first = renames[fileIndex][record.first]; } if(!eventsToKeep.count(record.first)) { // This event wasn't on the list of events to actually output, // so don't output it. Log::info() << "Skipped event " << record.first << std::endl; continue; } if(!alreadyWritten.count(record.first)) { // Save the record to the output FASTA file. fastaOut << ">" << record.first << std::endl << record.second << std::endl; // Remember that we have written a record by this name. alreadyWritten.insert(record.first); } } } fastaOut.close(); // Now we're done! return 0; }