void indexInMemorySAIS() { std::cout << "Building index for " << opt::readsFile << " in memory using SAIS\n"; if(opt::bBuildForward || opt::bBuildReverse) { // Parse the initial read table ReadTable* pRT = new ReadTable(opt::readsFile); // Create and write the suffix array for the forward reads if(opt::bBuildForward) { buildIndexForTable(opt::prefix, pRT, false); } if(opt::bBuildReverse) { // Reverse all the reads pRT->reverseAll(); // Build the reverse suffix array buildIndexForTable(opt::prefix, pRT, true); } delete pRT; } }
// Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file // resets the stream when finished. void get_junctions_from_hits(HitStream& hit_stream, ReadTable& it, JunctionSet& junctions) { HitsForRead curr_hit_group; hit_stream.next_read_hits(curr_hit_group); uint32_t curr_obs_order = it.observation_order(curr_hit_group.insert_id); while(curr_obs_order != 0xFFFFFFFF) { for (size_t i = 0; i < curr_hit_group.hits.size(); ++i) { BowtieHit& bh = curr_hit_group.hits[i]; if (!bh.contiguous()) { junctions_from_alignment(bh, junctions); } hit_stream.next_read_hits(curr_hit_group); curr_obs_order = it.observation_order(curr_hit_group.insert_id); } } hit_stream.reset(); }
// Initialize a suffix array for the strings in RT void SuffixArray::initialize(const ReadTable& rt) { size_t n = rt.countSumLengths() + rt.getCount(); initialize(n, rt.getCount()); // Fill the data table with the linear ordering of the suffixes size_t count = 0; for(size_t i = 0; i < rt.getCount(); ++i) { // + 1 below is for the empty suffix (is it actually needed?) for(size_t j = 0; j < rt.getRead(i).seq.length() + 1; ++j) { m_data[count++] = SAElem(i, j); } } }
void closure_driver(vector<FZPipe>& map1, vector<FZPipe>& map2, ifstream& ref_stream, FILE* juncs_file, FILE* fusions_out) { typedef RefSequenceTable::Sequence Reference; ReadTable it; RefSequenceTable rt(true); BowtieHitFactory hit_factory(it, rt); std::set<Fusion> fusions; fprintf (stderr, "Finding near-covered motifs..."); CoverageMapVisitor cov_map_visitor(ref_stream, rt); uint32_t coverage_attempts = 0; assert(map1.size() == map2.size()); for (size_t num = 0; num < map1.size(); ++num) { HitStream left_hs(map1[num].file, &hit_factory, false, true, false); HitStream right_hs(map2[num].file, &hit_factory, false, true, false); HitsForRead curr_left_hit_group; HitsForRead curr_right_hit_group; left_hs.next_read_hits(curr_left_hit_group); right_hs.next_read_hits(curr_right_hit_group); uint32_t curr_right_obs_order = it.observation_order(curr_left_hit_group.insert_id); uint32_t curr_left_obs_order = it.observation_order(curr_right_hit_group.insert_id); while(curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { while (curr_left_obs_order < curr_right_obs_order&& curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { // Get hit group left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); } while (curr_left_obs_order > curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { // Get hit group right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } while (curr_left_obs_order == curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { if (num == 0) find_fusion_closure(curr_left_hit_group, curr_right_hit_group, fusions); if (coverage_attempts++ % 10000 == 0) fprintf (stderr, "Adding covered motifs from pair %d\n", coverage_attempts); visit_best_pairing(curr_left_hit_group, curr_right_hit_group, cov_map_visitor); left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } } } cov_map_visitor.finalize(); fprintf (stderr, "done\n"); ClosureJunctionSet fwd_splices; ClosureJunctionSet rev_splices; JunctionMapVisitor junc_map_visitor(fwd_splices, rev_splices, cov_map_visitor.finders); fprintf (stderr, "Searching for closures..."); uint32_t closure_attempts = 0; for (size_t num = 0; num < map1.size(); ++num) { map1[num].rewind(); map2[num].rewind(); HitStream left_hs = HitStream(map1[num].file, &hit_factory, false, true, false); HitStream right_hs = HitStream(map2[num].file, &hit_factory, false, true, false); HitsForRead curr_left_hit_group; HitsForRead curr_right_hit_group; left_hs.next_read_hits(curr_left_hit_group); right_hs.next_read_hits(curr_right_hit_group); uint32_t curr_right_obs_order = it.observation_order(curr_left_hit_group.insert_id); uint32_t curr_left_obs_order = it.observation_order(curr_right_hit_group.insert_id); while(curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { while (curr_left_obs_order < curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { // Get hit group left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); } while (curr_left_obs_order > curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { // Get hit group right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } while (curr_left_obs_order == curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { if (closure_attempts++ % 10000 == 0) fprintf (stderr, "Trying to close pair %d\n", closure_attempts); visit_best_pairing(curr_left_hit_group, curr_right_hit_group, junc_map_visitor); left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } } } for (size_t num = 0; num < map1.size(); ++num) { map1[num].close(); map2[num].close(); } fprintf(stderr, "%lu Forward strand splices\n", fwd_splices.size()); fprintf(stderr, "%lu Reverse strand splices\n", rev_splices.size()); fprintf (stderr, "done\n"); uint32_t num_potential_splices = 0; fprintf (stderr, "Reporting possible junctions..."); map<uint32_t, pair<JunctionMapVisitor::JunctionTable, JunctionMapVisitor::JunctionTable> >::iterator f_itr; f_itr = junc_map_visitor._finders.begin(); ClosureJunctionSet::iterator j_itr; j_itr = fwd_splices.begin(); while (j_itr != fwd_splices.end()) { fprintf (juncs_file,"%s\t%u\t%u\t%c\n", rt.get_name(j_itr->refid), j_itr->left,j_itr->right,'+'); ++num_potential_splices; ++j_itr; } j_itr = rev_splices.begin(); while (j_itr != rev_splices.end()) { fprintf (juncs_file,"%s\t%u\t%u\t%c\n", rt.get_name(j_itr->refid), j_itr->left,j_itr->right,'-'); ++num_potential_splices; ++j_itr; } //accept_all_best_hits(best_status_for_inserts); fprintf(stderr, "done\n"); fprintf(stderr, "Searched for closures between %d pairs\n", searched); fprintf(stderr, "Successfully closed %d pairs\n", closed); fprintf(stderr, "Found %d total possible splices\n", num_potential_splices); // daehwan #if 0 fprintf (stderr, "Reporting potential fusions...\n"); if(fusions_out){ for(std::set<Fusion>::iterator itr = fusions.begin(); itr != fusions.end(); ++itr){ const char* ref_name1 = rt.get_name(itr->refid1); const char* ref_name2 = rt.get_name(itr->refid2); const char* dir = ""; if (itr->dir == FUSION_FR) dir = "fr"; else if(itr->dir == FUSION_RF) dir = "rf"; else dir = "ff"; fprintf(fusions_out, "%s\t%d\t%s\t%d\t%s\n", ref_name1, itr->left, ref_name2, itr->right, dir); } fclose(fusions_out); }else{ fprintf(stderr, "Failed to open fusions file for writing\n"); } #endif }
// // Main // int overlapLongMain(int argc, char** argv) { parseOverlapLongOptions(argc, argv); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setTransitiveTag(true); headerRecord.write(*pASQGWriter); // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Read the sequence file and write vertex records for each // Also store the read names in a vector of strings ReadTable reads; SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION); SeqRecord record; while(pReader->get(record)) { reads.addRead(record.toSeqItem()); ASQG::VertexRecord vr(record.id, record.seq.toString()); vr.write(*pASQGWriter); if(reads.getCount() % 100000 == 0) printf("Read %zu sequences\n", reads.getCount()); } delete pReader; pReader = NULL; BWTIndexSet index; index.pBWT = pBWT; index.pSSA = pSSA; index.pReadTable = &reads; // Make a prefix for the temporary hits files size_t n_reads = reads.getCount(); omp_set_num_threads(opt::numThreads); #pragma omp parallel for for(size_t read_idx = 0; read_idx < n_reads; ++read_idx) { const SeqItem& curr_read = reads.getRead(read_idx); printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length()); SequenceOverlapPairVector sopv = KmerOverlaps::retrieveMatches(curr_read.seq.toString(), opt::seedLength, opt::minOverlap, 1 - opt::errorRate, 100, index); printf("Found %zu matches\n", sopv.size()); for(size_t i = 0; i < sopv.size(); ++i) { std::string match_id = reads.getRead(sopv[i].match_idx).id; // We only want to output each edge once so skip this overlap // if the matched read has a lexicographically lower ID if(curr_read.id > match_id) continue; std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50); printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(), sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, match_id.c_str(), sopv[i].overlap.getOverlapLength(), sopv[i].overlap.getPercentIdentity(), sopv[i].overlap.cigar.c_str()); // Convert to ASQG SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]); SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]); // KmerOverlaps returns the coordinates of the overlap after flipping the reads // to ensure the strand matches. The ASQG file wants the coordinate of the original // sequencing strand. Flip here if necessary if(sopv[i].is_reversed) sc2.flip(); // Convert the SequenceOverlap the ASQG's overlap format Overlap ovr(curr_read.id, sc1, match_id, sc2, sopv[i].is_reversed, -1); ASQG::EdgeRecord er(ovr); er.setCigarTag(sopv[i].overlap.cigar); er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity()); #pragma omp critical { er.write(*pASQGWriter); } } } // Cleanup delete pReader; delete pBWT; delete pSSA; delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
// Compute the initial BWTs for the input file split into blocks of records using the SAIS algorithm MergeVector computeInitialSAIS(const BWTDiskParameters& parameters) { SeqReader* pReader = new SeqReader(parameters.inFile); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs ReadTable* pCurrRT = new ReadTable; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(parameters.bBuildReverse) item.seq.reverse(); pCurrRT->addRead(item); ++numReadTotal; } if(pCurrRT->getCount() >= parameters.numReadsPerBatch || (done && pCurrRT->getCount() > 0)) { // Compute the SA and BWT for this group SuffixArray* pSA = new SuffixArray(pCurrRT, 1); // Write the BWT to disk std::string bwt_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.bwtExtension); pSA->writeBWT(bwt_temp_filename, pCurrRT); std::string sai_temp_filename = makeTempName(parameters.outPrefix, groupID, parameters.saiExtension); pSA->writeIndex(sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = parameters.inFile; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Cleanup delete pSA; // Start the new group mergeItem.start_index = numReadTotal; ++groupID; pCurrRT->clear(); } } delete pCurrRT; delete pReader; return mergeVector; }
void best_insert_mappings(uint64_t refid, ReadTable& it, /*const string& name,*/ HitList& hits1_in_ref, HitList& hits2_in_ref, BestInsertAlignmentTable& best_status_for_inserts, bool prefer_shorter_pairs) { long chucked_for_shorter_pair = 0; std::set<size_t> marked; HitList::iterator last_good = hits2_in_ref.begin(); for (size_t i = 0; i < hits1_in_ref.size(); ++i) { BowtieHit& h1 = hits1_in_ref[i]; pair<HitList::iterator, HitList::iterator> range_pair; range_pair = equal_range(last_good, hits2_in_ref.end(), h1, hit_insert_id_lt); bool found_hit = false; if (range_pair.first != range_pair.second) last_good = range_pair.first; uint32_t obs_order = it.observation_order(h1.insert_id()); for (HitList::iterator f = range_pair.first; f != range_pair.second; ++f) { BowtieHit& h2 = *f; if (h1.insert_id() == h2.insert_id()) { // max mate inner distance (genomic) int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev; if (max_mate_inner_dist == -1) { max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev; } InsertAlignmentGrade s(h1, h2, min_mate_inner_dist, max_mate_inner_dist); pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; // Is the new status better than the current best one? if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } else if (!(s < current)) { if (prefer_shorter_pairs && current.num_mapped == 2) { pair<int, int> dc = pair_distances(*(insert_best.second[0].left_alignment), *(insert_best.second[0].right_alignment)); pair<int, int> ds = pair_distances(h1,h2); if (ds.second < dc.second) { chucked_for_shorter_pair += insert_best.second.size(); insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } } else { insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } } marked.insert(f - hits2_in_ref.begin()); found_hit = true; } } if (!found_hit) { pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; InsertAlignmentGrade s(h1); if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, NULL)); } else if (! (s < current)) { insert_best.second.push_back(InsertAlignment(refid, &h1, NULL)); } } } for (size_t i = 0; i < hits2_in_ref.size(); ++i) { BowtieHit& h2 = hits2_in_ref[i]; uint32_t obs_order = it.observation_order(h2.insert_id()); pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; InsertAlignmentGrade s(h2); // Did we include h2 as part of a pairing already, or is this first time // we've seen it? If so, it's a singleton. if (marked.find(i) == marked.end()) { if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, NULL, &h2)); } else if (! (s < current)) { insert_best.second.push_back(InsertAlignment(refid, NULL, &h2)); } } } fprintf(stderr, "Chucked %ld pairs for shorter pairing of same mates\n", chucked_for_shorter_pair); }
// The algorithm is as follows. We create M BWTs for subsets of // the input reads. These are created independently and written // to disk. They are then merged either sequentially or pairwise // to create the final BWT void buildBWTDisk(const std::string& in_filename, const std::string& out_prefix, const std::string& bwt_extension, const std::string& sai_extension, bool doReverse, int numThreads, int numReadsPerBatch, int storageLevel) { size_t MAX_READS_PER_GROUP = numReadsPerBatch; SeqReader* pReader = new SeqReader(in_filename); SeqRecord record; int groupID = 0; size_t numReadTotal = 0; MergeVector mergeVector; MergeItem mergeItem; mergeItem.start_index = 0; // Phase 1: Compute the initial BWTs ReadTable* pCurrRT = new ReadTable; bool done = false; while(!done) { done = !pReader->get(record); if(!done) { // the read is valid SeqItem item = record.toSeqItem(); if(doReverse) item.seq.reverse(); pCurrRT->addRead(item); ++numReadTotal; } if(pCurrRT->getCount() >= MAX_READS_PER_GROUP || (done && pCurrRT->getCount() > 0)) { // Compute the SA and BWT for this group SuffixArray* pSA = new SuffixArray(pCurrRT, numThreads); // Write the BWT to disk std::string bwt_temp_filename = makeTempName(out_prefix, groupID, bwt_extension); pSA->writeBWT(bwt_temp_filename, pCurrRT); std::string sai_temp_filename = makeTempName(out_prefix, groupID, sai_extension); pSA->writeIndex(sai_temp_filename); // Push the merge info mergeItem.end_index = numReadTotal - 1; // inclusive mergeItem.reads_filename = in_filename; mergeItem.bwt_filename = bwt_temp_filename; mergeItem.sai_filename = sai_temp_filename; mergeVector.push_back(mergeItem); // Cleanup delete pSA; // Start the new group mergeItem.start_index = numReadTotal; ++groupID; pCurrRT->clear(); } } delete pCurrRT; delete pReader; // Phase 2: Pairwise merge the BWTs int round = 1; MergeVector nextMergeRound; while(mergeVector.size() > 1) { std::cout << "Starting round " << round << "\n"; pReader = new SeqReader(in_filename); for(size_t i = 0; i < mergeVector.size(); i+=2) { if(i + 1 != mergeVector.size()) { std::string bwt_merged_name = makeTempName(out_prefix, groupID, bwt_extension); std::string sai_merged_name = makeTempName(out_prefix, groupID, sai_extension); MergeItem item1 = mergeVector[i]; MergeItem item2 = mergeVector[i+1]; // Perform the actual merge int64_t curr_idx = merge(pReader, item1, item2, bwt_merged_name, sai_merged_name, doReverse, numThreads, storageLevel); // pReader now points to the end of item1's block of // reads. Skip item2's reads assert(curr_idx == item2.start_index); while(curr_idx <= item2.end_index) { bool eof = !pReader->get(record); assert(!eof); (void)eof; ++curr_idx; } // Create the merged mergeItem to use in the next round MergeItem merged; merged.start_index = item1.start_index; merged.end_index = item2.end_index; merged.bwt_filename = bwt_merged_name; merged.sai_filename = sai_merged_name; nextMergeRound.push_back(merged); // Done with the temp files, remove them unlink(item1.bwt_filename.c_str()); unlink(item2.bwt_filename.c_str()); unlink(item1.sai_filename.c_str()); unlink(item2.sai_filename.c_str()); ++groupID; } else { // Singleton, pass through to the next round nextMergeRound.push_back(mergeVector[i]); } } delete pReader; mergeVector.clear(); mergeVector.swap(nextMergeRound); ++round; } assert(mergeVector.size() == 1); // Done, rename the files to their final name std::stringstream bwt_ss; bwt_ss << out_prefix << bwt_extension << (USE_GZ ? ".gz" : ""); std::string bwt_final_filename = bwt_ss.str(); rename(mergeVector.front().bwt_filename.c_str(), bwt_final_filename.c_str()); std::stringstream sai_ss; sai_ss << out_prefix << sai_extension << (USE_GZ ? ".gz" : ""); std::string sai_final_filename = sai_ss.str(); rename(mergeVector.front().sai_filename.c_str(), sai_final_filename.c_str()); }