// Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file // resets the stream when finished. void get_junctions_from_hits(HitStream& hit_stream, ReadTable& it, JunctionSet& junctions) { HitsForRead curr_hit_group; hit_stream.next_read_hits(curr_hit_group); uint32_t curr_obs_order = it.observation_order(curr_hit_group.insert_id); while(curr_obs_order != 0xFFFFFFFF) { for (size_t i = 0; i < curr_hit_group.hits.size(); ++i) { BowtieHit& bh = curr_hit_group.hits[i]; if (!bh.contiguous()) { junctions_from_alignment(bh, junctions); } hit_stream.next_read_hits(curr_hit_group); curr_obs_order = it.observation_order(curr_hit_group.insert_id); } } hit_stream.reset(); }
void closure_driver(vector<FZPipe>& map1, vector<FZPipe>& map2, ifstream& ref_stream, FILE* juncs_file, FILE* fusions_out) { typedef RefSequenceTable::Sequence Reference; ReadTable it; RefSequenceTable rt(true); BowtieHitFactory hit_factory(it, rt); std::set<Fusion> fusions; fprintf (stderr, "Finding near-covered motifs..."); CoverageMapVisitor cov_map_visitor(ref_stream, rt); uint32_t coverage_attempts = 0; assert(map1.size() == map2.size()); for (size_t num = 0; num < map1.size(); ++num) { HitStream left_hs(map1[num].file, &hit_factory, false, true, false); HitStream right_hs(map2[num].file, &hit_factory, false, true, false); HitsForRead curr_left_hit_group; HitsForRead curr_right_hit_group; left_hs.next_read_hits(curr_left_hit_group); right_hs.next_read_hits(curr_right_hit_group); uint32_t curr_right_obs_order = it.observation_order(curr_left_hit_group.insert_id); uint32_t curr_left_obs_order = it.observation_order(curr_right_hit_group.insert_id); while(curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { while (curr_left_obs_order < curr_right_obs_order&& curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { // Get hit group left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); } while (curr_left_obs_order > curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { // Get hit group right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } while (curr_left_obs_order == curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { if (num == 0) find_fusion_closure(curr_left_hit_group, curr_right_hit_group, fusions); if (coverage_attempts++ % 10000 == 0) fprintf (stderr, "Adding covered motifs from pair %d\n", coverage_attempts); visit_best_pairing(curr_left_hit_group, curr_right_hit_group, cov_map_visitor); left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } } } cov_map_visitor.finalize(); fprintf (stderr, "done\n"); ClosureJunctionSet fwd_splices; ClosureJunctionSet rev_splices; JunctionMapVisitor junc_map_visitor(fwd_splices, rev_splices, cov_map_visitor.finders); fprintf (stderr, "Searching for closures..."); uint32_t closure_attempts = 0; for (size_t num = 0; num < map1.size(); ++num) { map1[num].rewind(); map2[num].rewind(); HitStream left_hs = HitStream(map1[num].file, &hit_factory, false, true, false); HitStream right_hs = HitStream(map2[num].file, &hit_factory, false, true, false); HitsForRead curr_left_hit_group; HitsForRead curr_right_hit_group; left_hs.next_read_hits(curr_left_hit_group); right_hs.next_read_hits(curr_right_hit_group); uint32_t curr_right_obs_order = it.observation_order(curr_left_hit_group.insert_id); uint32_t curr_left_obs_order = it.observation_order(curr_right_hit_group.insert_id); while(curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { while (curr_left_obs_order < curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { // Get hit group left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); } while (curr_left_obs_order > curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { // Get hit group right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } while (curr_left_obs_order == curr_right_obs_order && curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32) { if (closure_attempts++ % 10000 == 0) fprintf (stderr, "Trying to close pair %d\n", closure_attempts); visit_best_pairing(curr_left_hit_group, curr_right_hit_group, junc_map_visitor); left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } } } for (size_t num = 0; num < map1.size(); ++num) { map1[num].close(); map2[num].close(); } fprintf(stderr, "%lu Forward strand splices\n", fwd_splices.size()); fprintf(stderr, "%lu Reverse strand splices\n", rev_splices.size()); fprintf (stderr, "done\n"); uint32_t num_potential_splices = 0; fprintf (stderr, "Reporting possible junctions..."); map<uint32_t, pair<JunctionMapVisitor::JunctionTable, JunctionMapVisitor::JunctionTable> >::iterator f_itr; f_itr = junc_map_visitor._finders.begin(); ClosureJunctionSet::iterator j_itr; j_itr = fwd_splices.begin(); while (j_itr != fwd_splices.end()) { fprintf (juncs_file,"%s\t%u\t%u\t%c\n", rt.get_name(j_itr->refid), j_itr->left,j_itr->right,'+'); ++num_potential_splices; ++j_itr; } j_itr = rev_splices.begin(); while (j_itr != rev_splices.end()) { fprintf (juncs_file,"%s\t%u\t%u\t%c\n", rt.get_name(j_itr->refid), j_itr->left,j_itr->right,'-'); ++num_potential_splices; ++j_itr; } //accept_all_best_hits(best_status_for_inserts); fprintf(stderr, "done\n"); fprintf(stderr, "Searched for closures between %d pairs\n", searched); fprintf(stderr, "Successfully closed %d pairs\n", closed); fprintf(stderr, "Found %d total possible splices\n", num_potential_splices); // daehwan #if 0 fprintf (stderr, "Reporting potential fusions...\n"); if(fusions_out){ for(std::set<Fusion>::iterator itr = fusions.begin(); itr != fusions.end(); ++itr){ const char* ref_name1 = rt.get_name(itr->refid1); const char* ref_name2 = rt.get_name(itr->refid2); const char* dir = ""; if (itr->dir == FUSION_FR) dir = "fr"; else if(itr->dir == FUSION_RF) dir = "rf"; else dir = "ff"; fprintf(fusions_out, "%s\t%d\t%s\t%d\t%s\n", ref_name1, itr->left, ref_name2, itr->right, dir); } fclose(fusions_out); }else{ fprintf(stderr, "Failed to open fusions file for writing\n"); } #endif }
void best_insert_mappings(uint64_t refid, ReadTable& it, /*const string& name,*/ HitList& hits1_in_ref, HitList& hits2_in_ref, BestInsertAlignmentTable& best_status_for_inserts, bool prefer_shorter_pairs) { long chucked_for_shorter_pair = 0; std::set<size_t> marked; HitList::iterator last_good = hits2_in_ref.begin(); for (size_t i = 0; i < hits1_in_ref.size(); ++i) { BowtieHit& h1 = hits1_in_ref[i]; pair<HitList::iterator, HitList::iterator> range_pair; range_pair = equal_range(last_good, hits2_in_ref.end(), h1, hit_insert_id_lt); bool found_hit = false; if (range_pair.first != range_pair.second) last_good = range_pair.first; uint32_t obs_order = it.observation_order(h1.insert_id()); for (HitList::iterator f = range_pair.first; f != range_pair.second; ++f) { BowtieHit& h2 = *f; if (h1.insert_id() == h2.insert_id()) { // max mate inner distance (genomic) int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev; if (max_mate_inner_dist == -1) { max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev; } InsertAlignmentGrade s(h1, h2, min_mate_inner_dist, max_mate_inner_dist); pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; // Is the new status better than the current best one? if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } else if (!(s < current)) { if (prefer_shorter_pairs && current.num_mapped == 2) { pair<int, int> dc = pair_distances(*(insert_best.second[0].left_alignment), *(insert_best.second[0].right_alignment)); pair<int, int> ds = pair_distances(h1,h2); if (ds.second < dc.second) { chucked_for_shorter_pair += insert_best.second.size(); insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } } else { insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } } marked.insert(f - hits2_in_ref.begin()); found_hit = true; } } if (!found_hit) { pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; InsertAlignmentGrade s(h1); if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, NULL)); } else if (! (s < current)) { insert_best.second.push_back(InsertAlignment(refid, &h1, NULL)); } } } for (size_t i = 0; i < hits2_in_ref.size(); ++i) { BowtieHit& h2 = hits2_in_ref[i]; uint32_t obs_order = it.observation_order(h2.insert_id()); pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; InsertAlignmentGrade s(h2); // Did we include h2 as part of a pairing already, or is this first time // we've seen it? If so, it's a singleton. if (marked.find(i) == marked.end()) { if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, NULL, &h2)); } else if (! (s < current)) { insert_best.second.push_back(InsertAlignment(refid, NULL, &h2)); } } } fprintf(stderr, "Chucked %ld pairs for shorter pairing of same mates\n", chucked_for_shorter_pair); }