void junctions_from_alignment(const BowtieHit& spliced_alignment, JunctionSet& junctions) { vector<pair<Junction, JunctionStats> > juncs; junctions_from_spliced_hit(spliced_alignment, juncs); for (size_t i = 0; i < juncs.size(); ++i) { pair<Junction, JunctionStats>& junc = juncs[i]; JunctionSet::iterator itr = junctions.find(junc.first); if (itr != junctions.end()) { JunctionStats& j = itr->second; j.left_extent = max(j.left_extent, junc.second.left_extent); j.right_extent = max(j.right_extent, junc.second.right_extent); j.min_splice_mms = min(j.min_splice_mms, junc.second.min_splice_mms); j.supporting_hits++; } else { assert(junc.first.refid != 0xFFFFFFFF); junctions[junc.first] = junc.second; } } }
void accept_all_junctions(JunctionSet& junctions, const uint32_t refid) { fprintf(stderr, "Accepting all junctions\n"); for (JunctionSet::iterator itr = junctions.begin(); itr != junctions.end(); ++itr) { itr->second.accepted = true; } }
void filter_junctions(JunctionSet& junctions) { for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) { accept_if_valid(i->first, i->second); } knockout_shadow_junctions(junctions); }
void validate_junctions(const JunctionSet& junctions) { uint32_t invalid_juncs = 0; for (JunctionSet::const_iterator i = junctions.begin(); i != junctions.end(); ++i) { if (!i->first.valid()) invalid_juncs++; } fprintf(stderr, "Found %d invalid junctions\n", invalid_juncs); }
bool HyperedgeTreeNode::removeOtherJunctionsFrom(HyperedgeTreeEdge *ignored, JunctionSet& treeRoots) { bool containsCycle = false; if (visited) { // We've encountered this node before, so there must be cycles in // the hyperedge. Don't recurse any further. containsCycle = true; return containsCycle; } if (junction && (ignored != NULL)) { // Remove junctions other than the first (when ignored == NULL). treeRoots.erase(junction); } visited = true; for (std::list<HyperedgeTreeEdge *>::iterator curr = edges.begin(); curr != edges.end(); ++curr) { if (*curr != ignored) { containsCycle |= (*curr)->removeOtherJunctionsFrom(this, treeRoots); } } return containsCycle; }
// This method traverses the hyperedge tree and removes from treeRoots any // junction nodes. // void HyperEdgeTreeEdge::removeOtherJunctionsFrom(HyperEdgeTreeNode *ignored, JunctionSet& treeRoots) { if (ends.first && (ends.first != ignored)) { ends.first->removeOtherJunctionsFrom(this, treeRoots); if (ends.first->junction) { treeRoots.erase(ends.first->junction); } } if (ends.second && (ends.second != ignored)) { ends.second->removeOtherJunctionsFrom(this, treeRoots); if (ends.second->junction) { treeRoots.erase(ends.second->junction); } } }
/** * Parse the cigar string of a BowtieHit in order to determine the alignment status. */ AlignStatus::AlignStatus(const BowtieHit& bh, const JunctionSet& gtf_junctions) { const vector<CigarOp>& cigar = bh.cigar(); _aligned = cigar.size() > 0; _indelFreeAlignment = true; _unannotatedSpliceFreeAlignment = true; _edit_dist = bh.edit_dist(); int j = bh.left(); for (size_t c = 0 ; c < cigar.size(); ++c) { Junction junc; switch(cigar[c].opcode) { case REF_SKIP: junc.refid = bh.ref_id(); junc.left = j; junc.right = junc.left + cigar[c].length; junc.antisense = bh.antisense_splice(); j += cigar[c].length; if (gtf_junctions.find(junc) == gtf_junctions.end()) _unannotatedSpliceFreeAlignment = false; break; case MATCH: j += cigar[c].length; break; case DEL: j += cigar[c].length; _indelFreeAlignment = false; break; case INS: _indelFreeAlignment = false; break; default: break; } } }
void print_junctions(FILE* junctions_out, const JunctionSet& junctions, RefSequenceTable& ref_sequences) { uint64_t junc_id = 1; fprintf(junctions_out, "track name=junctions description=\"TopHat junctions\"\n"); for (JunctionSet::const_iterator i = junctions.begin(); i != junctions.end(); ++i) { const pair<Junction, JunctionStats>& j_itr = *i; const Junction& j = j_itr.first; const JunctionStats& s = j_itr.second; assert(ref_sequences.get_name(j.refid)); //fprintf(stdout,"%d\t%d\t%d\t%c\n", j.refid, j.left, j.right, j.antisense ? '-' : '+'); print_junction(junctions_out, ref_sequences.get_name(j.refid), j, s, junc_id++); } //fprintf(stderr, "Rejected %d / %d alignments, %d / %d spliced\n", rejected, total, rejected_spliced, total_spliced); }
void driver(const vector<FILE*>& splice_coords_files, const vector<FILE*>& insertion_coords_files, const vector<FILE*>& deletion_coords_files, ifstream& ref_stream) { char splice_buf[2048]; RefSequenceTable rt(true); JunctionSet junctions; for (size_t i = 0; i < splice_coords_files.size(); ++i) { FILE* splice_coords = splice_coords_files[i]; if (!splice_coords) continue; while (fgets(splice_buf, 2048, splice_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; /** Fields are: 1) reference name 2) left coord of splice (last char of the left exon) 3) right coord of splice (first char of the right exon) */ char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* orientation = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord || !orientation) { fprintf(stderr,"Error: malformed splice coordinate record\n"); exit(1); } uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); bool antisense = *orientation == '-'; junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats())); } } /* * Read in the deletion coordinates * and store in a set */ std::set<Deletion> deletions; for(size_t i=0; i < deletion_coords_files.size(); ++i){ FILE* deletion_coords = deletion_coords_files[i]; if(!deletion_coords){ continue; } while (fgets(splice_buf, 2048, deletion_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; /** Fields are: 1) reference name 2) left coord of splice (last char of the left exon) 3) right coord of splice (first char of the right exon) */ char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord) { fprintf(stderr,"Error: malformed deletion coordinate record\n"); exit(1); } /* * Note that when reading in a deletion, the left co-ord is the position of the * first deleted based. Since we are co-opting the junction data structure, need * to fix up this location */ uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false)); } } /* * Read in the insertion coordinates * and store in a set */ std::set<Insertion> insertions; for(size_t i=0; i < insertion_coords_files.size(); ++i){ FILE* insertion_coords = insertion_coords_files[i]; if(!insertion_coords){ continue; } while(fgets(splice_buf, 2048, insertion_coords)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* scan_sequence = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_sequence || !scan_right_coord) { fprintf(stderr,"Error: malformed insertion coordinate record\n"); exit(1); } seqan::Dna5String sequence = seqan::Dna5String(scan_sequence); bool containsN = false; for(size_t index = 0; index < seqan::length(sequence); index += 1){ /* * Don't allow any ambiguities in the insertion */ if(sequence[index] == 'N'){ containsN = true; break; } } if(containsN){ continue; } seqan::CharString charSequence = sequence; uint32_t ref_id = rt.get_id(ref_name,NULL,0); uint32_t left_coord = atoi(scan_left_coord); insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence))); } } typedef RefSequenceTable::Sequence Reference; while(ref_stream.good() && !ref_stream.eof()) { Reference ref_str; string name; readMeta(ref_stream, name, Fasta()); string::size_type space_pos = name.find_first_of(" \t\r"); if (space_pos != string::npos) { name.resize(space_pos); } read(ref_stream, ref_str, Fasta()); uint32_t refid = rt.get_id(name, NULL, 0); Junction dummy_left(refid, 0, 0, true); Junction dummy_right(refid, VMAXINT32, VMAXINT32, true); pair<JunctionSet::iterator, JunctionSet::iterator> r; r.first = junctions.lower_bound(dummy_left); r.second = junctions.upper_bound(dummy_right); JunctionSet::iterator itr = r.first; while(itr != r.second && itr != junctions.end()) { print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", ref_str, name, cout); ++itr; } } ref_stream.clear(); ref_stream.seekg(0, ios::beg); while(ref_stream.good() && !ref_stream.eof()) { Reference ref_str; string name; readMeta(ref_stream, name, Fasta()); string::size_type space_pos = name.find_first_of(" \t\r"); if (space_pos != string::npos) { name.resize(space_pos); } read(ref_stream, ref_str, Fasta()); uint32_t refid = rt.get_id(name, NULL,0); Deletion dummy_left(refid, 0, 0, true); Deletion dummy_right(refid, VMAXINT32, VMAXINT32, true); pair<std::set<Deletion>::iterator, std::set<Deletion>::iterator> r; r.first = deletions.lower_bound(dummy_left); r.second = deletions.upper_bound(dummy_right); std::set<Deletion>::iterator itr = r.first; while(itr != r.second && itr != deletions.end()) { print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", ref_str, name, cout); ++itr; } } ref_stream.clear(); ref_stream.seekg(0, ios::beg); while(ref_stream.good() && !ref_stream.eof()) { Reference ref_str; string name; readMeta(ref_stream, name, Fasta()); string::size_type space_pos = name.find_first_of(" \t\r"); if (space_pos != string::npos) { name.resize(space_pos); } read(ref_stream, ref_str, Fasta()); uint32_t refid = rt.get_id(name, NULL,0); Insertion dummy_left(refid, 0, ""); Insertion dummy_right(refid, VMAXINT32, ""); std::set<Insertion>::iterator itr = insertions.lower_bound(dummy_left); std::set<Insertion>::iterator upper = insertions.upper_bound(dummy_right); while(itr != upper && itr != insertions.end()){ print_insertion(*itr, read_length, ref_str, name, cout); ++itr; } } }
void driver(const vector<FILE*>& splice_coords_files, const vector<FILE*>& insertion_coords_files, const vector<FILE*>& deletion_coords_files, const vector<FILE*>& fusion_coords_files, ifstream& ref_stream) { char splice_buf[2048]; RefSequenceTable rt(sam_header, true); get_seqs(ref_stream, rt, true); JunctionSet junctions; for (size_t i = 0; i < splice_coords_files.size(); ++i) { FILE* splice_coords = splice_coords_files[i]; if (!splice_coords) continue; while (fgets(splice_buf, 2048, splice_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; /** Fields are: 1) reference name 2) left coord of splice (last char of the left exon) 3) right coord of splice (first char of the right exon) */ char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* orientation = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord || !orientation) { fprintf(stderr,"Error: malformed splice coordinate record\n"); exit(1); } uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); bool antisense = *orientation == '-'; junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats())); } } /* * Read in the deletion coordinates * and store in a set */ std::set<Deletion> deletions; for(size_t i=0; i < deletion_coords_files.size(); ++i){ FILE* deletion_coords = deletion_coords_files[i]; if(!deletion_coords){ continue; } while (fgets(splice_buf, 2048, deletion_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; /** Fields are: 1) reference name 2) left coord of splice (last char of the left exon) 3) right coord of splice (first char of the right exon) */ char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord) { fprintf(stderr,"Error: malformed deletion coordinate record\n"); exit(1); } /* * Note that when reading in a deletion, the left co-ord is the position of the * first deleted based. Since we are co-opting the junction data structure, need * to fix up this location */ uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false)); } } /* * Read in the insertion coordinates * and store in a set */ std::set<Insertion> insertions; for(size_t i=0; i < insertion_coords_files.size(); ++i){ FILE* insertion_coords = insertion_coords_files[i]; if(!insertion_coords){ continue; } while(fgets(splice_buf, 2048, insertion_coords)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* scan_sequence = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_sequence || !scan_right_coord) { fprintf(stderr,"Error: malformed insertion coordinate record\n"); exit(1); } seqan::Dna5String sequence = seqan::Dna5String(scan_sequence); bool containsN = false; for(size_t index = 0; index < seqan::length(sequence); index += 1){ /* * Don't allow any ambiguities in the insertion */ if(sequence[index] == 'N'){ containsN = true; break; } } if(containsN){ continue; } seqan::CharString charSequence = sequence; uint32_t ref_id = rt.get_id(ref_name,NULL,0); uint32_t left_coord = atoi(scan_left_coord); insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence))); } } std::set<Fusion> fusions; for(size_t i=0; i < fusion_coords_files.size(); ++i){ FILE* fusion_coords = fusion_coords_files[i]; if(!fusion_coords){ continue; } while(fgets(splice_buf, 2048, fusion_coords)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name1 = strsep((char**)&buf, "\t"); char* scan_left_coord = strsep((char**)&buf, "\t"); char* ref_name2 = strsep((char**)&buf, "\t"); char* scan_right_coord = strsep((char**)&buf, "\t"); char* scan_dir = strsep((char**)&buf, "\t"); if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir) { fprintf(stderr,"Error: malformed insertion coordinate record\n"); exit(1); } uint32_t ref_id1 = rt.get_id(ref_name1, NULL, 0); uint32_t ref_id2 = rt.get_id(ref_name2, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); uint32_t dir = FUSION_FF; if (strcmp(scan_dir, "fr") == 0) dir = FUSION_FR; else if(strcmp(scan_dir, "rf") == 0) dir = FUSION_RF; else if(strcmp(scan_dir, "rr") == 0) dir = FUSION_RR; fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir)); } } { JunctionSet::iterator itr = junctions.begin(); for (; itr != junctions.end(); ++itr) { RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->first.refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->first.refid); print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", *ref_str, name, cout); } } { std::set<Deletion>::iterator itr = deletions.begin(); for (; itr != deletions.end(); ++itr) { RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->refid); print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", *ref_str, name, cout); } } { std::set<Insertion>::iterator itr = insertions.begin(); for (; itr != insertions.end(); ++itr){ RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->refid); print_insertion(*itr, read_length, *ref_str, name, cout); } } { std::set<Fusion>::iterator itr = fusions.begin(); for (; itr != fusions.end(); ++itr){ RefSequenceTable::Sequence* left_ref_str = rt.get_seq(itr->refid1); RefSequenceTable::Sequence* right_ref_str = rt.get_seq(itr->refid2); if (left_ref_str == NULL || right_ref_str == NULL) continue; const char* left_ref_name = rt.get_name(itr->refid1); const char* right_ref_name = rt.get_name(itr->refid2); print_fusion(*itr, read_length, *left_ref_str, *right_ref_str, left_ref_name, right_ref_name, cout); } } }
void knockout_shadow_junctions(JunctionSet& junctions) { vector<uint32_t> ref_ids; for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) { ref_ids.push_back(i->first.refid); } sort(ref_ids.begin(), ref_ids.end()); vector<uint32_t>::iterator new_end = unique(ref_ids.begin(), ref_ids.end()); ref_ids.erase(new_end, ref_ids.end()); for(size_t i = 0; i < ref_ids.size(); ++i) { uint32_t refid = ref_ids[i]; Junction dummy_left(refid, 0, 0, true); Junction dummy_right(refid, 0xFFFFFFFF, 0xFFFFFFFF, true); pair<JunctionSet::iterator, JunctionSet::iterator> r; r.first = junctions.lower_bound(dummy_left); r.second = junctions.upper_bound(dummy_right); JunctionSet::iterator itr = r.first; while(itr != r.second && itr != junctions.end()) { if (itr->second.accepted) { Junction fuzzy_left = itr->first; Junction fuzzy_right = itr->first; fuzzy_left.left -= min_anchor_len; fuzzy_right.right += min_anchor_len; fuzzy_left.antisense = !itr->first.antisense; fuzzy_right.antisense = !itr->first.antisense; pair<JunctionSet::iterator, JunctionSet::iterator> s; s.first = junctions.lower_bound(fuzzy_left); s.second = junctions.upper_bound(fuzzy_right); JunctionSet::iterator itr2 = s.first; int junc_support = itr->second.supporting_hits; while(itr2 != s.second && itr2 != junctions.end()) { int left_diff = itr->first.left - itr2->first.left; int right_diff = itr->first.right - itr2->first.right; if (itr != itr2 && itr->first.antisense != itr2->first.antisense && (left_diff < min_anchor_len || right_diff < min_anchor_len)) { if (junc_support < itr2->second.supporting_hits) itr->second.accepted = false; } ++itr2; } } ++itr; } } }