void accept_all_junctions(JunctionSet& junctions, const uint32_t refid) { fprintf(stderr, "Accepting all junctions\n"); for (JunctionSet::iterator itr = junctions.begin(); itr != junctions.end(); ++itr) { itr->second.accepted = true; } }
void filter_junctions(JunctionSet& junctions) { for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) { accept_if_valid(i->first, i->second); } knockout_shadow_junctions(junctions); }
void validate_junctions(const JunctionSet& junctions) { uint32_t invalid_juncs = 0; for (JunctionSet::const_iterator i = junctions.begin(); i != junctions.end(); ++i) { if (!i->first.valid()) invalid_juncs++; } fprintf(stderr, "Found %d invalid junctions\n", invalid_juncs); }
void print_junctions(FILE* junctions_out, const JunctionSet& junctions, RefSequenceTable& ref_sequences) { uint64_t junc_id = 1; fprintf(junctions_out, "track name=junctions description=\"TopHat junctions\"\n"); for (JunctionSet::const_iterator i = junctions.begin(); i != junctions.end(); ++i) { const pair<Junction, JunctionStats>& j_itr = *i; const Junction& j = j_itr.first; const JunctionStats& s = j_itr.second; assert(ref_sequences.get_name(j.refid)); //fprintf(stdout,"%d\t%d\t%d\t%c\n", j.refid, j.left, j.right, j.antisense ? '-' : '+'); print_junction(junctions_out, ref_sequences.get_name(j.refid), j, s, junc_id++); } //fprintf(stderr, "Rejected %d / %d alignments, %d / %d spliced\n", rejected, total, rejected_spliced, total_spliced); }
void driver(const vector<FILE*>& splice_coords_files, const vector<FILE*>& insertion_coords_files, const vector<FILE*>& deletion_coords_files, const vector<FILE*>& fusion_coords_files, ifstream& ref_stream) { char splice_buf[2048]; RefSequenceTable rt(sam_header, true); get_seqs(ref_stream, rt, true); JunctionSet junctions; for (size_t i = 0; i < splice_coords_files.size(); ++i) { FILE* splice_coords = splice_coords_files[i]; if (!splice_coords) continue; while (fgets(splice_buf, 2048, splice_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; /** Fields are: 1) reference name 2) left coord of splice (last char of the left exon) 3) right coord of splice (first char of the right exon) */ char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* orientation = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord || !orientation) { fprintf(stderr,"Error: malformed splice coordinate record\n"); exit(1); } uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); bool antisense = *orientation == '-'; junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats())); } } /* * Read in the deletion coordinates * and store in a set */ std::set<Deletion> deletions; for(size_t i=0; i < deletion_coords_files.size(); ++i){ FILE* deletion_coords = deletion_coords_files[i]; if(!deletion_coords){ continue; } while (fgets(splice_buf, 2048, deletion_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; /** Fields are: 1) reference name 2) left coord of splice (last char of the left exon) 3) right coord of splice (first char of the right exon) */ char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord) { fprintf(stderr,"Error: malformed deletion coordinate record\n"); exit(1); } /* * Note that when reading in a deletion, the left co-ord is the position of the * first deleted based. Since we are co-opting the junction data structure, need * to fix up this location */ uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false)); } } /* * Read in the insertion coordinates * and store in a set */ std::set<Insertion> insertions; for(size_t i=0; i < insertion_coords_files.size(); ++i){ FILE* insertion_coords = insertion_coords_files[i]; if(!insertion_coords){ continue; } while(fgets(splice_buf, 2048, insertion_coords)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* scan_sequence = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_sequence || !scan_right_coord) { fprintf(stderr,"Error: malformed insertion coordinate record\n"); exit(1); } seqan::Dna5String sequence = seqan::Dna5String(scan_sequence); bool containsN = false; for(size_t index = 0; index < seqan::length(sequence); index += 1){ /* * Don't allow any ambiguities in the insertion */ if(sequence[index] == 'N'){ containsN = true; break; } } if(containsN){ continue; } seqan::CharString charSequence = sequence; uint32_t ref_id = rt.get_id(ref_name,NULL,0); uint32_t left_coord = atoi(scan_left_coord); insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence))); } } std::set<Fusion> fusions; for(size_t i=0; i < fusion_coords_files.size(); ++i){ FILE* fusion_coords = fusion_coords_files[i]; if(!fusion_coords){ continue; } while(fgets(splice_buf, 2048, fusion_coords)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name1 = strsep((char**)&buf, "\t"); char* scan_left_coord = strsep((char**)&buf, "\t"); char* ref_name2 = strsep((char**)&buf, "\t"); char* scan_right_coord = strsep((char**)&buf, "\t"); char* scan_dir = strsep((char**)&buf, "\t"); if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir) { fprintf(stderr,"Error: malformed insertion coordinate record\n"); exit(1); } uint32_t ref_id1 = rt.get_id(ref_name1, NULL, 0); uint32_t ref_id2 = rt.get_id(ref_name2, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); uint32_t dir = FUSION_FF; if (strcmp(scan_dir, "fr") == 0) dir = FUSION_FR; else if(strcmp(scan_dir, "rf") == 0) dir = FUSION_RF; else if(strcmp(scan_dir, "rr") == 0) dir = FUSION_RR; fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir)); } } { JunctionSet::iterator itr = junctions.begin(); for (; itr != junctions.end(); ++itr) { RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->first.refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->first.refid); print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", *ref_str, name, cout); } } { std::set<Deletion>::iterator itr = deletions.begin(); for (; itr != deletions.end(); ++itr) { RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->refid); print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", *ref_str, name, cout); } } { std::set<Insertion>::iterator itr = insertions.begin(); for (; itr != insertions.end(); ++itr){ RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->refid); print_insertion(*itr, read_length, *ref_str, name, cout); } } { std::set<Fusion>::iterator itr = fusions.begin(); for (; itr != fusions.end(); ++itr){ RefSequenceTable::Sequence* left_ref_str = rt.get_seq(itr->refid1); RefSequenceTable::Sequence* right_ref_str = rt.get_seq(itr->refid2); if (left_ref_str == NULL || right_ref_str == NULL) continue; const char* left_ref_name = rt.get_name(itr->refid1); const char* right_ref_name = rt.get_name(itr->refid2); print_fusion(*itr, read_length, *left_ref_str, *right_ref_str, left_ref_name, right_ref_name, cout); } } }
void knockout_shadow_junctions(JunctionSet& junctions) { vector<uint32_t> ref_ids; for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) { ref_ids.push_back(i->first.refid); } sort(ref_ids.begin(), ref_ids.end()); vector<uint32_t>::iterator new_end = unique(ref_ids.begin(), ref_ids.end()); ref_ids.erase(new_end, ref_ids.end()); for(size_t i = 0; i < ref_ids.size(); ++i) { uint32_t refid = ref_ids[i]; Junction dummy_left(refid, 0, 0, true); Junction dummy_right(refid, 0xFFFFFFFF, 0xFFFFFFFF, true); pair<JunctionSet::iterator, JunctionSet::iterator> r; r.first = junctions.lower_bound(dummy_left); r.second = junctions.upper_bound(dummy_right); JunctionSet::iterator itr = r.first; while(itr != r.second && itr != junctions.end()) { if (itr->second.accepted) { Junction fuzzy_left = itr->first; Junction fuzzy_right = itr->first; fuzzy_left.left -= min_anchor_len; fuzzy_right.right += min_anchor_len; fuzzy_left.antisense = !itr->first.antisense; fuzzy_right.antisense = !itr->first.antisense; pair<JunctionSet::iterator, JunctionSet::iterator> s; s.first = junctions.lower_bound(fuzzy_left); s.second = junctions.upper_bound(fuzzy_right); JunctionSet::iterator itr2 = s.first; int junc_support = itr->second.supporting_hits; while(itr2 != s.second && itr2 != junctions.end()) { int left_diff = itr->first.left - itr2->first.left; int right_diff = itr->first.right - itr2->first.right; if (itr != itr2 && itr->first.antisense != itr2->first.antisense && (left_diff < min_anchor_len || right_diff < min_anchor_len)) { if (junc_support < itr2->second.supporting_hits) itr->second.accepted = false; } ++itr2; } } ++itr; } } }