void junctions_from_alignment(const BowtieHit& spliced_alignment,
			      JunctionSet& junctions)
{
  vector<pair<Junction, JunctionStats> > juncs;
  junctions_from_spliced_hit(spliced_alignment, juncs);

  for (size_t i = 0; i < juncs.size(); ++i)
    {
      pair<Junction, JunctionStats>& junc = juncs[i];
      JunctionSet::iterator itr = junctions.find(junc.first);
      
      if (itr != junctions.end())
	{
	  JunctionStats& j = itr->second;
	  j.left_extent = max(j.left_extent, junc.second.left_extent);
	  j.right_extent = max(j.right_extent, junc.second.right_extent);
	  j.min_splice_mms = min(j.min_splice_mms, junc.second.min_splice_mms);
	  j.supporting_hits++;
	}
      else
	{
	  assert(junc.first.refid != 0xFFFFFFFF);
	  junctions[junc.first] = junc.second;
	}
    }
}
void accept_all_junctions(JunctionSet& junctions,
			  const uint32_t refid)
{
  fprintf(stderr, "Accepting all junctions\n");
  for (JunctionSet::iterator itr = junctions.begin(); itr != junctions.end(); ++itr)
    {
      itr->second.accepted = true;
    }
}
void filter_junctions(JunctionSet& junctions)
{
  for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
    {
      accept_if_valid(i->first, i->second);
    }
	
  knockout_shadow_junctions(junctions);
}
void validate_junctions(const JunctionSet& junctions)
{
  uint32_t invalid_juncs = 0;
  for (JunctionSet::const_iterator i = junctions.begin();
       i != junctions.end();
       ++i)
    {
      if (!i->first.valid())
	invalid_juncs++;
    }
  fprintf(stderr, "Found %d invalid junctions\n", invalid_juncs);
}
Example #5
0
/**
 * Parse the cigar string of a BowtieHit in order to determine the alignment status.
 */
AlignStatus::AlignStatus(const BowtieHit& bh, const JunctionSet& gtf_junctions)
{
  const vector<CigarOp>& cigar = bh.cigar();
  _aligned = cigar.size() > 0;
  _indelFreeAlignment = true;
  _unannotatedSpliceFreeAlignment = true;
  _edit_dist = bh.edit_dist();
  int j = bh.left();
  for (size_t c = 0 ; c < cigar.size(); ++c)
    {
      Junction junc;
      switch(cigar[c].opcode)
	{
	case REF_SKIP:
	  junc.refid = bh.ref_id();
	  junc.left = j;
	  junc.right = junc.left + cigar[c].length;
	  junc.antisense = bh.antisense_splice();
	  j += cigar[c].length;

	  if (gtf_junctions.find(junc) == gtf_junctions.end())
	    _unannotatedSpliceFreeAlignment = false;
	  break;	  
	case MATCH:
	  j += cigar[c].length;
	  break;
	case DEL:
	  j += cigar[c].length;
	  _indelFreeAlignment = false;
	  break;
	case INS:
	  _indelFreeAlignment = false;
	  break;
	default:
	  break;
	}
    }
}
void print_junctions(FILE* junctions_out, 
		     const JunctionSet& junctions,
		     RefSequenceTable& ref_sequences)
{
  uint64_t junc_id = 1;
  fprintf(junctions_out, "track name=junctions description=\"TopHat junctions\"\n");
  for (JunctionSet::const_iterator i = junctions.begin();
       i != junctions.end();
       ++i)
    {
      const pair<Junction, JunctionStats>& j_itr = *i; 
      const Junction& j = j_itr.first;
      const JunctionStats& s = j_itr.second;			
      
      assert(ref_sequences.get_name(j.refid));
      //fprintf(stdout,"%d\t%d\t%d\t%c\n", j.refid, j.left, j.right, j.antisense ? '-' : '+');
      print_junction(junctions_out, 
		     ref_sequences.get_name(j.refid),
		     j,
		     s, 
		     junc_id++);
    }
  //fprintf(stderr, "Rejected %d / %d alignments, %d / %d spliced\n", rejected, total, rejected_spliced, total_spliced);
}
Example #7
0
void driver(const vector<FILE*>& splice_coords_files,
			const vector<FILE*>& insertion_coords_files,
			const vector<FILE*>& deletion_coords_files, 
			ifstream& ref_stream)
{	
	char splice_buf[2048];
	RefSequenceTable rt(true);
	JunctionSet junctions;
	for (size_t i = 0; i < splice_coords_files.size(); ++i)
	{
		FILE* splice_coords = splice_coords_files[i];
		if (!splice_coords)
			continue;
		while (fgets(splice_buf, 2048, splice_coords))
		{
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			/**
			 Fields are:
			 1) reference name
			 2) left coord of splice (last char of the left exon)
			 3) right coord of splice (first char of the right exon)
			 */
			
			char* ref_name                   = get_token((char**)&buf, "\t");
			char* scan_left_coord            = get_token((char**)&buf, "\t");
			char* scan_right_coord           = get_token((char**)&buf, "\t");
			char* orientation				 = get_token((char**)&buf, "\t");
			
			if (!scan_left_coord || !scan_right_coord || !orientation)
			{
				fprintf(stderr,"Error: malformed splice coordinate record\n");
				exit(1);
			}
			uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			bool antisense = *orientation == '-';
			junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
		}
	}


	/*
	 * Read in the deletion coordinates
	 * and store in a set
	 */	
	std::set<Deletion> deletions;
	for(size_t i=0; i < deletion_coords_files.size(); ++i){
		FILE* deletion_coords = deletion_coords_files[i];
		if(!deletion_coords){
			continue;
		} 
		while (fgets(splice_buf, 2048, deletion_coords))
		{
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			/**
			 Fields are:
			 1) reference name
			 2) left coord of splice (last char of the left exon)
			 3) right coord of splice (first char of the right exon)
			 */
			
			char* ref_name                   = get_token((char**)&buf, "\t");
			char* scan_left_coord            = get_token((char**)&buf, "\t");
			char* scan_right_coord           = get_token((char**)&buf, "\t");
			
			if (!scan_left_coord || !scan_right_coord)
			{
				fprintf(stderr,"Error: malformed deletion coordinate record\n");
				exit(1);
			}

			/*
			 * Note that when reading in a deletion, the left co-ord is the position of the 
			 * first deleted based. Since we are co-opting the junction data structure, need
			 * to fix up this location
			 */
			uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
		}
	}

	/*
	 * Read in the insertion coordinates
	 * and store in a set
	 */
	std::set<Insertion> insertions;
	for(size_t i=0; i < insertion_coords_files.size(); ++i){
		FILE* insertion_coords = insertion_coords_files[i];
		if(!insertion_coords){
			continue;
		} 
		while(fgets(splice_buf, 2048, insertion_coords)){
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			char* ref_name = get_token((char**)&buf, "\t");
			char* scan_left_coord = get_token((char**)&buf, "\t");
			char* scan_right_coord = get_token((char**)&buf, "\t");
			char* scan_sequence = get_token((char**)&buf, "\t");

			if (!scan_left_coord || !scan_sequence || !scan_right_coord)
			{
				fprintf(stderr,"Error: malformed insertion coordinate record\n");
				exit(1);
			}
			
			seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
			bool containsN = false;
			for(size_t index = 0; index < seqan::length(sequence); index += 1){
				/*
				 * Don't allow any ambiguities in the insertion
				 */
				if(sequence[index] == 'N'){
					containsN = true;
					break;	
				}
			}
			if(containsN){
				continue;
			}
			seqan::CharString charSequence = sequence;
			uint32_t ref_id = rt.get_id(ref_name,NULL,0);
			uint32_t left_coord = atoi(scan_left_coord);
			insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
		}
	}


	typedef RefSequenceTable::Sequence Reference;
	
	while(ref_stream.good() && 
		  !ref_stream.eof()) 
	{
		Reference ref_str;
		string name;

		readMeta(ref_stream, name, Fasta());
		string::size_type space_pos = name.find_first_of(" \t\r");
		if (space_pos != string::npos)
		{
			name.resize(space_pos);
		}
		
		read(ref_stream, ref_str, Fasta());
		
		uint32_t refid = rt.get_id(name, NULL, 0);
		Junction dummy_left(refid, 0, 0, true);
		Junction dummy_right(refid, VMAXINT32, VMAXINT32, true);
		
		pair<JunctionSet::iterator, JunctionSet::iterator> r;
		r.first = junctions.lower_bound(dummy_left);
		r.second = junctions.upper_bound(dummy_right);
		
		JunctionSet::iterator itr = r.first;
		
		while(itr != r.second && itr != junctions.end())
		{
			print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", ref_str, name, cout);
			++itr;
		}
	}


	ref_stream.clear();
	ref_stream.seekg(0, ios::beg);


	while(ref_stream.good() && 
		  !ref_stream.eof()) 
	{
		Reference ref_str;
		string name;

		readMeta(ref_stream, name, Fasta());
		string::size_type space_pos = name.find_first_of(" \t\r");
		if (space_pos != string::npos)
		{
			name.resize(space_pos);
		}
		
		read(ref_stream, ref_str, Fasta());
		
		uint32_t refid = rt.get_id(name, NULL,0);
		Deletion dummy_left(refid, 0, 0, true);
		Deletion dummy_right(refid, VMAXINT32, VMAXINT32, true);
		
		pair<std::set<Deletion>::iterator, std::set<Deletion>::iterator> r;
		r.first = deletions.lower_bound(dummy_left);
		r.second = deletions.upper_bound(dummy_right);
		
		std::set<Deletion>::iterator itr = r.first;
		
		while(itr != r.second && itr != deletions.end())
		{
			print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", ref_str, name, cout);
			++itr;
		}
	}

	ref_stream.clear();
	ref_stream.seekg(0, ios::beg);



	while(ref_stream.good() && 
		  !ref_stream.eof()) 
	{
		Reference ref_str;
		string name;

		readMeta(ref_stream, name, Fasta());
		string::size_type space_pos = name.find_first_of(" \t\r");
		if (space_pos != string::npos)
		{
			name.resize(space_pos);
		}
		
		read(ref_stream, ref_str, Fasta());
		
		uint32_t refid = rt.get_id(name, NULL,0);
		Insertion dummy_left(refid, 0, "");
		Insertion dummy_right(refid, VMAXINT32, "");
	
		std::set<Insertion>::iterator itr = insertions.lower_bound(dummy_left);
		std::set<Insertion>::iterator upper   = insertions.upper_bound(dummy_right);

		while(itr != upper && itr != insertions.end()){
			print_insertion(*itr, read_length, ref_str, name, cout);	
			++itr;
		}	
	}

}
void driver(const vector<FILE*>& splice_coords_files,
	    const vector<FILE*>& insertion_coords_files,
	    const vector<FILE*>& deletion_coords_files,
	    const vector<FILE*>& fusion_coords_files, 
	    ifstream& ref_stream)
{	
	char splice_buf[2048];
	RefSequenceTable rt(sam_header, true);
	get_seqs(ref_stream, rt, true);

	JunctionSet junctions;
	for (size_t i = 0; i < splice_coords_files.size(); ++i)
	{
		FILE* splice_coords = splice_coords_files[i];
		if (!splice_coords)
			continue;
		while (fgets(splice_buf, 2048, splice_coords))
		{
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			/**
			 Fields are:
			 1) reference name
			 2) left coord of splice (last char of the left exon)
			 3) right coord of splice (first char of the right exon)
			 */
			
			char* ref_name                   = get_token((char**)&buf, "\t");
			char* scan_left_coord            = get_token((char**)&buf, "\t");
			char* scan_right_coord           = get_token((char**)&buf, "\t");
			char* orientation				 = get_token((char**)&buf, "\t");
			
			if (!scan_left_coord || !scan_right_coord || !orientation)
			{
				fprintf(stderr,"Error: malformed splice coordinate record\n");
				exit(1);
			}
			uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			bool antisense = *orientation == '-';
			junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
		}
	}


	/*
	 * Read in the deletion coordinates
	 * and store in a set
	 */	
	std::set<Deletion> deletions;
	for(size_t i=0; i < deletion_coords_files.size(); ++i){
		FILE* deletion_coords = deletion_coords_files[i];
		if(!deletion_coords){
			continue;
		} 
		while (fgets(splice_buf, 2048, deletion_coords))
		{
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			/**
			 Fields are:
			 1) reference name
			 2) left coord of splice (last char of the left exon)
			 3) right coord of splice (first char of the right exon)
			 */
			
			char* ref_name                   = get_token((char**)&buf, "\t");
			char* scan_left_coord            = get_token((char**)&buf, "\t");
			char* scan_right_coord           = get_token((char**)&buf, "\t");
			
			if (!scan_left_coord || !scan_right_coord)
			{
				fprintf(stderr,"Error: malformed deletion coordinate record\n");
				exit(1);
			}

			/*
			 * Note that when reading in a deletion, the left co-ord is the position of the 
			 * first deleted based. Since we are co-opting the junction data structure, need
			 * to fix up this location
			 */
			uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
		}
	}

	/*
	 * Read in the insertion coordinates
	 * and store in a set
	 */
	std::set<Insertion> insertions;
	for(size_t i=0; i < insertion_coords_files.size(); ++i){
		FILE* insertion_coords = insertion_coords_files[i];
		if(!insertion_coords){
			continue;
		} 
		while(fgets(splice_buf, 2048, insertion_coords)){
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			char* ref_name = get_token((char**)&buf, "\t");
			char* scan_left_coord = get_token((char**)&buf, "\t");
			char* scan_right_coord = get_token((char**)&buf, "\t");
			char* scan_sequence = get_token((char**)&buf, "\t");

			if (!scan_left_coord || !scan_sequence || !scan_right_coord)
			{
				fprintf(stderr,"Error: malformed insertion coordinate record\n");
				exit(1);
			}
			
			seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
			bool containsN = false;
			for(size_t index = 0; index < seqan::length(sequence); index += 1){
				/*
				 * Don't allow any ambiguities in the insertion
				 */
				if(sequence[index] == 'N'){
					containsN = true;
					break;	
				}
			}
			if(containsN){
				continue;
			}
			seqan::CharString charSequence = sequence;
			uint32_t ref_id = rt.get_id(ref_name,NULL,0);
			uint32_t left_coord = atoi(scan_left_coord);
			insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
		}
	}

	std::set<Fusion> fusions;
	for(size_t i=0; i < fusion_coords_files.size(); ++i){
		FILE* fusion_coords = fusion_coords_files[i];
		if(!fusion_coords){
			continue;
		} 
		while(fgets(splice_buf, 2048, fusion_coords)){
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			char* ref_name1 = strsep((char**)&buf, "\t");
			char* scan_left_coord = strsep((char**)&buf, "\t");
			char* ref_name2 = strsep((char**)&buf, "\t");
			char* scan_right_coord = strsep((char**)&buf, "\t");
			char* scan_dir = strsep((char**)&buf, "\t");

			if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
			{
			  fprintf(stderr,"Error: malformed insertion coordinate record\n");
			  exit(1);
			}
			
			uint32_t ref_id1 = rt.get_id(ref_name1, NULL, 0);
			uint32_t ref_id2 = rt.get_id(ref_name2, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			uint32_t dir = FUSION_FF;
			if (strcmp(scan_dir, "fr") == 0)
			  dir = FUSION_FR;
			else if(strcmp(scan_dir, "rf") == 0)
			  dir = FUSION_RF;
			else if(strcmp(scan_dir, "rr") == 0)
			  dir = FUSION_RR;
		  
			fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
		}
	}

	{
	  JunctionSet::iterator itr = junctions.begin();
	  for (; itr != junctions.end(); ++itr)
	    {
	      RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->first.refid);
	      if (ref_str == NULL) continue;
	      
	      const char* name = rt.get_name(itr->first.refid);
	      print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", *ref_str, name, cout);
	    }
	}

	{
	  std::set<Deletion>::iterator itr = deletions.begin();
	  for (; itr != deletions.end(); ++itr)
	    {
	      RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
	      if (ref_str == NULL) continue;
	      
	      const char* name = rt.get_name(itr->refid);
	      print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", *ref_str, name, cout);
	    }
	}

	{
	  std::set<Insertion>::iterator itr  = insertions.begin();
	  for (; itr != insertions.end(); ++itr){
	    RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
	    if (ref_str == NULL) continue;
	    
	    const char* name = rt.get_name(itr->refid);
	    print_insertion(*itr, read_length, *ref_str, name, cout);	
	  }
	}

	{
	  std::set<Fusion>::iterator itr = fusions.begin();
	  for (; itr != fusions.end(); ++itr){
	    RefSequenceTable::Sequence* left_ref_str = rt.get_seq(itr->refid1);
	    RefSequenceTable::Sequence* right_ref_str = rt.get_seq(itr->refid2);

	    if (left_ref_str == NULL || right_ref_str == NULL) continue;
	    
	    const char* left_ref_name = rt.get_name(itr->refid1);
	    const char* right_ref_name = rt.get_name(itr->refid2);
	    print_fusion(*itr, read_length, *left_ref_str, *right_ref_str, left_ref_name, right_ref_name, cout);	
	  }
	}
}
void knockout_shadow_junctions(JunctionSet& junctions)
{
  vector<uint32_t> ref_ids;
  
  for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
    {
      ref_ids.push_back(i->first.refid);
    }
  
  sort(ref_ids.begin(), ref_ids.end());
  vector<uint32_t>::iterator new_end = unique(ref_ids.begin(), ref_ids.end());
  ref_ids.erase(new_end, ref_ids.end());
  
  for(size_t i = 0; i < ref_ids.size(); ++i)
    {
      uint32_t refid = ref_ids[i];
		
      Junction dummy_left(refid, 0, 0, true);
      Junction dummy_right(refid, 0xFFFFFFFF, 0xFFFFFFFF, true);
      
      pair<JunctionSet::iterator, JunctionSet::iterator> r;
      r.first = junctions.lower_bound(dummy_left);
      r.second = junctions.upper_bound(dummy_right);
      
      JunctionSet::iterator itr = r.first;
      
      while(itr != r.second && itr != junctions.end())
	{
	  if (itr->second.accepted)
	    {
	      Junction fuzzy_left = itr->first;
	      Junction fuzzy_right = itr->first;
	      fuzzy_left.left -= min_anchor_len;
	      fuzzy_right.right += min_anchor_len;
	      fuzzy_left.antisense = !itr->first.antisense;
	      fuzzy_right.antisense = !itr->first.antisense;
	      
	      pair<JunctionSet::iterator, JunctionSet::iterator> s;
	      s.first = junctions.lower_bound(fuzzy_left);
	      s.second = junctions.upper_bound(fuzzy_right);
	      JunctionSet::iterator itr2 = s.first;
	      
	      int junc_support = itr->second.supporting_hits;
	      
	      while(itr2 != s.second && itr2 != junctions.end())
		{
		  int left_diff = itr->first.left - itr2->first.left;
		  int right_diff = itr->first.right - itr2->first.right;
		  if (itr != itr2 && 
		      itr->first.antisense != itr2->first.antisense && 
		      (left_diff < min_anchor_len || right_diff < min_anchor_len))
		    {
		      if (junc_support < itr2->second.supporting_hits)
			itr->second.accepted = false;
		    }
		  ++itr2;
		}
	    }
	  ++itr;
	}
    }
}