コード例 #1
0
ファイル: juncs_db.cpp プロジェクト: genome-vendor/tophat
void driver(const vector<FILE*>& splice_coords_files,
			const vector<FILE*>& insertion_coords_files,
			const vector<FILE*>& deletion_coords_files, 
			ifstream& ref_stream)
{	
	char splice_buf[2048];
	RefSequenceTable rt(true);
	JunctionSet junctions;
	for (size_t i = 0; i < splice_coords_files.size(); ++i)
	{
		FILE* splice_coords = splice_coords_files[i];
		if (!splice_coords)
			continue;
		while (fgets(splice_buf, 2048, splice_coords))
		{
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			/**
			 Fields are:
			 1) reference name
			 2) left coord of splice (last char of the left exon)
			 3) right coord of splice (first char of the right exon)
			 */
			
			char* ref_name                   = get_token((char**)&buf, "\t");
			char* scan_left_coord            = get_token((char**)&buf, "\t");
			char* scan_right_coord           = get_token((char**)&buf, "\t");
			char* orientation				 = get_token((char**)&buf, "\t");
			
			if (!scan_left_coord || !scan_right_coord || !orientation)
			{
				fprintf(stderr,"Error: malformed splice coordinate record\n");
				exit(1);
			}
			uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			bool antisense = *orientation == '-';
			junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
		}
	}


	/*
	 * Read in the deletion coordinates
	 * and store in a set
	 */	
	std::set<Deletion> deletions;
	for(size_t i=0; i < deletion_coords_files.size(); ++i){
		FILE* deletion_coords = deletion_coords_files[i];
		if(!deletion_coords){
			continue;
		} 
		while (fgets(splice_buf, 2048, deletion_coords))
		{
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			/**
			 Fields are:
			 1) reference name
			 2) left coord of splice (last char of the left exon)
			 3) right coord of splice (first char of the right exon)
			 */
			
			char* ref_name                   = get_token((char**)&buf, "\t");
			char* scan_left_coord            = get_token((char**)&buf, "\t");
			char* scan_right_coord           = get_token((char**)&buf, "\t");
			
			if (!scan_left_coord || !scan_right_coord)
			{
				fprintf(stderr,"Error: malformed deletion coordinate record\n");
				exit(1);
			}

			/*
			 * Note that when reading in a deletion, the left co-ord is the position of the 
			 * first deleted based. Since we are co-opting the junction data structure, need
			 * to fix up this location
			 */
			uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
		}
	}

	/*
	 * Read in the insertion coordinates
	 * and store in a set
	 */
	std::set<Insertion> insertions;
	for(size_t i=0; i < insertion_coords_files.size(); ++i){
		FILE* insertion_coords = insertion_coords_files[i];
		if(!insertion_coords){
			continue;
		} 
		while(fgets(splice_buf, 2048, insertion_coords)){
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			char* ref_name = get_token((char**)&buf, "\t");
			char* scan_left_coord = get_token((char**)&buf, "\t");
			char* scan_right_coord = get_token((char**)&buf, "\t");
			char* scan_sequence = get_token((char**)&buf, "\t");

			if (!scan_left_coord || !scan_sequence || !scan_right_coord)
			{
				fprintf(stderr,"Error: malformed insertion coordinate record\n");
				exit(1);
			}
			
			seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
			bool containsN = false;
			for(size_t index = 0; index < seqan::length(sequence); index += 1){
				/*
				 * Don't allow any ambiguities in the insertion
				 */
				if(sequence[index] == 'N'){
					containsN = true;
					break;	
				}
			}
			if(containsN){
				continue;
			}
			seqan::CharString charSequence = sequence;
			uint32_t ref_id = rt.get_id(ref_name,NULL,0);
			uint32_t left_coord = atoi(scan_left_coord);
			insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
		}
	}


	typedef RefSequenceTable::Sequence Reference;
	
	while(ref_stream.good() && 
		  !ref_stream.eof()) 
	{
		Reference ref_str;
		string name;

		readMeta(ref_stream, name, Fasta());
		string::size_type space_pos = name.find_first_of(" \t\r");
		if (space_pos != string::npos)
		{
			name.resize(space_pos);
		}
		
		read(ref_stream, ref_str, Fasta());
		
		uint32_t refid = rt.get_id(name, NULL, 0);
		Junction dummy_left(refid, 0, 0, true);
		Junction dummy_right(refid, VMAXINT32, VMAXINT32, true);
		
		pair<JunctionSet::iterator, JunctionSet::iterator> r;
		r.first = junctions.lower_bound(dummy_left);
		r.second = junctions.upper_bound(dummy_right);
		
		JunctionSet::iterator itr = r.first;
		
		while(itr != r.second && itr != junctions.end())
		{
			print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", ref_str, name, cout);
			++itr;
		}
	}


	ref_stream.clear();
	ref_stream.seekg(0, ios::beg);


	while(ref_stream.good() && 
		  !ref_stream.eof()) 
	{
		Reference ref_str;
		string name;

		readMeta(ref_stream, name, Fasta());
		string::size_type space_pos = name.find_first_of(" \t\r");
		if (space_pos != string::npos)
		{
			name.resize(space_pos);
		}
		
		read(ref_stream, ref_str, Fasta());
		
		uint32_t refid = rt.get_id(name, NULL,0);
		Deletion dummy_left(refid, 0, 0, true);
		Deletion dummy_right(refid, VMAXINT32, VMAXINT32, true);
		
		pair<std::set<Deletion>::iterator, std::set<Deletion>::iterator> r;
		r.first = deletions.lower_bound(dummy_left);
		r.second = deletions.upper_bound(dummy_right);
		
		std::set<Deletion>::iterator itr = r.first;
		
		while(itr != r.second && itr != deletions.end())
		{
			print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", ref_str, name, cout);
			++itr;
		}
	}

	ref_stream.clear();
	ref_stream.seekg(0, ios::beg);



	while(ref_stream.good() && 
		  !ref_stream.eof()) 
	{
		Reference ref_str;
		string name;

		readMeta(ref_stream, name, Fasta());
		string::size_type space_pos = name.find_first_of(" \t\r");
		if (space_pos != string::npos)
		{
			name.resize(space_pos);
		}
		
		read(ref_stream, ref_str, Fasta());
		
		uint32_t refid = rt.get_id(name, NULL,0);
		Insertion dummy_left(refid, 0, "");
		Insertion dummy_right(refid, VMAXINT32, "");
	
		std::set<Insertion>::iterator itr = insertions.lower_bound(dummy_left);
		std::set<Insertion>::iterator upper   = insertions.upper_bound(dummy_right);

		while(itr != upper && itr != insertions.end()){
			print_insertion(*itr, read_length, ref_str, name, cout);	
			++itr;
		}	
	}

}
コード例 #2
0
void driver(const vector<FILE*>& splice_coords_files,
	    const vector<FILE*>& insertion_coords_files,
	    const vector<FILE*>& deletion_coords_files,
	    const vector<FILE*>& fusion_coords_files, 
	    ifstream& ref_stream)
{	
	char splice_buf[2048];
	RefSequenceTable rt(sam_header, true);
	get_seqs(ref_stream, rt, true);

	JunctionSet junctions;
	for (size_t i = 0; i < splice_coords_files.size(); ++i)
	{
		FILE* splice_coords = splice_coords_files[i];
		if (!splice_coords)
			continue;
		while (fgets(splice_buf, 2048, splice_coords))
		{
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			/**
			 Fields are:
			 1) reference name
			 2) left coord of splice (last char of the left exon)
			 3) right coord of splice (first char of the right exon)
			 */
			
			char* ref_name                   = get_token((char**)&buf, "\t");
			char* scan_left_coord            = get_token((char**)&buf, "\t");
			char* scan_right_coord           = get_token((char**)&buf, "\t");
			char* orientation				 = get_token((char**)&buf, "\t");
			
			if (!scan_left_coord || !scan_right_coord || !orientation)
			{
				fprintf(stderr,"Error: malformed splice coordinate record\n");
				exit(1);
			}
			uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			bool antisense = *orientation == '-';
			junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
		}
	}


	/*
	 * Read in the deletion coordinates
	 * and store in a set
	 */	
	std::set<Deletion> deletions;
	for(size_t i=0; i < deletion_coords_files.size(); ++i){
		FILE* deletion_coords = deletion_coords_files[i];
		if(!deletion_coords){
			continue;
		} 
		while (fgets(splice_buf, 2048, deletion_coords))
		{
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			/**
			 Fields are:
			 1) reference name
			 2) left coord of splice (last char of the left exon)
			 3) right coord of splice (first char of the right exon)
			 */
			
			char* ref_name                   = get_token((char**)&buf, "\t");
			char* scan_left_coord            = get_token((char**)&buf, "\t");
			char* scan_right_coord           = get_token((char**)&buf, "\t");
			
			if (!scan_left_coord || !scan_right_coord)
			{
				fprintf(stderr,"Error: malformed deletion coordinate record\n");
				exit(1);
			}

			/*
			 * Note that when reading in a deletion, the left co-ord is the position of the 
			 * first deleted based. Since we are co-opting the junction data structure, need
			 * to fix up this location
			 */
			uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
		}
	}

	/*
	 * Read in the insertion coordinates
	 * and store in a set
	 */
	std::set<Insertion> insertions;
	for(size_t i=0; i < insertion_coords_files.size(); ++i){
		FILE* insertion_coords = insertion_coords_files[i];
		if(!insertion_coords){
			continue;
		} 
		while(fgets(splice_buf, 2048, insertion_coords)){
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			char* ref_name = get_token((char**)&buf, "\t");
			char* scan_left_coord = get_token((char**)&buf, "\t");
			char* scan_right_coord = get_token((char**)&buf, "\t");
			char* scan_sequence = get_token((char**)&buf, "\t");

			if (!scan_left_coord || !scan_sequence || !scan_right_coord)
			{
				fprintf(stderr,"Error: malformed insertion coordinate record\n");
				exit(1);
			}
			
			seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
			bool containsN = false;
			for(size_t index = 0; index < seqan::length(sequence); index += 1){
				/*
				 * Don't allow any ambiguities in the insertion
				 */
				if(sequence[index] == 'N'){
					containsN = true;
					break;	
				}
			}
			if(containsN){
				continue;
			}
			seqan::CharString charSequence = sequence;
			uint32_t ref_id = rt.get_id(ref_name,NULL,0);
			uint32_t left_coord = atoi(scan_left_coord);
			insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
		}
	}

	std::set<Fusion> fusions;
	for(size_t i=0; i < fusion_coords_files.size(); ++i){
		FILE* fusion_coords = fusion_coords_files[i];
		if(!fusion_coords){
			continue;
		} 
		while(fgets(splice_buf, 2048, fusion_coords)){
			char* nl = strrchr(splice_buf, '\n');
			char* buf = splice_buf;
			if (nl) *nl = 0;
			
			char* ref_name1 = strsep((char**)&buf, "\t");
			char* scan_left_coord = strsep((char**)&buf, "\t");
			char* ref_name2 = strsep((char**)&buf, "\t");
			char* scan_right_coord = strsep((char**)&buf, "\t");
			char* scan_dir = strsep((char**)&buf, "\t");

			if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
			{
			  fprintf(stderr,"Error: malformed insertion coordinate record\n");
			  exit(1);
			}
			
			uint32_t ref_id1 = rt.get_id(ref_name1, NULL, 0);
			uint32_t ref_id2 = rt.get_id(ref_name2, NULL, 0);
			uint32_t left_coord = atoi(scan_left_coord);
			uint32_t right_coord = atoi(scan_right_coord);
			uint32_t dir = FUSION_FF;
			if (strcmp(scan_dir, "fr") == 0)
			  dir = FUSION_FR;
			else if(strcmp(scan_dir, "rf") == 0)
			  dir = FUSION_RF;
			else if(strcmp(scan_dir, "rr") == 0)
			  dir = FUSION_RR;
		  
			fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
		}
	}

	{
	  JunctionSet::iterator itr = junctions.begin();
	  for (; itr != junctions.end(); ++itr)
	    {
	      RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->first.refid);
	      if (ref_str == NULL) continue;
	      
	      const char* name = rt.get_name(itr->first.refid);
	      print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", *ref_str, name, cout);
	    }
	}

	{
	  std::set<Deletion>::iterator itr = deletions.begin();
	  for (; itr != deletions.end(); ++itr)
	    {
	      RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
	      if (ref_str == NULL) continue;
	      
	      const char* name = rt.get_name(itr->refid);
	      print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", *ref_str, name, cout);
	    }
	}

	{
	  std::set<Insertion>::iterator itr  = insertions.begin();
	  for (; itr != insertions.end(); ++itr){
	    RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
	    if (ref_str == NULL) continue;
	    
	    const char* name = rt.get_name(itr->refid);
	    print_insertion(*itr, read_length, *ref_str, name, cout);	
	  }
	}

	{
	  std::set<Fusion>::iterator itr = fusions.begin();
	  for (; itr != fusions.end(); ++itr){
	    RefSequenceTable::Sequence* left_ref_str = rt.get_seq(itr->refid1);
	    RefSequenceTable::Sequence* right_ref_str = rt.get_seq(itr->refid2);

	    if (left_ref_str == NULL || right_ref_str == NULL) continue;
	    
	    const char* left_ref_name = rt.get_name(itr->refid1);
	    const char* right_ref_name = rt.get_name(itr->refid2);
	    print_fusion(*itr, read_length, *left_ref_str, *right_ref_str, left_ref_name, right_ref_name, cout);	
	  }
	}
}