示例#1
0
void sieve()
{
    memset( prime, -1, sizeof( prime ) );
    unsigned int i;
    unsigned int sqrtN = ( unsigned int )sqrt( ( double )N ) + 1;
    for( i = 3; i < sqrtN; i += 2 )if(gP(i))
    {
        unsigned int i2 = i + i;
        for( unsigned int j = i * i; j < N; j += i2 ) rP( j );
    }
}
std::vector<oneReadPair> readSimulator::simulate_paired_reads_from_edgePath(std::vector<Edge*> edgePath, double expected_haploid_coverage, double starting_coordinates_diff_mean, double starting_coordinates_diff_sd, bool perfectly, bool is2nd)
{
	std::vector<oneReadPair> forReturn;

	Graph* g = edgePath.front()->From->g;
	std::string edgePath_string;
	std::vector<unsigned int> edgePath_string_originLevel;
	for(unsigned int eI = 0; eI < edgePath.size(); eI++)
	{
		std::string edgeEmission = g->CODE.deCode(edgePath.at(eI)->locus_id, edgePath.at(eI)->emission);
		assert(edgeEmission.length() == 1);
		if(edgeEmission != "_")
		{
			edgePath_string.append(edgeEmission);
			edgePath_string_originLevel.push_back(eI);
		}
	}
	assert(edgePath_string.size() == edgePath_string_originLevel.size());

	double poissonStartRate = expected_haploid_coverage / ( 2.0 * (double)(read_length)); // this is of reads and their pairs, thus / 2

	long long firstPosition = 0;
	long long lastPosition = edgePath_string.length() - read_length;

	if(!(lastPosition >= firstPosition))
	{
		throw std::runtime_error("readSimulator::simulate_paired_reads_from_edgePath(): Problem -- lastPosition < firstPosition -- the supplied first string is not long enough!\n");
	}

	double global_indel_events = 0;
	double global_generated_bases = 0;
	double global_generated_errors = 0;
	std::map<char, double> global_error_NUC;
	
	{
		boost::mt19937 rnd_gen;

		auto seed = boost::random::random_device()();
		rnd_gen.seed(seed);  
		
		boost::random::poisson_distribution<> rnd_starting_reads ( poissonStartRate );
		std::vector< boost::random::poisson_distribution<> > rnd_INs;
		std::vector< boost::random::poisson_distribution<> > rnd_DELs;
		boost::random::normal_distribution<> rnd_jumpSize (starting_coordinates_diff_mean, starting_coordinates_diff_sd);

		for(unsigned int i = 0; i < read_length; i++)
		{
			rnd_INs.push_back(boost::random::poisson_distribution<>( read_INDEL_freq.at(i) ));
			rnd_DELs.push_back(boost::random::poisson_distribution<>( read_INDEL_freq.at(i) ));
		}

		double thread_indel_events = 0;
		double thread_generated_bases = 0;
		double thread_generated_errors = 0;
		size_t thread_read_pairs = 0;

		auto sampleOneBase = [&](unsigned int position_in_read, char underlyingBase, char& returnedBase, char& returnedQuality) -> void {

			assert(position_in_read < this->read_quality_frequencies.size());
			assert(position_in_read < this->read_quality_correctness.size());

			if(is2nd)
			{
				returnedQuality = Utilities::choose_from_normalized_map(this->read_quality_frequencies_2nd.at(position_in_read), rnd_gen);
			}
			else
			{
				returnedQuality = Utilities::choose_from_normalized_map(this->read_quality_frequencies.at(position_in_read), rnd_gen);
			}


			assert(returnedQuality > 0);

			bool generateError;

			if(is2nd)
			{
				generateError = Utilities::oneBernoulliTrial( 1 - this->read_quality_correctness_2nd.at(position_in_read).at(returnedQuality), rnd_gen);
			}
			else
			{
				generateError = Utilities::oneBernoulliTrial( 1 - this->read_quality_correctness.at(position_in_read).at(returnedQuality), rnd_gen);
			}
						
			if(generateError && (! perfectly))
			{
				returnedBase =  Utilities::randomNucleotide(rnd_gen);
				if(global_error_NUC.count(returnedBase) == 0)
				{
					global_error_NUC[returnedBase] = 0;
				}
				global_error_NUC[returnedBase]++;
				thread_generated_errors++;
			}
			else
			{
				returnedBase = underlyingBase;
			}

			thread_generated_bases++;

			returnedQuality += 32;
		};
		
		auto sampleRead = [&](long long index_into_baseString, std::string& read, std::string& read_qualities, std::vector<int>& coordinates_string, bool& success) -> void {

			read.resize(this->read_length, 0);
			read_qualities.resize(this->read_length, 0);

			coordinates_string.clear();
			success = true;

			int INDEL_events = 0;

			for(unsigned int base = 0; base < this->read_length; base++)
			{
				int insertions = rnd_INs.at(base)(rnd_gen);
				int deletions = rnd_DELs.at(base)(rnd_gen);

				if(perfectly)
				{
					insertions = 0;
					deletions = 0;
				}
				
				// std::cout << "\tbase " << base << " " << insertions << " " << deletions << "\n" << std::flush;

				INDEL_events += (insertions + deletions);

				if(insertions > 0)
				{
					for(int insertionI = 0; insertionI < insertions; insertionI++)
					{
						char baseChar = Utilities::randomNucleotide(rnd_gen);

						char base_for_read; char quality_for_read;
						sampleOneBase(base, baseChar, base_for_read, quality_for_read);
						read.at(base) = base_for_read;
						read_qualities.at(base) = quality_for_read;
						base++;

						coordinates_string.push_back(-1);
						
						if(base >= this->read_length)
							break;
					}

					if(base >= this->read_length)
						break;
				}

				if(deletions > 0)
				{
					index_into_baseString += deletions;
				}

				if(index_into_baseString >= edgePath_string.length())
				{
					success = false;
					break;
				}

				char base_for_read; char quality_for_read;
				assert(index_into_baseString < edgePath_string.size());

				sampleOneBase(base, edgePath_string.at(index_into_baseString), base_for_read, quality_for_read);

				read.at(base) = base_for_read;
				read_qualities.at(base) = quality_for_read;
				coordinates_string.push_back(index_into_baseString);
				
				// std::cout << "index_into_baseString: " << index_into_baseString << "; base: " << base << "; coordinates_string.size(): " << coordinates_string.size() << "\n" << std::flush;
				index_into_baseString++;
			}
		

			thread_indel_events += INDEL_events;
			// std::cout << "INDEL events: " << INDEL_events << "\n";

			if(!((! success) || (coordinates_string.size() == read.size())))
			{
				std::cerr << "success: " << success << "\n";
				std::cerr << "coordinates_string.size(): " << coordinates_string.size() << "\n";
				std::cerr << "read.size(): " << read.size() << "\n" << std::flush;
			}
			
			assert((! success) || (coordinates_string.size() == read.size()));
			
			if(paranoid && success)
			{
				assert(std::find(read.begin(), read.end(), 0) == read.end());
				assert(std::find(read_qualities.begin(), read_qualities.end(), 0) == read_qualities.end());
			}
		};

		for(long long i = 0; i < lastPosition; i++)
		{
			int starting_reads = rnd_starting_reads(rnd_gen);

			for(int readI = 0; readI < starting_reads; readI++)
			{
				int jumpSize = floor(rnd_jumpSize(rnd_gen));


				std::string read1; std::string read1_qualities; std::vector<int> read1_coordinates_string; bool read1_success;
				std::string read2; std::string read2_qualities; std::vector<int> read2_coordinates_string; bool read2_success;

				sampleRead(i,            read1, read1_qualities, read1_coordinates_string, read1_success);
				sampleRead(i + this->read_length + jumpSize, read2, read2_qualities, read2_coordinates_string, read2_success);

				if(read1_success && read2_success)
				{
					thread_read_pairs++;

					read2 = Utilities::seq_reverse_complement(read2);
					std::reverse(read2_qualities.begin(), read2_qualities.end());
					
					std::string read1_name = "p1" + readName_field_separator + Utilities::ItoStr(i);
					std::string read2_name = "p2" + readName_field_separator + Utilities::ItoStr(i + jumpSize);

					oneRead r1(read1_name, read1, read1_qualities);
					oneRead r2(read2_name, read2, read2_qualities);

					std::vector<int> read1_coordinates_edgePath;
					std::vector<int> read2_coordinates_edgePath;
					for(unsigned int cI = 0; cI < read1_coordinates_string.size(); cI++)
					{
						int c = read1_coordinates_string.at(cI);
						if(c == -1)
						{
							read1_coordinates_edgePath.push_back(c);
						}
						else
						{
							read1_coordinates_edgePath.push_back(edgePath_string_originLevel.at(c));
						}
					}
					for(unsigned int cI = 0; cI < read2_coordinates_string.size(); cI++)
					{
						int c = read2_coordinates_string.at(cI);
						if(c == -1)
						{
							read2_coordinates_edgePath.push_back(c);
						}
						else
						{
							read2_coordinates_edgePath.push_back(edgePath_string_originLevel.at(c));
						}
					}

					r1.coordinates_string = read1_coordinates_string;
					r1.coordinates_edgePath = read1_coordinates_edgePath;
					
					std::reverse(read2_coordinates_string.begin(), read2_coordinates_string.end());
					std::reverse(read2_coordinates_edgePath.begin(), read2_coordinates_edgePath.end());

					r2.coordinates_string = read2_coordinates_string;
					r2.coordinates_edgePath = read2_coordinates_edgePath;

					assert(r1.coordinates_string.size() == r1.sequence.size());
					assert(r2.coordinates_string.size() == r2.sequence.size());
					assert(r1.coordinates_string.size() == r1.coordinates_edgePath.size());
					assert(r2.coordinates_string.size() == r2.coordinates_edgePath.size());
					  
					oneReadPair rP(r1, r2, jumpSize);

					if(Utilities::oneBernoulliTrial(0.5, rnd_gen))
					{
						rP.invert();
					}

					forReturn.push_back(rP);
				}
			}
		}
		  
		{
			global_generated_bases += thread_generated_bases;
			global_generated_errors += thread_generated_errors;
			global_indel_events += thread_indel_events;
		}
	}

	std::cout << "readSimulator::simulate_paired_reads_from_edgePath(..): Simulated " << forReturn.size() << " read pairs.\n";
	std::cout << "\t" << "global_generated_bases" << ": " << global_generated_bases << "\n";
	std::cout << "\t" << "global_generated_errors" << ": " << global_generated_errors << "\n";
	std::cout << "\t" << "global_indel_events" << ": " << global_indel_events << "\n\n";
	std::cout << "\t" << "error base counts: \n";
	for(std::map<char, double>::iterator bIt = global_error_NUC.begin(); bIt != global_error_NUC.end(); bIt++)
	{
		std::cout << "\t\t" << bIt->first << ": " << bIt->second << "\n";
	}
	std::cout << "\n" << std::flush;

	return forReturn;
}
// see header
size_t readSimulator::simulate_paired_reads_from_string(std::string readNamePrefix, std::string& s, double expected_haploid_coverage, std::vector<std::pair<std::ofstream*, std::ofstream*>>& output_FHs_perThread, double starting_coordinates_diff_mean, double starting_coordinates_diff_sd)
{
	std::vector<oneReadPair> forReturn;

	double poissonStartRate = expected_haploid_coverage / ( 2.0 * (double)(read_length)); // this is of reads and their pairs, thus / 2
//	std::cout << "Poisson start rate: " << poissonStartRate << "\n";

	long long firstPosition = 0;
	long long lastPosition = s.length() - read_length;

	if(!(lastPosition >= firstPosition))
	{
		throw std::runtime_error("readSimulator::simulate_reads_from_string(): Problem -- lastPosition < firstPosition -- the supplied first string is not long enough!\n");
	}

	omp_set_num_threads(threads);

	std::vector< std::vector<oneReadPair> > forReturn_perThread;
	forReturn_perThread.resize(threads);

	double global_indel_events = 0;
	double global_generated_bases = 0;
	double global_generated_errors = 0;

	auto print_one_readPair = [&output_FHs_perThread] (oneReadPair& rP, unsigned int threadI) -> void
	{
		*(output_FHs_perThread.at(threadI).first) << "@" << rP.reads.first.name << "\n";
		*(output_FHs_perThread.at(threadI).first) << rP.reads.first.sequence << "\n";
		*(output_FHs_perThread.at(threadI).first) << "+" << "\n";
		*(output_FHs_perThread.at(threadI).first) << rP.reads.first.quality << "\n";

		*(output_FHs_perThread.at(threadI).second) << "@" << rP.reads.second.name << "\n";
		*(output_FHs_perThread.at(threadI).second) << rP.reads.second.sequence << "\n";
		*(output_FHs_perThread.at(threadI).second) << "+" << "\n";
		*(output_FHs_perThread.at(threadI).second) << rP.reads.second.quality << "\n";
	};

	#pragma omp parallel
	{
		assert(omp_get_num_threads() == (int)threads);
		int thisThread = omp_get_thread_num();

		boost::mt19937 rnd_gen;

		auto seed = boost::random::random_device()();
		rnd_gen.seed(seed);

		boost::random::poisson_distribution<> rnd_starting_reads ( poissonStartRate );
		std::vector< boost::random::poisson_distribution<> > rnd_INs;
		std::vector< boost::random::poisson_distribution<> > rnd_DELs;
		boost::random::normal_distribution<> rnd_jumpSize (starting_coordinates_diff_mean, starting_coordinates_diff_sd);

		for(unsigned int i = 0; i < read_length; i++)
		{
			rnd_INs.push_back(boost::random::poisson_distribution<>( read_INDEL_freq.at(i) ));
			rnd_DELs.push_back(boost::random::poisson_distribution<>( read_INDEL_freq.at(i) ));
		}

		double thread_indel_events = 0;
		double thread_generated_bases = 0;
		double thread_generated_errors = 0;
		size_t thread_read_pairs = 0;

		auto sampleOneBase = [&](unsigned int position_in_read, char underlyingBase, char& returnedBase, char& returnedQuality) -> void {

			assert(position_in_read < this->read_quality_frequencies.size());
			assert(position_in_read < this->read_quality_correctness.size());

			returnedQuality = Utilities::choose_from_normalized_map(this->read_quality_frequencies.at(position_in_read), rnd_gen);

			assert(returnedQuality > 0);

			bool generateError = Utilities::oneBernoulliTrial( 1 - this->read_quality_correctness.at(position_in_read).at(returnedQuality), rnd_gen);
			if(generateError)
			{
				returnedBase =  Utilities::randomNucleotide(rnd_gen);
				thread_generated_errors++;
			}
			else
			{
				returnedBase = underlyingBase;
			}

			thread_generated_bases++;

			returnedQuality += 32;

		};

		auto sampleRead = [&](long long index_into_baseString, std::string& read, std::string& read_qualities, bool& success) -> void {

			read.resize(this->read_length, 0);
			read_qualities.resize(this->read_length, 0);

			success = true;

			int INDEL_events = 0;

			for(unsigned int base = 0; base < this->read_length; base++)
			{
				int insertions = rnd_INs.at(base)(rnd_gen);
				int deletions = rnd_DELs.at(base)(rnd_gen);

				// std::cout << "\tbase " << base << " " << insertions << " " << deletions << "\n" << std::flush;

				INDEL_events += (insertions + deletions);

				if(insertions > 0)
				{
					for(int insertionI = 0; insertionI < insertions; insertionI++)
					{
						char baseChar = Utilities::randomNucleotide(rnd_gen);

						char base_for_read; char quality_for_read;
						sampleOneBase(base, baseChar, base_for_read, quality_for_read);
						read.at(base) = base_for_read;
						read_qualities.at(base) = quality_for_read;
						base++;

						if(base >= this->read_length)
							break;
					}

					if(base >= this->read_length)
						break;
				}

				if(deletions > 0)
				{
					index_into_baseString += deletions;
				}

				if(index_into_baseString >= s.length())
				{
					success = false;
					break;
				}

				char base_for_read; char quality_for_read;
				assert(index_into_baseString < s.size());

				sampleOneBase(base, s.at(index_into_baseString), base_for_read, quality_for_read);
				read.at(base) = base_for_read;
				read_qualities.at(base) = quality_for_read;

				index_into_baseString++;
			}

			thread_indel_events += INDEL_events;
			// std::cout << "INDEL events: " << INDEL_events << "\n";

			if(paranoid && success)
			{
				assert(std::find(read.begin(), read.end(), 0) == read.end());
				assert(std::find(read_qualities.begin(), read_qualities.end(), 0) == read_qualities.end());
			}
		};

		#pragma omp for
		for(long long i = 0; i < lastPosition; i++)
		{

			assert(omp_get_num_threads() == (int)threads);
			assert(thisThread == omp_get_thread_num());

			if((i % 10000) == 0)
			{
				#pragma omp critical
				{
					if(omp_get_thread_num() == 0)
					{
						double approx_all = thread_generated_bases * omp_get_num_threads();
						std::cout << "\r" << "Generated bases this thread: " <<  thread_generated_bases << " x total: " << approx_all << std::flush;
					}
				}
			}
			int starting_reads = rnd_starting_reads(rnd_gen);

			for(int readI = 0; readI < starting_reads; readI++)
			{
				int jumpSize = floor(rnd_jumpSize(rnd_gen));
				

				std::string read1; std::string read1_qualities; bool read1_success;
				std::string read2; std::string read2_qualities; bool read2_success;

				sampleRead(i,            read1, read1_qualities, read1_success);
				sampleRead(i + jumpSize, read2, read2_qualities, read2_success);

				if(read1_success && read2_success)
				{
					thread_read_pairs++;

					read2 = Utilities::seq_reverse_complement(read2);
					
					
					std::string read1_name = readNamePrefix + readName_field_separator + "p1" + readName_field_separator + Utilities::ItoStr(i);
					std::string read2_name = readNamePrefix + readName_field_separator + "p2" + readName_field_separator + Utilities::ItoStr(i + jumpSize);

					oneRead r1(read1_name, read1, read1_qualities);
					oneRead r2(read2_name, read2, read2_qualities);

					oneReadPair rP(r1, r2, jumpSize);

					print_one_readPair(rP, thisThread);

					// forReturn_perThread.at(thisThread).push_back(rP);
				}
			}
		}

		#pragma omp critical
		{
//			std::cout << "Thread " << thisThread << "; read pairs " << thread_read_pairs << "; thread bases: " << thread_generated_bases << "; thread errors: " << thread_generated_errors << "; thread INDELs: " << thread_indel_events << "\n" << std::flush;

			global_generated_bases += thread_generated_bases;
			global_generated_errors += thread_generated_errors;
			global_indel_events += thread_indel_events;
		}
	}
	
	std::cout << "\n";


//	std::cout << "readSimulator::simulate_paired_reads_from_string() summary\n";
//	std::cout << "\tUnderlying string length: " << s.length() << "\n";
//	std::cout << "\tGenerated bases: " << global_generated_bases << "\n";
//	std::cout << "\tGenerated errors: " << global_generated_errors << "\n";
//	std::cout << "\tGenerated INDEL events: " << global_indel_events << "\n";
//	std::cout << "\tAchieved approximate coverage: " << global_generated_bases/double(s.length()) << "\n\n";

	return global_generated_bases;

//	for(unsigned int tI = 0; tI < threads; tI++)
//	{
//		forReturn.insert(forReturn.end(), forReturn_perThread.at(tI).begin(), forReturn_perThread.at(tI).end());
//	}
//
//	for(unsigned int rI = 0; rI < forReturn.size(); rI++)
//	{
//		oneReadPair& p = forReturn.at(rI);
//		std::cout << "Read pair #" << rI << "\n======================================\n";
//		std::cout << p.reads.first.name << "\n";
//		std::cout << p.reads.first.sequence << "\n";
//		std::cout << p.reads.first.quality << "\n";
//		std::cout << p.diff_starting_coordinates << "\n";
//		std::cout << p.reads.second.name << "\n";
//		std::cout << p.reads.second.sequence << "\n";
//		std::cout << p.reads.second.quality << "\n\n";
//	}

//	return forReturn;
}