Пример #1
0
int read_sequences(Auto_Unzip & input, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) {
	if (&input == NULL)
		return 0;

	Fasta read;
	read.set_FASTQ_type(format_type);
	int n_seq = 0;
	{
		mutex::scoped_lock lock(read_mutex);
		istream & in = input.filtered();
		while (not input.eof() and n_seq < num_seq) {
			in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			//Reset and take a ref
			Mask & r = sequences[n_seq] = Mask();
			r.set_id(read.get_id());
			r.set_sequence(read.get_sequence());
			r.set_quality(read.get_quality());
			n_seq++;
		}
		output_progress(input, gui_output);
	}
	return n_seq;
}
Пример #2
0
int read_sequences(Auto_Unzip & first, Auto_Unzip & second, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) {
	if (&first == NULL or &second == NULL)
		return 0;

	Fasta read;
	read.set_FASTQ_type(format_type);
	int n_seq = 0;
	{
		mutex::scoped_lock lock(read_mutex);
		istream & first_in = first.filtered();
		istream & second_in = second.filtered();
		while (not first.eof() and not second.eof() and (n_seq + 1) < num_seq) {
			first_in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			Mask & rf = sequences[n_seq];
			rf.set_id(read.get_id());
			rf.set_sequence(read.get_sequence());
			rf.set_quality(read.get_quality());
			n_seq++;
			second_in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			Mask & rs = sequences[n_seq];
			rs.set_id(read.get_id());
			rs.set_sequence(read.get_sequence());
			rs.set_quality(read.get_quality());
			n_seq++;
			//CHECK!!
			if (rf.id.compare(0, rf.id.size() - 1, rs.id, 0, rs.id.size() - 1) != 0) {
				ERROR_CHANNEL << "wrong paired reads IDs: '" << rf.id << "' and '" << rs.id << '\'' << endl;
				exit(2);
			}
		}
		output_progress(first, gui_output);
	}
	return n_seq;
}
Пример #3
0
void Module_DCREATE::compute_master(const Options & options) {

	string prefix_temp = options.output_file + string("_temp");

	DEFAULT_CHANNEL << '[' << my_rank << "] reading input" << endl;

	// Read all the Fasta files and check for duplicate names
	vector<Fasta *> multi_fasta;
	set<string> names;
	pair<set<string>::iterator,bool> ret;
	bool all_ok = true;
	size_t sum = 0;
	for (vector<string>::const_iterator iter = options.input_files.begin(); iter != options.input_files.end(); iter++) {
		Auto_Unzip input(iter->c_str());
		while (not input.eof()) {
			Fasta * temp = new Fasta();
			input.filtered() >> *temp;
			sum += temp->length();
			multi_fasta.push_back(temp);
			ret = names.insert(temp->get_id());
			if (ret.second == false) {
				ERROR_CHANNEL << "Error: name \"" << temp->get_id() << "\" already exists!" << endl;
				all_ok = false;
			}
		}
	}
	if (not all_ok) {
		for (int node = 1; node < nprocs; node++)
		send_sequences_to_slave(node, 0, 0, string(), string());
		return;
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] sorting" << endl;
	// sort by length
	if (options.balancing)
		sort(multi_fasta.begin(), multi_fasta.end(), sort_reverse_function);

	DEFAULT_CHANNEL << '[' << my_rank << "] preparing header" << endl;

	// prepare file for header
	stringstream header_name;
	header_name << options.output_file << "_h.dht";
	ofstream o(header_name.str().c_str());
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++)
		o << (*iter)->get_id() << '\t' << (*iter)->get_sequence().size() << endl;
	o.close();

	DEFAULT_CHANNEL << '[' << my_rank << "] preparing temporary files" << endl;

	// prepare sets and create temp files
	size_t bins = nprocs;
	size_t bin_length[bins];
	ofstream outputs[bins];
	for (size_t i = 0; i < bins; i++) {
		bin_length[i] = 0;
		stringstream filename;
		filename << prefix_temp << '_' << (i+1) << ".fasta";
		temp_files.push_back(filename.str());
		outputs[i].open(filename.str().c_str());
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] writing to files" << endl;

			// write to files
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) {
		size_t min_pos = 0;
		size_t t_min = sum;
		for (size_t i = 0; i < bins; i++)
			if (bin_length[i] < t_min) {
				min_pos = i;
				t_min = bin_length[min_pos];
			}
		bin_length[min_pos] += (*iter)->length();
		outputs[min_pos] << **iter;
	}

	for (size_t i = 0; i < bins; i++) {
		outputs[i].close();
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] sending to slaves" << endl;

	// send to slaves
	for (size_t i = 1; i < bins; i++) {
		stringstream filename_input;
		stringstream filename_output;
		filename_input << prefix_temp << '_' << (i+1) << ".fasta";
		filename_output << options.output_file << '_' << (i+1) << ".eht";
		send_sequences_to_slave(i, options.k, options.blockLength, filename_input.str(), filename_output.str());
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] clearing memory" << endl;
	// clear memory
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++)
		delete *iter;

	DEFAULT_CHANNEL << '[' << my_rank << "] computing" << endl;

	// compute by master process
	stringstream filename_input;
	stringstream filename_output;
	filename_input << prefix_temp << "_1.fasta";
	filename_output << options.output_file << "_1.eht";
	compute_hash(options.k,options.blockLength,filename_input.str().c_str(), filename_output.str().c_str(),false); //TODO handle methyl_hash

	DEFAULT_CHANNEL << '[' << my_rank << "] finishing" << endl;

	stringstream filename_numberfile;
	filename_numberfile << options.output_file << "_n.dht";
	ofstream nf(filename_numberfile.str().c_str());
	if (!nf) {
		ERROR_CHANNEL << "I cannot open file " << filename_numberfile.str() << " for writing!" << endl;
		exit(6);
	}
	nf << nprocs << endl;

}