int read_sequences(Auto_Unzip & input, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) { if (&input == NULL) return 0; Fasta read; read.set_FASTQ_type(format_type); int n_seq = 0; { mutex::scoped_lock lock(read_mutex); istream & in = input.filtered(); while (not input.eof() and n_seq < num_seq) { in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } //Reset and take a ref Mask & r = sequences[n_seq] = Mask(); r.set_id(read.get_id()); r.set_sequence(read.get_sequence()); r.set_quality(read.get_quality()); n_seq++; } output_progress(input, gui_output); } return n_seq; }
int read_sequences(Auto_Unzip & first, Auto_Unzip & second, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) { if (&first == NULL or &second == NULL) return 0; Fasta read; read.set_FASTQ_type(format_type); int n_seq = 0; { mutex::scoped_lock lock(read_mutex); istream & first_in = first.filtered(); istream & second_in = second.filtered(); while (not first.eof() and not second.eof() and (n_seq + 1) < num_seq) { first_in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } Mask & rf = sequences[n_seq]; rf.set_id(read.get_id()); rf.set_sequence(read.get_sequence()); rf.set_quality(read.get_quality()); n_seq++; second_in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } Mask & rs = sequences[n_seq]; rs.set_id(read.get_id()); rs.set_sequence(read.get_sequence()); rs.set_quality(read.get_quality()); n_seq++; //CHECK!! if (rf.id.compare(0, rf.id.size() - 1, rs.id, 0, rs.id.size() - 1) != 0) { ERROR_CHANNEL << "wrong paired reads IDs: '" << rf.id << "' and '" << rs.id << '\'' << endl; exit(2); } } output_progress(first, gui_output); } return n_seq; }
void Module_DCREATE::compute_master(const Options & options) { string prefix_temp = options.output_file + string("_temp"); DEFAULT_CHANNEL << '[' << my_rank << "] reading input" << endl; // Read all the Fasta files and check for duplicate names vector<Fasta *> multi_fasta; set<string> names; pair<set<string>::iterator,bool> ret; bool all_ok = true; size_t sum = 0; for (vector<string>::const_iterator iter = options.input_files.begin(); iter != options.input_files.end(); iter++) { Auto_Unzip input(iter->c_str()); while (not input.eof()) { Fasta * temp = new Fasta(); input.filtered() >> *temp; sum += temp->length(); multi_fasta.push_back(temp); ret = names.insert(temp->get_id()); if (ret.second == false) { ERROR_CHANNEL << "Error: name \"" << temp->get_id() << "\" already exists!" << endl; all_ok = false; } } } if (not all_ok) { for (int node = 1; node < nprocs; node++) send_sequences_to_slave(node, 0, 0, string(), string()); return; } DEFAULT_CHANNEL << '[' << my_rank << "] sorting" << endl; // sort by length if (options.balancing) sort(multi_fasta.begin(), multi_fasta.end(), sort_reverse_function); DEFAULT_CHANNEL << '[' << my_rank << "] preparing header" << endl; // prepare file for header stringstream header_name; header_name << options.output_file << "_h.dht"; ofstream o(header_name.str().c_str()); for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) o << (*iter)->get_id() << '\t' << (*iter)->get_sequence().size() << endl; o.close(); DEFAULT_CHANNEL << '[' << my_rank << "] preparing temporary files" << endl; // prepare sets and create temp files size_t bins = nprocs; size_t bin_length[bins]; ofstream outputs[bins]; for (size_t i = 0; i < bins; i++) { bin_length[i] = 0; stringstream filename; filename << prefix_temp << '_' << (i+1) << ".fasta"; temp_files.push_back(filename.str()); outputs[i].open(filename.str().c_str()); } DEFAULT_CHANNEL << '[' << my_rank << "] writing to files" << endl; // write to files for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) { size_t min_pos = 0; size_t t_min = sum; for (size_t i = 0; i < bins; i++) if (bin_length[i] < t_min) { min_pos = i; t_min = bin_length[min_pos]; } bin_length[min_pos] += (*iter)->length(); outputs[min_pos] << **iter; } for (size_t i = 0; i < bins; i++) { outputs[i].close(); } DEFAULT_CHANNEL << '[' << my_rank << "] sending to slaves" << endl; // send to slaves for (size_t i = 1; i < bins; i++) { stringstream filename_input; stringstream filename_output; filename_input << prefix_temp << '_' << (i+1) << ".fasta"; filename_output << options.output_file << '_' << (i+1) << ".eht"; send_sequences_to_slave(i, options.k, options.blockLength, filename_input.str(), filename_output.str()); } DEFAULT_CHANNEL << '[' << my_rank << "] clearing memory" << endl; // clear memory for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) delete *iter; DEFAULT_CHANNEL << '[' << my_rank << "] computing" << endl; // compute by master process stringstream filename_input; stringstream filename_output; filename_input << prefix_temp << "_1.fasta"; filename_output << options.output_file << "_1.eht"; compute_hash(options.k,options.blockLength,filename_input.str().c_str(), filename_output.str().c_str(),false); //TODO handle methyl_hash DEFAULT_CHANNEL << '[' << my_rank << "] finishing" << endl; stringstream filename_numberfile; filename_numberfile << options.output_file << "_n.dht"; ofstream nf(filename_numberfile.str().c_str()); if (!nf) { ERROR_CHANNEL << "I cannot open file " << filename_numberfile.str() << " for writing!" << endl; exit(6); } nf << nprocs << endl; }