int read_sequences(Auto_Unzip & input, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) { if (&input == NULL) return 0; Fasta read; read.set_FASTQ_type(format_type); int n_seq = 0; { mutex::scoped_lock lock(read_mutex); istream & in = input.filtered(); while (not input.eof() and n_seq < num_seq) { in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } //Reset and take a ref Mask & r = sequences[n_seq] = Mask(); r.set_id(read.get_id()); r.set_sequence(read.get_sequence()); r.set_quality(read.get_quality()); n_seq++; } output_progress(input, gui_output); } return n_seq; }
int read_sequences(Auto_Unzip & first, Auto_Unzip & second, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) { if (&first == NULL or &second == NULL) return 0; Fasta read; read.set_FASTQ_type(format_type); int n_seq = 0; { mutex::scoped_lock lock(read_mutex); istream & first_in = first.filtered(); istream & second_in = second.filtered(); while (not first.eof() and not second.eof() and (n_seq + 1) < num_seq) { first_in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } Mask & rf = sequences[n_seq]; rf.set_id(read.get_id()); rf.set_sequence(read.get_sequence()); rf.set_quality(read.get_quality()); n_seq++; second_in >> read; if (read.length() > MAX_READ_LENGTH) { cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl; exit(5); } Mask & rs = sequences[n_seq]; rs.set_id(read.get_id()); rs.set_sequence(read.get_sequence()); rs.set_quality(read.get_quality()); n_seq++; //CHECK!! if (rf.id.compare(0, rf.id.size() - 1, rs.id, 0, rs.id.size() - 1) != 0) { ERROR_CHANNEL << "wrong paired reads IDs: '" << rf.id << "' and '" << rs.id << '\'' << endl; exit(2); } } output_progress(first, gui_output); } return n_seq; }
int main( int argc, char *argv[]) { // Options bool showHelp = false; string cutSeq = "AAGCTT"; string genomeFile; string bedFile = "stdout"; string faFile = "ends.fa"; CHRPOS readLen = 20; // Show help when has no options if(argc <= 1) { Help(); return 0; } // Parsing options for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); if((PARAMETER_CHECK("-h", 2, parameterLength)) || (PARAMETER_CHECK("--help", 5, parameterLength))) showHelp=true; else if((PARAMETER_CHECK("-g", 2, parameterLength)) || (PARAMETER_CHECK("--genome", 8, parameterLength))) { if ((++i) < argc) genomeFile = argv[i]; } else if((PARAMETER_CHECK("-c", 2, parameterLength)) || (PARAMETER_CHECK("--cut_seq", 9, parameterLength))) { if ((++i) < argc) cutSeq = argv[i]; } else if ((PARAMETER_CHECK("-b", 2, parameterLength)) || (PARAMETER_CHECK("--bed_output", 12, parameterLength))) { if ((++i) < argc) bedFile=argv[i]; } else if ((PARAMETER_CHECK("-f", 2, parameterLength)) || (PARAMETER_CHECK("--fa_output", 11, parameterLength))) { if ((++i) < argc) faFile=argv[i]; } else if ((PARAMETER_CHECK("-r", 2, parameterLength)) || (PARAMETER_CHECK("--read_len", 10, parameterLength))) { if ((++i) < argc) readLen = StringUtils::toValue<CHRPOS>(argv[i]); } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; } } // Show help if no proper auguments. if (showHelp) { Help(); return 0; } // Statistical variables map <string, int, less<string> > bedCount; map <string, CHRPOS, less<string> > bedSum; map <string, CHRPOS, less<string> > faSize; // Variables CHRPOS lindex,rindex; int siteLen=cutSeq.size(); bool flag; Fasta curFa; SeqReader fhfa(genomeFile); Writer bedOutput(bedFile); Writer faOutput(faFile); // open files fhfa.open(); bedOutput.open(); faOutput.open(); // Read the genome file. while (fhfa.getNext(curFa)) { // Statistics bedCount[curFa.id]=0; bedSum[curFa.id]=0; faSize[curFa.id]=curFa.length(); // Find next recognition site. lindex=rindex=0; flag=true; while (flag) { rindex=curFa.seq.find(cutSeq,lindex); if(rindex==CHRPOS(string::npos)) { rindex=curFa.seq.size(); flag=false; } (*(bedOutput.Printer())) << curFa.id << "\t" << lindex << "\t" << rindex << "\t" << (lindex+rindex)/2 << endl; if (rindex - lindex >= 2*readLen) { bedCount[curFa.id]++; bedSum[curFa.id]+=rindex-lindex; (*(faOutput.Printer())) << ">" << curFa.id << "_" << (lindex+rindex)/2 << "_L" << endl; (*(faOutput.Printer())) << curFa.seq.substr(lindex,readLen) << endl; (*(faOutput.Printer())) << ">" << curFa.id << "_" << (lindex+rindex)/2 << "_R" << endl; (*(faOutput.Printer())) << curFa.seq.substr(rindex-readLen,readLen) << endl; } lindex=rindex+siteLen; } } // close files fhfa.close(); bedOutput.close(); faOutput.close(); // print statistics into log file Writer log(cutSeq+".log"); log.open(); log.close(); return 0; }
void Module_DCREATE::compute_master(const Options & options) { string prefix_temp = options.output_file + string("_temp"); DEFAULT_CHANNEL << '[' << my_rank << "] reading input" << endl; // Read all the Fasta files and check for duplicate names vector<Fasta *> multi_fasta; set<string> names; pair<set<string>::iterator,bool> ret; bool all_ok = true; size_t sum = 0; for (vector<string>::const_iterator iter = options.input_files.begin(); iter != options.input_files.end(); iter++) { Auto_Unzip input(iter->c_str()); while (not input.eof()) { Fasta * temp = new Fasta(); input.filtered() >> *temp; sum += temp->length(); multi_fasta.push_back(temp); ret = names.insert(temp->get_id()); if (ret.second == false) { ERROR_CHANNEL << "Error: name \"" << temp->get_id() << "\" already exists!" << endl; all_ok = false; } } } if (not all_ok) { for (int node = 1; node < nprocs; node++) send_sequences_to_slave(node, 0, 0, string(), string()); return; } DEFAULT_CHANNEL << '[' << my_rank << "] sorting" << endl; // sort by length if (options.balancing) sort(multi_fasta.begin(), multi_fasta.end(), sort_reverse_function); DEFAULT_CHANNEL << '[' << my_rank << "] preparing header" << endl; // prepare file for header stringstream header_name; header_name << options.output_file << "_h.dht"; ofstream o(header_name.str().c_str()); for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) o << (*iter)->get_id() << '\t' << (*iter)->get_sequence().size() << endl; o.close(); DEFAULT_CHANNEL << '[' << my_rank << "] preparing temporary files" << endl; // prepare sets and create temp files size_t bins = nprocs; size_t bin_length[bins]; ofstream outputs[bins]; for (size_t i = 0; i < bins; i++) { bin_length[i] = 0; stringstream filename; filename << prefix_temp << '_' << (i+1) << ".fasta"; temp_files.push_back(filename.str()); outputs[i].open(filename.str().c_str()); } DEFAULT_CHANNEL << '[' << my_rank << "] writing to files" << endl; // write to files for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) { size_t min_pos = 0; size_t t_min = sum; for (size_t i = 0; i < bins; i++) if (bin_length[i] < t_min) { min_pos = i; t_min = bin_length[min_pos]; } bin_length[min_pos] += (*iter)->length(); outputs[min_pos] << **iter; } for (size_t i = 0; i < bins; i++) { outputs[i].close(); } DEFAULT_CHANNEL << '[' << my_rank << "] sending to slaves" << endl; // send to slaves for (size_t i = 1; i < bins; i++) { stringstream filename_input; stringstream filename_output; filename_input << prefix_temp << '_' << (i+1) << ".fasta"; filename_output << options.output_file << '_' << (i+1) << ".eht"; send_sequences_to_slave(i, options.k, options.blockLength, filename_input.str(), filename_output.str()); } DEFAULT_CHANNEL << '[' << my_rank << "] clearing memory" << endl; // clear memory for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) delete *iter; DEFAULT_CHANNEL << '[' << my_rank << "] computing" << endl; // compute by master process stringstream filename_input; stringstream filename_output; filename_input << prefix_temp << "_1.fasta"; filename_output << options.output_file << "_1.eht"; compute_hash(options.k,options.blockLength,filename_input.str().c_str(), filename_output.str().c_str(),false); //TODO handle methyl_hash DEFAULT_CHANNEL << '[' << my_rank << "] finishing" << endl; stringstream filename_numberfile; filename_numberfile << options.output_file << "_n.dht"; ofstream nf(filename_numberfile.str().c_str()); if (!nf) { ERROR_CHANNEL << "I cannot open file " << filename_numberfile.str() << " for writing!" << endl; exit(6); } nf << nprocs << endl; }