Example #1
0
int read_sequences(Auto_Unzip & input, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) {
	if (&input == NULL)
		return 0;

	Fasta read;
	read.set_FASTQ_type(format_type);
	int n_seq = 0;
	{
		mutex::scoped_lock lock(read_mutex);
		istream & in = input.filtered();
		while (not input.eof() and n_seq < num_seq) {
			in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			//Reset and take a ref
			Mask & r = sequences[n_seq] = Mask();
			r.set_id(read.get_id());
			r.set_sequence(read.get_sequence());
			r.set_quality(read.get_quality());
			n_seq++;
		}
		output_progress(input, gui_output);
	}
	return n_seq;
}
Example #2
0
int read_sequences(Auto_Unzip & first, Auto_Unzip & second, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) {
	if (&first == NULL or &second == NULL)
		return 0;

	Fasta read;
	read.set_FASTQ_type(format_type);
	int n_seq = 0;
	{
		mutex::scoped_lock lock(read_mutex);
		istream & first_in = first.filtered();
		istream & second_in = second.filtered();
		while (not first.eof() and not second.eof() and (n_seq + 1) < num_seq) {
			first_in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			Mask & rf = sequences[n_seq];
			rf.set_id(read.get_id());
			rf.set_sequence(read.get_sequence());
			rf.set_quality(read.get_quality());
			n_seq++;
			second_in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			Mask & rs = sequences[n_seq];
			rs.set_id(read.get_id());
			rs.set_sequence(read.get_sequence());
			rs.set_quality(read.get_quality());
			n_seq++;
			//CHECK!!
			if (rf.id.compare(0, rf.id.size() - 1, rs.id, 0, rs.id.size() - 1) != 0) {
				ERROR_CHANNEL << "wrong paired reads IDs: '" << rf.id << "' and '" << rs.id << '\'' << endl;
				exit(2);
			}
		}
		output_progress(first, gui_output);
	}
	return n_seq;
}
int main( int argc, char *argv[])
{
	// Options
	bool     showHelp          = false;
	string   cutSeq   = "AAGCTT";
	string   genomeFile;
	string   bedFile           = "stdout";
	string   faFile            = "ends.fa";
	CHRPOS   readLen           = 20;

	// Show help when has no options
	if(argc <= 1)
	{
		Help();
		return 0;
	}

	// Parsing options
	for(int i = 1; i < argc; i++)
	{
		int parameterLength = (int)strlen(argv[i]);
		if((PARAMETER_CHECK("-h", 2, parameterLength)) || (PARAMETER_CHECK("--help", 5, parameterLength))) 
			showHelp=true;
		else if((PARAMETER_CHECK("-g", 2, parameterLength)) || (PARAMETER_CHECK("--genome", 8, parameterLength)))
		{
			if ((++i) < argc) 
				genomeFile = argv[i];
		}
		else if((PARAMETER_CHECK("-c", 2, parameterLength)) || (PARAMETER_CHECK("--cut_seq", 9, parameterLength)))
		{
			if ((++i) < argc)
				cutSeq = argv[i];
		}
		else if ((PARAMETER_CHECK("-b", 2, parameterLength)) || (PARAMETER_CHECK("--bed_output", 12, parameterLength)))
		{
			if ((++i) < argc)
				bedFile=argv[i];
		}
		else if ((PARAMETER_CHECK("-f", 2, parameterLength)) || (PARAMETER_CHECK("--fa_output", 11, parameterLength)))
		{
			if ((++i) < argc)
				faFile=argv[i];
		}
		else if ((PARAMETER_CHECK("-r", 2, parameterLength)) || (PARAMETER_CHECK("--read_len", 10, parameterLength)))
		{
			if ((++i) < argc)
				readLen = StringUtils::toValue<CHRPOS>(argv[i]);
		}
		else
		{
			cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
			showHelp = true;
		}
	}
	
	// Show help if no proper auguments.
	if (showHelp)
	{
		Help();
		return 0;
	}

	// Statistical variables
	map <string, int, less<string> > bedCount;
	map <string, CHRPOS, less<string> > bedSum;
	map <string, CHRPOS, less<string> > faSize;

    // Variables
	CHRPOS lindex,rindex;
	int siteLen=cutSeq.size();
	bool flag;
	Fasta curFa;
	SeqReader fhfa(genomeFile);
	Writer bedOutput(bedFile);
	Writer faOutput(faFile);

	
	// open files
	fhfa.open();
	bedOutput.open();
	faOutput.open();

	// Read the genome file.
	while (fhfa.getNext(curFa))	
	{
		// Statistics
		bedCount[curFa.id]=0;
		bedSum[curFa.id]=0;
		faSize[curFa.id]=curFa.length();
		
		// Find next recognition site.
		lindex=rindex=0;
		flag=true;
		while (flag)
		{
			rindex=curFa.seq.find(cutSeq,lindex);
			if(rindex==CHRPOS(string::npos))
			{
				rindex=curFa.seq.size();
				flag=false;
			}

			(*(bedOutput.Printer())) << curFa.id << "\t" << lindex << "\t" << rindex << "\t" << (lindex+rindex)/2 << endl;
			if (rindex - lindex >= 2*readLen)
			{
				bedCount[curFa.id]++;
				bedSum[curFa.id]+=rindex-lindex;
				(*(faOutput.Printer())) << ">" << curFa.id << "_" << (lindex+rindex)/2 << "_L" << endl;
				(*(faOutput.Printer())) << curFa.seq.substr(lindex,readLen) << endl;
				(*(faOutput.Printer())) << ">" << curFa.id << "_" << (lindex+rindex)/2 << "_R" << endl;
				(*(faOutput.Printer())) << curFa.seq.substr(rindex-readLen,readLen) << endl;
			}
			lindex=rindex+siteLen;
		}
	}
	
	// close files
	fhfa.close();
	bedOutput.close();
	faOutput.close();

	// print statistics into log file
	Writer log(cutSeq+".log");
	log.open();
	log.close();
	
	return 0;
}
Example #4
0
void Module_DCREATE::compute_master(const Options & options) {

	string prefix_temp = options.output_file + string("_temp");

	DEFAULT_CHANNEL << '[' << my_rank << "] reading input" << endl;

	// Read all the Fasta files and check for duplicate names
	vector<Fasta *> multi_fasta;
	set<string> names;
	pair<set<string>::iterator,bool> ret;
	bool all_ok = true;
	size_t sum = 0;
	for (vector<string>::const_iterator iter = options.input_files.begin(); iter != options.input_files.end(); iter++) {
		Auto_Unzip input(iter->c_str());
		while (not input.eof()) {
			Fasta * temp = new Fasta();
			input.filtered() >> *temp;
			sum += temp->length();
			multi_fasta.push_back(temp);
			ret = names.insert(temp->get_id());
			if (ret.second == false) {
				ERROR_CHANNEL << "Error: name \"" << temp->get_id() << "\" already exists!" << endl;
				all_ok = false;
			}
		}
	}
	if (not all_ok) {
		for (int node = 1; node < nprocs; node++)
		send_sequences_to_slave(node, 0, 0, string(), string());
		return;
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] sorting" << endl;
	// sort by length
	if (options.balancing)
		sort(multi_fasta.begin(), multi_fasta.end(), sort_reverse_function);

	DEFAULT_CHANNEL << '[' << my_rank << "] preparing header" << endl;

	// prepare file for header
	stringstream header_name;
	header_name << options.output_file << "_h.dht";
	ofstream o(header_name.str().c_str());
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++)
		o << (*iter)->get_id() << '\t' << (*iter)->get_sequence().size() << endl;
	o.close();

	DEFAULT_CHANNEL << '[' << my_rank << "] preparing temporary files" << endl;

	// prepare sets and create temp files
	size_t bins = nprocs;
	size_t bin_length[bins];
	ofstream outputs[bins];
	for (size_t i = 0; i < bins; i++) {
		bin_length[i] = 0;
		stringstream filename;
		filename << prefix_temp << '_' << (i+1) << ".fasta";
		temp_files.push_back(filename.str());
		outputs[i].open(filename.str().c_str());
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] writing to files" << endl;

			// write to files
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) {
		size_t min_pos = 0;
		size_t t_min = sum;
		for (size_t i = 0; i < bins; i++)
			if (bin_length[i] < t_min) {
				min_pos = i;
				t_min = bin_length[min_pos];
			}
		bin_length[min_pos] += (*iter)->length();
		outputs[min_pos] << **iter;
	}

	for (size_t i = 0; i < bins; i++) {
		outputs[i].close();
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] sending to slaves" << endl;

	// send to slaves
	for (size_t i = 1; i < bins; i++) {
		stringstream filename_input;
		stringstream filename_output;
		filename_input << prefix_temp << '_' << (i+1) << ".fasta";
		filename_output << options.output_file << '_' << (i+1) << ".eht";
		send_sequences_to_slave(i, options.k, options.blockLength, filename_input.str(), filename_output.str());
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] clearing memory" << endl;
	// clear memory
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++)
		delete *iter;

	DEFAULT_CHANNEL << '[' << my_rank << "] computing" << endl;

	// compute by master process
	stringstream filename_input;
	stringstream filename_output;
	filename_input << prefix_temp << "_1.fasta";
	filename_output << options.output_file << "_1.eht";
	compute_hash(options.k,options.blockLength,filename_input.str().c_str(), filename_output.str().c_str(),false); //TODO handle methyl_hash

	DEFAULT_CHANNEL << '[' << my_rank << "] finishing" << endl;

	stringstream filename_numberfile;
	filename_numberfile << options.output_file << "_n.dht";
	ofstream nf(filename_numberfile.str().c_str());
	if (!nf) {
		ERROR_CHANNEL << "I cannot open file " << filename_numberfile.str() << " for writing!" << endl;
		exit(6);
	}
	nf << nprocs << endl;

}