Esempio n. 1
0
pair<string,scalar_type> tree_LL_nucl(string tree,string aln_filename,bool optimize_bls,scalar_type tolerance)
{
  //const Alphabet* alphabet = new ProteicAlphabet();
  const Alphabet* alphabet = new RNA();
	OrderedSequenceContainer *alignment;
	VectorSiteContainer* sites;
	Fasta Reader;
	//NexusIOSequence Reader;
	//Phylip * Reader=new Phylip(true,true,100,true,"\r");
	alignment = Reader.read(aln_filename, alphabet);
	sites = new VectorSiteContainer(*alignment);
	SiteContainerTools::removeGapOnlySites(*sites);	
	SiteContainerTools::changeGapsToUnknownCharacters(*sites);	

	TreeTemplate<Node>* ttree1=TreeTemplateTools::parenthesisToTree(tree,false,"ID");
	DiscreteRatesAcrossSitesTreeLikelihood* tl1;
	SubstitutionModel*    model    = 0;
	DiscreteDistribution* rDist    = 0;	
	model = new GTR(&AlphabetTools::RNA_ALPHABET);
	model->setFreqFromData(*sites);
	rDist = new GammaDiscreteDistribution(8, 1, 1);
	tl1 = new RHomogeneousTreeLikelihood(*ttree1, *sites, model, rDist, true, false, false);
	tl1->initialize();
	if (optimize_bls)
	  {
	    //Newton..
	    ParameterList * parameters= new ParameterList();
	    parameters->addParameters( tl1->getBranchLengthsParameters());
	    parameters->addParameters( tl1->getRateDistributionParameters());
	    OptimizationTools::optimizeNumericalParameters(
									     dynamic_cast<DiscreteRatesAcrossSitesTreeLikelihood*>  (tl1),
									     //tl1->getParameters(),
									     *parameters,
									     0,
									     1,
									     tolerance,
									     1000,
									     0,
									     0,
									     false,
									     0,
									     OptimizationTools::OPTIMIZATION_NEWTON,
									     //OptimizationTools::OPTIMIZATION_BRENT);
									     OptimizationTools::OPTIMIZATION_BFGS);
	
	    delete parameters;
	      }
	scalar_type LL=- tl1->getValue(); //Here's your log likelihood value !
	//tl1->getParameters().printParameters(cout);
	//cout << TreeTemplateTools::treeToParenthesis( tl1->getTree() ) <<endl;
	pair<string,scalar_type> return_pair;
	return_pair.first= TreeTemplateTools::treeToParenthesis( tl1->getTree() ) ;
	return_pair.second=LL;
	delete sites;
	delete alphabet;
	delete model;
	delete rDist;
	delete tl1;
	return 	return_pair;
}
Esempio n. 2
0
int main(int argc, const char** argv)
{
  usage(argc, argv);

  string read = argv[1];
  string germline = argv[2];

  Fasta Vgenes(germline+"V.fa", 2, "|");
  Fasta Jgenes(germline+"J.fa", 2, "|");

  Fasta interestingV = extractInterestingGenes(Vgenes, argv[3]);
  Fasta interestingJ = extractInterestingGenes(Jgenes, argv[4]);

  if (interestingV.size() == 0) {
    cerr << "No interesting V found" << endl;
    exit(2);
  }
  if (interestingJ.size() == 0) {
    cerr << "No interesting J found" << endl;
    exit(2);
  }

  AlignBox box_V("5", V_COLOR);
  AlignBox box_J("3", J_COLOR);

  if (read == "-") {
    // Read on stdin
    read = read_sequence(cin);
  }
  
  align_against_collection(read, interestingV, -1, false, false, false, &box_V, VDJ);
  align_against_collection(read, interestingJ, -1, false, true, false, &box_J, VDJ);
  // This should be handled directly into align_against_collection
  box_J.start = box_J.end ;
  box_J.del_left = box_J.del_right;
  box_J.end = read.size() - 1;
  
  int align_V_length = min(GENE_ALIGN, box_V.end - box_V.start + 1);
  int align_J_length = min(GENE_ALIGN, (int)read.size() - box_J.start + 1);
  int start_V = box_V.end - align_V_length + 1;
  int end_J = box_J.start + align_J_length - 1;

  cout << "read        \t" << start_V << "\t" ;

  cout << V_COLOR << read.substr(start_V, align_V_length)
       << NO_COLOR
       << read.substr(box_V.end+1, (box_J.start - 1) - (box_V.end + 1) +1)
       << J_COLOR
       << read.substr(box_J.start, align_J_length)
       << NO_COLOR
       << "\t" << end_J << endl ;

  cout << box_V.refToString(start_V, end_J) << "\t" << box_V << endl ;
  cout << box_J.refToString(start_V, end_J) << "\t" << box_J << endl ;
      
  exit (0);
}
Esempio n. 3
0
shared_ptr<VectorSiteContainer> SiteContainerBuilder::read_fasta_protein_file(
        string filename) {
    Fasta reader;
    SequenceContainer* alignment = reader.readSequences(filename, &AlphabetTools::PROTEIN_ALPHABET);
    shared_ptr<VectorSiteContainer> sequences(new VectorSiteContainer(*alignment));
    delete alignment;
    if (sequences->getNumberOfSequences() == 0) {
        sequences.reset();
        throw Exception("The alignment is empty - did you specify the right file format?");
    }
    return sequences;
}
Esempio n. 4
0
Fasta extractInterestingGenes(Fasta &repertoire, string name) {
  Fasta interesting;
  
  int size = repertoire.size();
  for (int i = 0; i < size; i++) {
    if (repertoire.label(i).find(name) != string::npos) {
      interesting.add(repertoire.read(i));
    }
  }

  return interesting;
}
Esempio n. 5
0
int read_sequences(Auto_Unzip & input, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) {
	if (&input == NULL)
		return 0;

	Fasta read;
	read.set_FASTQ_type(format_type);
	int n_seq = 0;
	{
		mutex::scoped_lock lock(read_mutex);
		istream & in = input.filtered();
		while (not input.eof() and n_seq < num_seq) {
			in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			//Reset and take a ref
			Mask & r = sequences[n_seq] = Mask();
			r.set_id(read.get_id());
			r.set_sequence(read.get_sequence());
			r.set_quality(read.get_quality());
			n_seq++;
		}
		output_progress(input, gui_output);
	}
	return n_seq;
}
Esempio n. 6
0
VariantContig::VariantContig(
        RawVariant const& var,
        Fasta& ref,
        int flank,
        std::string const& seqname
        )
{
    uint64_t seqlen = ref.seqlen(seqname);
    uint64_t preflank_len = var.pos <= flank ? var.pos - 1 : flank;
    _start = std::max(1ul, var.pos - preflank_len);
    _stop = std::min(var.pos + var.ref.size() - 1 + flank, seqlen);
    uint64_t postflank_start = var.pos + var.ref.size();
    uint64_t postflank_len = _stop - postflank_start + 1;
    
    // build sequence
    if (preflank_len)
        _sequence = ref.sequence(seqname, _start, preflank_len); // left flank
    _sequence += var.alt;
    if (postflank_start <= seqlen && postflank_len)
        _sequence += ref.sequence(seqname, postflank_start, postflank_len); // right flank
    
    // build cigar
    if (preflank_len)
        _cigar.push_back(preflank_len, MATCH);

    if (var.ref.size() > var.alt.size()) {
        _cigar.push_back(var.alt.size(), MATCH);
        _cigar.push_back(var.ref.size() - var.alt.size(), DEL);
    } else if (var.ref.size() < var.alt.size()) {
        _cigar.push_back(var.ref.size(), MATCH);
        _cigar.push_back(var.alt.size() - var.ref.size(), INS);
    } else {
        _cigar.push_back(var.alt.size(), MATCH);
    }

    if (postflank_len)
        _cigar.push_back(postflank_len, MATCH);
}
Esempio n. 7
0
void Alignment::_write_fasta(shared_ptr<VectorSiteContainer> seqs, string filename) {
    Fasta writer;
    writer.writeAlignment(filename, *seqs);
}
int write_node
(
	FILE *f,
	parus::Tree *node,
	int parent_node_number,
	int number_output_edge,
	map<int,Edge> &edges_list,
	Fasta &fasta,
	const char *path_to_muscle
)
{
	Edge edge;
	int flag=0;
	int num_input_edges=0;

	int left_edge_number=0;
	int right_edge_number=0;

	char str[100];
	int i;

	FILE *file_node_body=NULL;

	Fasta node_fasta;
	Sequence_record record;


	if(node==NULL) return 0;

	fprintf(f,"  <NODE_BEGIN>\n");
	fprintf(f,"   number %d\n",node->number);
	fprintf(f,"   type 0\n");
	fprintf(f,"   weight 1000\n");
	fprintf(f,"   layer %d\n",node->antilayer);
	
	if(node->left!=NULL)
	{
		num_input_edges++;
	}

	if(node->right!=NULL)
	{
		num_input_edges++;
	}
	fprintf(f,"   num_input_edges %d\n",num_input_edges);
	
	fprintf(f,"   edges ( ");
	if(node->left!=NULL)
	{
		edge_number++;
		left_edge_number=edge_number;
		fprintf(f," %d ",edge_number);
	}
	
	if(node->right!=NULL)
	{
		edge_number++;
		right_edge_number=edge_number;
		fprintf(f," %d ",edge_number);
	}
	fprintf(f,")\n");


	if(number_output_edge==0)
	{
		fprintf(f,"   num_output_edges 0\n");
		fprintf(f,"   edges ( )\n");
	}
	else
	{
		fprintf(f,"   num_output_edges 1\n");
		fprintf(f,"   edges ( %d )\n",number_output_edge);
		
		edge.to=parent_node_number;
		edge.from=node->number;
		edges_list[number_output_edge]=edge;
		
	}
	
	fprintf(f,"   head \"\"\n");
	if((node->left!=NULL)&&(node->right!=NULL))
	{
			fprintf(f,"   body \"process_pair.cpp\"\n");
	}
	if(node->num_names!=0)
	{
			fprintf(f,"   body \"generate_profile.cpp\"\n");
	}
	fprintf(f,"   tail \"\"\n");
	fprintf(f,"  <NODE_END>\n\n");
	
	flag=write_node
	(
		f,
		node->left,
		node->number,
		left_edge_number,
		edges_list,
		fasta,
		path_to_muscle
	);
	
	if(flag) return -1;

	flag=write_node
	(
		f,
		node->right,
		node->number,
		right_edge_number,
		edges_list,
		fasta,
		path_to_muscle
	);

	if(flag) return -1;

	if((node->left!=NULL)&&(node->right!=NULL))
	{
		
		file_node_body=fopen("root_graph_program.cpp","a");
		
		if(file_node_body==NULL)
		{
			printf("Can't open file 'root_graph_program.cpp'\n");
			return -1;
		}

		fprintf
		(
			file_node_body,
			"pairs[%d].set_values( %d, %d );\n",
			node->number-1,
			node->left->number,
			node->right->number
		);

		fclose(file_node_body);

		/*
		fprintf
		(
			file_node_body,
			"system(\"%s -profile -in1 data_node_%d.fasta -in2 data_node_%d.fasta -out data_node_%d.fasta \");\n",
			path_to_muscle,
			node->left->number,
			node->right->number,
			node->number
		);
		*/
		
		/*
		 * This code commented because this problem
		 * solved in make_align.sh script
		 */
		
		/*
		fprintf
		(
			file_node_body,
			"system(\"rm data_node_%d.fasta data_node_%d.fasta\");\n",
			node->left->number,
			node->right->number
		);
		*/
	}
    
	if(node->num_names!=0)
	{
	 /*
		fprintf
		(
			file_node_body,
			"system(\"%s -in fasta_node_%d.fasta -out data_node_%d.fasta \");\n",
			path_to_muscle,
			node->number,
			node->number			
		);
	*/	
		/* 
		 * This code commented because this problem
		 * solved in make_align.sh script
		 */
		
		/*
		fprintf
		(
			file_node_body,
			"system(\"rm fasta_node_%d.fasta \");\n",
			node->number			
		);
		*/
		
		for(i=0;i<node->num_names;i++)
		{
			fasta.get(node->names[i],record);
			node_fasta.add(record);			
		}

		sprintf(str,"fasta_node_%d.fasta",node->number);
		flag=node_fasta.write(str);
		if(flag)
		{
			printf("The node with number %d can't be written to file '%s'\n",node->number,str);
			return -1;
		}
			
	}

	//fclose(file_node_body);
	
	return 0;
	
}
Esempio n. 9
0
int main( int argc, char *argv[])
{
	// Options
	bool     showHelp          = false;
	string   cutSeq   = "AAGCTT";
	string   genomeFile;
	string   bedFile           = "stdout";
	string   faFile            = "ends.fa";
	CHRPOS   readLen           = 20;

	// Show help when has no options
	if(argc <= 1)
	{
		Help();
		return 0;
	}

	// Parsing options
	for(int i = 1; i < argc; i++)
	{
		int parameterLength = (int)strlen(argv[i]);
		if((PARAMETER_CHECK("-h", 2, parameterLength)) || (PARAMETER_CHECK("--help", 5, parameterLength))) 
			showHelp=true;
		else if((PARAMETER_CHECK("-g", 2, parameterLength)) || (PARAMETER_CHECK("--genome", 8, parameterLength)))
		{
			if ((++i) < argc) 
				genomeFile = argv[i];
		}
		else if((PARAMETER_CHECK("-c", 2, parameterLength)) || (PARAMETER_CHECK("--cut_seq", 9, parameterLength)))
		{
			if ((++i) < argc)
				cutSeq = argv[i];
		}
		else if ((PARAMETER_CHECK("-b", 2, parameterLength)) || (PARAMETER_CHECK("--bed_output", 12, parameterLength)))
		{
			if ((++i) < argc)
				bedFile=argv[i];
		}
		else if ((PARAMETER_CHECK("-f", 2, parameterLength)) || (PARAMETER_CHECK("--fa_output", 11, parameterLength)))
		{
			if ((++i) < argc)
				faFile=argv[i];
		}
		else if ((PARAMETER_CHECK("-r", 2, parameterLength)) || (PARAMETER_CHECK("--read_len", 10, parameterLength)))
		{
			if ((++i) < argc)
				readLen = StringUtils::toValue<CHRPOS>(argv[i]);
		}
		else
		{
			cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
			showHelp = true;
		}
	}
	
	// Show help if no proper auguments.
	if (showHelp)
	{
		Help();
		return 0;
	}

	// Statistical variables
	map <string, int, less<string> > bedCount;
	map <string, CHRPOS, less<string> > bedSum;
	map <string, CHRPOS, less<string> > faSize;

    // Variables
	CHRPOS lindex,rindex;
	int siteLen=cutSeq.size();
	bool flag;
	Fasta curFa;
	SeqReader fhfa(genomeFile);
	Writer bedOutput(bedFile);
	Writer faOutput(faFile);

	
	// open files
	fhfa.open();
	bedOutput.open();
	faOutput.open();

	// Read the genome file.
	while (fhfa.getNext(curFa))	
	{
		// Statistics
		bedCount[curFa.id]=0;
		bedSum[curFa.id]=0;
		faSize[curFa.id]=curFa.length();
		
		// Find next recognition site.
		lindex=rindex=0;
		flag=true;
		while (flag)
		{
			rindex=curFa.seq.find(cutSeq,lindex);
			if(rindex==CHRPOS(string::npos))
			{
				rindex=curFa.seq.size();
				flag=false;
			}

			(*(bedOutput.Printer())) << curFa.id << "\t" << lindex << "\t" << rindex << "\t" << (lindex+rindex)/2 << endl;
			if (rindex - lindex >= 2*readLen)
			{
				bedCount[curFa.id]++;
				bedSum[curFa.id]+=rindex-lindex;
				(*(faOutput.Printer())) << ">" << curFa.id << "_" << (lindex+rindex)/2 << "_L" << endl;
				(*(faOutput.Printer())) << curFa.seq.substr(lindex,readLen) << endl;
				(*(faOutput.Printer())) << ">" << curFa.id << "_" << (lindex+rindex)/2 << "_R" << endl;
				(*(faOutput.Printer())) << curFa.seq.substr(rindex-readLen,readLen) << endl;
			}
			lindex=rindex+siteLen;
		}
	}
	
	// close files
	fhfa.close();
	bedOutput.close();
	faOutput.close();

	// print statistics into log file
	Writer log(cutSeq+".log");
	log.open();
	log.close();
	
	return 0;
}
Esempio n. 10
0
scalar_type tree_LL(string tree,string aln_filename,bool optimize_bls,scalar_type tolerance)
{

	const Alphabet* alphabet = new ProteicAlphabet();
	OrderedSequenceContainer *alignment;
	VectorSiteContainer* sites;
	Fasta Reader;
	//Phylip * Reader=new Phylip(true,true,100,true,"\r");
	alignment = Reader.read(aln_filename, alphabet);
	sites = new VectorSiteContainer(*alignment);
	SiteContainerTools::changeGapsToUnknownCharacters(*sites);
	
	TreeTemplate<Node>* ttree1=TreeTemplateTools::parenthesisToTree(tree,false,"ID");

	//Newick newick1;
	//ttree1 = newick1.read(tree);

	DiscreteRatesAcrossSitesTreeLikelihood* tl1;
	SubstitutionModel*    model    = 0;
	DiscreteDistribution* rDist    = 0;	

	model = new LG08(&AlphabetTools::PROTEIN_ALPHABET, new FullProteinFrequenciesSet(&AlphabetTools::PROTEIN_ALPHABET), true);
	model->setFreqFromData(*sites);

	rDist = new GammaDiscreteDistribution(4, 1, 1);

	tl1 = new RHomogeneousTreeLikelihood(*ttree1, *sites, model, rDist, true, false, false);
	tl1->initialize();
		/*

	if (optimize_bls)
	  {
	    Optimizer* optimizer = new PseudoNewtonOptimizer(tl1);
	    //	  Optimizer* optimizer = new PseudoNewtonOptimizer(tl1);

	    ParameterList * parameters= new ParameterList();
	    parameters->addParameters( tl1->getBranchLengthsParameters());
	    parameters->addParameters( tl1->getRateDistributionParameters());
	    //Newton..
	    optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO);
	    optimizer->setProfiler(0);
	    optimizer->setMessageHandler(0);
	    optimizer->setVerbose(0);
	    optimizer->getStopCondition()->setTolerance(0.01);
	    optimizer->init(*parameters);
	    //optimizer->init(tl1->getParameters());
	    optimizer->setMaximumNumberOfEvaluations(1000);
	    optimizer->optimize();
	    delete  parameters;
	    delete optimizer;       
	
	  }
		*/
	if (optimize_bls)
	  {
	    //Newton..
	    ParameterList * parameters= new ParameterList();
	    parameters->addParameters( tl1->getBranchLengthsParameters());
	    parameters->addParameters( tl1->getRateDistributionParameters());
	    OptimizationTools::optimizeNumericalParameters(
									     dynamic_cast<DiscreteRatesAcrossSitesTreeLikelihood*>  (tl1),
									     //tl1->getParameters(),
									     *parameters,
									     0,
									     1,
									     tolerance,
									     1000,
									     0,
									     0,
									     false,
									     0,
									     OptimizationTools::OPTIMIZATION_NEWTON,
									     //OptimizationTools::OPTIMIZATION_BRENT);
									     OptimizationTools::OPTIMIZATION_BFGS);
	
	    delete parameters;
	      }
	scalar_type LL=- tl1->getValue(); //Here's your log likelihood value !

	delete sites;
	delete alphabet;
	delete model;
	delete rDist;
	delete tl1;
	return 	LL;
}
Esempio n. 11
0
int read_sequences(Auto_Unzip & first, Auto_Unzip & second, int num_seq, Mask sequences[], Fasta::FASTQ_encoding format_type, bool gui_output) {
	if (&first == NULL or &second == NULL)
		return 0;

	Fasta read;
	read.set_FASTQ_type(format_type);
	int n_seq = 0;
	{
		mutex::scoped_lock lock(read_mutex);
		istream & first_in = first.filtered();
		istream & second_in = second.filtered();
		while (not first.eof() and not second.eof() and (n_seq + 1) < num_seq) {
			first_in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			Mask & rf = sequences[n_seq];
			rf.set_id(read.get_id());
			rf.set_sequence(read.get_sequence());
			rf.set_quality(read.get_quality());
			n_seq++;
			second_in >> read;
			if (read.length() > MAX_READ_LENGTH) {
				cerr << "Read " << read.get_id() << " too long. Max allowed read size is " << MAX_READ_LENGTH << endl;
				exit(5);
			}
			Mask & rs = sequences[n_seq];
			rs.set_id(read.get_id());
			rs.set_sequence(read.get_sequence());
			rs.set_quality(read.get_quality());
			n_seq++;
			//CHECK!!
			if (rf.id.compare(0, rf.id.size() - 1, rs.id, 0, rs.id.size() - 1) != 0) {
				ERROR_CHANNEL << "wrong paired reads IDs: '" << rf.id << "' and '" << rs.id << '\'' << endl;
				exit(2);
			}
		}
		output_progress(first, gui_output);
	}
	return n_seq;
}
Esempio n. 12
0
void Module_DCREATE::compute_master(const Options & options) {

	string prefix_temp = options.output_file + string("_temp");

	DEFAULT_CHANNEL << '[' << my_rank << "] reading input" << endl;

	// Read all the Fasta files and check for duplicate names
	vector<Fasta *> multi_fasta;
	set<string> names;
	pair<set<string>::iterator,bool> ret;
	bool all_ok = true;
	size_t sum = 0;
	for (vector<string>::const_iterator iter = options.input_files.begin(); iter != options.input_files.end(); iter++) {
		Auto_Unzip input(iter->c_str());
		while (not input.eof()) {
			Fasta * temp = new Fasta();
			input.filtered() >> *temp;
			sum += temp->length();
			multi_fasta.push_back(temp);
			ret = names.insert(temp->get_id());
			if (ret.second == false) {
				ERROR_CHANNEL << "Error: name \"" << temp->get_id() << "\" already exists!" << endl;
				all_ok = false;
			}
		}
	}
	if (not all_ok) {
		for (int node = 1; node < nprocs; node++)
		send_sequences_to_slave(node, 0, 0, string(), string());
		return;
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] sorting" << endl;
	// sort by length
	if (options.balancing)
		sort(multi_fasta.begin(), multi_fasta.end(), sort_reverse_function);

	DEFAULT_CHANNEL << '[' << my_rank << "] preparing header" << endl;

	// prepare file for header
	stringstream header_name;
	header_name << options.output_file << "_h.dht";
	ofstream o(header_name.str().c_str());
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++)
		o << (*iter)->get_id() << '\t' << (*iter)->get_sequence().size() << endl;
	o.close();

	DEFAULT_CHANNEL << '[' << my_rank << "] preparing temporary files" << endl;

	// prepare sets and create temp files
	size_t bins = nprocs;
	size_t bin_length[bins];
	ofstream outputs[bins];
	for (size_t i = 0; i < bins; i++) {
		bin_length[i] = 0;
		stringstream filename;
		filename << prefix_temp << '_' << (i+1) << ".fasta";
		temp_files.push_back(filename.str());
		outputs[i].open(filename.str().c_str());
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] writing to files" << endl;

			// write to files
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++) {
		size_t min_pos = 0;
		size_t t_min = sum;
		for (size_t i = 0; i < bins; i++)
			if (bin_length[i] < t_min) {
				min_pos = i;
				t_min = bin_length[min_pos];
			}
		bin_length[min_pos] += (*iter)->length();
		outputs[min_pos] << **iter;
	}

	for (size_t i = 0; i < bins; i++) {
		outputs[i].close();
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] sending to slaves" << endl;

	// send to slaves
	for (size_t i = 1; i < bins; i++) {
		stringstream filename_input;
		stringstream filename_output;
		filename_input << prefix_temp << '_' << (i+1) << ".fasta";
		filename_output << options.output_file << '_' << (i+1) << ".eht";
		send_sequences_to_slave(i, options.k, options.blockLength, filename_input.str(), filename_output.str());
	}

	DEFAULT_CHANNEL << '[' << my_rank << "] clearing memory" << endl;
	// clear memory
	for (vector<Fasta *>::iterator iter = multi_fasta.begin(); iter != multi_fasta.end(); iter++)
		delete *iter;

	DEFAULT_CHANNEL << '[' << my_rank << "] computing" << endl;

	// compute by master process
	stringstream filename_input;
	stringstream filename_output;
	filename_input << prefix_temp << "_1.fasta";
	filename_output << options.output_file << "_1.eht";
	compute_hash(options.k,options.blockLength,filename_input.str().c_str(), filename_output.str().c_str(),false); //TODO handle methyl_hash

	DEFAULT_CHANNEL << '[' << my_rank << "] finishing" << endl;

	stringstream filename_numberfile;
	filename_numberfile << options.output_file << "_n.dht";
	ofstream nf(filename_numberfile.str().c_str());
	if (!nf) {
		ERROR_CHANNEL << "I cannot open file " << filename_numberfile.str() << " for writing!" << endl;
		exit(6);
	}
	nf << nprocs << endl;

}
void GenericIndividualSnpCall::PyroHMMsnp(Fasta &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, GenericProbabilisticAlignment &probAligner, list<Allele>& allelesInBlock, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantResults)
{
    VariantCallSetting settingForPyroHMMsnp = snpCallSettings;

    // allele pool
    vector<Allele> allelePool;
    for (list<Allele>::iterator allelesInBlockIter=allelesInBlock.begin(); allelesInBlockIter!=allelesInBlock.end(); allelesInBlockIter++)
    {
        allelePool.push_back(*allelesInBlockIter);
    }

    // add 10bp flanking segment at each side
    int windowLeftPosition  = leftPosition  - snpCallSettings.m_flankingSize;
    int windowRightPosition = rightPosition + snpCallSettings.m_flankingSize;

    // genome
    string genome;
    fastaObj.GetSequence(chrID, windowLeftPosition, windowRightPosition, genome);

    int    globalDepth;
    double globalMapQual;
    int    globalStrandPos;
    int    globalStrandNeg;

    vector<PyroHMMsnp_Sequence_t> readsInWindow;

    // rewind BAM reader
    bamObj.Rewind();
    // set BAM region
    bamObj.SetRegion(chrID, windowLeftPosition, chrID, windowRightPosition);
    // read alignment
    BamAlignment al;
    while (bamObj.GetNextAlignment(al))
    {
        // skip if it is not a good alignment
        if (!GenericBamAlignmentTools::goodAlignment(al))
        {
            continue;
        }

        // skip if it is not valid at length
        if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength))
        {
            continue;
        }

        // skip if it is not valid at map quality
        if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality))
        {
            continue;
        }

        // skip if it is not valid at alignment identity
        if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac))
        {
            continue;
        }

        // global info
        globalDepth   += 1;
        globalMapQual += al.MapQuality*al.MapQuality;
        if (al.IsReverseStrand())
            globalStrandNeg += 1;
        else
            globalStrandPos += 1;

        // get local alignment
        string t_localRead, t_localGenome;
        Cigar  t_cigar;
        BamMD  t_md;
        int    t_numMismatch, t_numInDel;
        GenericBamAlignmentTools::getLocalAlignment(al, windowLeftPosition, windowRightPosition-windowLeftPosition,
                                                    t_localRead, t_localGenome, t_cigar, t_md,
                                                    t_numMismatch, t_numInDel);

        if (t_localRead.empty() || t_localGenome.empty())
            continue;


        // save into set
        PyroHMMsnp_Sequence_t t_seq;
        t_seq.t_ID           = GenericBamAlignmentTools::getBamAlignmentID(al);
        t_seq.t_sequence     = t_localRead;
        t_seq.t_cigar        = t_cigar;
        t_seq.t_md           = t_md;
        t_seq.t_numMismatch  = t_numMismatch;
        t_seq.t_numInDel     = t_numInDel;
        t_seq.t_mapQualScore = al.MapQuality;


        if (al.Position>windowLeftPosition)
            t_seq.t_startPositionShift = al.Position-windowLeftPosition;
        else
            t_seq.t_startPositionShift = 0;

        if (al.GetEndPosition()<windowRightPosition)
            t_seq.t_endPositionShift = windowRightPosition-al.GetEndPosition();
        else
            t_seq.t_endPositionShift = 0;

        readsInWindow.push_back(t_seq);
    }

    int numData = readsInWindow.size();

    // construct the consensus sequence graph
    GenericDagGraph consensusGraph;
    vector<string>  consensusGraphReads;
    vector<Cigar>   consensusGraphReadCigars;
    vector<int>     consensusGraphReadStarts;

    // set of aligned reads to construct the graph
    for (int i=0; i<numData; ++i)
    {
        consensusGraphReads.push_back(readsInWindow[i].t_sequence);
        consensusGraphReadCigars.push_back(readsInWindow[i].t_cigar);
        consensusGraphReadStarts.push_back(readsInWindow[i].t_startPositionShift);
    }

    // build up the graph
    consensusGraph.buildDagGraph(genome, consensusGraphReads, consensusGraphReadCigars, consensusGraphReadStarts);
    consensusGraph.edgePruning(snpCallSettings.m_graphPruneLevel);

    // search topK paths, excluding reference
    vector<string>       topRankConsensusGraphPaths;
    vector<list<Vertex>> topRankConsensusGraphPathVertexs;
    vector<double>       topRankConsensusGraphPathWeights;
    consensusGraph.topRankPathsExcludeGenome(30, topRankConsensusGraphPaths, topRankConsensusGraphPathVertexs, topRankConsensusGraphPathWeights);

    // change vertex list to vertex set
    vector<set<Vertex>>  topRankConsensusGraphPathVertexSet;
    for (int i=0; i<topRankConsensusGraphPathVertexs.size(); i++)
    {
        list<Vertex>::iterator vertexIter = topRankConsensusGraphPathVertexs[i].begin();
        set<Vertex> vertexSet;
        for (; vertexIter!=topRankConsensusGraphPathVertexs[i].end(); vertexIter++)
        {
            vertexSet.insert(*vertexIter);
        }
        topRankConsensusGraphPathVertexSet.push_back(vertexSet);
    }

    // get variant vertices
    vector<int>    allelePositions;
    vector<string> alleleChars;
    for (list<Allele>::iterator alleleIter=allelesInBlock.begin(); alleleIter!=allelesInBlock.end(); alleleIter++)
    {
        Allele allele = *alleleIter;
        allelePositions.push_back(allele.m_chrPosition-windowLeftPosition);
        alleleChars.push_back(allele.m_allele);
    }
    // map allele to graph vertex
    set<Vertex> variantVertexs;
    map<int,Vertex> mapAlleleToVertex;
    map<Vertex,int> mapVertexToAllele;
    for (int v=0; v<consensusGraph.m_numVertexs; v++)
    {
        if (consensusGraph.m_skip[v])
            continue;

        if (!consensusGraph.m_isMismatch[v])
            continue;

        int gp = consensusGraph.m_genomePosition[v] - 1;


        for (int j=0; j<allelePool.size(); j++)
        {
            int ap = allelePositions[j];
            if (ap==gp)
            {
                if (alleleChars[j]==consensusGraph.m_labels[v])
                {
                    variantVertexs.insert(v);
                    mapAlleleToVertex[j] = v;
                    mapVertexToAllele[v] = j;
                }
            }
        }
    }


    // set up the haplotypes
    vector<string> haplotypes;
    vector<int>    haplotypeToPathIndex;
    vector<set<Vertex>> haplotypeVariantVertexs;

    haplotypes.push_back(genome);
    haplotypeToPathIndex.push_back(-1);
    haplotypeVariantVertexs.push_back(set<Vertex>());

    int kk = 0;
    for (int i=0; i<topRankConsensusGraphPaths.size(); i++)
    {
        if (kk>=snpCallSettings.m_topK)
            continue;

        bool hasVariantVertex = false;
        int  deltaLength = (topRankConsensusGraphPaths[i].length()-genome.length());
        deltaLength = abs(deltaLength);

        if (deltaLength>5)
            continue;

        set<Vertex> pathVertexs = topRankConsensusGraphPathVertexSet[i];
        set<Vertex> pathVariantVertexs;
        for (set<Vertex>::iterator variantIter=variantVertexs.begin(); variantIter!=variantVertexs.end(); variantIter++)
        {
            if (pathVertexs.find(*variantIter)!=pathVertexs.end())
            {
                hasVariantVertex = true;
                pathVariantVertexs.insert(*variantIter);
            }
        }

        int totalNumberVariantVertexInPath = 0;
        for (set<Vertex>::iterator vertexIter=pathVertexs.begin(); vertexIter!=pathVertexs.end(); vertexIter++)
        {
            int v = *vertexIter;
            if (consensusGraph.m_isMismatch[v])
            {
                totalNumberVariantVertexInPath += 1;
            }
        }

        if (hasVariantVertex && totalNumberVariantVertexInPath<=pathVariantVertexs.size())
        {
            haplotypes.push_back(topRankConsensusGraphPaths[i]);
            haplotypeToPathIndex.push_back(i);
            haplotypeVariantVertexs.push_back(pathVariantVertexs);

            kk++;
        }
    }

    int numHaplotypes = haplotypes.size();

    // skip if there is no variant haplotype
    if (numHaplotypes==1)
    {
        return;
    }

    // compute haplotype data likelihood
    vector<vector<long double>> haplotypeDataLikelihoods(numHaplotypes);
    PyroHMMsnpHaplotypeDataLikelihood(probAligner, snpCallSettings.m_band, numHaplotypes, haplotypes, readsInWindow, haplotypeDataLikelihoods);


    // genotype
    vector<vector<int>> genotypes;
    set<set<int>> genotypeDiscovered;
    for (int i=0; i<numHaplotypes; i++)
    {
        vector<int> precedeHaplotypes;
        PyroHMMsnpGenotypeSet(snpCallSettings.m_ploidy, i, numHaplotypes, precedeHaplotypes, genotypes, genotypeDiscovered);
    }

    int numGenotypes = genotypes.size();

    // genotype variant vertex
    vector<set<Vertex>> genotypeVariantVertexs;
    for (int i=0; i<numGenotypes; i++)
    {
        set<Vertex> variantVertexInGenotype;
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            int haplotype = genotypes[i][j];
            set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype];
            variantVertexInGenotype.insert(variantVertexInHaplotype.begin(), variantVertexInHaplotype.end());
        }
        genotypeVariantVertexs.push_back(variantVertexInGenotype);
    }

    // genotype priors
    vector<long double> genotypePriors(numGenotypes);
    PyroHMMsnpGenotypePrior(numGenotypes, genotypes, settingForPyroHMMsnp, genotypePriors);

    // genotype likelihoods
    vector<long double> genotypeLikelihoods(numGenotypes);
    PyroHMMsnpGenotypeLikelihood(numGenotypes, genotypes, readsInWindow.size(), haplotypeDataLikelihoods, snpCallSettings, genotypeLikelihoods);

    // genotype posteriors
    vector<long double> genotypePosteriors(numGenotypes);
    PyroHMMsnpGenotypePosterior(numGenotypes, genotypePriors, genotypeLikelihoods, genotypePosteriors);

    // search maximal genotype posterior
    long double maxGenotypePosterior = 0;
    int inferGenotype;
    for (int i=1; i<numGenotypes; i++)
    {
        if (maxGenotypePosterior<genotypePosteriors[i])
        {
            maxGenotypePosterior = genotypePosteriors[i];
            inferGenotype = i;
        }
    }

    // all variant vertexs in the inferred genotype
    set<Vertex> inferGenotypeVariantVertexs = genotypeVariantVertexs[inferGenotype];

    // count haploid type of variant
    map<Vertex,vector<int>> inferGenotypeVariantHaploidType;
    set<Vertex>::iterator inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        int v = *inferVariantIter;
        vector<int> variantHaploidType;
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            int haplotype = genotypes[inferGenotype][j];
            set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype];
            if (variantVertexInHaplotype.find(v)==variantVertexInHaplotype.end())
            {
                variantHaploidType.push_back(0);
            }else
            {
                variantHaploidType.push_back(1);
            }
        }
        inferGenotypeVariantHaploidType[v] = variantHaploidType;
    }
    // variant score
    map<Vertex,long double> inferGenotypeVariantScore;
    inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        int v = *inferVariantIter;
        long double variantScore = 0;
        for (int i=0; i<numGenotypes; i++)
        {
            set<Vertex> variantVertexInGenotype = genotypeVariantVertexs[i];
            if (variantVertexInGenotype.find(v)!=variantVertexInGenotype.end())
                variantScore += genotypePosteriors[i];
        }

        inferGenotypeVariantScore[v] = variantScore;
    }

    // save variant result
    inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        GenericVariant result;

        int v = *inferVariantIter;
        int a = mapVertexToAllele[v];

        int variantChrID;
        int variantChrPos;

        vector<int> haploidType = inferGenotypeVariantHaploidType[v];
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            if (haploidType[j]==0)
            {
                int g = consensusGraph.m_genomePosition[v];

                Allele allele;
                allele.m_allele = consensusGraph.m_labels[g];
                result.m_alleles.push_back(allele);
            }else
            {
                Allele allele = allelePool[a];
                result.m_alleles.push_back(allele);

                variantChrID  = allele.m_chrID;
                variantChrPos = allele.m_chrPosition;
            }
        }

        result.m_chrID           = variantChrID;
        result.m_chrPosition     = variantChrPos;
        result.m_probScoreRef    = genotypePosteriors[0];
        result.m_probScoreVar    = genotypePosteriors[inferGenotype];
        result.m_variantType     = VARIANT_SNP;
        long double variantScore = inferGenotypeVariantScore[v];
        if (fabs(1-variantScore)<1e-300)
            result.m_quality     = 3000;
        else if (variantScore<1e-300)
            result.m_quality     = 0;
        else
            result.m_quality     = -10*log10(1-variantScore);

        char refBase;
        fastaObj.GetBase(result.m_chrID, result.m_chrPosition, refBase);
        result.m_reference       = refBase;

        for (int i=0; i<result.m_alleles.size(); i++)
        {
            if (result.m_alleles[i].m_allele==result.m_reference)
                result.m_haploidType.push_back(0);
            else
                result.m_haploidType.push_back(1);
        }


        // filter
        if (result.m_quality>=snpCallSettings.m_variantQualityFilter)
            variantResults.push_back(result);

    }

}
int GenericIndividualSnpCall::call(Fasta &fastaObj, BamReader &bamObj, BamRegion &roi, GenericProbabilisticAlignment &probAligner, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantSet)
{
    RefVector chromosomes = bamObj.GetReferenceData();
    // set up genome blocks
    vector<int> BlockChrID, BlockLeftPos, BlockRightPos;
    int BlockNumber=setupGenomeBlock(chromosomes, roi, BlockChrID, BlockLeftPos, BlockRightPos);

    int numSNP = 0;

    // iterate throught blocks
    for (int i=0; i<BlockNumber; ++i)
    {
        if (m_verbosity>=1)
        {
            cout << "processing " << chromosomes[BlockChrID[i]].RefName << ":" << BlockLeftPos[i]+1 << "-" << BlockRightPos[i] << endl;
        }

        clock_t startTime = clock();

        // genome
        string BlockGenome;
        fastaObj.GetSequence(BlockChrID[i], BlockLeftPos[i], BlockRightPos[i], BlockGenome);

        map<int,list<tuple<char,int,int,double>>> BlockBamData;
        AlleleSet BlockSnpAlleleCandidates;
        // profile SNP sites by the simple method
        simpleSnpCall(BlockGenome, bamObj, BlockChrID[i], BlockLeftPos[i], BlockRightPos[i], BlockSnpAlleleCandidates, BlockBamData);

        // merge SNP sites to SNP blocks
        vector<tuple<int,int,list<Allele>>> BlockSnpLoci;
        mergeSnpSitesToBlocks(BlockSnpAlleleCandidates, BlockSnpLoci);

        // iterate through Snp locus
        for (int j=0; j<BlockSnpLoci.size(); j++)
        {
            int BlockSnpLeftPos  = get<0>(BlockSnpLoci[j]);
            int BlockSnpRightPos = get<1>(BlockSnpLoci[j]);

            // it is a SNP site
            if (BlockSnpRightPos==BlockSnpLeftPos+1)
            {
                simpleBayesianSnpCall(fastaObj, bamObj, BlockChrID[i], BlockSnpLeftPos, BlockSnpRightPos, get<2>(BlockSnpLoci[j]), BlockBamData[BlockSnpLeftPos], snpCallSettings, variantSet);
            }else if (BlockSnpRightPos==BlockSnpLeftPos+2)
            {
                for (int pos=BlockSnpLeftPos; pos<BlockSnpRightPos; pos++)
                {
                    list<Allele> fAlleles = get<2>(BlockSnpLoci[j]);
                    list<Allele> tAlleles;
                    for (list<Allele>::iterator faIter=fAlleles.begin(); faIter!=fAlleles.end(); faIter++)
                    {
                        if (faIter->m_chrPosition==pos)
                            tAlleles.emplace_back(*faIter);
                    }

                    if (!tAlleles.empty())
                        simpleBayesianSnpCall(fastaObj, bamObj, BlockChrID[i], pos, pos+1, tAlleles, BlockBamData[pos], snpCallSettings, variantSet);

                }
            }
            else   // it is a MNP site
            {
                PyroHMMsnp(fastaObj, bamObj, BlockChrID[i], BlockSnpLeftPos, BlockSnpRightPos, probAligner, get<2>(BlockSnpLoci[j]), snpCallSettings, variantSet);
            }
        }

        clock_t endTime = clock();
        if (m_verbosity>=1)
        {
            cout << "time elapsed " << ((endTime-startTime)/(double)CLOCKS_PER_SEC/60.) << " minutes";
            cout << ", ";
            cout << "call " << variantSet.size()-numSNP << " SNPs" << endl;
        }

        numSNP = variantSet.size();
    }

    return variantSet.size();
}
Esempio n. 15
0
void buildBWT2 (const std::string& fileName, const std::string& prefixName) {
	/* read input fasta file */
	std::ifstream in {fileName};
	/* string to store the sense + reverse complementary of the genome seq */
	std::string seq, seqRC {};
	/* running accumulator recording the length of each chr */
	INTTYPE tempLen {0}, accumulatedLength {0};
	/* for concatenated seq */
	std::map <INTTYPE, INTTYPE> NPosLen { };

	/* file to store which regions has which chr*/
	std::ofstream chrStartPos {prefixName + "chrStart"};
	/* file to store the length of each chr */
	std::ofstream chrLen {prefixName + "chrLen"};
	/* read in each fasta and make two string */
	while (in.good ()) {
		Fasta<std::vector> fa {in};
		/* store start position of each chr */
		chrStartPos << fa.getName () << '\t' << accumulatedLength << '\n';
		/* get chr length */
		tempLen = fa.getLengthNoN ();
		/* store chr length */
		chrLen << fa.getName () << '\t' << tempLen << '\n';
		/* update accumulated length */
		accumulatedLength += tempLen;
		/* update NPosLen */
		fa.updateNpos (NPosLen);
		seq += fa.getSeqNoN ();
	}
	chrStartPos.close ();
	chrLen.close ();
	/* resize to enough space for the reverse complemetary sequence and a $ sign */
	seq.resize (seq.size () * 2 + 1); // TODO: resize does mallocating the extra space and also initialization, the later is not necessary
	auto iter = seq.begin ();
	std::advance (iter, (seq.size ()-1)/2);
	auto iter2 = iter;
	--iter2;
	do {
		switch (*iter2) {
		case 'A':
			*iter = 'T'; break;
		case 'T':
			*iter = 'A'; break;
		case 'G':
			*iter = 'C'; break;
		case 'C':
			*iter = 'G'; break;
		}
		++iter;
	} while (iter2-- != seq.begin ());
	*iter = '$';
	/* writing NPosLen to file */
	{
		boost::iostreams::filtering_ostream fos;
		fos.push (boost::iostreams::zlib_compressor());
		fos.push (boost::iostreams::file_sink (prefixName + "NposLen.z"));
		boost::archive::binary_oarchive oa (fos);
		oa << NPosLen;
	}
	{
		ABSequence<std::string> x ( seq );
		ABWT<ABSequence<std::string>> y (x, 512, 64, prefixName);
	}
}
Esempio n. 16
0
void align_against_collection(string &read, Fasta &rep, int forbidden_rep_id,
                              bool reverse_ref, bool reverse_both, bool local,
                             AlignBox *box, Cost segment_cost)
{
  
  int best_score = MINUS_INF ;
  box->ref_nb = MINUS_INF ;
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;

  vector<pair<int, int> > score_r;

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;

  // With reverse_ref, the read is reversed to prevent calling revcomp on each reference sequence
  string sequence_or_rc = revcomp(read, reverse_ref);
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
      if (r == forbidden_rep_id)
        continue;

      DynProg dp = DynProg(sequence_or_rc, rep.sequence(r),
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both,
                          rep.read(r).marked_pos);

      bool onlyBottomTriangle = !local ;
      int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT);
      
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
	  box->ref_nb = r ;
	  box->ref_label = rep.label(r) ;

          if (!local)
            dp.backtrack();
          box->marked_pos = dp.marked_pos_i ;
	}
	
	score_r.push_back(make_pair(score, r));

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif

    }
    sort(score_r.begin(),score_r.end(),comp_pair);

  box->ref = rep.sequence(box->ref_nb);
  box->del_right = reverse_both ? best_best_j : box->ref.size() - best_best_j - 1;
  box->del_left = best_first_j;
  box->start = best_first_i;
  
  box->score = score_r;

#ifdef DEBUG_SEGMENT	
  cout << "best: " << box->ref_label << " " << best_score ;
  cout << "del/del2/begin:" << (box->del_right) << "/" << (box->del_left) << "/" << (box->start) << endl;
  cout << endl;
#endif

  if (reverse_ref)
    // Why -1 here and +1 in dynprog.cpp /// best_i = m - best_i + 1 ;
    best_best_i = read.length() - best_best_i - 1 ;

  box->end = best_best_i ;
}