Ejemplo n.º 1
0
void GLaguer::GetPhylipLaguer(const int categs, MDOUBLE alpha, Vdouble & points, Vdouble & weights)
{
  /* calculate rates and probabilities to approximate Gamma distribution
     of rates with "categs" categories and shape parameter "alpha" using
     rates and weights from Generalized Laguerre quadrature */

	points.resize(categs, 0.0);
	weights.resize(categs, 0.0);
	long i;
	raterootarray lgroot; /* roots of GLaguerre polynomials */
	double f, x, xi, y;

	alpha = alpha - 1.0;
	lgroot[1][1] = 1.0+alpha;
	for (i = 2; i <= categs; i++)
	{
		cerr<<lgroot[i][1]<<"\t";
		lgr(i, alpha, lgroot);                   /* get roots for L^(a)_n */
		cerr<<lgroot[i][1]<<endl;
	}
	/* here get weights */
	/* Gamma weights are (1+a)(1+a/2) ... (1+a/n)*x_i/((n+1)^2 [L_{n+1}^a(x_i)]^2)  */
	f = 1;
	for (i = 1; i <= categs; i++)
		f *= (1.0+alpha/i);
	for (i = 1; i <= categs; i++) {
		xi = lgroot[categs][i];
		y = glaguerre(categs+1, alpha, xi);
		x = f*xi/((categs+1)*(categs+1)*y*y);
		points[i-1] = xi/(1.0+alpha);
		weights[i-1] = x;
	}
}
void AbstractMixedSubstitutionModel::setVRates(const Vdouble& vd)
{
  if (vd.size()!=modelsContainer_.size())
    throw Exception("AbstractMixedSubstitutionModel::setVRates  bad size of Vdouble argument.");

  for (unsigned int i=0;i<vd.size();i++)
    vRates_[i]=vd[i];
 
  normalizeVRates();
}
inline Vdouble SubstitutionProcessCollectionMember::getClassProbabilities() const
{
  Vdouble vProb;

  for (size_t i = 0; i < getRateDistribution()->getNumberOfCategories(); i++)
  {
    vProb.push_back(getRateDistribution()->getProbability(i));
  }

  return vProb;
}
generalGammaDistributionFixedCategories::generalGammaDistributionFixedCategories(const Vdouble& fixedRates, const Vdouble& boundaries, MDOUBLE alpha, MDOUBLE beta) :
generalGammaDistribution()
{
	if ((fixedRates.size() + 1) !=  boundaries.size())
		errorMsg::reportError("error in generalGammaDistributionFixedCategories constructor");
	_alpha = alpha;
	_beta = beta;
	_rates = fixedRates;
	_bonderi = boundaries;
	computeRatesProbs();
}
Ejemplo n.º 5
0
Vdouble TreeTemplateTools::getBranchLengths(const Node& node) throw (NodePException)
{
  Vdouble brLen(1);
  brLen[0] = node.getDistanceToFather();
  for (size_t i = 0; i < node.getNumberOfSons(); i++)
  {
    Vdouble sonBrLen = getBranchLengths(*node.getSon(i));
    for (size_t j = 0; j < sonBrLen.size(); j++)
    {
      brLen.push_back(sonBrLen[j]);
    }
  }
  return brLen;
}
Ejemplo n.º 6
0
void YNGKP_M8::updateMatrices()
{
  AbstractBiblioSubstitutionModel::updateMatrices();

  // homogeneization of the synonymous substittion rates

  Vdouble vd;

  for (unsigned int i = 0; i < pmixmodel_->getNumberOfModels(); i++)
  {
    vd.push_back(1 / pmixmodel_->getNModel(i)->Qij(synfrom_, synto_));
  }

  pmixmodel_->setVRates(vd);
}
Ejemplo n.º 7
0
MDOUBLE siteSpecificRateGL::computeML_siteSpecificRate(Vdouble & ratesV,
						Vdouble & likelihoodsV,
						const Vint& spAttributesVec,
						const Vint& treeAttributesVec,
						const vector<tree> & etVec,
						const vector<const stochasticProcess *> & spVec,
						const sequenceContainer& sc,
						const MDOUBLE maxRate,
						const MDOUBLE tol){
	MDOUBLE Lsum = 0.0;
	ratesV.resize(sc.seqLen()); // the rates themselves
	likelihoodsV.resize(sc.seqLen()); // the log likelihood of each position
	
	for (int pos=0; pos < sc.seqLen(); ++pos) {
		LOG(5,<<".");
		MDOUBLE bestR=-1.0; // tree1
		//		MDOUBLE LmaxR1=0;
		
		// getting the right tree for the specific position:
		const tree*  treeForThisPosition=NULL;
		if ((etVec.size() >0 ) && (treeAttributesVec[pos]>0)) {
			treeForThisPosition = & etVec[ treeAttributesVec[pos] -1];
		} else {
			errorMsg::reportError("tree vector is empty, or treeAttribute is empty, or treeAttribute[pos] is zero (it should be one)");
		}

		// getting the right stochastic process for the specific position:

		const stochasticProcess* spForThisPosition=NULL;

		if ((spVec.size() >0 ) && (spAttributesVec[pos]>0)) {
			spForThisPosition = spVec[ spAttributesVec[pos] -1];
		} else {
			errorMsg::reportError("stochastic process vector is empty, or spAttributesVec is empty, or spAttribute[pos] is zero (it should be one)");
		}

		siteSpecificRateGL::computeML_siteSpecificRate(pos,sc,*spForThisPosition,*treeForThisPosition,bestR,likelihoodsV[pos],maxRate,tol);
		ratesV[pos] = bestR;
		assert(likelihoodsV[pos]>0.0);
		Lsum += log(likelihoodsV[pos]);
		LOG(5,<<" rate of pos: "<<pos<<" = "<<ratesV[pos]<<endl);
	}
	LOG(5,<<" number of sites: "<<sc.seqLen()<<endl);
	return Lsum;
}
Ejemplo n.º 8
0
///////////////////////////////////////////////////////////////////////////////////////////////////////////
//findBestParamManyStarts: Finds the best gammaMixture from many starting points.
//The function starts form few starting points. 
//For each point it tries to optimize the likellihood doing only a small number of iterations.
//It then picks the best points (highest likelihood) and continue the maximization for these points only.
//The best gammaMixture is stored in _sp and the best likelihood is returned.
//input Parameters:
//startPointsNum = the number of starting points.
//bestStartsNum  = the number of best points to continue with the full optimization.
//startIter      = the number of iterations to perform with all starting points.
//maxIterations  = the maximum number of iterations to continue with the best points
//epsilon        = for determining convergence in the maximization process. 
MDOUBLE optGammaMixtureEM::findBestParamManyStarts(const int startPointsNum, const int bestStartsNum, const int startIter, const int maxIterations, const MDOUBLE epsilon, const MDOUBLE epsilomQopt, ofstream* pOutF)
{
	vector<mixtureDistribution> distVec;
	Vdouble likelihoodVec(startPointsNum);
	mixtureDistribution * pMixture = static_cast<mixtureDistribution*>(_pSp->distr());
	//create starting distributions
	int i;
	for (i = 0; i < startPointsNum; ++i)
	{
		//the first distribution will be the current one
		if (i == 0)
			distVec.push_back(*pMixture); 
		else
			distVec.push_back(mixtureDistribution(pMixture->getComponentsNum(), pMixture->categoriesForOneComponent(), LAGUERRE, 15, 15)); 
	}

	//make a small number of iterations for all random starts 
	for (i = 0; i < distVec.size(); ++i)
	{
		likelihoodVec[i] = optimizeParam(&distVec[i], startIter, epsilon, epsilomQopt, pOutF);
	}

	//sort results and make full optimization only on the best starts
	Vdouble sortedL = likelihoodVec;
	sort(sortedL.begin(),sortedL.end());
	MDOUBLE threshold = sortedL[sortedL.size()- bestStartsNum];
	MDOUBLE bestL = sortedL[0];
	int bestDistNum = 0;
	for (i = 0; i < distVec.size(); ++i)
	{
		if (likelihoodVec[i] >= threshold) 
		{
			MDOUBLE newL = optimizeParam(&distVec[i], maxIterations, epsilon, epsilomQopt, pOutF);
			if (newL > bestL)
			{
				bestL = newL;
				bestDistNum = i;
			}
		}
	}
	_pSp->setDistribution(&distVec[bestDistNum]);
	distVec.clear();
	return bestL;
}
Ejemplo n.º 9
0
//Input: alf = the alpha parameter of the Laguerre polynomials
//		 pointsNum = the polynom order
//Output: the abscissas and weights are stored in the vecotrs x and w, respectively. 
//Discreption: given alf, the alpha parameter of the Laguerre polynomials, the function returns the abscissas and weights
//			   of the n-point Guass-Laguerre quadrature formula.
//			   The smallest abscissa is stored in x[0], the largest in x[pointsNum - 1].
void GLaguer::gaulag(Vdouble &x, Vdouble  &w, const MDOUBLE alf, const int pointsNum)
{
	x.resize(pointsNum, 0.0);
	w.resize(pointsNum, 0.0);
	const int MAXIT=10000;
	const MDOUBLE EPS=1.0e-6;
	int i,its,j;
	MDOUBLE ai,p1,p2,p3,pp,z=0.0,z1;

	int n= x.size();
	for (i=0;i<n;i++) {
		//loops over the desired roots
		if (i == 0) { //initial guess for the smallest root
			z=(1.0+alf)*(3.0+0.92*alf)/(1.0+2.4*n+1.8*alf);
		} else if (i == 1) {//initial guess for the second smallest root
			z += (15.0+6.25*alf)/(1.0+0.9*alf+2.5*n);
		} else { //initial guess for the other roots
			ai=i-1;
			z += ((1.0+2.55*ai)/(1.9*ai)+1.26*ai*alf/
				(1.0+3.5*ai))*(z-x[i-2])/(1.0+0.3*alf);
		}
		for (its=0;its<MAXIT;its++) { //refinement by Newton's method
			p1=1.0;
			p2=0.0;
			for (j=0;j<n;j++) { //Loop up the recurrence relation to get the Laguerre polynomial evaluated at z.
				p3=p2;
				p2=p1;
				p1=((2*j+1+alf-z)*p2-(j+alf)*p3)/(j+1);
			}
			//p1 is now the desired Laguerre polynomial. We next compute pp, its derivative,
			//by a standard relation involving also p2, the polynomial of one lower order.
			pp=(n*p1-(n+alf)*p2)/z;
			z1=z;
			z=z1-p1/pp; //Newton's formula
			if (fabs(z-z1) <= EPS) 
				break;
		}
		if (its >= MAXIT) 
			errorMsg::reportError("too many iterations in gaulag");
		x[i]=z;
		w[i] = -exp(gammln(alf+n)-gammln(MDOUBLE(n)))/(pp*n*p2);
	}
}
Ejemplo n.º 10
0
Vdouble TreeTools::getBranchLengths(const Tree& tree, int nodeId) throw (NodeNotFoundException, NodeException)
{
    if (!tree.hasNode(nodeId))
        throw NodeNotFoundException("TreeTools::getBranchLengths", nodeId);
    Vdouble brLen(1);
    if (tree.hasDistanceToFather(nodeId))
        brLen[0] = tree.getDistanceToFather(nodeId);
    else
        throw NodeException("TreeTools::getbranchLengths(). No branch length.", nodeId);
    vector<int> sons = tree.getSonsId(nodeId);
    for (size_t i = 0; i < sons.size(); i++)
    {
        Vdouble sonBrLen = getBranchLengths(tree, sons[i]);
        for (size_t j = 0; j < sonBrLen.size(); j++)
        {
            brLen.push_back(sonBrLen[j]);
        }
    }
    return brLen;
}
Ejemplo n.º 11
0
MDOUBLE siteSpecificRateGL::computeML_siteSpecificRate(Vdouble & ratesV,
								   Vdouble & likelihoodsV,
								   const sequenceContainer& sc,
								   const stochasticProcess& sp,
								   const tree& et,
								   const MDOUBLE maxRate,//20.0f
								   const MDOUBLE tol){//=0.0001f;

	ratesV.resize(sc.seqLen());
	likelihoodsV.resize(sc.seqLen());
	MDOUBLE Lsum = 0.0;

	for (int pos=0; pos < sc.seqLen(); ++pos) {
		siteSpecificRateGL::computeML_siteSpecificRate(pos,sc,sp,et,ratesV[pos],likelihoodsV[pos],maxRate,tol);
		assert(likelihoodsV[pos]>0.0);
		Lsum += log(likelihoodsV[pos]);
		LOG(5,<<" rate of pos: "<<pos<<" = "<<ratesV[pos]<<endl);
	}
	LOG(5,<<" number of sites: "<<sc.seqLen()<<endl);
	return Lsum;
}
void generalGammaDistributionFixedCategories::setFixedCategories(const Vdouble& fixedBoundaries){

	if (fixedBoundaries.size()<2)
		errorMsg::reportError("Error in generalGammaDistributionFixedCategories::setFixedCategories : at least two boundaries are required");
	if (fixedBoundaries[0] > 0.0)
		errorMsg::reportError("Error in generalGammaDistributionFixedCategories::setFixedCategories : first boundary should be zero");
	
	_bonderi = fixedBoundaries;
	if (_bonderi[_bonderi.size()] > VERYBIG/10000.0)
		 _bonderi[_bonderi.size()] = VERYBIG/10000.0; // to avoid overflow 

	setFixedCategories();
}
Ejemplo n.º 13
0
void YNGKP_M3::updateMatrices()
{
  for (unsigned int i = 0; i < lParPmodel_.size(); i++)
  {
    if (mapParNamesFromPmodel_.find(lParPmodel_[i].getName()) != mapParNamesFromPmodel_.end())
    {
      if (lParPmodel_[i].getName()[18] == 'V')
      {
        unsigned int ind = TextTools::toInt(lParPmodel_[i].getName().substr(19));
        double x = getParameterValue("omega0");
        for (unsigned j = 1; j < ind; j++)
        {
          x += getParameterValue("delta" + TextTools::toString(j));
        }
        lParPmodel_[i].setValue(x);
      }
      else
      {
        lParPmodel_[i].setValue(getParameter(getParameterNameWithoutNamespace(mapParNamesFromPmodel_[lParPmodel_[i].getName()])).getValue());
      }
    }
  }

  pmixmodel_->matchParametersValues(lParPmodel_);

  // homogeneization of the synonymous substitution rates


  Vdouble vd;

  for (unsigned int i = 0; i < pmixmodel_->getNumberOfModels(); i++)
  {
    vd.push_back(1 / pmixmodel_->getNModel(i)->Qij(synfrom_, synto_));
  }

  pmixmodel_->setVRates(vd);
}
Ejemplo n.º 14
0
// a file with color-coding from Ka/Ks values to color-bins
void kaks2Color(const Vdouble & kaksVec, const Vdouble &lowerBoundV,
				const sequence & refSeq, string fileName,codon *co) {
	vector<int> colors;
	int numOfSitesinAln = kaksVec.size();
	Vdouble negativesKaksVec,negativesSite;
	negativesKaksVec.clear();
	negativesSite.clear();
	int i,gapsInRefSeq=0;

	for (i=0;i<numOfSitesinAln;i++){
		if (codonUtility::aaOf(refSeq[i],*co) == -1) gapsInRefSeq++; 
	}

	// first dealing with positive selection
	colors.resize(numOfSitesinAln-gapsInRefSeq);
	int gap=0;
	for (i=0;i<numOfSitesinAln;i++){
		if (codonUtility::aaOf(refSeq[i],*co) == -1){
			gap++;
			continue;
		}
		if (lowerBoundV[i]>1) // color 1 (positive selection) : if confidence interval lower bound > 1
			colors[i-gap]=1;
		else if (kaksVec[i]>1) // color 2(positive selection) : "non-significant"
			colors[i-gap]=2;
		else  {
			negativesKaksVec.push_back(kaksVec[i]);  //add the value of kaks < 1
			negativesSite.push_back(i-gap);   //add the number of site of the kaks 
		}
	
	}

	// now dealing with purifying selection
	Vdouble orderVec = negativesKaksVec;
	if (orderVec.size()>0) // this is since once the whole protein was positive selection... (anomaly)
		sort(orderVec.begin(), orderVec.end());  //sort the kaks values to be divided to 5 groups
	MDOUBLE percentileNum = 5.0;
	int percentileNumInt = 5;
	Vdouble maxScoreForPercentile(percentileNumInt);
	if (orderVec.size()>0) {
		maxScoreForPercentile[0] = orderVec[0]; 
		for (int c = 1; c < percentileNumInt; ++c){
			int place = (int)((c / percentileNum) * negativesKaksVec.size());
			MDOUBLE maxScore = orderVec[place];
			maxScoreForPercentile[c] = maxScore;
		}
	}

	//loop over all the Ka/Ks < 1  
	for (int j=0; j < negativesKaksVec.size(); ++j){
			MDOUBLE r = negativesKaksVec[j]; //the kaks of the site.
			int s = (int)negativesSite[j];  //the  site.
			if (r > maxScoreForPercentile[4]) 
					colors[s] = 3;
			else if (r > maxScoreForPercentile[3]) 
					colors[s] = 4;
			else if (r> maxScoreForPercentile[2])
					colors[s] = 5;
			else if (r > maxScoreForPercentile[1])
					colors[s] = 6;
			else if (r >= maxScoreForPercentile[0])
					colors[s] = 7;
	}
	//print to file
	ofstream out(fileName.c_str());
	gap=0;
	amino aminoAcid;
	LOG(5,<<"Printing selection color bins to file"<<endl);
	for (i=0;i<refSeq.seqLen();i++){	 
		int aa = codonUtility::aaOf(refSeq[i], *co);
		if (aa==-1){
			gap++;
			continue;
		}
		string aaStr = aminoAcid.fromInt(aa);
		out<<i+1-gap <<"\t"<<aaStr<<"\t"<<colors[i-gap];
		out<<endl;
	}
	out.close();
}
Ejemplo n.º 15
0
int main(int args, char** argv)
{
  cout << "******************************************************************" << endl;
  cout << "*     Bio++ Computation of site likelihoods inside mixed models  *" << endl;
  cout << "*                        Version 2.2.0.                          *" << endl;
  cout << "* Author: L. Guéguen                       Last Modif.: 25/09/14 *" << endl;
  cout << "******************************************************************" << endl;
  cout << endl;

  if (args == 1)
  {
    help();
    return 0;
  }

  try
  {
    BppApplication bppmixedlikelihoods(args, argv, "BppMixedLikelihoods");
    bppmixedlikelihoods.startTimer();

    Alphabet* alphabet = SequenceApplicationTools::getAlphabet(bppmixedlikelihoods.getParams(), "", false);
    auto_ptr<GeneticCode> gCode;
    CodonAlphabet* codonAlphabet = dynamic_cast<CodonAlphabet*>(alphabet);
    if (codonAlphabet) {
      string codeDesc = ApplicationTools::getStringParameter("genetic_code", bppmixedlikelihoods.getParams(), "Standard", "", true, true);
      ApplicationTools::displayResult("Genetic Code", codeDesc);
      
      gCode.reset(SequenceApplicationTools::getGeneticCode(codonAlphabet->getNucleicAlphabet(), codeDesc));
    }

    // get the data

    VectorSiteContainer* allSites = SequenceApplicationTools::getSiteContainer(alphabet, bppmixedlikelihoods.getParams());

    VectorSiteContainer* sites = SequenceApplicationTools::getSitesToAnalyse(*allSites, bppmixedlikelihoods.getParams(), "", true, false);
    delete allSites;

    ApplicationTools::displayResult("Number of sequences", TextTools::toString(sites->getNumberOfSequences()));
    ApplicationTools::displayResult("Number of sites", TextTools::toString(sites->getNumberOfSites()));

    // Get the tree
    Tree* tree = PhylogeneticsApplicationTools::getTree(bppmixedlikelihoods.getParams());
    ApplicationTools::displayResult("Number of leaves", TextTools::toString(tree->getNumberOfLeaves()));


    AbstractDiscreteRatesAcrossSitesTreeLikelihood* tl;
    string nhOpt = ApplicationTools::getStringParameter("nonhomogeneous", bppmixedlikelihoods.getParams(), "no", "", true, false);
    ApplicationTools::displayResult("Heterogeneous model", nhOpt);

    MixedSubstitutionModel* model       = 0;
    MixedSubstitutionModelSet* modelSet = 0;
    DiscreteDistribution* rDist         = 0;

    if (nhOpt == "no")
    {
      model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (model == 0)
      {
        cout << "Model is not a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (model->getNumberOfStates() > model->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantRateDistribution();
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true);
    }
    else if (nhOpt == "one_per_branch")
    {
      model = dynamic_cast<MixedSubstitutionModel*>(PhylogeneticsApplicationTools::getSubstitutionModel(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (model == 0)
      {
        cout << "Model is not a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (model->getNumberOfStates() > model->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantRateDistribution();
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      vector<double> rateFreqs;
      if (model->getNumberOfStates() != alphabet->getSize())
      {
        // Markov-Modulated Markov Model...
        unsigned int n = (unsigned int)(model->getNumberOfStates() / alphabet->getSize());
        rateFreqs = vector<double>(n, 1. / (double)n); // Equal rates assumed for now, may be changed later (actually, in the most general case,
        // we should assume a rate distribution for the root also!!!
      }
      
      std::map<std::string, std::string> aliasFreqNames;

      FrequenciesSet* rootFreqs = PhylogeneticsApplicationTools::getRootFrequenciesSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams(), aliasFreqNames, rateFreqs);
      vector<string> globalParameters = ApplicationTools::getVectorParameter<string>("nonhomogeneous_one_per_branch.shared_parameters", bppmixedlikelihoods.getParams(), ',', "");
      modelSet = dynamic_cast<MixedSubstitutionModelSet*>(SubstitutionModelSetTools::createNonHomogeneousModelSet(model, rootFreqs, tree, aliasFreqNames, globalParameters));
      model = 0;
      tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true);
    }
    else if (nhOpt == "general")
    {
      modelSet = dynamic_cast<MixedSubstitutionModelSet*>(PhylogeneticsApplicationTools::getSubstitutionModelSet(alphabet, gCode.get(), sites, bppmixedlikelihoods.getParams()));
      if (modelSet == 0)
      {
        cout << "Missing a Mixed model" << endl;
        exit(0);
      }

      SiteContainerTools::changeGapsToUnknownCharacters(*sites);
      if (modelSet->getNumberOfStates() > modelSet->getAlphabet()->getSize())
      {
        // Markov-modulated Markov model!
        rDist = new ConstantDistribution(1.);
      }
      else
      {
        rDist = PhylogeneticsApplicationTools::getRateDistribution(bppmixedlikelihoods.getParams());
      }
      tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, true);
    }
    else
      throw Exception("Unknown option for nonhomogeneous: " + nhOpt);

    tl->initialize();

    double logL = tl->getValue();
    if (isinf(logL))
    {
      // This may be due to null branch lengths, leading to null likelihood!
      ApplicationTools::displayWarning("!!! Warning!!! Likelihood is zero.");
      ApplicationTools::displayWarning("!!! This may be due to branch length == 0.");
      ApplicationTools::displayWarning("!!! All null branch lengths will be set to 0.000001.");
      ParameterList pl = tl->getBranchLengthsParameters();
      for (unsigned int i = 0; i < pl.size(); i++)
      {
        if (pl[i].getValue() < 0.000001)
          pl[i].setValue(0.000001);
      }
      tl->matchParametersValues(pl);
      logL = tl->getValue();
    }
    if (isinf(logL))
    {
      ApplicationTools::displayError("!!! Unexpected likelihood == 0.");
      ApplicationTools::displayError("!!! Looking at each site:");
      for (unsigned int i = 0; i < sites->getNumberOfSites(); i++)
      {
        (*ApplicationTools::error << "Site " << sites->getSite(i).getPosition() << "\tlog likelihood = " << tl->getLogLikelihoodForASite(i)).endLine();
      }
      ApplicationTools::displayError("!!! 0 values (inf in log) may be due to computer overflow, particularily if datasets are big (>~500 sequences).");
      exit(-1);
    }


    // Write parameters to screen:
    ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
    ParameterList parameters = tl->getSubstitutionModelParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
      ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }
    parameters = tl->getRateDistributionParameters();
    for (unsigned int i = 0; i < parameters.size(); i++)
    {
      ApplicationTools::displayResult(parameters[i].getName(), TextTools::toString(parameters[i].getValue()));
    }


    // /////////////////////////////////////////////
    // Getting likelihoods per submodel

    string outputFile;
    outputFile = ApplicationTools::getAFilePath("output.likelihoods.file", bppmixedlikelihoods.getParams(), true, false);
    ApplicationTools::displayResult("Output file for likelihoods", outputFile);
    ofstream out(outputFile.c_str(), ios::out);

    size_t nSites = sites->getNumberOfSites();

    size_t nummodel = ApplicationTools::getParameter<size_t>("likelihoods.model_number", bppmixedlikelihoods.getParams(), 1, "", true, true);

    string parname = ApplicationTools::getStringParameter("likelihoods.parameter_name", bppmixedlikelihoods.getParams(), "", "", true, false);

    if (modelSet && ((nummodel <= 0) || (nummodel > modelSet->getNumberOfModels())))
    {
      ApplicationTools::displayError("Bad number of model " + TextTools::toString(nummodel) + ".");
      exit(-1);
    }

    MixedSubstitutionModel* p0 = dynamic_cast<MixedSubstitutionModel*>(model ? model : modelSet->getModel(nummodel - 1));

    if (!p0)
    {
      ApplicationTools::displayError("Model " + TextTools::toString(nummodel) + " is not a Mixed Model.");
      exit(-1);
    }

    const AbstractBiblioMixedSubstitutionModel* ptmp = dynamic_cast<const AbstractBiblioMixedSubstitutionModel*>(p0);
    if (ptmp) {
      p0 = ptmp->getMixedModel().clone();

      if (nhOpt == "no")
        model = p0;
      else {
        modelSet->replaceModel(nummodel-1, p0);
        modelSet->isFullySetUpFor(*tree);
      }
    }
    
    //////////////////////////////////////////////////
    // Case of a MixtureOfSubstitutionModels

    MixtureOfSubstitutionModels* pMSM = dynamic_cast<MixtureOfSubstitutionModels*>(p0);

    if (pMSM)
    {
      vector<string> colNames;
      colNames.push_back("Sites");

      size_t nummod = pMSM->getNumberOfModels();
      for (unsigned int i = 0; i < nummod; i++)
      {
        colNames.push_back(pMSM->getNModel(i)->getName());
      }

      DataTable* rates = new DataTable(nSites, colNames.size());
      rates->setColumnNames(colNames);

      for (unsigned int i = 0; i < nSites; i++)
      {
        const Site* currentSite = &sites->getSite(i);
        int currentSitePosition = currentSite->getPosition();
        (*rates)(i, "Sites") = string("[" + TextTools::toString(currentSitePosition) + "]");
      }

      Vdouble vprob = pMSM->getProbabilities();
      for (unsigned int i = 0; i < nummod; i++)
      {
        string modname = pMSM->getNModel(i)->getName();

        for (unsigned int j = 0; j < nummod; j++)
        {
          pMSM->setNProbability(j, (j == i) ? 1 : 0);
        }

        if (tl)
          delete tl;

        if (nhOpt == "no")
          tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true);
        else
          tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true);

        tl->initialize();
        logL = tl->getValue();
        Vdouble Vd = tl->getLogLikelihoodForEachSite();
        for (unsigned int j = 0; j < nSites; j++)
        {
          (*rates)(j, modname) = TextTools::toString(Vd[j]);
        }

        ApplicationTools::displayMessage("\n");
        ApplicationTools::displayMessage("Model " + modname + ":");
        ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
        ApplicationTools::displayResult("Probability", TextTools::toString(vprob[i], 15));
      }

      DataTable::write(*rates, out, "\t");
    }

    //////////////////////////////////////////////////
    // Case of a MixtureOfASubstitutionModel

    else
    {
      MixtureOfASubstitutionModel* pMSM2 = dynamic_cast<MixtureOfASubstitutionModel*>(p0);
      if (pMSM2 != NULL)
      {
        size_t nummod = pMSM2->getNumberOfModels();
        if (parname == "")
        {
          ParameterList pl=pMSM2->getParameters();

          for (size_t i2 = 0; i2 < pl.size(); i2++)
          {
            string pl2n = pl[i2].getName();

            if (dynamic_cast<const ConstantDistribution*>(pMSM2->getDistribution(pl2n))==NULL)
            {
              parname=pl2n;

              while (parname.size()>0 && pMSM2->getDistribution(parname)==NULL)
                parname=pl2n.substr(0,pl2n.rfind("_"));

              if (parname.size()>0){
                ApplicationTools::displayResult("likelihoods.parameter_name", parname);
                break;
              }
            }
          }
        }

        if (parname == "")
        {
          ApplicationTools::displayError("Argument likelihoods.parameter_name is required.");
          exit(-1);
        }

        vector< Vint > vvnmod;
        size_t i2 = 0;
        while (i2 < nummod)
        {
          string par2 = parname + "_" + TextTools::toString(i2 + 1);
          Vint vnmod = pMSM2->getSubmodelNumbers(par2);
          if (vnmod.size() == 0)
            break;
          vvnmod.push_back(vnmod);
          i2++;
        }

        size_t nbcl = vvnmod.size();
        if (nbcl==0)
          throw Exception("Parameter " + parname + " is not mixed.");
        
        Vdouble vprob = pMSM2->getProbabilities();

        vector<vector<double> > vvprob;
        vector<double> vsprob;
        
        for (size_t i = 0; i < nbcl; i++)
        {
          vector<double> vprob2;
          for (size_t j = 0; j < vvnmod[i].size(); j++)
          {
            vprob2.push_back(vprob[static_cast<size_t>(vvnmod[i][j])]);
          }

          vvprob.push_back(vprob2);
          vsprob.push_back(VectorTools::sum(vvprob[i]));
        }

        vector<string> colNames;
        colNames.push_back("Sites");

        Vdouble dval;
        for (unsigned int i = 0; i < nbcl; i++)
        {
          SubstitutionModel* pSM = pMSM2->getNModel(static_cast<size_t>(vvnmod[i][0]));
          double valPar = pSM->getParameterValue(pSM->getParameterNameWithoutNamespace(parname));
          dval.push_back(valPar);
          colNames.push_back("Ll_" + parname + "=" + TextTools::toString(valPar));
        }
        for (unsigned int i = 0; i < nbcl; i++)
          colNames.push_back("Pr_" + parname + "=" + TextTools::toString(dval[i]));

        colNames.push_back("mean");

        DataTable* rates = new DataTable(nSites, colNames.size());
        rates->setColumnNames(colNames);

        for (unsigned int i = 0; i < nSites; i++)
        {
          const Site* currentSite = &sites->getSite(i);
          int currentSitePosition = currentSite->getPosition();
          (*rates)(i,"Sites")=TextTools::toString(currentSitePosition);
        }

        VVdouble vvd;

          
        vector<double> vRates = pMSM2->getVRates();

        for (size_t i = 0; i < nbcl; ++i)
        {
          string par2 = parname + "_" + TextTools::toString(i + 1);
          
          for (unsigned int j = 0; j < nummod; ++j)
            pMSM2->setNProbability(j, 0);

          for (size_t j = 0; j < vvprob[i].size(); ++j)
            pMSM2->setNProbability(static_cast<size_t>(vvnmod[i][j]), vvprob[i][j] / vsprob[i]);

          if (tl)
            delete tl;

          if (nhOpt == "no")
            tl = new RHomogeneousMixedTreeLikelihood(*tree, *sites, model, rDist, true, false, true);
          else
            tl = new RNonHomogeneousMixedTreeLikelihood(*tree, *sites, modelSet, rDist, false, true);

          tl->initialize();
          logL = tl->getValue();
          Vdouble vd = tl->getLogLikelihoodForEachSite();

          for (unsigned int j = 0; j < nSites; j++)
            (*rates)(j, i + 1) = TextTools::toString(vd[j]);

          vvd.push_back(vd);

          ApplicationTools::displayMessage("\n");
          ApplicationTools::displayMessage("Parameter " + par2 + "=" + TextTools::toString(dval[i]) + " with rate=" + TextTools::toString(vRates[i]));

          ApplicationTools::displayResult("Log likelihood", TextTools::toString(tl->getValue(), 15));
          ApplicationTools::displayResult("Probability", TextTools::toString(vsprob[i], 15));
        }

        for (unsigned int j = 0; j < nSites; j++)
        {
          Vdouble vd;
          for (unsigned int i = 0; i < nbcl; i++)
            vd.push_back(std::log(vsprob[i])+vvd[i][j]);
          
          VectorTools::logNorm(vd);
          for (unsigned int i = 0; i < nbcl; i++)
            (*rates)(j,nbcl + i + 1) = TextTools::toString(std::exp(vd[i]));
          (*rates)(j, 2 * nbcl + 1) = TextTools::toString(VectorTools::sumExp(vd, dval));
        }

        DataTable::write(*rates, out, "\t");
      }
    }

    delete alphabet;
    delete sites;
    if (model)
      delete model;
    if (modelSet)
      delete modelSet;
    delete rDist;
    delete tl;
    delete tree;
    ApplicationTools::displayMessage("\n");
    bppmixedlikelihoods.done();
  }

  catch (exception& e)
  {
    cout << e.what() << endl;
    return 1;
  }

  return 0;
}
Ejemplo n.º 16
0
	double simulate_with_dependence (string treeFile, double PI_1, double init_k, int total_positions, int num_pos_with_same_k, double k_increase, int is_gamma, double alpha, double beta, int num_cat)
	{

		//read Newick format tree
		tree treeIn(treeFile);
	
		//four states alphabet A C G T (will later be rplaced to 00,01,10,11)
		alphabet* alph = new nucleotide;

		sequenceContainer SC_all; //this will contain all positions
	
		//parameters:
		double PI_0 = 1-PI_1;
		double k = init_k; //will be increased with each iteration

		//parameters:
		int jump_size = total_positions / num_pos_with_same_k;

		for(int i=0; i<jump_size; i++)
		{
			Vdouble freqs; //stationary probabilities PI_00, PI_01, PI_10, PI_11
			double TOTAL = k*PI_1*PI_1 + 2*PI_0*PI_1 + k*PI_0*PI_0;
			freqs.push_back(k*PI_0*PI_0 / TOTAL); //PI_00 = k*PI_0*PI_0 / TOTAL
			freqs.push_back(PI_0*PI_1 / TOTAL); //PI_01 = PI_0*PI_1 / TOTAL
			freqs.push_back(PI_0*PI_1 / TOTAL); //PI_10 = PI_0*PI_1 / TOTAL
			freqs.push_back(k*PI_1*PI_1 / TOTAL); //PI_11 = k*PI_1*PI_1 / TOTAL
	
			//Q matrix (partial values - the rest are calculated by gtrModel using freqs and these values)
			MDOUBLE a2c = PI_1; // --> c2a = freqs[a]*a2c/freqs[c] --> c2a = ((k*PI_0*PI_0 / TOTAL)*PI_1)/(PI_0*PI_1 / TOTAL) = k*PI_0
			MDOUBLE a2g = PI_1;
			MDOUBLE a2t = 0;
			MDOUBLE c2g = 0;
			MDOUBLE c2t = k*PI_1;
			MDOUBLE g2t = k*PI_1;

			//starting the evolutionary model
			distribution *currDist = NULL;
			if(is_gamma == 1)
			{
				currDist = new generalGammaDistribution(alpha,beta,num_cat); // ---> in the future we might want to turn these into param
			}
			else
			{
				currDist = new uniDistribution; // no among site rate variation
			}
		
			replacementModel *probMod = NULL;
			pijAccelerator *pijAcc = NULL;

			probMod = new gtrModel(freqs,a2c,a2g,a2t,c2g,c2t,g2t);
			pijAcc = new trivialAccelerator(probMod);
			stochasticProcess* _sp = new stochasticProcess(currDist, pijAcc);

			//simulate:
			simulateTree st1(treeIn, *_sp, alph);
			st1.generate_seq(num_pos_with_same_k); //simulate num_pos_with_same_k positions with the current k

			if(i == 0)
			{
				SC_all = st1.toSeqDataWithoutInternalNodes(); //first time
			}
			else
			{
				sequenceContainer SC = st1.toSeqDataWithoutInternalNodes(); //concatenate new positions to the ones you have
				SC_all.concatenate(SC);
			}
		
			delete currDist;
			delete probMod;
			delete pijAcc;
			delete _sp;

			k = k + k_increase; //k = 1 , 1.05 , 1.1 , ... , 5.5
		}

		//prepare out file name:
		std::stringstream sstm;
		if(is_gamma == 1)
		{
			sstm << treeFile << ".gammaRateNoInv.PI_1=" << PI_1 << ".init_k=" << init_k << ".k_group_size=" << num_pos_with_same_k << ".k_increase=" << k_increase << ".fas";
		}
		else
		{
			sstm << treeFile << ".NoRate.PI_1=" << PI_1 << ".init_k=" << init_k << ".k_group_size=" << num_pos_with_same_k << ".k_increase=" << k_increase << ".fas";
		}
		std::string seqOutputFile = sstm.str();

		//write out:
		ofstream seq_sim(seqOutputFile.c_str());
		fastaFormat::write(seq_sim,SC_all);
		seq_sim.close();
	
	
		delete alph;
		
		return 0;
	
	}