/////////////////////////////////////////////////////////////////////////////////////////////////////////// //findBestParamManyStarts: Finds the best gammaMixture from many starting points. //The function starts form few starting points. //For each point it tries to optimize the likellihood doing only a small number of iterations. //It then picks the best points (highest likelihood) and continue the maximization for these points only. //The best gammaMixture is stored in _sp and the best likelihood is returned. //input Parameters: //startPointsNum = the number of starting points. //bestStartsNum = the number of best points to continue with the full optimization. //startIter = the number of iterations to perform with all starting points. //maxIterations = the maximum number of iterations to continue with the best points //epsilon = for determining convergence in the maximization process. MDOUBLE optGammaMixtureEM::findBestParamManyStarts(const int startPointsNum, const int bestStartsNum, const int startIter, const int maxIterations, const MDOUBLE epsilon, const MDOUBLE epsilomQopt, ofstream* pOutF) { vector<mixtureDistribution> distVec; Vdouble likelihoodVec(startPointsNum); mixtureDistribution * pMixture = static_cast<mixtureDistribution*>(_pSp->distr()); //create starting distributions int i; for (i = 0; i < startPointsNum; ++i) { //the first distribution will be the current one if (i == 0) distVec.push_back(*pMixture); else distVec.push_back(mixtureDistribution(pMixture->getComponentsNum(), pMixture->categoriesForOneComponent(), LAGUERRE, 15, 15)); } //make a small number of iterations for all random starts for (i = 0; i < distVec.size(); ++i) { likelihoodVec[i] = optimizeParam(&distVec[i], startIter, epsilon, epsilomQopt, pOutF); } //sort results and make full optimization only on the best starts Vdouble sortedL = likelihoodVec; sort(sortedL.begin(),sortedL.end()); MDOUBLE threshold = sortedL[sortedL.size()- bestStartsNum]; MDOUBLE bestL = sortedL[0]; int bestDistNum = 0; for (i = 0; i < distVec.size(); ++i) { if (likelihoodVec[i] >= threshold) { MDOUBLE newL = optimizeParam(&distVec[i], maxIterations, epsilon, epsilomQopt, pOutF); if (newL > bestL) { bestL = newL; bestDistNum = i; } } } _pSp->setDistribution(&distVec[bestDistNum]); distVec.clear(); return bestL; }
// a file with color-coding from Ka/Ks values to color-bins void kaks2Color(const Vdouble & kaksVec, const Vdouble &lowerBoundV, const sequence & refSeq, string fileName,codon *co) { vector<int> colors; int numOfSitesinAln = kaksVec.size(); Vdouble negativesKaksVec,negativesSite; negativesKaksVec.clear(); negativesSite.clear(); int i,gapsInRefSeq=0; for (i=0;i<numOfSitesinAln;i++){ if (codonUtility::aaOf(refSeq[i],*co) == -1) gapsInRefSeq++; } // first dealing with positive selection colors.resize(numOfSitesinAln-gapsInRefSeq); int gap=0; for (i=0;i<numOfSitesinAln;i++){ if (codonUtility::aaOf(refSeq[i],*co) == -1){ gap++; continue; } if (lowerBoundV[i]>1) // color 1 (positive selection) : if confidence interval lower bound > 1 colors[i-gap]=1; else if (kaksVec[i]>1) // color 2(positive selection) : "non-significant" colors[i-gap]=2; else { negativesKaksVec.push_back(kaksVec[i]); //add the value of kaks < 1 negativesSite.push_back(i-gap); //add the number of site of the kaks } } // now dealing with purifying selection Vdouble orderVec = negativesKaksVec; if (orderVec.size()>0) // this is since once the whole protein was positive selection... (anomaly) sort(orderVec.begin(), orderVec.end()); //sort the kaks values to be divided to 5 groups MDOUBLE percentileNum = 5.0; int percentileNumInt = 5; Vdouble maxScoreForPercentile(percentileNumInt); if (orderVec.size()>0) { maxScoreForPercentile[0] = orderVec[0]; for (int c = 1; c < percentileNumInt; ++c){ int place = (int)((c / percentileNum) * negativesKaksVec.size()); MDOUBLE maxScore = orderVec[place]; maxScoreForPercentile[c] = maxScore; } } //loop over all the Ka/Ks < 1 for (int j=0; j < negativesKaksVec.size(); ++j){ MDOUBLE r = negativesKaksVec[j]; //the kaks of the site. int s = (int)negativesSite[j]; //the site. if (r > maxScoreForPercentile[4]) colors[s] = 3; else if (r > maxScoreForPercentile[3]) colors[s] = 4; else if (r> maxScoreForPercentile[2]) colors[s] = 5; else if (r > maxScoreForPercentile[1]) colors[s] = 6; else if (r >= maxScoreForPercentile[0]) colors[s] = 7; } //print to file ofstream out(fileName.c_str()); gap=0; amino aminoAcid; LOG(5,<<"Printing selection color bins to file"<<endl); for (i=0;i<refSeq.seqLen();i++){ int aa = codonUtility::aaOf(refSeq[i], *co); if (aa==-1){ gap++; continue; } string aaStr = aminoAcid.fromInt(aa); out<<i+1-gap <<"\t"<<aaStr<<"\t"<<colors[i-gap]; out<<endl; } out.close(); }