void FidoInterface::getEstimated_and_Empirical_FDR( const std::vector<std::vector<string> >& proteinNames, const std::vector<double>& probabilities, std::vector<double>& empq, std::vector<double>& estq) { empq.clear(); estq.clear(); std::vector<std::pair<double, bool> > combined; std::vector<double> peps; for (unsigned int k = 0; k < proteinNames.size(); ++k) { double prob = probabilities[k]; unsigned tpChange = countTargets(proteinNames[k]); unsigned fpChange = proteinNames[k].size() - tpChange; bool isDecoy = (tpChange == 0); combined.push_back(make_pair(probabilities[k], !isDecoy)); peps.push_back(probabilities[k]); } if (usePi0_) { std::vector<double> pvals; PosteriorEstimator::getPValues(combined, pvals); pi0_ = PosteriorEstimator::estimatePi0(pvals); } PosteriorEstimator::setNegative(true); // also get q-values for decoys PosteriorEstimator::getQValuesFromPEP(peps, estq); PosteriorEstimator::getQValues(pi0_, combined, empq); }
void FidoInterface::getEstimated_and_Empirical_FDR( const std::vector<std::vector<string> >& proteinNames, const std::vector<double>& probabilities, std::vector<double>& empq, std::vector<double>& estq) { empq.clear(); estq.clear(); double targetDecoyRatio = 1.0; if (usePi0_) { targetDecoyRatio = static_cast<double>(numberTargetProteins_) / numberDecoyProteins_; std::vector<std::pair<double, bool> > combined; for (unsigned int k = 0; k < proteinNames.size(); ++k) { double prob = probabilities[k]; unsigned tpChange = countTargets(proteinNames[k]); unsigned fpChange = proteinNames[k].size() - tpChange; bool isDecoy = (tpChange == 0); combined.push_back(make_pair(probabilities[k], !isDecoy)); } std::vector<double> pvals; PosteriorEstimator::getPValues(combined, pvals); pi0_ = PosteriorEstimator::estimatePi0(pvals); } FDRCalculator fdrCalculator(usePi0_, targetDecoyRatio, pi0_ * absenceRatio_, countDecoyQvalue_); //NOTE no need to store more q values since they will not be taken into account while estimating MSE FDR divergence for (unsigned int k = 0; (k < proteinNames.size() && (fdrCalculator.getPreviousEstQ() <= mseThreshold_)); k++) { double prob = probabilities[k]; unsigned tpChange = countTargets(proteinNames[k]); unsigned fpChange = proteinNames[k].size() - tpChange; if (trivialGrouping_) { if (tpChange > 0) tpChange = 1; if (fpChange > 0) fpChange = 1; } fdrCalculator.calcFDRs(fpChange, tpChange, prob, empq, estq); } if (kUpdateRocN) rocN_ = fdrCalculator.getRocN(); }
void FidoInterface::updateTargetDecoySizes() { std::vector<std::vector<std::string> > proteinNames; proteinGraph_->getProteinNames(proteinNames); numberTargetProteins_ = 0; numberDecoyProteins_ = 0; for (unsigned int k = 0; k < proteinNames.size(); ++k) { unsigned tpChange = countTargets(proteinNames[k]); unsigned fpChange = proteinNames[k].size() - tpChange; if (trivialGrouping_) { if (tpChange > 0) numberTargetProteins_ += 1; if (fpChange > 0) numberDecoyProteins_ += 1; } } }
void FidoInterface::getROC_AUC(const std::vector<std::vector<string> > &names, const std::vector<double> &probabilities, double &auc) { /* Estimate ROC auc1 area as : (So - no(no + 1) / 2) / (no*n1) * where no = number of target * where n1 = number of decoy * where So = SUM ri * where ri is the rank of i target in the ranked list of target and decoys */ /* Estimate ROC auc2 area as : sum trapezoid area of each segment (integral of absolute value) * A_segment(i) = abs(X1-Xo) * abs((y1 + y2 ) / 2) * Where yo = number TP at segment i * Where y1 = number TP at segment i + 1 * Where Xo = number FP at segment i * Where X1 = number FP at segment i + 1 * Total Area = Total Area / total_TP * total_FP */ /* Estimate ROC auc3 area as : sum trapezoid area with antiderivatives of each segment (absolute value of the integral) * A_segment(i) = ((yo - m*Xo)*X1 + m/2 * X1^2) - ((yo - m*Xo)*Xo - m/2 * X2^2)) * Where yo = number TP at segment i * Where y1 = number TP at segment i + 1 * Where Xo = number FP at segment i * Where X2 = number FP at segment i + 1 * Where m = (y1 - y0) / (X1 - X0) * Total Area = abs(Total Area / total_TP * total_FP) */ std::vector<bool> ranked_list; // true if is decoy std::vector<unsigned> tpArray,fpArray; unsigned prev_tp,prev_fp,tp,fp; prev_tp = prev_fp = tp = fp = 0; double prev_prob = -1; auc = 0.0; // assuming names and probabilities same size; rocN_ set by getEstimated_and_Empirical_FDR() for (unsigned k = 0; k < names.size() && fp <= rocN_; k++) { double prob = probabilities[k]; unsigned tpChange = countTargets(names[k]); unsigned fpChange = names[k].size() - tpChange; //if ties activated count groups as 1 protein if (trivialGrouping_) { if (tpChange > 0) tpChange = 1; if (fpChange > 0) fpChange = 1; } tp += tpChange; fp += fpChange; //should only do it when fp changes and either of them is != 0 if (prev_prob != -1 && fp != 0 && tp != 0 && fp != prev_fp) { double trapezoid = trapezoid_area(fp,prev_fp,tp,prev_tp); prev_fp = fp; prev_tp = tp; auc += trapezoid; } prev_prob = prob; } unsigned normalizer = (tp * fp); if (normalizer > 0) { auc /= normalizer; } else { auc = 0.0; } return; }
void FidoInterface::getEstimated_and_Empirical_FDR(const std::vector<std::vector<string> > &names, const std::vector<double> &probabilities, std::vector<double> &empq, std::vector<double> &estq) { empq.clear(); estq.clear(); double fpCount = 0.0, tpCount = 0.0; double totalFDR = 0.0, estFDR = 0.0, empFDR = 0.0; double TargetDecoyRatio = (double)numberTargetProteins / (double)numberDecoyProteins; double previousEmpQ = 0.0; double previousEstQ = 0.0; if(updateRocN) rocN = 50; //NOTE no need to store more q values since they will not be taken into account while estimating MSE FDR divergence for (unsigned int k=0; (k<names.size() && (estFDR <= threshold)); k++) { double prob = probabilities[k]; if(tiesAsOneProtein) { unsigned tpChange = countTargets(names[k]); unsigned fpChange = names[k].size() - tpChange; fpCount += (double)fpChange; tpCount += (double)tpChange; if(countDecoyQvalue) { totalFDR += (prob) * (double)(tpChange + fpChange); estFDR = totalFDR / (tpCount + fpCount); } else { totalFDR += (prob) * (double)(tpChange); estFDR = totalFDR / (tpCount); } if(tpCount) empFDR = (fpCount * pi0 * TargetDecoyRatio) / tpCount; if(empFDR > 1.0 || std::isnan(empFDR) || std::isinf(empFDR)) empFDR = 1.0; if(estFDR > 1.0 || std::isnan(estFDR) || std::isinf(estFDR)) estFDR = 1.0; if(estFDR < previousEstQ) estFDR = previousEstQ; else previousEstQ = estFDR; if(empFDR < previousEmpQ) empFDR = previousEmpQ; else previousEmpQ = empFDR; if(updateRocN) { rocN = (unsigned)std::max(rocN,(unsigned)std::max(50,std::min((int)fpCount,500))); } estq.push_back(estFDR); empq.push_back(empFDR); } else { for(unsigned i=0; i<names[k].size(); i++) { std::string protein = names[k][i]; bool isdecoy = isDecoy(protein); if(isdecoy) { fpCount++; } else { tpCount++; } if(countDecoyQvalue) { totalFDR += (prob); estFDR = totalFDR / (tpCount + fpCount); } else if(tpCount) { if(!((bool)isdecoy)) totalFDR += (prob); estFDR = totalFDR / (tpCount); } if(tpCount) empFDR = (fpCount * pi0 * TargetDecoyRatio) / tpCount; if(empFDR > 1.0 || std::isnan(empFDR) || std::isinf(empFDR)) empFDR = 1.0; if(estFDR > 1.0 || std::isnan(estFDR) || std::isinf(estFDR)) estFDR = 1.0; if(estFDR < previousEstQ) estFDR = previousEstQ; else previousEstQ = estFDR; if(empFDR < previousEmpQ) empFDR = previousEmpQ; else previousEmpQ = empFDR; if(updateRocN) { rocN = (unsigned)std::max(rocN,(unsigned)std::max(50,std::min((int)fpCount,500))); } estq.push_back(estFDR); empq.push_back(empFDR); } } } return; }