void PearsonCorrelation::glance(uint32_t user_u, uint32_t user_v) {
    std::map<uint32_t, MovieRate*>::iterator it = _userid_movierate_map.find(user_u);
    if (it != _userid_movierate_map.end()) {
        it->second->glance();
    }
    it = _userid_movierate_map.find(user_v);
    if (it != _userid_movierate_map.end()) {
        it->second->glance();
    }
    double score;
    if (getCorrelation(user_u, user_v, score))
        printf("score: %lf\n", score);
    else
        printf("no score\n");
}
/******************************************* getSF *******************************************/
double Simulation::getSF()
{
  double SF = 0;
  
  ////sum over all pairs of spins (ensure j<i for no double counting):
  //sum over all pairs of spins:
  for( int i=0; i<N; i++ )
  {
    for( int j=0; j<N; j++ )
    { SF += getCorrelation(i,j); }
  }
  SF /= N;
  
  return SF;
}
void data::runPermutationExtended(string fout, vector < int > nPermutations) {

	//0. Prepare genotypes
	vector < double > genotype_sd = vector < double > (genotype_count, 0.0);
	vector < double > phenotype_sd = vector < double > (phenotype_count, 0.0);
	if (covariate_count > 0) {
		LOG.println("\nCorrecting genotypes for covariates");
		covariate_engine->residualize(genotype_orig);
	}
	for (int g = 0 ; g < genotype_count ; g ++) genotype_sd[g] = RunningStat(genotype_orig[g]).StandardDeviation();
	normalize(genotype_orig);

	//1. Loop over phenotypes
	ofile fdo (fout);
	for (int p = 0 ; p < phenotype_count ; p ++) {

		LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");

		//1.1. Enumerate all genotype-phenotype pairs within cis-window
		vector < int > targetGenotypes, targetDistances;
		for (int g = 0 ; g < genotype_count ; g ++) {
                  int cisdistance;
                  int startdistance = genotype_pos[g] - phenotype_start[p];
                  int enddistance = genotype_end[g] - phenotype_start[p];

                  // for INVs ignore the span and define the cisdistance
                  // as the distance from the breakpoints to the phenotype_start
                  if (genotype_vartype[g].compare("INV") == 0) {
                    if (abs(startdistance) <= abs(enddistance))
                      cisdistance = startdistance;
                    else
                      cisdistance = enddistance;
                  }

                  // for the variants with span (DEL, DUP, MEI), cisdistance is zero
                  // if the phenotype_start falls within the span, and the distance to
                  // the closest edge otherwise
                  // BNDs get processed here as well, but their END coordinate is the
                  // same as the START coordinate.
                  else {
                    if (startdistance < 0 && enddistance > 0) { // if gene is within SV, then cis distance is 0
                      cisdistance = 0;
                    }
                    else if (startdistance >= 0)
                      cisdistance = startdistance;
                    else
                      cisdistance = enddistance;
                  }

                  if (abs(cisdistance) <= cis_window) {
                    targetGenotypes.push_back(g);
                    targetDistances.push_back(cisdistance);
                  }
		}
		LOG.println("  * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));

		//1.2. Copy original data
		vector < float > phenotype_curr = phenotype_orig[p];
		if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
		phenotype_sd[p] = RunningStat(phenotype_curr).StandardDeviation();
		normalize(phenotype_curr);

		//1.3. Nominal pass: scan cis-window & compute statistics
		double bestCorr = 0.0;
		vector < double > targetCorrelations;
		int bestDistance = ___LI___, bestIndex = -1;
		for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
			double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
			targetCorrelations.push_back(corr);
			if (abs(targetCorrelations[g]) > abs(bestCorr) || (abs(targetCorrelations[g]) == abs(bestCorr) && abs(targetDistances[g]) < bestDistance)) {
				bestCorr = targetCorrelations[g];
				bestDistance = targetDistances[g];
				bestIndex = targetGenotypes[g];
			}
		}
		if (targetGenotypes.size() > 0) LOG.println("  * Best correlation = " + sutils::double2str(bestCorr, 4));

		//1.4. Permutation pass:
		bool done = false;
		int countPermutations = 0, nBetterCorrelation = 0;
		vector < double > permCorr;
		do {
			double bestCperm = 0.0;
			phenotype_curr = phenotype_orig[p];
			random_shuffle(phenotype_curr.begin(), phenotype_curr.end());
			if (covariate_count > 0) covariate_engine->residualize(phenotype_curr);
			normalize(phenotype_curr);
			for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
				double corr = getCorrelation(genotype_orig[targetGenotypes[g]], phenotype_curr);
				if (abs(corr) > abs(bestCperm)) bestCperm = corr;
			}
			if (abs(bestCperm) >= abs(bestCorr)) nBetterCorrelation++;
			permCorr.push_back(bestCperm);
			countPermutations++;

			if (nPermutations.size() == 1 && countPermutations >= nPermutations[0]) done = true;
			if (nPermutations.size() == 2 && (nBetterCorrelation >= nPermutations[0] || countPermutations >= nPermutations[1])) done = true;
			if (nPermutations.size() == 3 && (countPermutations >= nPermutations[0]) && (nBetterCorrelation >= nPermutations[1] || countPermutations >= nPermutations[2])) done = true;
		} while (!done);
		if (targetGenotypes.size() > 0) LOG.println("  * Number of permutations = " + sutils::int2str(nBetterCorrelation) + " / " + sutils::int2str(countPermutations));

		//1.5. Calculate effective DFs & Beta distribution parameters
		vector < double > permPvalues;
		double true_df = sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0);
		double mean = 0.0, variance = 0.0, beta_shape1 = 1.0, beta_shape2 = 1.0;
		if (targetGenotypes.size() > 0) {
			//Estimate number of degrees of freedom
			if (putils::variance(permCorr, putils::mean(permCorr)) != 0.0) {
				learnDF(permCorr, true_df);
				//LOG.println("  * Effective degree of freedom = " + sutils::double2str(true_df, 4));
			}
			//Compute mean and variance of p-values
			for (int c = 0 ; c < permCorr.size() ; c ++) permPvalues.push_back(getPvalue(permCorr[c], true_df));
			for (int pv = 0 ; pv < permPvalues.size() ; pv++) mean += permPvalues[pv];
			mean /= permPvalues.size();
			for (int pv = 0 ; pv < permPvalues.size() ; pv++) variance += (permPvalues[pv] - mean) * (permPvalues[pv] - mean);
			variance /= (permPvalues.size() - 1);
			//Estimate shape1 & shape2
			if (targetGenotypes.size() > 1 && mean != 1.0) {
				beta_shape1 = mean * (mean * (1 - mean ) / variance - 1);
				beta_shape2 = beta_shape1 * (1 / mean - 1);
				if (targetGenotypes.size() > 10) mleBeta(permPvalues, beta_shape1, beta_shape2);	//ML estimate if more than 10 variant in cis
			}
			LOG.println("  * Beta distribution parameters = " + sutils::double2str(beta_shape1, 4) + " " + sutils::double2str(beta_shape2, 4));
		}

		//1.6. Writing results
		if (targetGenotypes.size() > 0 && bestIndex >= 0) {
		    for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
		        fdo << phenotype_id[p] << " " << targetGenotypes.size();
			fdo << " " << beta_shape1 << " " << beta_shape2 << " " << true_df;
			double pval_fdo = getPvalue(targetCorrelations[g], true_df);
			double pval_nom = getPvalue(targetCorrelations[g], sample_count - 2 - ((covariate_count>0)?covariate_engine->nCovariates():0));
			double pval_slope = getSlope(targetCorrelations[g], phenotype_sd[p], genotype_sd[bestIndex]);
			fdo << " " << genotype_id[targetGenotypes[g]];
			fdo << " " << targetDistances[g];
			fdo << " " << pval_nom;
			fdo << " " << pval_slope;
			fdo << " " << (nBetterCorrelation + 1) * 1.0 / (countPermutations + 1.0);
			fdo << " " << pbeta(pval_fdo, beta_shape1, beta_shape2, 1, 0);
			fdo << endl;
		    }
		}
		else fdo << phenotype_id[p] << " NA NA NA NA NA NA NA NA NA" << endl;

		LOG.println("  * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
	}
	fdo.close();
}
Exemple #4
0
void data::runMapping(string fout, bool full) {
	ofile fdo (fout);
	for (int p = 0 ; p < phenotype_count ; p ++) {

		LOG.println("\nProcessing gene [" + phenotype_id[p] + "]");

		vector < int > targetGenotypes, targetDistances;
		for (int g = 0 ; g < genotype_count ; g ++) {
			int cisdistance = genotype_pos[g] - phenotype_start[p];
			if (abs(cisdistance) <= cis_window) {
				targetGenotypes.push_back(g);
				targetDistances.push_back(cisdistance);
			}
		}
		LOG.println("  * Number of variants in cis = " + sutils::int2str(targetGenotypes.size()));

		if (targetGenotypes.size() > 0) {

			LOG.println("  * Nominal significance threshold = " + sutils::double2str(phenotype_threshold[p]));

			// 1. Forward pass: Learn number of independent signals and Map the best candidates
			vector < double > bestCorr = vector < double > (MAX_ANALYSIS_DEPTH, 0.0);
			vector < double > uncorrected_pvalues = vector < double > (targetGenotypes.size(), 2.0);
			vector < int > bestIndex;
			bool done = false;

			for (int i = 0 ; i < MAX_ANALYSIS_DEPTH && !done; i ++) {
				int n_significant = 0;
				vector < float > phenotype_curr = phenotype_orig[p];
				copy(genotype_orig.begin() + targetGenotypes[0], genotype_orig.begin() + targetGenotypes.back() + 1, genotype_curr.begin() + targetGenotypes[0]);
				vector < int > bestIndex_tmp = bestIndex;
				bestIndex_tmp.push_back(-1);

				//Covariates + Best signals
				covariate_engine->clearSoft();
				for (int h = 0 ; h < bestIndex.size() ; h ++) covariate_engine->pushSoft(genotype_orig[bestIndex[h]]);
				covariate_engine->residualize(phenotype_curr);
				normalize(phenotype_curr);

				for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
					if (i == 0 || full || (!full && uncorrected_pvalues[g] <= phenotype_threshold[p])) {
						covariate_engine->residualize(genotype_curr[targetGenotypes[g]]);
						normalize(genotype_curr[targetGenotypes[g]]);
						double corr = getCorrelation(genotype_curr[targetGenotypes[g]], phenotype_curr);
						double pvalue = getPvalue(corr, sample_count - 2 - covariate_engine->nCovariates());
						if (abs(corr) > abs(bestCorr[i])) {
							bestCorr[i] = corr;
							bestIndex_tmp[i] = targetGenotypes[g];
						}
						if (i == 0) uncorrected_pvalues[g] = pvalue;
						if (pvalue <= phenotype_threshold[p]) n_significant ++;
					}
				}
				if (n_significant == 0) done = true;
				else bestIndex = bestIndex_tmp;
			}
			LOG.println("  * Number of independent signals found = " + sutils::int2str(bestIndex.size()));

			if (bestIndex.size() == 1) {
				int n_signals = 0;
				for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
					if (uncorrected_pvalues[g] <= phenotype_threshold[p]) {
						fdo << phenotype_id[p] << " 0 " << genotype_id[targetGenotypes[g]] << " " << targetDistances[g] << " " << uncorrected_pvalues[g] << " " << uncorrected_pvalues[g] << " " << (bestIndex[0] == targetGenotypes[g]) << endl;
						n_signals ++;
					}
				}
				LOG.println("  * Number of candidate QTLs reported for rank 1 = " + sutils::int2str(n_signals));
			} else if (bestIndex.size() > 1) {
				//2. Backward pass: Determine candidate variants and classify them
				vector < vector < double > > corrected_pvalues = vector < vector < double > > (bestIndex.size(), vector < double > (targetGenotypes.size(), 2.0));
				for (int i = bestIndex.size() - 1 ; i >= 0 ; i --) {
					vector < float > phenotype_curr = phenotype_orig[p];
					copy(genotype_orig.begin() + targetGenotypes[0], genotype_orig.begin() + targetGenotypes.back() + 1, genotype_curr.begin() + targetGenotypes[0]);

					vector < int > bestIndexOthers = bestIndex;
					bestIndexOthers.erase(bestIndexOthers.begin() + i);

					covariate_engine->clearSoft();
					for (int h = 0 ; h < bestIndexOthers.size() ; h ++) covariate_engine->pushSoft(genotype_orig[bestIndexOthers[h]]);
					covariate_engine->residualize(phenotype_curr);
					normalize(phenotype_curr);

					for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
						if (full || (!full && uncorrected_pvalues[g] <= phenotype_threshold[p])) {
							covariate_engine->residualize(genotype_curr[targetGenotypes[g]]);
							normalize(genotype_curr[targetGenotypes[g]]);
							double corr = getCorrelation(genotype_curr[targetGenotypes[g]], phenotype_curr);
							corrected_pvalues[i][g] = getPvalue(corr, sample_count - 2 - covariate_engine->nCovariates());
						}
					}
				}

				//
				for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
					double min_pvalue = 1.1;
					for (int i = 0; i < bestIndex.size() ; i ++) if (corrected_pvalues[i][g] < min_pvalue) min_pvalue = corrected_pvalues[i][g];
					for (int i = 0; i < bestIndex.size() ; i ++) if (corrected_pvalues[i][g] != min_pvalue) {
						corrected_pvalues[i][g] = 2.0;
					}
				}

				//
				vector < int > n_signals = vector < int > (bestIndex.size(), 0);
				for (int i = 0; i < bestIndex.size() ; i ++) {
					for (int g = 0 ; g < targetGenotypes.size() ; g ++) {
						if ((bestIndex[i] == targetGenotypes[g]) || (corrected_pvalues[i][g] <= phenotype_threshold[p])) {
							fdo << phenotype_id[p] << " " << i << " " << genotype_id[targetGenotypes[g]] << " " << targetDistances[g] << " " << uncorrected_pvalues[g] << " " << corrected_pvalues[i][g] << " " << (bestIndex[i] == targetGenotypes[g]) << endl;
							n_signals [i] ++;
						}
					}
				}

				//
				for (int i = 0; i < bestIndex.size() ; i ++)
					LOG.println("  * Number of candidate QTLs reported for rank "+ sutils::int2str(i) + " = " + sutils::int2str(n_signals[i]));
			}
		}

		LOG.println("  * Progress = " + sutils::double2str((p+1) * 100.0 / phenotype_count, 1) + "%");
	}
	fdo.close();
}