示例#1
0
/*
* Parameters: 
* SNPs [nIndividuals by nSNPs]:
*                       Matrix stored in column-major order. 
*                       This will hold the result.
*                       NaNs will be set to 0.0 in the result.
*/
void SUFFIX(ImputeAndZeroMeanSNPs)( 
	REAL *SNPs, 
	const size_t nIndividuals, 
	const size_t nSNPs, 
	const bool betaNotUnitVariance,
	const REAL betaA,
	const REAL betaB
	)
{
	bool seenSNC = false; //Keep track of this so that only one warning message is reported
#ifdef ORDERF

	for ( size_t iSnp = 0; iSnp < nSNPs; ++iSnp )
	{

		REAL n_observed = 0.0;
		REAL sum_s  = 0.0;      //the sum of a SNP over all observed individuals
		REAL sum2_s = 0.0;      //the sum of the squares of the SNP over all observed individuals

		size_t end = nIndividuals;
		size_t delta = 1;
		for( size_t ind = 0; ind < end; ind+=delta )
		{
			if (SNPs[ind] == SNPs[ind])
			{
				//check for not NaN
				sum_s += SNPs[ ind ];
				sum2_s+= SNPs[ ind ] * SNPs[ ind ];
				++n_observed;
			}
		}

		if ( n_observed < 1.0 )
		{
			printf( "No individual observed for the SNP.\n");
		}

		REAL mean_s  = sum_s  / n_observed;    //compute the mean over observed individuals for the current SNP
		REAL mean2_s = sum2_s / n_observed;    //compute the mean of the squared SNP

		//When beta standardization is being done, check that data is 0,1,2
		if (betaNotUnitVariance && sum_s <= (REAL)0.0)
		{
			REAL freqT = sum_s/n_observed;
			fprintf(stderr, "Observed SNP freq is %.2f. for a SNPs[:][%i]\n", freqT, iSnp );
			exit(1);
		}


		//The SNP frequency
		REAL freq = (sum_s) / (n_observed * (REAL)2.0);   //compute snp-freq as in the Visscher Height paper (Nat Gen, Yang et al 2010).

		if ((freq != freq) || betaNotUnitVariance && ((freq >= (REAL)1.0) || (freq <= (REAL)0.0)))
		{
			if (!seenSNC)
			{
				seenSNC = true;
				fprintf(stderr, "Illegal SNP frequency: %.2f for SNPs[:][%i]\n", freq, iSnp);
			}
		}


		REAL variance = mean2_s-mean_s * mean_s;        //By the Cauchy Shwartz inequality this should always be positive
		REAL std = sqrt( variance );                    //The SNP frequency

		bool isSNC = false;
		if ( (std != std) || (std <= (REAL)0.0) )
		{
			// a std == 0.0 means all SNPs have the same value (no variation or Single Nucleotide Constant (SNC))
			//   however ALL SNCs should have been removed in previous filtering steps
			//   This test now prevents a divide by zero error below
			std = 1.0;
			isSNC = true;
			if (!seenSNC)
			{
				seenSNC = true;
				fprintf(stderr, "std=.%2f has illegal value for SNPs[:][%i]\n", std, iSnp );
			}

		}

		if (betaNotUnitVariance && freq > .5)
		{
			freq = 1.0 - freq;
		}

		for( size_t ind = 0; ind < end; ind+=delta )
		{
			//check for NaN
			if ( (SNPs[ ind ]!=SNPs[ ind ]) || isSNC)
			{
				SNPs[ ind ] = 0.0;
			}
			else
			{
				SNPs[ ind ] -= mean_s;     //subtract the mean from the data
				if (betaNotUnitVariance )
				{
					REAL rT = SUFFIX(BetaPdf)( freq, betaA, betaB );
					//fprintf(stderr, "BetaPdf(%f,%f,%f)=%f\n",  freq, betaA, betaB, rT);
					SNPs[ ind ] *= rT;
				}
				else
				{
					SNPs[ ind ] /= std;        //unit variance as well
				}

			}
		}

		SNPs += nIndividuals;
	}

#else //Order C


	// Make one pass through the data (by individual, because that is how it is laid out), collecting statistics
	std::vector<REAL> n_observed(nSNPs); //                                                C++ inits to 0's
	std::vector<REAL> sum_s(nSNPs);      //the sum of a SNP over all observed individuals. C++ inits to 0's
	std::vector<REAL> sum2_s(nSNPs);     //the sum of the squares of the SNP over all observed individuals.     C++ inits to 0's

	for( size_t ind = 0; ind < nIndividuals; ++ind)
	{
		size_t rowStart = ind * nSNPs;
		for ( size_t iSnp = 0; iSnp < nSNPs; ++iSnp )
		{
			REAL value = SNPs[rowStart+iSnp];
			if ( value == value )
			{
				sum_s[iSnp] += value;
				sum2_s[iSnp] += value * value;
				++n_observed[iSnp];
			}
		}
	}


	std::vector<REAL> mean_s(nSNPs);  //compute the mean over observed individuals for the current SNP
	std::vector<REAL> mean2_s(nSNPs); //compute the mean of the squared SNP
	std::vector<REAL> std(nSNPs); //the standard deviation
	std::vector<REAL> freq(nSNPs); //The SNP frequency
	std::vector<bool> isSNC(nSNPs); // Is this a SNC (C++ inits to false)

	for ( size_t iSnp = 0; iSnp < nSNPs; ++iSnp )
	{
		if ( n_observed[iSnp] < 1.0 )
		{
			printf( "No individual observed for the SNP.\n");
		}

		mean_s[iSnp]  = sum_s[iSnp]  / n_observed[iSnp];    //compute the mean over observed individuals for the current SNP
		mean2_s[iSnp] = sum2_s[iSnp] / n_observed[iSnp];    //compute the mean of the squared SNP

		//When beta standardization is being done, check that data is 0,1,2
		if (betaNotUnitVariance && sum_s[iSnp] <= (REAL)0.0)
		{
			REAL freqT = sum_s[iSnp]/n_observed[iSnp];
			fprintf(stderr, "Observed SNP freq is %.2f. for a SNPs[:][%i]\n", freqT, iSnp );
			exit(1);
		}

		freq[iSnp] = (sum_s[iSnp]) / (n_observed[iSnp] * (REAL)2.0);   //compute snp-freq[iSnp] as in the Visscher Height paper (Nat Gen, Yang et al 2010).

		if ((freq[iSnp] != freq[iSnp]) || betaNotUnitVariance && ((freq[iSnp] >= (REAL)1.0) || (freq[iSnp] <= (REAL)0.0)))
		{
			if (!seenSNC)
			{
				seenSNC = true;
				fprintf(stderr, "Illegal SNP frequency: %.2f for SNPs[:][%i]\n", freq[iSnp], iSnp);
			}
		}


		REAL variance = mean2_s[iSnp]-mean_s[iSnp] * mean_s[iSnp];        //By the Cauchy Shwartz inequality this should always be positive
		std[iSnp] = sqrt( variance );

		if ( (std[iSnp] != std[iSnp]) || (std[iSnp] <= (REAL)0.0) )
		{
			// a std == 0.0 means all SNPs have the same value (no variation or Single Nucleotide Constant (SNC))
			//   however ALL SNCs should have been removed in previous filtering steps
			//   This test now prevents a divide by zero error below
			std[iSnp] = 1.0;
			isSNC[iSnp] = true;
			if (!seenSNC)
			{
				seenSNC = true;
				fprintf(stderr, "std=.%2f has illegal value for SNPs[:][%i]\n", std[iSnp], iSnp );
			}
		}

		if (betaNotUnitVariance && freq[iSnp] > .5)
		{
			freq[iSnp] = 1.0 - freq[iSnp];
		}
	}

	for( size_t ind = 0; ind < nIndividuals; ++ind)
	{
		size_t rowStart = ind * nSNPs;
		for ( size_t iSnp = 0; iSnp < nSNPs; ++iSnp )
		{
			REAL value = SNPs[rowStart+iSnp];
			//check for NaN
			if ( (value != value) || isSNC[iSnp])
			{
				value = 0.0;
			}
			else
			{
				value -= mean_s[iSnp];     //subtract the mean from the data
				if (betaNotUnitVariance )
				{
					REAL rT = SUFFIX(BetaPdf)( freq[iSnp], betaA, betaB );
					//fprintf(stderr, "BetaPdf(%f,%f,%f)=%f\n",  freq, betaA, betaB, rT);
					value *= rT;
				}
				else
				{
					value /= std[iSnp];        //unit variance as well
				}
			}
			SNPs[rowStart+iSnp] = value;
		}
	}
#endif
}
/*
* Parameters: 
* SNPs [nIndividuals by nSNPs]:
*                       Matrix stored in column-major order. 
*                       This will hold the result.
*                       NaNs will be set to 0.0 in the result.
*/
void SUFFIX(ImputeAndZeroMeanSNPs)( 
	REAL *SNPs, 
	const size_t nIndividuals, 
	const size_t nSNPs, 
	const bool betaNotUnitVariance,
	const REAL betaA,
	const REAL betaB,
	const bool apply_in_place,
	const bool use_stats,
	REAL *stats
	)
{
	bool seenSNC = false; //Keep track of this so that only one warning message is reported
#ifdef ORDERF

	for ( size_t iSnp = 0; iSnp < nSNPs; ++iSnp )
	{
		REAL mean_s;
		REAL std;
		REAL freq = 0;
		size_t end = nIndividuals;
		size_t delta = 1;
		bool isSNC;

		if (use_stats)
		{
			mean_s = stats[iSnp];
			std = stats[iSnp + nSNPs];
			isSNC = isinf(std);
		}
		else
		{
			isSNC = false;
			REAL n_observed = 0.0;
			REAL sum_s = 0.0;      //the sum of a SNP over all observed individuals
			REAL sum2_s = 0.0;      //the sum of the squares of the SNP over all observed individuals

			for (size_t ind = 0; ind < end; ind += delta)
			{
				if (SNPs[ind] == SNPs[ind])
				{
					//check for not NaN
					sum_s += SNPs[ind];
					sum2_s += SNPs[ind] * SNPs[ind];
					++n_observed;
				}
			}

			if (n_observed < 1.0)
			{
				printf("No individual observed for the SNP.\n");
				//LATER make it work (in some form) for n of 0
			}

			mean_s = sum_s / n_observed;    //compute the mean over observed individuals for the current SNP
			REAL mean2_s = sum2_s / n_observed;    //compute the mean of the squared SNP

			if ((mean_s != mean_s) || betaNotUnitVariance && ((mean_s > (REAL)2.0) || (mean_s < (REAL)0.0)))
			{
				if (!seenSNC)
				{
					seenSNC = true;
					fprintf(stderr, "Illegal SNP mean: %.2f for SNPs[:][%i]\n", mean_s, iSnp);
				}
			}


			REAL variance = mean2_s - mean_s * mean_s;        //By the Cauchy Shwartz inequality this should always be positive
			std = sqrt(variance);

			if ((std != std) || (std <= (REAL)0.0))
			{
				// a std == 0.0 means all SNPs have the same value (no variation or Single Nucleotide Constant (SNC))
				//   however ALL SNCs should have been removed in previous filtering steps
				//   This test now prevents a divide by zero error below
				isSNC = true;
				if (!seenSNC)
				{
					seenSNC = true;
					fprintf(stderr, "std=.%2f has illegal value for SNPs[:][%i]\n", std, iSnp);
				}
				std = std::numeric_limits<REAL>::infinity();

			}

			stats[iSnp] = mean_s;
			stats[iSnp + nSNPs] = std;
		}


		if (apply_in_place)
		{
			for (size_t ind = 0; ind < end; ind += delta)
			{
				//check for NaN
				if ((SNPs[ind] != SNPs[ind]) || isSNC)
				{
					SNPs[ind] = 0.0;
				}
				else
				{
					SNPs[ind] -= mean_s;     //subtract the mean from the data
					if (betaNotUnitVariance) //compute snp-freq as in the Visscher Height paper (Nat Gen, Yang et al 2010).
					{
						REAL freq = mean_s / 2.0;
						if (freq > .5)
						{
							freq = 1.0 - freq;
						}
						REAL rT = SUFFIX(BetaPdf)(freq, betaA, betaB);
						//fprintf(stderr, "BetaPdf(%f,%f,%f)=%f\n",  freq, betaA, betaB, rT);
						SNPs[ind] *= rT;
					}
					else
					{
						SNPs[ind] /= std;        //unit variance as well
					}
				}
			}
		}

		SNPs += nIndividuals;
	}

#else //Order C

	std::vector<REAL> mean_s(nSNPs);  //compute the mean over observed individuals for the current SNP
	std::vector<REAL> std(nSNPs); //the standard deviation
	std::vector<bool> isSNC(nSNPs); // Is this a SNC (C++ inits to false)
	if (use_stats)
	{
		for (size_t iSnp = 0; iSnp < nSNPs; ++iSnp)
		{
			mean_s[iSnp] = stats[iSnp*2];
			std[iSnp] = stats[iSnp * 2+1];
			isSNC[iSnp] = isinf(std[iSnp]);
		}
	}
	else
	{
		// Make one pass through the data (by individual, because that is how it is laid out), collecting statistics
		std::vector<REAL> n_observed(nSNPs); //                                                C++ inits to 0's
		std::vector<REAL> sum_s(nSNPs);      //the sum of a SNP over all observed individuals. C++ inits to 0's
		std::vector<REAL> sum2_s(nSNPs);     //the sum of the squares of the SNP over all observed individuals.     C++ inits to 0's

		for( size_t ind = 0; ind < nIndividuals; ++ind)
		{
			size_t rowStart = ind * nSNPs;
			for ( size_t iSnp = 0; iSnp < nSNPs; ++iSnp )
			{
				REAL value = SNPs[rowStart+iSnp];
				if ( value == value )
				{
					sum_s[iSnp] += value;
					sum2_s[iSnp] += value * value;
					++n_observed[iSnp];
				}
			}
		}


		std::vector<REAL> mean2_s(nSNPs); //compute the mean of the squared SNP

		for (size_t iSnp = 0; iSnp < nSNPs; ++iSnp)
		{
			if (n_observed[iSnp] < 1.0)
			{
				printf("No individual observed for the SNP.\n");
			}

			mean_s[iSnp] = sum_s[iSnp] / n_observed[iSnp];    //compute the mean over observed individuals for the current SNP
			mean2_s[iSnp] = sum2_s[iSnp] / n_observed[iSnp];    //compute the mean of the squared SNP

			if ((mean_s[iSnp] != mean_s[iSnp]) || betaNotUnitVariance && ((mean_s[iSnp] > (REAL)2.0) || (mean_s[iSnp] < (REAL)0.0)))
			{
				if (!seenSNC)
				{
					seenSNC = true;
					fprintf(stderr, "Illegal SNP mean: %.2f for SNPs[:][%i]\n", mean_s[iSnp], iSnp);
				}
			}


			REAL variance = mean2_s[iSnp] - mean_s[iSnp] * mean_s[iSnp];        //By the Cauchy Shwartz inequality this should always be positive
			std[iSnp] = sqrt(variance);

			if ((std[iSnp] != std[iSnp]) || (std[iSnp] <= (REAL)0.0))
			{
				// a std == 0.0 means all SNPs have the same value (no variation or Single Nucleotide Constant (SNC))
				//   however ALL SNCs should have been removed in previous filtering steps
				//   This test now prevents a divide by zero error below
				std[iSnp] = std::numeric_limits<REAL>::infinity();
				isSNC[iSnp] = true;
				if (!seenSNC)
				{
					seenSNC = true;
					fprintf(stderr, "std=.%2f has illegal value for SNPs[:][%i]\n", std[iSnp], iSnp);
				}
			}
			stats[iSnp*2] = mean_s[iSnp];
			stats[iSnp*2+1] = std[iSnp];
		}
	}

	if (apply_in_place)
	{
		for (size_t ind = 0; ind < nIndividuals; ++ind)
		{
			size_t rowStart = ind * nSNPs;
			for (size_t iSnp = 0; iSnp < nSNPs; ++iSnp)
			{
				REAL value = SNPs[rowStart + iSnp];
				//check for NaN
				if ((value != value) || isSNC[iSnp])
				{
					value = 0.0;
				}
				else
				{
					value -= mean_s[iSnp];     //subtract the mean from the data
					if (betaNotUnitVariance)
					{
						//compute snp-freq as in the Visscher Height paper (Nat Gen, Yang et al 2010).
						REAL freq = mean_s[iSnp] / 2.0;
						if (freq > .5)
						{
							freq = 1.0 - freq;
						}

						REAL rT = SUFFIX(BetaPdf)(freq, betaA, betaB);
						//fprintf(stderr, "BetaPdf(%f,%f,%f)=%f\n",  freq, betaA, betaB, rT);
						value *= rT;
					}
					else
					{
						value /= std[iSnp];        //unit variance as well
					}
				}
				SNPs[rowStart + iSnp] = value;
			}
		}
	}
#endif
}