Example #1
0
File: data.cpp Project: fw1121/eems
void Data::bed2diffs_v1()
{
  std::string diffsfile = datapath + ".diffs";
  std::string countfile = datapath + ".count";
  std::string orderfile = datapath + ".order";
  std::ofstream outdiffs(diffsfile.c_str(), std::ios::out);
  std::ofstream outcount(countfile.c_str(), std::ios::out);
  std::ofstream outorder(orderfile.c_str(), std::ios::out);

  outdiffs.precision(12);
  outdiffs.setf(std::ios::fixed,std::ios::floatfield);
  
  size_t nPairs = nIndiv*(nIndiv-1)/2;
  size_t nSitesProcessed = 0;

  // Read the genotypes in serial, compute the differences in parallel
  snp_t *snps = (snp_t *) malloc( sizeof(snp_t)*nIndiv );
  size_t *a = (size_t *) malloc( sizeof(size_t)*nPairs );
  size_t *b = (size_t *) malloc( sizeof(size_t)*nPairs );
  double *diffs = (double *) malloc( sizeof(double)*nPairs );
  double *pairs = (double *) malloc( sizeof(double)*nPairs );

  for (size_t i = 0 ; i<(nIndiv-1) ; i++ ) {
    for (size_t j = i+1 ; j<nIndiv ; j++ ) {
      size_t ij = Index(i,j);
      diffs[ij] = 0.0;
      pairs[ij] = 0.0;
      a[ij] = i;
      b[ij] = j;
    }
  }

  while (pio_next_row( &plink_file, snps ) == PIO_OK) {

    nSitesProcessed++;
    
    #pragma omp parallel for
    for (size_t ij = 0 ; ij < nPairs ; ij++ ) {
      size_t zi = snps[a[ij]];
      size_t zj = snps[b[ij]];
      if ((zi!=PLINK_NA)&&(zj!=PLINK_NA)) {
	diffs[ij] += (zi - zj)*(zi - zj);
	pairs[ij] += 1.0;
      }
    }
  }

  std::cout << "Computed average pairwise differences across " << nSitesProcessed << " SNPs" << std::endl;

  if (!outdiffs.is_open() || !outorder.is_open())
    {
      std::cerr << "[Data::bed2diffs] Error writing output files" << std::endl;
      exit(1);
    }   

  for (size_t i = 0 ; i<nIndiv ; i++ ) {

    struct pio_sample_t *sample = pio_get_sample( &plink_file, i );
    outorder << sample->fid << " " << sample->iid << std::endl;

    for (size_t j = 0 ; j<nIndiv ; j++ ) {
      if (i==j) {
	outdiffs << " " << 0;
      } else {
	size_t ij = Index(i,j);
	outdiffs << " " << diffs[ij]/pairs[ij];
	outcount << " " << pairs[ij];
      }
    }
    outdiffs << std::endl;
    outcount << std::endl;
  }
  
  outdiffs.close( );
  outcount.close( );
  outorder.close( );
  free( a );
  free( b );
  free( snps );
  free( diffs );
  free( pairs );
}
Example #2
0
void Data::bed2diffs_v2()
{
  std::string diffsfile = datapath + ".diffs";
  std::string orderfile = datapath + ".order";
  std::ofstream outdiffs(diffsfile.c_str(), std::ios::out);
  std::ofstream outorder(orderfile.c_str(), std::ios::out);

  outdiffs.precision(12);
  outdiffs.setf(std::ios::fixed,std::ios::floatfield);
  
  size_t nPairs = nIndiv*(nIndiv-1)/2;
  size_t nSitesProcessed = 0;

  // Read the genotypes in serial, compute the differences in parallel
  snp_t *snps = (snp_t *) malloc( sizeof(snp_t)*nIndiv );
  size_t *a = (size_t *) malloc( sizeof(size_t)*nPairs );
  size_t *b = (size_t *) malloc( sizeof(size_t)*nPairs );
  double *diffs = (double *) malloc( sizeof(double)*nPairs );
  double *count = (double *) malloc( sizeof(double)*nPairs );

  for (size_t i = 0 ; i < (nIndiv-1) ; i++ ) {
    for (size_t j = i+1 ; j < nIndiv ; j++ ) {
      size_t ij = Index(i, j);
      diffs[ij] = 0.0;
      count[ij] = 0.0;
      a[ij] = i;
      b[ij] = j;
    }
  }

  while (pio_next_row( &plink_file, snps ) == PIO_OK) {

    nSitesProcessed++;

    // Compute the observed genotype mean
    int sumGeno = 0, nObsrvd = 0;
    for (size_t i = 0 ; i < nIndiv ; i++ ) {
      size_t zi = snps[i];
      if (zi != PLINK_NA) { 
	sumGeno += zi; nObsrvd += 1;
      }
    }
    double aveGeno = sumGeno / (double)nObsrvd;

    #pragma omp parallel for
    for (size_t ij = 0 ; ij < nPairs ; ij++ ) {
      // If a genotype missing, impute it with the average genotype (which is a double, not an int)
      double zi = (double)snps[a[ij]];
      double zj = (double)snps[b[ij]];
      if (zi == PLINK_NA) { zi = aveGeno; }
      if (zj == PLINK_NA) { zj = aveGeno; }
      diffs[ij] += (zi - zj) * (zi - zj);
      count[ij] += 1.0;
    }
  }

  std::cout << "Computed average pairwise differences across " << nSitesProcessed << " SNPs" << std::endl;

  if (!outdiffs.is_open() || !outorder.is_open())
    {
      std::cerr << "[Data::bed2diffs] Error writing output files" << std::endl;
      exit(1);
    }   

  for (size_t i = 0 ; i < nIndiv ; i++ ) {

    struct pio_sample_t *sample = pio_get_sample( &plink_file, i );
    outorder << sample->fid << " " << sample->iid << std::endl;

    for (size_t j = 0 ; j < nIndiv ; j++ ) {
      if (i == j) {
	outdiffs << " " << 0;
      } else {
	size_t ij = Index(i, j);
	outdiffs << " " << diffs[ij] / count[ij];
      }
    }
    outdiffs << std::endl;
  }
  
  outdiffs.close( );
  outorder.close( );
  free( a );
  free( b );
  free( snps );
  free( diffs );
  free( count );
}