コード例 #1
0
ファイル: PlinkOutputFile.cpp プロジェクト: ernfrid/polymutt
void PlinkOutputFile::writeBED(SimpleMatrix* mat, int nPeople, int nMarker){
    /* int nPeople = mat->cols; */
    /* int nMarker = mat->rows; */
    unsigned char c = 0;
    int offset;
    for (int m = 0; m < nMarker; m++){
        for (int i = 0; i < nPeople ; i ++) {
            offset = i & (4 - 1);
            int geno = (int)( (*mat)[m][i]);
            switch(geno){
                case HOM_REF:
                    setGenotype(&c, offset, HET); // het: 0b01
                    break;
                case HET:
                    setGenotype(&c, offset, HET); // het: 0b01
                    break;
                case HOM_ALT:
                    setGenotype(&c, offset, HOM_ALT); // hom alt: 0b11
                    break;
                default:
                    setGenotype(&c, offset, MISSING); // missing
                    break;
            }
        }
        if ( offset == 3) { // 3: 4 - 1, so every 4 genotype we will flush
            fwrite(&c, sizeof(char), 1, this->fpBed);
            c = 0;
        }
    };
    if (nPeople % 4 != 0 )
        fwrite(&c, sizeof(char), 1, this->fpBed);
};
コード例 #2
0
int PlinkOutputFile::extractBED(PlinkInputFile& pin,
                                const std::vector<int>& sampleIdx,
                                const std::vector<int>& snpIdx) {
  const int M = snpIdx.size();
  const int N = sampleIdx.size();
  unsigned char c = 0;
  int offset = 0;
  for (int i = 0; i < M; ++i) {  // assume SNP-major
    for (int j = 0; j < N; ++j) {
      offset = j & (4 - 1);
      setGenotype(&c, offset, pin.get2BitGenotype(sampleIdx[j], snpIdx[i]));
      if (offset == 3) {  // 3: 4 - 1, so every 4 genotype we will flush
        fwrite(&c, sizeof(char), 1, this->fpBed);
        c = 0;
      }
    }                  // end for j
    if (N % 4 != 0) {  // remaining some bits
      fwrite(&c, sizeof(char), 1, this->fpBed);
      c = 0;
    }
  }  // end for i
  return 0;
}
コード例 #3
0
ファイル: VerifyBamID.cpp プロジェクト: statgen/verifyBamID
GenMatrixBinary::GenMatrixBinary(const char* vcfFile, bool siteOnly, std::vector<std::string>& subsetInds, double minAF, double minCallRate) {
  // open a VCF file
  VcfFile vcf;
  VcfMarker* pMarker;
  std::vector<int> indIndices;

  vcf.bSiteOnly = siteOnly;
  if ( !siteOnly ) {
    vcf.bParseGenotypes = true;
    vcf.bParseDosages = false;
    vcf.bParseValues = false;
  }
  vcf.openForRead(vcfFile);

  // match the individual IDs;
  if ( siteOnly ) {
    if ( subsetInds.size() > 0 ) {
      Logger::gLogger->warning("--siteOnly option is turned on with subset of individuals information provided. subset information will be ignored");
    }
  }
  else {
    if ( subsetInds.size() == 0 ) {
      for(int i=0; i < (int)vcf.vpVcfInds.size(); ++i) {
	indids.push_back(vcf.vpVcfInds[i]->sIndID.c_str());
	indIndices.push_back(i);
      }
    }
    else {
      for(int i=0; i < (int)vcf.vpVcfInds.size(); ++i) {
	for(int j=0; j < (int)subsetInds.size(); ++j) {
	  if ( vcf.vpVcfInds[i]->sIndID.Compare( subsetInds[j].c_str() ) == 0 ) {
	    indids.push_back(vcf.vpVcfInds[i]->sIndID.c_str());
	    indIndices.push_back(i);
	    break;
	  }
	}
      }
      Logger::gLogger->writeLog("Total of %d out of %d individuals successfully matched IDs",(int)indids.size(),(int)subsetInds.size());
    }
  }

  // set bytesPerMarker attribute
  if ( siteOnly ) {
    bytesPerMarker = 0;
  }
  else {
    bytesPerMarker = (indids.size() + 3)/4;
  }

  int numNonAutoWarn = 0;
  int afWarn = 0;
  int callRateWarn = 0;
  // read each marker and stores genotype
  while( vcf.iterateMarker() ) {
    // set per-marker level information
    if ( vcf.nNumMarkers % 10000 == 0 ) {
      Logger::gLogger->writeLog("Reading %d markers from VCF file",vcf.nNumMarkers);
    }

    pMarker = vcf.getLastMarker();

    // Non-autosomal chromosomes must be discarded
    if ( !VcfHelper::isAutosome(pMarker->sChrom.c_str()) ) {
        if(++numNonAutoWarn <= 5)
        {
            Logger::gLogger->warning("Skipping no-autosomal marker %s:%d",pMarker->sChrom.c_str(),pMarker->nPos);
        }
        continue;
    }    

    // get allele frequency information from VCF file
    // if site-only is set
    //    -- use INFO
    // else if subset is set
    //    -- use subset genotypes
    // else 
    //    -- use INFO if available
    //    -- otherwise, use all genotypes
    double AF = -1;
    double callRate = 1.;
    // if site-only is set use INFO only
    if ( siteOnly ) { 
      // use AC and AN first to estimate AF
      const char* sAC = pMarker->getInfoValue("AC");
      const char* sAN = pMarker->getInfoValue("AN");
      int AC = (sAC == NULL) ? -1 : atoi(sAC);
      int AN = (sAN == NULL) ? -1 : atoi(sAN);

      if ( ( AC > 0 ) && ( AN > 0 ) ) {
	AF = (double)AC/(double)AN;
	if ( vcf.vpVcfInds.size() > 0 ) {
	  callRate = AN / 2. / vcf.vpVcfInds.size();
	}
      }
      if ( AF < 0 ) {
	const char* sAF = pMarker->getInfoValue("AF");
	AF = (sAF == NULL) ? -1. : atof(sAF);
      }
    }
    // if subset is set, do not use INFO and use genotypes
    else if ( indIndices.size() > 1 ) { // if not self-only option
      std::pair<int,int> alleleCounts = pMarker->computeAlleleCounts(indIndices);
      AF = (double)alleleCounts.first/(double)(alleleCounts.second+1e-6);
      callRate = (double)alleleCounts.second / 2. / subsetInds.size();
    }
    // if selfOnly or use-all, use INFO first and genotypes later
    else if ( vcf.vpVcfInds.size() >= 0 ) {
      // use AC and AN first to estimate AF
      const char* sAC = pMarker->getInfoValue("AC");
      const char* sAN = pMarker->getInfoValue("AN");
      int AC = (sAC == NULL) ? -1 : atoi(sAC);
      int AN = (sAN == NULL) ? -1 : atoi(sAN);

      if ( ( AC > 0 ) && ( AN > 0 ) ) {
	AF = (double)AC/(double)AN;
	if ( vcf.vpVcfInds.size() > 0 ) {
	  callRate = AN / 2. / vcf.vpVcfInds.size();
	}
      }
      if ( AF < 0 ) {
	const char* sAF = pMarker->getInfoValue("AF");
	AF = (sAF == NULL) ? -1. : atof(sAF);
      }
      // use all genotype if INFO field does not have AF, AC, AN
      if ( AF < 0 ) {
	std::pair<int,int> alleleCounts = pMarker->computeAlleleCounts();
	AF = (double)alleleCounts.first/(double)(alleleCounts.second+1e-6);
	callRate = (double)alleleCounts.second / 2. / vcf.vpVcfInds.size();
      }
    }

    if ( AF < 0 ) {
      Logger::gLogger->warning("Cannot obtain allele frequency information at %s:%d",pMarker->sChrom.c_str(),pMarker->nPos);
    }

    // skip by AF or callRate 
    if ( AF < minAF ) 
    {
        if(++afWarn < 5)
        {
            Logger::gLogger->warning("Skipping marker where AF (%lf) is < minAF (%lf)", AF, minAF);
        }
        continue;
    }
    if ( callRate < minCallRate ) 
    {
        if(++callRateWarn < 5)
        {
            Logger::gLogger->warning("Skipping marker where callRate (%lf) is < minCallRate (%lf)", callRate, minCallRate);
        }
        continue;
    }

    // skip non-bi-allelic marker
    if ( pMarker->asAlts.Length() > 1 ) {
      Logger::gLogger->warning("Skipping marker %s:%d with multiple alternative alleles",pMarker->sChrom.c_str(),pMarker->nPos);
      continue;
    }

    // add marker information
    addMarker(pMarker->sChrom.c_str(), pMarker->nPos, pMarker->sRef[0], pMarker->asAlts[0][0], AF);

    // set genotypes
    if ( siteOnly ) {
      // no genotypes can be stored, skip them
    }
    else {
      for(int i=0; i < (int)indIndices.size(); ++i) {
	setGenotype( pMarker->vnSampleGenotypes[indIndices[i]], i );
      }
    }
  }

  Logger::gLogger->writeLog("Finished reading %d markers from VCF file",vcf.nNumMarkers);
  Logger::gLogger->writeLog("Total of %d informative markers passed after AF >= %lf and callRate >= %lf threshold",(int)chroms.size(),minAF,minCallRate);
  if(afWarn > 0)
  {
      Logger::gLogger->warning("Skipped %d markers where AF was less than minAF (%lf)",afWarn,minAF);
  }
  if(callRateWarn > 0)
  {
      Logger::gLogger->warning("Skipped %d markers where callRate was less than minCallRate (%lf)",callRateWarn,minCallRate);
  }

  if ( chroms.size() == 0 ) {
    Logger::gLogger->error("No informative markers were found. Does the VCF have individual genotypes or either AF entry or AC & AN entries included in the INFO field?");
  }
}
コード例 #4
0
int PlinkOutputFile::writeRecord(VCFRecord* r) {
  // write BIM
  if (isMultiAllelic(r->getRef()) || isMultiAllelic(r->getAlt())) {
    fprintf(stdout, "%s:%d Skip with ref = [ %s ] and alt= [ %s ]\n", __FILE__,
            __LINE__, r->getRef(), r->getAlt());
    return -1;
  }

  this->writeBIM(r->getChrom(), r->getID(), 0, r->getPos(), r->getRef(),
                 r->getAlt());

  // write BED
  int GTidx = r->getFormatIndex("GT");
  VCFPeople& people = r->getPeople();
  unsigned char c = 0;
  VCFIndividual* indv;
  int offset;
  for (unsigned int i = 0; i < people.size(); i++) {
    indv = people[i];
    offset = i & (4 - 1);
    if (indv->justGet(GTidx).isHaploid()) {  // 0: index of GT
      int a1 = indv->justGet(GTidx).getAllele1();
      if (a1 == 0)
        setGenotype(&c, offset, HOM_REF);
      else if (a1 == 1)
        setGenotype(&c, offset, HET);
      else
        setGenotype(&c, offset, MISSING);
    } else {
      int a1 = indv->justGet(GTidx).getAllele1();
      int a2 = indv->justGet(GTidx).getAllele2();
      if (a1 == 0) {
        if (a2 == 0) {
          // h**o ref: 0b00
        } else if (a2 == 1) {
          setGenotype(&c, offset, HET);  // het: 0b01
        } else {
          setGenotype(&c, offset, MISSING);  // missing 0b10
        }
      } else if (a1 == 1) {
        if (a2 == 0) {
          setGenotype(&c, offset, HET);  // het: 0b01
        } else if (a2 == 1) {
          setGenotype(&c, offset, HOM_ALT);  // hom alt: 0b11
        } else {
          setGenotype(&c, offset, MISSING);  // missing
        }
      } else {
        // NOTE: Plink does not support tri-allelic
        // so have to set genotype as missing.
        setGenotype(&c, offset, MISSING);  // missing
      }
    }
    if (offset == 3) {  // 3: 4 - 1, so every 4 genotype we will flush
      fwrite(&c, sizeof(char), 1, this->fpBed);
      c = 0;
    }
  }
  if (people.size() % 4 != 0) {  // remaining some bits
    fwrite(&c, sizeof(char), 1, this->fpBed);
  }
  return 0;
}