void PlinkOutputFile::writeBED(SimpleMatrix* mat, int nPeople, int nMarker){ /* int nPeople = mat->cols; */ /* int nMarker = mat->rows; */ unsigned char c = 0; int offset; for (int m = 0; m < nMarker; m++){ for (int i = 0; i < nPeople ; i ++) { offset = i & (4 - 1); int geno = (int)( (*mat)[m][i]); switch(geno){ case HOM_REF: setGenotype(&c, offset, HET); // het: 0b01 break; case HET: setGenotype(&c, offset, HET); // het: 0b01 break; case HOM_ALT: setGenotype(&c, offset, HOM_ALT); // hom alt: 0b11 break; default: setGenotype(&c, offset, MISSING); // missing break; } } if ( offset == 3) { // 3: 4 - 1, so every 4 genotype we will flush fwrite(&c, sizeof(char), 1, this->fpBed); c = 0; } }; if (nPeople % 4 != 0 ) fwrite(&c, sizeof(char), 1, this->fpBed); };
int PlinkOutputFile::extractBED(PlinkInputFile& pin, const std::vector<int>& sampleIdx, const std::vector<int>& snpIdx) { const int M = snpIdx.size(); const int N = sampleIdx.size(); unsigned char c = 0; int offset = 0; for (int i = 0; i < M; ++i) { // assume SNP-major for (int j = 0; j < N; ++j) { offset = j & (4 - 1); setGenotype(&c, offset, pin.get2BitGenotype(sampleIdx[j], snpIdx[i])); if (offset == 3) { // 3: 4 - 1, so every 4 genotype we will flush fwrite(&c, sizeof(char), 1, this->fpBed); c = 0; } } // end for j if (N % 4 != 0) { // remaining some bits fwrite(&c, sizeof(char), 1, this->fpBed); c = 0; } } // end for i return 0; }
GenMatrixBinary::GenMatrixBinary(const char* vcfFile, bool siteOnly, std::vector<std::string>& subsetInds, double minAF, double minCallRate) { // open a VCF file VcfFile vcf; VcfMarker* pMarker; std::vector<int> indIndices; vcf.bSiteOnly = siteOnly; if ( !siteOnly ) { vcf.bParseGenotypes = true; vcf.bParseDosages = false; vcf.bParseValues = false; } vcf.openForRead(vcfFile); // match the individual IDs; if ( siteOnly ) { if ( subsetInds.size() > 0 ) { Logger::gLogger->warning("--siteOnly option is turned on with subset of individuals information provided. subset information will be ignored"); } } else { if ( subsetInds.size() == 0 ) { for(int i=0; i < (int)vcf.vpVcfInds.size(); ++i) { indids.push_back(vcf.vpVcfInds[i]->sIndID.c_str()); indIndices.push_back(i); } } else { for(int i=0; i < (int)vcf.vpVcfInds.size(); ++i) { for(int j=0; j < (int)subsetInds.size(); ++j) { if ( vcf.vpVcfInds[i]->sIndID.Compare( subsetInds[j].c_str() ) == 0 ) { indids.push_back(vcf.vpVcfInds[i]->sIndID.c_str()); indIndices.push_back(i); break; } } } Logger::gLogger->writeLog("Total of %d out of %d individuals successfully matched IDs",(int)indids.size(),(int)subsetInds.size()); } } // set bytesPerMarker attribute if ( siteOnly ) { bytesPerMarker = 0; } else { bytesPerMarker = (indids.size() + 3)/4; } int numNonAutoWarn = 0; int afWarn = 0; int callRateWarn = 0; // read each marker and stores genotype while( vcf.iterateMarker() ) { // set per-marker level information if ( vcf.nNumMarkers % 10000 == 0 ) { Logger::gLogger->writeLog("Reading %d markers from VCF file",vcf.nNumMarkers); } pMarker = vcf.getLastMarker(); // Non-autosomal chromosomes must be discarded if ( !VcfHelper::isAutosome(pMarker->sChrom.c_str()) ) { if(++numNonAutoWarn <= 5) { Logger::gLogger->warning("Skipping no-autosomal marker %s:%d",pMarker->sChrom.c_str(),pMarker->nPos); } continue; } // get allele frequency information from VCF file // if site-only is set // -- use INFO // else if subset is set // -- use subset genotypes // else // -- use INFO if available // -- otherwise, use all genotypes double AF = -1; double callRate = 1.; // if site-only is set use INFO only if ( siteOnly ) { // use AC and AN first to estimate AF const char* sAC = pMarker->getInfoValue("AC"); const char* sAN = pMarker->getInfoValue("AN"); int AC = (sAC == NULL) ? -1 : atoi(sAC); int AN = (sAN == NULL) ? -1 : atoi(sAN); if ( ( AC > 0 ) && ( AN > 0 ) ) { AF = (double)AC/(double)AN; if ( vcf.vpVcfInds.size() > 0 ) { callRate = AN / 2. / vcf.vpVcfInds.size(); } } if ( AF < 0 ) { const char* sAF = pMarker->getInfoValue("AF"); AF = (sAF == NULL) ? -1. : atof(sAF); } } // if subset is set, do not use INFO and use genotypes else if ( indIndices.size() > 1 ) { // if not self-only option std::pair<int,int> alleleCounts = pMarker->computeAlleleCounts(indIndices); AF = (double)alleleCounts.first/(double)(alleleCounts.second+1e-6); callRate = (double)alleleCounts.second / 2. / subsetInds.size(); } // if selfOnly or use-all, use INFO first and genotypes later else if ( vcf.vpVcfInds.size() >= 0 ) { // use AC and AN first to estimate AF const char* sAC = pMarker->getInfoValue("AC"); const char* sAN = pMarker->getInfoValue("AN"); int AC = (sAC == NULL) ? -1 : atoi(sAC); int AN = (sAN == NULL) ? -1 : atoi(sAN); if ( ( AC > 0 ) && ( AN > 0 ) ) { AF = (double)AC/(double)AN; if ( vcf.vpVcfInds.size() > 0 ) { callRate = AN / 2. / vcf.vpVcfInds.size(); } } if ( AF < 0 ) { const char* sAF = pMarker->getInfoValue("AF"); AF = (sAF == NULL) ? -1. : atof(sAF); } // use all genotype if INFO field does not have AF, AC, AN if ( AF < 0 ) { std::pair<int,int> alleleCounts = pMarker->computeAlleleCounts(); AF = (double)alleleCounts.first/(double)(alleleCounts.second+1e-6); callRate = (double)alleleCounts.second / 2. / vcf.vpVcfInds.size(); } } if ( AF < 0 ) { Logger::gLogger->warning("Cannot obtain allele frequency information at %s:%d",pMarker->sChrom.c_str(),pMarker->nPos); } // skip by AF or callRate if ( AF < minAF ) { if(++afWarn < 5) { Logger::gLogger->warning("Skipping marker where AF (%lf) is < minAF (%lf)", AF, minAF); } continue; } if ( callRate < minCallRate ) { if(++callRateWarn < 5) { Logger::gLogger->warning("Skipping marker where callRate (%lf) is < minCallRate (%lf)", callRate, minCallRate); } continue; } // skip non-bi-allelic marker if ( pMarker->asAlts.Length() > 1 ) { Logger::gLogger->warning("Skipping marker %s:%d with multiple alternative alleles",pMarker->sChrom.c_str(),pMarker->nPos); continue; } // add marker information addMarker(pMarker->sChrom.c_str(), pMarker->nPos, pMarker->sRef[0], pMarker->asAlts[0][0], AF); // set genotypes if ( siteOnly ) { // no genotypes can be stored, skip them } else { for(int i=0; i < (int)indIndices.size(); ++i) { setGenotype( pMarker->vnSampleGenotypes[indIndices[i]], i ); } } } Logger::gLogger->writeLog("Finished reading %d markers from VCF file",vcf.nNumMarkers); Logger::gLogger->writeLog("Total of %d informative markers passed after AF >= %lf and callRate >= %lf threshold",(int)chroms.size(),minAF,minCallRate); if(afWarn > 0) { Logger::gLogger->warning("Skipped %d markers where AF was less than minAF (%lf)",afWarn,minAF); } if(callRateWarn > 0) { Logger::gLogger->warning("Skipped %d markers where callRate was less than minCallRate (%lf)",callRateWarn,minCallRate); } if ( chroms.size() == 0 ) { Logger::gLogger->error("No informative markers were found. Does the VCF have individual genotypes or either AF entry or AC & AN entries included in the INFO field?"); } }
int PlinkOutputFile::writeRecord(VCFRecord* r) { // write BIM if (isMultiAllelic(r->getRef()) || isMultiAllelic(r->getAlt())) { fprintf(stdout, "%s:%d Skip with ref = [ %s ] and alt= [ %s ]\n", __FILE__, __LINE__, r->getRef(), r->getAlt()); return -1; } this->writeBIM(r->getChrom(), r->getID(), 0, r->getPos(), r->getRef(), r->getAlt()); // write BED int GTidx = r->getFormatIndex("GT"); VCFPeople& people = r->getPeople(); unsigned char c = 0; VCFIndividual* indv; int offset; for (unsigned int i = 0; i < people.size(); i++) { indv = people[i]; offset = i & (4 - 1); if (indv->justGet(GTidx).isHaploid()) { // 0: index of GT int a1 = indv->justGet(GTidx).getAllele1(); if (a1 == 0) setGenotype(&c, offset, HOM_REF); else if (a1 == 1) setGenotype(&c, offset, HET); else setGenotype(&c, offset, MISSING); } else { int a1 = indv->justGet(GTidx).getAllele1(); int a2 = indv->justGet(GTidx).getAllele2(); if (a1 == 0) { if (a2 == 0) { // h**o ref: 0b00 } else if (a2 == 1) { setGenotype(&c, offset, HET); // het: 0b01 } else { setGenotype(&c, offset, MISSING); // missing 0b10 } } else if (a1 == 1) { if (a2 == 0) { setGenotype(&c, offset, HET); // het: 0b01 } else if (a2 == 1) { setGenotype(&c, offset, HOM_ALT); // hom alt: 0b11 } else { setGenotype(&c, offset, MISSING); // missing } } else { // NOTE: Plink does not support tri-allelic // so have to set genotype as missing. setGenotype(&c, offset, MISSING); // missing } } if (offset == 3) { // 3: 4 - 1, so every 4 genotype we will flush fwrite(&c, sizeof(char), 1, this->fpBed); c = 0; } } if (people.size() % 4 != 0) { // remaining some bits fwrite(&c, sizeof(char), 1, this->fpBed); } return 0; }