int main() { VCFInputFile vin("noindex.vcf.gz"); vin.setRangeList("1:0"); int lineNo = 0; while (vin.readRecord()) { lineNo ++; VCFRecord& r = vin.getVCFRecord(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; printf("%s:%d\t", r.getChrom(), r.getPos()); // e.g.: get TAG from INFO field // fprintf(stderr, "%s\n", r.getInfoTag("ANNO")); // e.g.: Loop each (selected) people in the same order as in the VCF for (int i = 0; i < people.size(); i++) { indv = people[i]; // get GT index. if you are sure the index will not change, call this function only once! int GTidx = r.getFormatIndex("GT"); if (GTidx >= 0) printf("%s ", indv->justGet(0).toStr()); // [0] meaning the first field of each individual else fprintf(stderr, "Cannot find GT field!\n"); } printf("\n"); }; fprintf(stdout, "Total %d VCF records have converted successfully\n", lineNo); };
int loadMarkerFromVCF(const std::string& fileName, const std::string& marker, std::vector<std::string>* rowLabel, Matrix* genotype) { if (!rowLabel || !genotype) { // invalid parameter return -1; } Matrix& m = *genotype; int col = 0; VCFInputFile vin(fileName); vin.setRangeList(marker); while (vin.readRecord()) { VCFRecord& r = vin.getVCFRecord(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; m.Dimension(people.size(), col + 1); int GTidx = r.getFormatIndex("GT"); for (int i = 0; i < (int)people.size(); i++) { indv = people[i]; // get GT index. if you are sure the index will not change, // call this function only once! if (GTidx >= 0) { // printf("%s ", indv->justGet(0).toStr()); // [0] meaning the first // field of each individual m[i][col] = indv->justGet(GTidx).getGenotype(); } else { logger->error("Cannot find GT field!"); return -1; } } if (col == 0) { // set-up names rowLabel->resize(people.size()); for (size_t i = 0; i < people.size(); ++i) { (*rowLabel)[i] = people[i]->getName(); } } std::string colLabel = r.getChrom(); colLabel += ":"; colLabel += r.getPosStr(); m.SetColumnLabel(col, colLabel.c_str()); ++col; } return 0; }
int loadGenotype(VCFInputFile& vin, AllConcordanceType* input, int idx) { AllConcordanceType& data = *input; std::string key; int lineNo = 0; while (vin.readRecord()) { lineNo++; key.clear(); VCFRecord& r = vin.getVCFRecord(); key += r.getChrom(); key += ":"; key += r.getPosStr(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; int GTidx = r.getFormatIndex("GT"); if (GTidx < 0) continue; for (size_t i = 0; i < people.size(); ++i) { indv = people[i]; const VCFValue& v = indv->justGet(GTidx); int a1 = v.getAllele1(); int a2 = v.getAllele2(); if (a1 == MISSING_GENOTYPE || a2 == MISSING_GENOTYPE) { data[indv->getName()][key][idx] = Value::MISSING; } else if (a1 == 0) { if (a2 == 0) { data[indv->getName()][key][idx] = Value::HOMREF; } else if (a2 == 1) { data[indv->getName()][key][idx] = Value::HET; } } else if (a1 == 1) { if (a2 == 0) { data[indv->getName()][key][idx] = Value::HET; } else if (a2 == 1) { data[indv->getName()][key][idx] = Value::HOMALT; } } } }; fprintf(stderr, "Total %d VCF records have read successfully\n", lineNo); return lineNo; };
int PlinkOutputFile::writeRecord(VCFRecord* r) { // write BIM if (isMultiAllelic(r->getRef()) || isMultiAllelic(r->getAlt())) { fprintf(stdout, "%s:%d Skip with ref = [ %s ] and alt= [ %s ]\n", __FILE__, __LINE__, r->getRef(), r->getAlt()); return -1; } this->writeBIM(r->getChrom(), r->getID(), 0, r->getPos(), r->getRef(), r->getAlt()); // write BED int GTidx = r->getFormatIndex("GT"); VCFPeople& people = r->getPeople(); unsigned char c = 0; VCFIndividual* indv; int offset; for (unsigned int i = 0; i < people.size(); i++) { indv = people[i]; offset = i & (4 - 1); if (indv->justGet(GTidx).isHaploid()) { // 0: index of GT int a1 = indv->justGet(GTidx).getAllele1(); if (a1 == 0) setGenotype(&c, offset, HOM_REF); else if (a1 == 1) setGenotype(&c, offset, HET); else setGenotype(&c, offset, MISSING); } else { int a1 = indv->justGet(GTidx).getAllele1(); int a2 = indv->justGet(GTidx).getAllele2(); if (a1 == 0) { if (a2 == 0) { // h**o ref: 0b00 } else if (a2 == 1) { setGenotype(&c, offset, HET); // het: 0b01 } else { setGenotype(&c, offset, MISSING); // missing 0b10 } } else if (a1 == 1) { if (a2 == 0) { setGenotype(&c, offset, HET); // het: 0b01 } else if (a2 == 1) { setGenotype(&c, offset, HOM_ALT); // hom alt: 0b11 } else { setGenotype(&c, offset, MISSING); // missing } } else { // NOTE: Plink does not support tri-allelic // so have to set genotype as missing. setGenotype(&c, offset, MISSING); // missing } } if (offset == 3) { // 3: 4 - 1, so every 4 genotype we will flush fwrite(&c, sizeof(char), 1, this->fpBed); c = 0; } } if (people.size() % 4 != 0) { // remaining some bits fwrite(&c, sizeof(char), 1, this->fpBed); } return 0; }
int GenotypeExtractor::extractMultipleGenotype(Matrix* g) { static Matrix m; // make it static to reduce memory allocation int row = 0; std::vector<std::string> colNames; std::string name; this->hemiRegion.clear(); GenotypeCounter genoCounter; while (this->vin->readRecord()) { VCFRecord& r = this->vin->getVCFRecord(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; m.Dimension(row + 1, people.size()); genoCounter.reset(); int genoIdx; const bool useDosage = (!this->dosageTag.empty()); if (useDosage) { genoIdx = r.getFormatIndex(dosageTag.c_str()); } else { genoIdx = r.getFormatIndex("GT"); } int GDidx = r.getFormatIndex("GD"); int GQidx = r.getFormatIndex("GQ"); assert(this->parRegion); bool hemiRegion = this->parRegion->isHemiRegion(r.getChrom(), r.getPos()); // e.g.: Loop each (selected) people in the same order as in the VCF const int numPeople = (int)people.size(); for (int i = 0; i < numPeople; i++) { indv = people[i]; // get GT index. if you are sure the index will not change, call this // function only once! if (genoIdx >= 0) { // printf("%s ", indv->justGet(0).toStr()); // [0] meaning the first // field of each individual if (useDosage) { if (!hemiRegion) { m[row][i] = indv->justGet(genoIdx).toDouble(); } else { // for male hemi region, imputated dosage is usually between 0 and 1 // need to multiply by 2.0 if ((*sex)[i] == PLINK_MALE) { m[row][i] = indv->justGet(genoIdx).toDouble() * 2.0; } } } else { if (!hemiRegion) { m[row][i] = indv->justGet(genoIdx).getGenotype(); } else { if ((*sex)[i] == PLINK_MALE) { m[row][i] = indv->justGet(genoIdx).getMaleNonParGenotype02(); } else if ((*sex)[i] == PLINK_FEMALE) { m[row][i] = indv->justGet(genoIdx).getGenotype(); } else { m[row][i] = MISSING_GENOTYPE; } } } if (!checkGD(indv, GDidx) || !checkGQ(indv, GQidx)) { m[row][i] = MISSING_GENOTYPE; } genoCounter.add(m[row][i]); } else { logger->error("Cannot find %s field!", this->dosageTag.empty() ? "GT" : dosageTag.c_str()); return -1; } } // check frequency cutoffs // int numNonMissingPeople = 0; // double maf = 0.; // for (int i = 0; i < numPeople; ++i) { // if (m[row][i] < 0) continue; // maf += m[row][i]; // ++numNonMissingPeople; // } // if (numNonMissingPeople) { // maf = maf / (2. * numNonMissingPeople); // } else { // maf = 0.0; // } // if (maf > .5) { // maf = 1.0 - maf; // } const double maf = genoCounter.getMAF(); if (this->freqMin > 0. && this->freqMin > maf) continue; if (this->freqMax > 0. && this->freqMax < maf) continue; // store genotype results name = r.getChrom(); name += ":"; name += r.getPosStr(); colNames.push_back(name); ++row; assert(this->parRegion); if (this->parRegion && this->parRegion->isHemiRegion(r.getChrom(), r.getPos())) { this->hemiRegion.push_back(true); } else { this->hemiRegion.push_back(false); } this->counter.push_back(genoCounter); } // end while (this->vin->readRecord()) // delete rows (ugly code here, as we may allocate extra row in previous // loop) m.Dimension(row, m.cols); // now transpose (marker by people -> people by marker) g->Transpose(m); for (int i = 0; i < row; ++i) { g->SetColumnLabel(i, colNames[i].c_str()); } return SUCCEED; } // end GenotypeExtractor
int GenotypeExtractor::extractSingleGenotype(Matrix* g, Result* b) { Matrix& genotype = *g; Result& buf = *b; bool hasRead = this->vin->readRecord(); if (!hasRead) return FILE_END; VCFRecord& r = this->vin->getVCFRecord(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; buf.updateValue("CHROM", r.getChrom()); buf.updateValue("POS", r.getPosStr()); buf.updateValue("REF", r.getRef()); buf.updateValue("ALT", r.getAlt()); genotype.Dimension(people.size(), 1); counter.resize(1); // get GT index. if you are sure the index will not change, call this // function only once! const bool useDosage = (!this->dosageTag.empty()); int genoIdx; if (useDosage) { genoIdx = r.getFormatIndex(dosageTag.c_str()); } else { genoIdx = r.getFormatIndex("GT"); } // int GTidx = r.getFormatIndex("GT"); int GDidx = r.getFormatIndex("GD"); int GQidx = r.getFormatIndex("GQ"); bool hemiRegion = this->parRegion->isHemiRegion(r.getChrom(), r.getPos()); // e.g.: Loop each (selected) people in the same order as in the VCF const int numPeople = (int)people.size(); for (int i = 0; i < numPeople; i++) { indv = people[i]; if (genoIdx >= 0) { // printf("%s ", indv->justGet(0).toStr()); // [0] meaning the first // field of each individual if (useDosage) { genotype[i][0] = indv->justGet(genoIdx).toDouble(); } else { if (!hemiRegion) { genotype[i][0] = indv->justGet(genoIdx).getGenotype(); } else { if ((*sex)[i] == PLINK_MALE) { genotype[i][0] = indv->justGet(genoIdx).getMaleNonParGenotype02(); } else if ((*sex)[i] == PLINK_FEMALE) { genotype[i][0] = indv->justGet(genoIdx).getGenotype(); } else { genotype[i][0] = MISSING_GENOTYPE; } } } if (!checkGD(indv, GDidx) || !checkGQ(indv, GQidx)) { genotype[i][0] = MISSING_GENOTYPE; } counter[0].add(genotype[i][0]); // logger->info("%d ", int(genotype[i][0])); } else { std::string s; indv->toStr(&s); logger->error( "Cannot find [ %s ] field when read individual information [ %s ]!", this->dosageTag.empty() ? "GT" : this->dosageTag.c_str(), s.c_str()); return ERROR; } } // check frequency cutoffs // double maf = 0.; // if (this->freqMin > 0.0 || this->freqMax > 0.) { // for (int i = 0; i < numPeople; ++i) { // maf += genotype[i][0]; // } // maf = maf / (2. * numPeople); // if (maf > .5) { // maf = 1.0 - maf; // } // } const double maf = counter[0].getMAF(); if (this->freqMin > 0. && this->freqMin > maf) return FAIL_FILTER; if (this->freqMax > 0. && this->freqMax < maf) return FAIL_FILTER; std::string label = r.getChrom(); label += ':'; label += r.getPosStr(); genotype.SetColumnLabel(0, label.c_str()); this->hemiRegion.resize(1); assert(this->parRegion); if (this->parRegion && this->parRegion->isHemiRegion(r.getChrom(), r.getPos())) { this->hemiRegion[0] = true; } else { this->hemiRegion[0] = false; } return SUCCEED; } // end extractSingleGenotype()
int main(int argc, char** argv) { time_t currentTime = time(0); fprintf(stderr, "Analysis started at: %s", ctime(¤tTime)); PARSE_PARAMETER(argc, argv); PARAMETER_STATUS(); if (FLAG_REMAIN_ARG.size() > 0) { fprintf(stderr, "Unparsed arguments: "); for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++) { fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str()); } fprintf(stderr, "\n"); abort(); } REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf"); const char* fn = FLAG_inVcf.c_str(); VCFInputFile vin(fn); // set range filters here // e.g. // vin.setRangeList("1:69500-69600"); vin.setRangeList(FLAG_rangeList.c_str()); vin.setRangeFile(FLAG_rangeFile.c_str()); // set people filters here if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) { vin.excludeAllPeople(); vin.includePeople(FLAG_peopleIncludeID.c_str()); vin.includePeopleFromFile(FLAG_peopleIncludeFile.c_str()); } vin.excludePeople(FLAG_peopleExcludeID.c_str()); vin.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str()); // let's write it out. if (FLAG_updateId != "") { int ret = vin.updateId(FLAG_updateId.c_str()); fprintf(stdout, "%d samples have updated id.\n", ret); } // load gene ranges std::map<std::string, std::string> geneRange; if (FLAG_geneName.size()) { if (FLAG_geneFile.size() == 0) { fprintf(stderr, "Have to provide --geneFile to extract by gene.\n"); abort(); } LineReader lr(FLAG_geneFile); std::vector<std::string> fd; while (lr.readLineBySep(&fd, "\t ")) { if (FLAG_geneName != fd[0]) continue; fd[2] = chopChr(fd[2]); // chop "chr1" to "1" if (geneRange.find(fd[0]) == geneRange.end()) { geneRange[fd[0]] = fd[2] + ":" + fd[4] + "-" + fd[5]; } else { geneRange[fd[0]] += "," + fd[2] + ":" + fd[4] + "-" + fd[5]; } }; } std::string range; for (std::map<std::string, std::string>::iterator it = geneRange.begin(); it != geneRange.end(); it++) { if (range.size() > 0) { range += ","; } range += it->second; }; fprintf(stderr, "range = %s\n", range.c_str()); vin.setRangeList(range.c_str()); Regex regex; if (FLAG_annoType.size()) { regex.readPattern(FLAG_annoType); } // print header std::vector<std::string> names; vin.getVCFHeader()->getPeopleName(&names); printf("CHROM\tPOS"); for (unsigned int i = 0; i < names.size(); i++) { printf("\t%s", names[i].c_str()); } printf("\n"); // real working part int nonVariantSite = 0; while (vin.readRecord()) { VCFRecord& r = vin.getVCFRecord(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; if (FLAG_variantOnly) { bool hasVariant = false; int geno; int GTidx = r.getFormatIndex("GT"); for (size_t i = 0; i < people.size(); i++) { indv = people[i]; geno = indv->justGet(GTidx).getGenotype(); if (geno != 0 && geno != MISSING_GENOTYPE) hasVariant = true; } if (!hasVariant) { nonVariantSite++; continue; } } if (FLAG_annoType.size()) { bool isMissing = false; const char* tag = r.getInfoTag("ANNO", &isMissing).toStr(); if (isMissing) continue; // fprintf(stdout, "ANNO = %s", tag); bool match = regex.match(tag); // fprintf(stdout, " %s \t", match ? "match": "noMatch"); // fprintf(stdout, " %s \n", exists ? "exists": "missing"); if (!match) { continue; } } fprintf(stdout, "%s\t%s", r.getChrom(), r.getPosStr()); for (size_t i = 0; i < people.size(); i++) { indv = people[i]; fprintf(stdout, "\t%d", indv->justGet(0).getGenotype()); } fprintf(stdout, "\n"); }; currentTime = time(0); fprintf(stderr, "Analysis ends at: %s", ctime(¤tTime)); return 0; };