int VcfCleaner::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcf = ""; bool uncompress = false; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("out", &outputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } VcfFileReader inFile; VcfFileWriter outFile; VcfHeader header; VcfRecord record; // Open the file. inFile.open(inputVcf, header); if(uncompress) { outFile.open(outputVcf, header, InputFile::DEFAULT); } else { outFile.open(outputVcf, header); } int numReadRecords = 0; int numWrittenRecords = 0; int returnVal = 0; // Set to only store/write the GT field. VcfRecordGenotype::addStoreField("GT"); while(inFile.readRecord(record)) { ++numReadRecords; // Check if any samples are missing GT or if any are not phased. if(!record.hasAllGenotypeAlleles() || !record.allPhased()) { // Missing a GT or not phased, so continue without writing. continue; } // Clear the INFO field. record.getInfo().clear(); // Write the record. if(!outFile.writeRecord(record)) { // Write error. std::cerr << "Failed writing a vcf record.\n"; returnVal = -1; } ++numWrittenRecords; } inFile.close(); outFile.close(); std::cerr << "NumReadRecords: " << numReadRecords << "; NumWrittenRecords: " << numWrittenRecords << "\n"; return(returnVal); }
int VcfSplit::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcfBase = ""; String refName = ""; bool uncompress = false; bool params = false; bool noeof = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("obase", &outputVcfBase) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_STRINGPARAMETER("refName", &refName) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcfBase == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--obase\", a required parameter.\n\n"; return(-1); } outputVcfBase += "."; if(params) { inputParameters.Status(); } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } VcfFileReader inFile; std::map<std::string, VcfFileWriter*> outFiles; VcfHeader header; // Open the file. inFile.open(inputVcf, header); if(refName != "") { inFile.setReadSection(refName.c_str()); } VcfRecord record; int numRecords = 0; std::string prevChr = ""; std::string chr = ""; VcfFileWriter* outFilePtr = 0; std::string outName = ""; while(inFile.readRecord(record)) { ++numRecords; chr = record.getChromStr(); if((outFilePtr == 0) || (chr != prevChr)) { outFilePtr = outFiles[chr]; if(outFilePtr == 0) { outFilePtr = new VcfFileWriter(); outFiles[chr] = outFilePtr; outName = outputVcfBase.c_str(); if(chr.substr(0,3) != "chr") { outName += "chr"; } outName += chr + ".vcf"; // chr not in outFile list. if(uncompress) { outFilePtr->open(outName.c_str(), header, InputFile::DEFAULT); } else { outName += ".gz"; outFilePtr->open(outName.c_str(), header); } } } outFilePtr->writeRecord(record); } inFile.close(); for (std::map<std::string,VcfFileWriter*>::iterator it = outFiles.begin(); it != outFiles.end(); ++it) { if(it->second != 0) { it->second->close(); it->second = 0; } } std::cerr << "NumRecords: " << numRecords << "\n"; return(0); }
int VcfConvert::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcf = ""; String refName = ""; bool uncompress = false; bool params = false; bool noeof = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("out", &outputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_STRINGPARAMETER("refName", &refName) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } VcfFileReader inFile; VcfFileWriter outFile; VcfHeader header; // Open the file. inFile.open(inputVcf, header); if(refName != "") { inFile.setReadSection(refName.c_str()); } if(uncompress) { outFile.open(outputVcf, header, InputFile::DEFAULT); } else { outFile.open(outputVcf, header); } VcfRecord record; int numRecords = 0; while(inFile.readRecord(record)) { ++numRecords; outFile.writeRecord(record); } inFile.close(); std::cerr << "NumRecords: " << numRecords << "\n"; return(0); }
int VcfConsensus::execute(int argc, char ** argv) { String vcfName1; String vcfName2; String vcfName3; String outputFileName; bool uncompress = false; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in1", &vcfName1) LONG_STRINGPARAMETER("in2", &vcfName2) LONG_STRINGPARAMETER("in3", &vcfName3) LONG_STRINGPARAMETER("out", &outputFileName) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); std::string gtField = "GT"; VcfFileReader vcf1; VcfFileReader vcf2; VcfFileReader vcf3; VcfHeader header1; VcfHeader header2; VcfHeader header3; VcfRecord record1; VcfRecord record2; VcfRecord record3; VcfRecordGenotype* genotypeInfoPtr1 = NULL; VcfRecordGenotype* genotypeInfoPtr2 = NULL; VcfRecordGenotype* genotypeInfoPtr3 = NULL; unsigned int numMissing2 = 0; unsigned int numMissing3 = 0; unsigned int numMismatchRefAlt = 0; unsigned int numMissingGT1 = 0; const unsigned int myMaxErrors = 4; // Check that the required parameters were set. if(vcfName1 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in1\", a required parameter.\n\n"; return(-1); } if(vcfName2 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in2\", a required parameter.\n\n"; return(-1); } if(vcfName3 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in3\", a required parameter.\n\n"; return(-1); } if(outputFileName == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // Open the files. vcf1.open(vcfName1, header1); vcf2.open(vcfName2, header2); vcf3.open(vcfName3, header3); // Setup the sample name maps. int numSamples = header1.getNumSamples(); std::vector<int> sample2Indices; std::vector<int> sample3Indices; std::vector<int> removeIndices; int numSamplesSkipped1 = 0; int numSamplesSkipped2 = 0; int numSamplesSkipped3 = 0; for(int i = 0; i < numSamples; i++) { int sm2Index = header2.getSampleIndex(header1.getSampleName(i)); int sm3Index = header3.getSampleIndex(header1.getSampleName(i)); // Look for this sample name in vcf2. if((sm2Index != -1) && (sm3Index != -1)) { sample2Indices.push_back(sm2Index); sample3Indices.push_back(sm3Index); } else { // Sample not found in all three vcfs. removeIndices.push_back(i); ++numSamplesSkipped1; } } // Remove samples not found in all 3 vcfs from header1. // Remove them in reverse order so they are removed from the end of the header first. VcfSubsetSamples subset1; subset1.init(header1, true); for(int i = (removeIndices.size() - 1); i >= 0; i--) { subset1.addExcludeSample(header1.getSampleName(removeIndices[i])); header1.removeSample(removeIndices[i]); } // Set numSamples to the new number of samples in header1. numSamples = header1.getNumSamples(); // Calculate the number of samples skipped for files 2 & 3. numSamplesSkipped2 = header2.getNumSamples() - sample2Indices.size(); numSamplesSkipped3 = header3.getNumSamples() - sample3Indices.size(); if(numSamplesSkipped1 > 0) { std::cerr << "Skipping " << numSamplesSkipped1 << " samples from --in1\n"; } if(numSamplesSkipped2 > 0) { std::cerr << "Skipping " << numSamplesSkipped2 << " samples from --in2\n"; } if(numSamplesSkipped3 > 0) { std::cerr << "Skipping " << numSamplesSkipped3 << " samples from --in3\n"; } VcfFileWriter outputVcf; // Open and write the header if(uncompress) { outputVcf.open(outputFileName, header1, InputFile::DEFAULT); } else { outputVcf.open(outputFileName, header1); } const char* chrom1 = NULL; int pos1 = UNSET_POS; // Read the first record from vcf2 & vcf3. vcf2.readRecord(record2); vcf3.readRecord(record3); bool newChrom = true; static std::string prevChrom = ""; uint64_t numAllMatch = 0; uint64_t num1Match2Only = 0; uint64_t num1Match3Only = 0; uint64_t num2Match3Only = 0; uint64_t numNoMatches = 0; uint64_t numAllMatch00 = 0; uint64_t num1Match2Only00 = 0; uint64_t num1Match3Only00 = 0; uint64_t num2Match3Only00 = 0; uint64_t numAllMatch01 = 0; uint64_t num1Match2Only01 = 0; uint64_t num1Match3Only01 = 0; uint64_t num2Match3Only01 = 0; uint64_t numAllMatch11 = 0; uint64_t num1Match2Only11 = 0; uint64_t num1Match3Only11 = 0; uint64_t num2Match3Only11 = 0; // Loop through vcf1. while(vcf1.readRecord(record1, &subset1)) { chrom1 = record1.getChromStr(); pos1 = record1.get1BasedPosition(); if(strcmp(chrom1, prevChrom.c_str()) == 0) { newChrom = false; } else { prevChrom = chrom1; newChrom = true; } bool found = true; if(!findPos(newChrom, chrom1, pos1, record2, vcf2)) { // Failed to find the position, continue to the next position if(++numMissing2 <= myMaxErrors) { std::cerr << "Failed to find " << chrom1 << ":" << pos1 << " in " << vcfName2 << ", so skipping this pos\n"; } found = false; } if(!findPos(newChrom, chrom1, pos1, record3, vcf3)) { // Failed to find the position, continue to the next position if(++numMissing3 <= myMaxErrors) { std::cerr << "Failed to find " << chrom1 << ":" << pos1 << " in " << vcfName3 << ", so skipping this pos\n"; } found = false; } if(found == false) { continue; } // Found the position in all files. // Validate that the reference & alternate alleles are the same. const char* ref1 = record1.getRefStr(); const char* alt1 = record1.getAltStr(); if((strcmp(ref1, record2.getRefStr()) != 0) || (strcmp(ref1, record3.getRefStr()) != 0) || (strcmp(alt1, record2.getAltStr()) != 0) || (strcmp(alt1, record3.getAltStr()) != 0)) { if(++numMismatchRefAlt <= myMaxErrors) { std::cerr << "Mismatching ref/alt found at " << chrom1 << ":" << pos1 << ", so skipping this pos\n"; } continue; } // Get the genotype information for each. genotypeInfoPtr1 = &(record1.getGenotypeInfo()); genotypeInfoPtr2 = &(record2.getGenotypeInfo()); genotypeInfoPtr3 = &(record3.getGenotypeInfo()); // Loop through all the samples in vcf1. // Get the Genotype Information. for(int i = 0; i < numSamples; i++) { const std::string* genotypeVal1 = genotypeInfoPtr1->getString(gtField, i); const std::string* genotypeVal2 = genotypeInfoPtr2->getString(gtField, sample2Indices[i]); const std::string* genotypeVal3 = genotypeInfoPtr3->getString(gtField, sample3Indices[i]); // Need to make sure the field was found. if(genotypeVal1 == NULL) { // GT not found in the first record, so just continue. if(++numMissingGT1 <= myMaxErrors) { std::cerr << "Missing GT for " << header1.getSampleName(i) << " in " << vcfName1 << "\n"; } continue; } if(isSame(genotypeVal1, genotypeVal2)) { // genotypeVal1 is majority, so make no change. if(isSame(genotypeVal1, genotypeVal3)) { ++numAllMatch; if(*genotypeVal1 == "0/0") { ++numAllMatch00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++numAllMatch01; } if(*genotypeVal1 == "1/1") { ++numAllMatch11; } } else { ++num1Match2Only; if(*genotypeVal1 == "0/0") { ++num1Match2Only00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++num1Match2Only01; } if(*genotypeVal1 == "1/1") { ++num1Match2Only11; } } } else if(isSame(genotypeVal1, genotypeVal3)) { // genotypeVal1 is majority, so make no change. ++num1Match3Only; if(*genotypeVal1 == "0/0") { ++num1Match3Only00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++num1Match3Only01; } if(*genotypeVal1 == "1/1") { ++num1Match3Only11; } } else if(isSame(genotypeVal2, genotypeVal3)) { // genotypeVal2 is majority, so change genotypeVal1. genotypeInfoPtr1->setString(gtField, i, *genotypeVal2); ++num2Match3Only; if(*genotypeVal2 == "0/0") { ++num2Match3Only00; } else if((*genotypeVal2 == "0/1") || (*genotypeVal2 == "1/0")) { ++num2Match3Only01; } if(*genotypeVal2 == "1/1") { ++num2Match3Only11; } } else { // None are the same so set to "./." genotypeInfoPtr1->setString(gtField, i, "./."); ++numNoMatches; } } // loop back to vcf1 samples. // Write this record. outputVcf.writeRecord(record1); } // loop back to next vcf1 record. std::cerr << "\n"; if(numMissing2 > myMaxErrors) { std::cerr << "Suppressed " << numMissing2 - myMaxErrors << " errors about skipped positions because they were not in " << vcfName2 << "\n"; } if(numMissing3 > myMaxErrors) { std::cerr << "Suppressed " << numMissing3 - myMaxErrors << " errors about skipped positions because they were not in " << vcfName3 << "\n"; } if(numMismatchRefAlt > myMaxErrors) { std::cerr << "Suppressed " << numMismatchRefAlt - myMaxErrors << " errors about mismatched ref/alt positions\n"; } if(numMissingGT1 > myMaxErrors) { std::cerr << "Suppressed " << numMissingGT1 - myMaxErrors << " errors about missing GT for " << vcfName1 << "\n"; } std::cerr << "\n"; // Output the stats. std::cerr << "File1 = " << vcfName1 << std::endl; std::cerr << "File2 = " << vcfName2 << std::endl; std::cerr << "File3 = " << vcfName3 << std::endl; std::cerr << "\nType\tTotal\t0/0\t0/1|1/0\t1/1\n"; std::cerr << "AllMatched" << "\t" << numAllMatch << "\t" << numAllMatch00 << "\t" << numAllMatch01 << "\t" << numAllMatch11 << std::endl; std::cerr << "1matched2" << "\t" << num1Match2Only << "\t" << num1Match2Only00 << "\t" << num1Match2Only01 << "\t" << num1Match2Only11 << std::endl; std::cerr << "1matched3" << "\t" << num1Match3Only << "\t" << num1Match3Only00 << "\t" << num1Match3Only01 << "\t" << num1Match3Only11 << std::endl; std::cerr << "2matched3" << "\t" << num2Match3Only << "\t" << num2Match3Only00 << "\t" << num2Match3Only01 << "\t" << num2Match3Only11 << std::endl; std::cerr << "NoneMatched\t" << numNoMatches << std::endl; return(0); }