int VcfCleaner::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcf = ""; bool uncompress = false; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("out", &outputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } VcfFileReader inFile; VcfFileWriter outFile; VcfHeader header; VcfRecord record; // Open the file. inFile.open(inputVcf, header); if(uncompress) { outFile.open(outputVcf, header, InputFile::DEFAULT); } else { outFile.open(outputVcf, header); } int numReadRecords = 0; int numWrittenRecords = 0; int returnVal = 0; // Set to only store/write the GT field. VcfRecordGenotype::addStoreField("GT"); while(inFile.readRecord(record)) { ++numReadRecords; // Check if any samples are missing GT or if any are not phased. if(!record.hasAllGenotypeAlleles() || !record.allPhased()) { // Missing a GT or not phased, so continue without writing. continue; } // Clear the INFO field. record.getInfo().clear(); // Write the record. if(!outFile.writeRecord(record)) { // Write error. std::cerr << "Failed writing a vcf record.\n"; returnVal = -1; } ++numWrittenRecords; } inFile.close(); outFile.close(); std::cerr << "NumReadRecords: " << numReadRecords << "; NumWrittenRecords: " << numWrittenRecords << "\n"; return(returnVal); }
bool VcfFileReader::readRecord(VcfRecord& record, VcfSubsetSamples* subset) { myStatus = StatGenStatus::SUCCESS; // Subset the read if there are subsets specified. VcfSubsetSamples* subsetPtr = subset; if((subsetPtr == NULL) && myUseSubset) { subsetPtr = &mySampleSubset; } // Check to see if a new region has been set. If so, setup for that region. bool searchChrom = false; if(myNewSection) { if(myVcfIndex != NULL) { // Have an index file so use if(!processNewSection()) { // processNewSection sets the status appropriately on failure. return(false); } } else if(myTotalRead == 0) { // ReadSection without an index only works if no records // have been read. searchChrom = true; myNewSection = false; } else { myNewSection = false; myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Cannot set read section with no index after reading records"); return(false); } } // Keep looping until a desired record is found. bool recordFound = false; while(!recordFound) { if(!record.read(myFilePtr, mySiteOnly, myRecordDiscardRules, subsetPtr)) { myStatus = record.getStatus(); myTotalRead += myRecordDiscardRules.getNumDiscarded(); myNumRecords += myRecordDiscardRules.getNumDiscarded(); myRecordDiscardRules.clearNumDiscarded(); return(false); } ++myTotalRead; myTotalRead += myRecordDiscardRules.getNumDiscarded(); // Check to see if the record is in the section. // First check the chromosome. if(!mySectionChrom.empty() && (mySectionChrom != record.getChromStr())) { if(searchChrom) { // Still searching for the chromosome, so continue // to the next record. continue; } // Record is not within the correct chromosome, so return failure. myStatus = StatGenStatus::NO_MORE_RECS; return(false); } searchChrom = false; // Check if the record is after the section end if applicable. if((mySection1BasedEndPos != -1) && (record.get1BasedPosition() >= mySection1BasedEndPos)) { myStatus = StatGenStatus::NO_MORE_RECS; return(false); } // Check if the record is prior to the section start if applicable. // Determinine the VCF record end position. // If we are not requiring overlap, then we only need to check // the start position, but if overlap is required, then it needs // to incrment the start by the length-1. int numIncBases = 0; if(mySectionOverlap) { // The VCF record end position is the start position + length of the // reference string - 1. numIncBases = record.getNumRefBases() - 1; } if((mySection1BasedStartPos != -1) && ((record.get1BasedPosition() + numIncBases) < mySection1BasedStartPos)) { // This record is prior to the section, so keep reading. continue; } ++myNumRecords; myNumRecords += myRecordDiscardRules.getNumDiscarded(); myRecordDiscardRules.clearNumDiscarded(); // Record successfully read, so check to see if it is discarded. if((myDiscardRules & DISCARD_NON_PHASED) && !record.allPhased()) { // Not all samples are phased, so discard this record. continue; } if((myDiscardRules & DISCARD_MISSING_GT) && !record.hasAllGenotypeAlleles()) { // discard missing GTs and this record had missing alleles, // so keep reading. continue; } if((myDiscardRules & DISCARD_FILTERED) && !(record.getFilter().passedAllFilters())) { // Record was filtered, so discard it. continue; } if((myDiscardRules & DISCARD_MULTIPLE_ALTS) && (record.getNumAlts() > 1)) { // Record had multiple alternates, so discard. continue; } // Check allele counts for discarding. if(myMinAltAlleleCount != UNSET_MIN_ALT_ALLELE_COUNT) { // Count the number of alternates. int32_t altCount = 0; for(int sampleNum = 0; sampleNum < record.getNumSamples(); sampleNum++) { if((myAltAlleleCountSubset != NULL) && !(myAltAlleleCountSubset->keep(sampleNum))) { // Skip this sample. continue; } for(int gtNum = 0; gtNum < record.getNumGTs(sampleNum); gtNum++) { if(record.getGT(sampleNum, gtNum) > 0) { // Alternate, so increment the count. ++altCount; } } } if(altCount < myMinAltAlleleCount) { // Not enough alternates so continue to the next sample. continue; } } // Check to see if the minimum alternate allele count is met. if(myMinMinorAlleleCount != UNSET_MIN_MINOR_ALLELE_COUNT) { // Get the number of possible alternates. unsigned int numAlts = record.getNumAlts(); // Verify that each allele has the min count. bool failMinorAlleleCount = false; for(unsigned int i = 0; i <= numAlts; i++) { if(record.getAlleleCount(i, myMinorAlleleCountSubset) < myMinMinorAlleleCount) { // Not enough of one gt, so not ok. failMinorAlleleCount = true; break; } } if(failMinorAlleleCount) { // not enough alleles, so continue to the next record. continue; } } // Record was not discarded. recordFound = true; } // Increment the number of kept records. ++myNumKeptRecords; return(true); }