// The VcfRecord passed in should already be set with a record. bool findPos(bool newChrom, const char* chrom1, int pos1, VcfRecord& record2, VcfFileReader& vcf2) { const char* chrom2 = record2.getChromStr(); int pos2 = record2.get1BasedPosition(); // Loop until the chrom/pos is found in vcf2. bool sameChrom = (strcmp(chrom2, chrom1) == 0); while(((pos2 < pos1) && sameChrom) || (newChrom && !sameChrom)) { if(vcf2.readRecord(record2)) { chrom2 = record2.getChromStr(); pos2 = record2.get1BasedPosition(); sameChrom = (strcmp(chrom2, chrom1) == 0); } else { // no more records. chrom2 = NULL; pos2 = UNSET_POS; return(false); } } // If we wind up here, chrom2 is either at the correct // position, or it is past the correct position. if((!sameChrom) || (pos2 != pos1)) { // Position not found. return(false); } return(true); }
int VcfSplit::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcfBase = ""; String refName = ""; bool uncompress = false; bool params = false; bool noeof = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("obase", &outputVcfBase) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_STRINGPARAMETER("refName", &refName) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcfBase == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--obase\", a required parameter.\n\n"; return(-1); } outputVcfBase += "."; if(params) { inputParameters.Status(); } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } VcfFileReader inFile; std::map<std::string, VcfFileWriter*> outFiles; VcfHeader header; // Open the file. inFile.open(inputVcf, header); if(refName != "") { inFile.setReadSection(refName.c_str()); } VcfRecord record; int numRecords = 0; std::string prevChr = ""; std::string chr = ""; VcfFileWriter* outFilePtr = 0; std::string outName = ""; while(inFile.readRecord(record)) { ++numRecords; chr = record.getChromStr(); if((outFilePtr == 0) || (chr != prevChr)) { outFilePtr = outFiles[chr]; if(outFilePtr == 0) { outFilePtr = new VcfFileWriter(); outFiles[chr] = outFilePtr; outName = outputVcfBase.c_str(); if(chr.substr(0,3) != "chr") { outName += "chr"; } outName += chr + ".vcf"; // chr not in outFile list. if(uncompress) { outFilePtr->open(outName.c_str(), header, InputFile::DEFAULT); } else { outName += ".gz"; outFilePtr->open(outName.c_str(), header); } } } outFilePtr->writeRecord(record); } inFile.close(); for (std::map<std::string,VcfFileWriter*>::iterator it = outFiles.begin(); it != outFiles.end(); ++it) { if(it->second != 0) { it->second->close(); it->second = 0; } } std::cerr << "NumRecords: " << numRecords << "\n"; return(0); }
bool VcfFileReader::readRecord(VcfRecord& record, VcfSubsetSamples* subset) { myStatus = StatGenStatus::SUCCESS; // Subset the read if there are subsets specified. VcfSubsetSamples* subsetPtr = subset; if((subsetPtr == NULL) && myUseSubset) { subsetPtr = &mySampleSubset; } // Check to see if a new region has been set. If so, setup for that region. bool searchChrom = false; if(myNewSection) { if(myVcfIndex != NULL) { // Have an index file so use if(!processNewSection()) { // processNewSection sets the status appropriately on failure. return(false); } } else if(myTotalRead == 0) { // ReadSection without an index only works if no records // have been read. searchChrom = true; myNewSection = false; } else { myNewSection = false; myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Cannot set read section with no index after reading records"); return(false); } } // Keep looping until a desired record is found. bool recordFound = false; while(!recordFound) { if(!record.read(myFilePtr, mySiteOnly, myRecordDiscardRules, subsetPtr)) { myStatus = record.getStatus(); myTotalRead += myRecordDiscardRules.getNumDiscarded(); myNumRecords += myRecordDiscardRules.getNumDiscarded(); myRecordDiscardRules.clearNumDiscarded(); return(false); } ++myTotalRead; myTotalRead += myRecordDiscardRules.getNumDiscarded(); // Check to see if the record is in the section. // First check the chromosome. if(!mySectionChrom.empty() && (mySectionChrom != record.getChromStr())) { if(searchChrom) { // Still searching for the chromosome, so continue // to the next record. continue; } // Record is not within the correct chromosome, so return failure. myStatus = StatGenStatus::NO_MORE_RECS; return(false); } searchChrom = false; // Check if the record is after the section end if applicable. if((mySection1BasedEndPos != -1) && (record.get1BasedPosition() >= mySection1BasedEndPos)) { myStatus = StatGenStatus::NO_MORE_RECS; return(false); } // Check if the record is prior to the section start if applicable. // Determinine the VCF record end position. // If we are not requiring overlap, then we only need to check // the start position, but if overlap is required, then it needs // to incrment the start by the length-1. int numIncBases = 0; if(mySectionOverlap) { // The VCF record end position is the start position + length of the // reference string - 1. numIncBases = record.getNumRefBases() - 1; } if((mySection1BasedStartPos != -1) && ((record.get1BasedPosition() + numIncBases) < mySection1BasedStartPos)) { // This record is prior to the section, so keep reading. continue; } ++myNumRecords; myNumRecords += myRecordDiscardRules.getNumDiscarded(); myRecordDiscardRules.clearNumDiscarded(); // Record successfully read, so check to see if it is discarded. if((myDiscardRules & DISCARD_NON_PHASED) && !record.allPhased()) { // Not all samples are phased, so discard this record. continue; } if((myDiscardRules & DISCARD_MISSING_GT) && !record.hasAllGenotypeAlleles()) { // discard missing GTs and this record had missing alleles, // so keep reading. continue; } if((myDiscardRules & DISCARD_FILTERED) && !(record.getFilter().passedAllFilters())) { // Record was filtered, so discard it. continue; } if((myDiscardRules & DISCARD_MULTIPLE_ALTS) && (record.getNumAlts() > 1)) { // Record had multiple alternates, so discard. continue; } // Check allele counts for discarding. if(myMinAltAlleleCount != UNSET_MIN_ALT_ALLELE_COUNT) { // Count the number of alternates. int32_t altCount = 0; for(int sampleNum = 0; sampleNum < record.getNumSamples(); sampleNum++) { if((myAltAlleleCountSubset != NULL) && !(myAltAlleleCountSubset->keep(sampleNum))) { // Skip this sample. continue; } for(int gtNum = 0; gtNum < record.getNumGTs(sampleNum); gtNum++) { if(record.getGT(sampleNum, gtNum) > 0) { // Alternate, so increment the count. ++altCount; } } } if(altCount < myMinAltAlleleCount) { // Not enough alternates so continue to the next sample. continue; } } // Check to see if the minimum alternate allele count is met. if(myMinMinorAlleleCount != UNSET_MIN_MINOR_ALLELE_COUNT) { // Get the number of possible alternates. unsigned int numAlts = record.getNumAlts(); // Verify that each allele has the min count. bool failMinorAlleleCount = false; for(unsigned int i = 0; i <= numAlts; i++) { if(record.getAlleleCount(i, myMinorAlleleCountSubset) < myMinMinorAlleleCount) { // Not enough of one gt, so not ok. failMinorAlleleCount = true; break; } } if(failMinorAlleleCount) { // not enough alleles, so continue to the next record. continue; } } // Record was not discarded. recordFound = true; } // Increment the number of kept records. ++myNumKeptRecords; return(true); }