// The VcfRecord passed in should already be set with a record. bool findPos(bool newChrom, const char* chrom1, int pos1, VcfRecord& record2, VcfFileReader& vcf2) { const char* chrom2 = record2.getChromStr(); int pos2 = record2.get1BasedPosition(); // Loop until the chrom/pos is found in vcf2. bool sameChrom = (strcmp(chrom2, chrom1) == 0); while(((pos2 < pos1) && sameChrom) || (newChrom && !sameChrom)) { if(vcf2.readRecord(record2)) { chrom2 = record2.getChromStr(); pos2 = record2.get1BasedPosition(); sameChrom = (strcmp(chrom2, chrom1) == 0); } else { // no more records. chrom2 = NULL; pos2 = UNSET_POS; return(false); } } // If we wind up here, chrom2 is either at the correct // position, or it is past the correct position. if((!sameChrom) || (pos2 != pos1)) { // Position not found. return(false); } return(true); }
bool VcfFileWriter::writeRecord(VcfRecord& record) { if(!record.write(myFilePtr, mySiteOnly)) { myStatus = record.getStatus(); return(false); } ++myNumRecords; return(true); }
void process_block(const bool is_in_region, const unsigned end, VcfRecord& vcfr) const { if(end>vcfr.GetPos()) { vcfr.SetInfoVal("END",_intstr.get32(end)); } else { vcfr.DeleteInfoKeyVal("END"); } if(is_in_region) make_record_haploid(vcfr); vcfr.WriteUnaltered(_opt.outfp); }
void make_record_haploid(VcfRecord& vcfr) const { const char* gt(vcfr.GetSampleVal("GT")); if(NULL == gt) return; parse_gt(gt,_gti); if(_gti.size() == 2) { // record is diploid if(_gti[0] == _gti[1]) { // change GT: static const char* unknown("."); const char* val(unknown); if(_gti[0]>=0) { val=_intstr.get32(_gti[0]); } vcfr.SetSampleVal("GT",val); // move PL field to 'backup' OPL field: const char* pl(vcfr.GetSampleVal("PL")); if(NULL != pl) { vcfr.SetSampleVal(_shopt.orig_pl_tag.c_str(),pl); vcfr.DeleteSampleKeyVal("PL"); } } else { vcfr.AppendFilter(_shopt.haploid_conflict_label.c_str()); } } }
void ResetVcfRecordFromBam( VcfRecord & vcf_rec, RefStats & rstats, vector<RefSeq*> & REF_SEQ, string & chr, int center, SamFile & samIn, SamFileHeader & samHeader ) { vector<int> raw_counts; // read-type count vector raw_counts.resize(18, 0); setReadCountInSection( raw_counts, chr, center, samIn, samHeader, REF_SEQ ); // set from bam // set gl & if 1/1 or 1/0, set breakpoint vector<MergeCell> new_vec; new_vec.resize(1); MergeCellPtr merge_ptr = new_vec.begin(); merge_ptr->dups = 1; merge_ptr->counts = raw_counts; raw_counts.clear(); merge_ptr->GL.resize(3,0); rstats.SetRecordGL( merge_ptr ); vcf_rec.SetChrName( chr ); vcf_rec.SetPosition( center ); vcf_rec.UpdateFromMergeCellPtr( merge_ptr ); // if need to refine break point && exist MEI, do refine if (REFINE_BREAK_POINT && vcf_rec.GetDosage() > 0) vcf_rec.SetBreakPointAndCIFromBam( samIn, samHeader ); }
int VcfCleaner::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcf = ""; bool uncompress = false; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("out", &outputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } VcfFileReader inFile; VcfFileWriter outFile; VcfHeader header; VcfRecord record; // Open the file. inFile.open(inputVcf, header); if(uncompress) { outFile.open(outputVcf, header, InputFile::DEFAULT); } else { outFile.open(outputVcf, header); } int numReadRecords = 0; int numWrittenRecords = 0; int returnVal = 0; // Set to only store/write the GT field. VcfRecordGenotype::addStoreField("GT"); while(inFile.readRecord(record)) { ++numReadRecords; // Check if any samples are missing GT or if any are not phased. if(!record.hasAllGenotypeAlleles() || !record.allPhased()) { // Missing a GT or not phased, so continue without writing. continue; } // Clear the INFO field. record.getInfo().clear(); // Write the record. if(!outFile.writeRecord(record)) { // Write error. std::cerr << "Failed writing a vcf record.\n"; returnVal = -1; } ++numWrittenRecords; } inFile.close(); outFile.close(); std::cerr << "NumReadRecords: " << numReadRecords << "; NumWrittenRecords: " << numWrittenRecords << "\n"; return(returnVal); }
int VcfSplit::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcfBase = ""; String refName = ""; bool uncompress = false; bool params = false; bool noeof = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("obase", &outputVcfBase) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_STRINGPARAMETER("refName", &refName) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcfBase == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--obase\", a required parameter.\n\n"; return(-1); } outputVcfBase += "."; if(params) { inputParameters.Status(); } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } VcfFileReader inFile; std::map<std::string, VcfFileWriter*> outFiles; VcfHeader header; // Open the file. inFile.open(inputVcf, header); if(refName != "") { inFile.setReadSection(refName.c_str()); } VcfRecord record; int numRecords = 0; std::string prevChr = ""; std::string chr = ""; VcfFileWriter* outFilePtr = 0; std::string outName = ""; while(inFile.readRecord(record)) { ++numRecords; chr = record.getChromStr(); if((outFilePtr == 0) || (chr != prevChr)) { outFilePtr = outFiles[chr]; if(outFilePtr == 0) { outFilePtr = new VcfFileWriter(); outFiles[chr] = outFilePtr; outName = outputVcfBase.c_str(); if(chr.substr(0,3) != "chr") { outName += "chr"; } outName += chr + ".vcf"; // chr not in outFile list. if(uncompress) { outFilePtr->open(outName.c_str(), header, InputFile::DEFAULT); } else { outName += ".gz"; outFilePtr->open(outName.c_str(), header); } } } outFilePtr->writeRecord(record); } inFile.close(); for (std::map<std::string,VcfFileWriter*>::iterator it = outFiles.begin(); it != outFiles.end(); ++it) { if(it->second != 0) { it->second->close(); it->second = 0; } } std::cerr << "NumRecords: " << numRecords << "\n"; return(0); }
void process_block(const bool is_in_region, const unsigned end, VcfRecord& vcfr) const { if(! is_in_region) { if(end>vcfr.GetPos()) { vcfr.SetInfoVal("END",_intstr.get32(end)); } else { vcfr.DeleteInfoKeyVal("END"); } vcfr.WriteUnaltered(_opt.outfp); } else { vcfr.DeleteInfoKeyVal("END"); vcfr.WriteUnaltered(_opt.outfp); while(end>vcfr.GetPos()) { const int next_pos(vcfr.GetPos()+1); vcfr.SetPos(next_pos); vcfr.SetRef(_scp.get_char(vcfr.GetChrom().c_str(),next_pos)); vcfr.WriteUnaltered(_opt.outfp); } } }
bool VcfFileReader::readRecord(VcfRecord& record, VcfSubsetSamples* subset) { myStatus = StatGenStatus::SUCCESS; // Subset the read if there are subsets specified. VcfSubsetSamples* subsetPtr = subset; if((subsetPtr == NULL) && myUseSubset) { subsetPtr = &mySampleSubset; } // Check to see if a new region has been set. If so, setup for that region. bool searchChrom = false; if(myNewSection) { if(myVcfIndex != NULL) { // Have an index file so use if(!processNewSection()) { // processNewSection sets the status appropriately on failure. return(false); } } else if(myTotalRead == 0) { // ReadSection without an index only works if no records // have been read. searchChrom = true; myNewSection = false; } else { myNewSection = false; myStatus.setStatus(StatGenStatus::FAIL_ORDER, "Cannot set read section with no index after reading records"); return(false); } } // Keep looping until a desired record is found. bool recordFound = false; while(!recordFound) { if(!record.read(myFilePtr, mySiteOnly, myRecordDiscardRules, subsetPtr)) { myStatus = record.getStatus(); myTotalRead += myRecordDiscardRules.getNumDiscarded(); myNumRecords += myRecordDiscardRules.getNumDiscarded(); myRecordDiscardRules.clearNumDiscarded(); return(false); } ++myTotalRead; myTotalRead += myRecordDiscardRules.getNumDiscarded(); // Check to see if the record is in the section. // First check the chromosome. if(!mySectionChrom.empty() && (mySectionChrom != record.getChromStr())) { if(searchChrom) { // Still searching for the chromosome, so continue // to the next record. continue; } // Record is not within the correct chromosome, so return failure. myStatus = StatGenStatus::NO_MORE_RECS; return(false); } searchChrom = false; // Check if the record is after the section end if applicable. if((mySection1BasedEndPos != -1) && (record.get1BasedPosition() >= mySection1BasedEndPos)) { myStatus = StatGenStatus::NO_MORE_RECS; return(false); } // Check if the record is prior to the section start if applicable. // Determinine the VCF record end position. // If we are not requiring overlap, then we only need to check // the start position, but if overlap is required, then it needs // to incrment the start by the length-1. int numIncBases = 0; if(mySectionOverlap) { // The VCF record end position is the start position + length of the // reference string - 1. numIncBases = record.getNumRefBases() - 1; } if((mySection1BasedStartPos != -1) && ((record.get1BasedPosition() + numIncBases) < mySection1BasedStartPos)) { // This record is prior to the section, so keep reading. continue; } ++myNumRecords; myNumRecords += myRecordDiscardRules.getNumDiscarded(); myRecordDiscardRules.clearNumDiscarded(); // Record successfully read, so check to see if it is discarded. if((myDiscardRules & DISCARD_NON_PHASED) && !record.allPhased()) { // Not all samples are phased, so discard this record. continue; } if((myDiscardRules & DISCARD_MISSING_GT) && !record.hasAllGenotypeAlleles()) { // discard missing GTs and this record had missing alleles, // so keep reading. continue; } if((myDiscardRules & DISCARD_FILTERED) && !(record.getFilter().passedAllFilters())) { // Record was filtered, so discard it. continue; } if((myDiscardRules & DISCARD_MULTIPLE_ALTS) && (record.getNumAlts() > 1)) { // Record had multiple alternates, so discard. continue; } // Check allele counts for discarding. if(myMinAltAlleleCount != UNSET_MIN_ALT_ALLELE_COUNT) { // Count the number of alternates. int32_t altCount = 0; for(int sampleNum = 0; sampleNum < record.getNumSamples(); sampleNum++) { if((myAltAlleleCountSubset != NULL) && !(myAltAlleleCountSubset->keep(sampleNum))) { // Skip this sample. continue; } for(int gtNum = 0; gtNum < record.getNumGTs(sampleNum); gtNum++) { if(record.getGT(sampleNum, gtNum) > 0) { // Alternate, so increment the count. ++altCount; } } } if(altCount < myMinAltAlleleCount) { // Not enough alternates so continue to the next sample. continue; } } // Check to see if the minimum alternate allele count is met. if(myMinMinorAlleleCount != UNSET_MIN_MINOR_ALLELE_COUNT) { // Get the number of possible alternates. unsigned int numAlts = record.getNumAlts(); // Verify that each allele has the min count. bool failMinorAlleleCount = false; for(unsigned int i = 0; i <= numAlts; i++) { if(record.getAlleleCount(i, myMinorAlleleCountSubset) < myMinMinorAlleleCount) { // Not enough of one gt, so not ok. failMinorAlleleCount = true; break; } } if(failMinorAlleleCount) { // not enough alleles, so continue to the next record. continue; } } // Record was not discarded. recordFound = true; } // Increment the number of kept records. ++myNumKeptRecords; return(true); }
int VcfMac::execute(int argc, char **argv) { String inputVcf = ""; int minAC = -1; String sampleSubset = ""; String filterList = ""; bool params = false; IntervalTree<int> regions; std::vector<int> intersection; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("sampleSubset", &sampleSubset) LONG_INTPARAMETER("minAC", &minAC) LONG_STRINGPARAMETER("filterList", &filterList) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // Open the two input files. VcfFileReader inFile; VcfHeader header; VcfRecord record; // Open the file if(sampleSubset.IsEmpty()) { inFile.open(inputVcf, header); } else { inFile.open(inputVcf, header, sampleSubset, NULL, NULL); } // Add the discard rule for minor allele count. if(minAC >= 0) { inFile.addDiscardMinMinorAlleleCount(minAC, NULL); } if(!filterList.IsEmpty()) { // Open the filter list. IFILE regionFile = ifopen(filterList, "r"); String regionLine; StringArray regionColumn; int start; int end; int intervalVal = 1; if(regionFile == NULL) { std::cerr << "Failed to open " << filterList << ", so keeping all positions\n"; filterList.Clear(); } else { while( regionFile->isOpen() && !regionFile->ifeof()) { // Read the next interval regionLine.Clear(); regionLine.ReadLine(regionFile); if(regionLine.IsEmpty()) { // Nothing on this line, continue to the next. continue; } regionColumn.ReplaceColumns(regionLine, ' '); if(regionColumn.Length() != 2) { std::cerr << "Improperly formatted region line: " << regionLine << "; skipping to the next line.\n"; continue; } // Convert the columns to integers. if(!regionColumn[0].AsInteger(start)) { // The start position (1st column) is not an integer. std::cerr << "Improperly formatted region line, start position " << "(1st column) is not an integer: " << regionColumn[0] << "; Skipping to the next line.\n"; continue; } if(!regionColumn[1].AsInteger(end)) { // The start position (1st column) is not an integer. std::cerr << "Improperly formatted region line, end position " << "(2nd column) is not an integer: " << regionColumn[1] << "; Skipping to the next line.\n"; continue; } // Add 1-based inclusive intervals. regions.add(start,end, intervalVal); } } } int numReadRecords = 0; while( inFile.readRecord(record)) { if(!filterList.IsEmpty()) { // Check if the region should be kept. intersection.clear(); regions.get_intersecting_intervals(record.get1BasedPosition(), intersection); if(intersection.empty()) { // not in the interval, so continue to the next record. continue; } } ++numReadRecords; // Loop through the number of possible alternates. unsigned int numAlts = record.getNumAlts(); int minAlleleCount = -1; int curAlleleCount = 0; int totalAlleleCount = 0; for(unsigned int i = 0; i <= numAlts; i++) { curAlleleCount = record.getAlleleCount(i); if((minAlleleCount == -1) || (curAlleleCount < minAlleleCount)) { minAlleleCount = curAlleleCount; } totalAlleleCount += curAlleleCount; } if(totalAlleleCount != 0) { double maf = (double)minAlleleCount/totalAlleleCount; std::cout << record.getIDStr() << "\t" << minAlleleCount << "\t" << maf << "\n"; } } inFile.close(); // std::cerr << "\n\t# Records: " << numReadRecords << "\n"; // return success. return(0); }