// Dump the specified Bam Index file. int DumpIndex::execute(int argc, char **argv) { // Extract command line arguments. static const int UNSPECIFIED_INT = -1; String indexFile = ""; int refID = UNSPECIFIED_INT; bool summary = false; bool params = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_INTPARAMETER("refID", &refID) LONG_PARAMETER("summary", &summary) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check to see if the index file was specified, if not, report an error. if(indexFile == "") { usage(); inputParameters.Status(); // mandatory argument was not specified. std::cerr << "Missing mandatory argument: --bamIndex" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Read the index. BamIndex bamIndex; SamStatus status; status = bamIndex.readIndex(indexFile); if(status != SamStatus::SUCCESS) { // Failed to read the index, return. fprintf(stderr, "%s\n", status.getStatusMessage()); return(status.getStatus()); } // Print the index file. bamIndex.printIndex(refID, summary); return(status.getStatus()); }
int Convert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String refFile = ""; bool lshift = false; bool noeof = false; bool params = false; bool useBases = false; bool useEquals = false; bool useOrigSeq = false; bool recover = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_STRINGPARAMETER("refFile", &refFile) LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("recover", &recover) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("SequenceConversion") EXCLUSIVE_PARAMETER("useBases", &useBases) EXCLUSIVE_PARAMETER("useEquals", &useEquals) EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the ref file was specified. // Open the reference. GenomeSequence* refPtr = NULL; if(refFile != "") { refPtr = new GenomeSequence(refFile); } SamRecord::SequenceTranslation translation; if((useBases) && (refPtr != NULL)) { translation = SamRecord::BASES; } else if((useEquals) && (refPtr != NULL)) { translation = SamRecord::EQUAL; } else { useOrigSeq = true; translation = SamRecord::NONE; } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; if(recover) samIn.setAttemptRecovery(true); samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); samOut.SetWriteSequenceTranslation(translation); samOut.SetReference(refPtr); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(1) { try { // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // left shift if necessary. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } break; } catch (std::runtime_error e) { std::cerr << "Caught runtime error: " << e.what() << "\n"; if(!recover) { std::cerr << "Corrupted BAM file detected - consider using --recover option.\n"; break; } std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n"; // XXX need to resync SamFile stream here bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH); if(rc) { std::cerr << "Successful resync - some data lost.\n"; continue; // succeeded } std::cerr << "Failed to re-sync on data stream.\n"; break; // failed to resync } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(refPtr != NULL) { delete(refPtr); } // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int Bam2FastQ::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool readName = false; String refFile = ""; String firstOut = ""; String secondOut = ""; String unpairedOut = ""; bool interleave = false; bool noeof = false; bool gzip = false; bool params = false; myOutBase = ""; myNumMateFailures = 0; myNumPairs = 0; myNumUnpaired = 0; mySplitRG = false; myQField = ""; myNumQualTagErrors = 0; myReverseComp = true; myRNPlus = false; myFirstRNExt = DEFAULT_FIRST_EXT; mySecondRNExt = DEFAULT_SECOND_EXT; myCompression = InputFile::DEFAULT; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("readName", &readName) LONG_PARAMETER("splitRG", &mySplitRG) LONG_STRINGPARAMETER("qualField", &myQField) LONG_PARAMETER("merge", &interleave) LONG_STRINGPARAMETER("refFile", &refFile) LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt) LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt) LONG_PARAMETER("rnPlus", &myRNPlus) LONG_PARAMETER("noReverseComp", &myReverseComp) LONG_PARAMETER("gzip", &gzip) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional OutputFile Names") LONG_STRINGPARAMETER("outBase", &myOutBase) LONG_STRINGPARAMETER("firstOut", &firstOut) LONG_STRINGPARAMETER("secondOut", &secondOut) LONG_STRINGPARAMETER("unpairedOut", &unpairedOut) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(gzip) { myCompression = InputFile::GZIP; } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both splitRG & firstOut/secondOut/unpairedOut // since it needs a different file for each RG. if(mySplitRG && (!firstOut.IsEmpty() || !secondOut.IsEmpty() || !unpairedOut.IsEmpty())) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n"; std::cerr << "Use --outBase instead.\n"; return(-1); } // Cannot specify splitRG & output to stdout. if(mySplitRG && (myOutBase[0] == '-')) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n"; return(-1); } // Check to see if the out file was specified, if not, generate it from // the input filename. if(myOutBase == "") { // Just remove the extension from the input filename. int extStart = inFile.FastFindLastChar('.'); if(extStart <= 0) { myOutBase = inFile; } else { myOutBase = inFile.Left(extStart); } } if(mySplitRG) { std::string fqList = myOutBase.c_str(); fqList += ".list"; myFqList = ifopen(fqList.c_str(), "w"); ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n"); } // Check to see if the first/second/single-ended were specified and // if not, set them. myFirstFileNameExt = "_1.fastq"; mySecondFileNameExt = "_2.fastq"; myUnpairedFileNameExt = ".fastq"; if(interleave) { myFirstFileNameExt = "_interleaved.fastq"; myFirstFileNameExt = "_interleaved.fastq"; } getFileName(firstOut, myFirstFileNameExt); getFileName(secondOut, mySecondFileNameExt); getFileName(unpairedOut, myUnpairedFileNameExt); if(params) { inputParameters.Status(); } // Open the files for reading/writing. // Open prior to opening the output files, // so if there is an error, the outputs don't get created. SamFile samIn; samIn.OpenForRead(inFile, &mySamHeader); // Skip non-primary reads. samIn.SetReadFlags(0, 0x0100); // Open the output files if not splitting RG if(!mySplitRG) { myUnpairedFile = ifopen(unpairedOut, "w", myCompression); // Only open the first file if it is different than an already opened file. if(firstOut != unpairedOut) { myFirstFile = ifopen(firstOut, "w", myCompression); } else { myFirstFile = myUnpairedFile; } // If it is interleaved or the 2nd file is not a new name, set it appropriately. if(interleave || secondOut == firstOut) { mySecondFile = myFirstFile; } else if(secondOut == unpairedOut) { mySecondFile = myUnpairedFile; } else { mySecondFile = ifopen(secondOut, "w", myCompression); } if(myUnpairedFile == NULL) { std::cerr << "Failed to open " << unpairedOut << " so can't convert bam2FastQ.\n"; return(-1); } if(myFirstFile == NULL) { std::cerr << "Failed to open " << firstOut << " so can't convert bam2FastQ.\n"; return(-1); } if(mySecondFile == NULL) { std::cerr << "Failed to open " << secondOut << " so can't convert bam2FastQ.\n"; return(-1); } } if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0)) { readName = true; } else { // defaulting to coordinate sorted. samIn.setSortedValidation(SamFile::COORDINATE); } // Setup the '=' translation if the reference was specified. if(!refFile.IsEmpty()) { GenomeSequence* refPtr = new GenomeSequence(refFile); samIn.SetReadSequenceTranslation(SamRecord::BASES); samIn.SetReference(refPtr); } SamRecord* recordPtr; int16_t samFlag; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = myPool.getRecord(); if(recordPtr == NULL) { // Failed to allocate a new record. throw(std::runtime_error("Failed to allocate a new SAM/BAM record")); } if(!samIn.ReadRecord(mySamHeader, *recordPtr)) { // Failed to read a record. returnStatus = samIn.GetStatus(); continue; } // Have a record. Check to see if it is a pair or unpaired read. samFlag = recordPtr->getFlag(); if(SamFlag::isPaired(samFlag)) { if(readName) { handlePairedRN(*recordPtr); } else { handlePairedCoord(*recordPtr); } } else { ++myNumUnpaired; writeFastQ(*recordPtr, myUnpairedFile, myUnpairedFileNameExt); } } // Flush All cleanUpMateMap(0, true); if(returnStatus == SamStatus::NO_MORE_RECS) { returnStatus = SamStatus::SUCCESS; } samIn.Close(); closeFiles(); // Output the results std::cerr << "\nFound " << myNumPairs << " read pairs.\n"; std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n"; if(myNumMateFailures != 0) { std::cerr << "Failed to find mates for " << myNumMateFailures << " reads, so they were written as unpaired\n" << " (not included in either of the above counts).\n"; } if(myNumQualTagErrors != 0) { std::cerr << myNumQualTagErrors << " records did not have tag " << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n"; } return(returnStatus); }
int VcfMac::execute(int argc, char **argv) { String inputVcf = ""; int minAC = -1; String sampleSubset = ""; String filterList = ""; bool params = false; IntervalTree<int> regions; std::vector<int> intersection; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("sampleSubset", &sampleSubset) LONG_INTPARAMETER("minAC", &minAC) LONG_STRINGPARAMETER("filterList", &filterList) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // Open the two input files. VcfFileReader inFile; VcfHeader header; VcfRecord record; // Open the file if(sampleSubset.IsEmpty()) { inFile.open(inputVcf, header); } else { inFile.open(inputVcf, header, sampleSubset, NULL, NULL); } // Add the discard rule for minor allele count. if(minAC >= 0) { inFile.addDiscardMinMinorAlleleCount(minAC, NULL); } if(!filterList.IsEmpty()) { // Open the filter list. IFILE regionFile = ifopen(filterList, "r"); String regionLine; StringArray regionColumn; int start; int end; int intervalVal = 1; if(regionFile == NULL) { std::cerr << "Failed to open " << filterList << ", so keeping all positions\n"; filterList.Clear(); } else { while( regionFile->isOpen() && !regionFile->ifeof()) { // Read the next interval regionLine.Clear(); regionLine.ReadLine(regionFile); if(regionLine.IsEmpty()) { // Nothing on this line, continue to the next. continue; } regionColumn.ReplaceColumns(regionLine, ' '); if(regionColumn.Length() != 2) { std::cerr << "Improperly formatted region line: " << regionLine << "; skipping to the next line.\n"; continue; } // Convert the columns to integers. if(!regionColumn[0].AsInteger(start)) { // The start position (1st column) is not an integer. std::cerr << "Improperly formatted region line, start position " << "(1st column) is not an integer: " << regionColumn[0] << "; Skipping to the next line.\n"; continue; } if(!regionColumn[1].AsInteger(end)) { // The start position (1st column) is not an integer. std::cerr << "Improperly formatted region line, end position " << "(2nd column) is not an integer: " << regionColumn[1] << "; Skipping to the next line.\n"; continue; } // Add 1-based inclusive intervals. regions.add(start,end, intervalVal); } } } int numReadRecords = 0; while( inFile.readRecord(record)) { if(!filterList.IsEmpty()) { // Check if the region should be kept. intersection.clear(); regions.get_intersecting_intervals(record.get1BasedPosition(), intersection); if(intersection.empty()) { // not in the interval, so continue to the next record. continue; } } ++numReadRecords; // Loop through the number of possible alternates. unsigned int numAlts = record.getNumAlts(); int minAlleleCount = -1; int curAlleleCount = 0; int totalAlleleCount = 0; for(unsigned int i = 0; i <= numAlts; i++) { curAlleleCount = record.getAlleleCount(i); if((minAlleleCount == -1) || (curAlleleCount < minAlleleCount)) { minAlleleCount = curAlleleCount; } totalAlleleCount += curAlleleCount; } if(totalAlleleCount != 0) { double maf = (double)minAlleleCount/totalAlleleCount; std::cout << record.getIDStr() << "\t" << minAlleleCount << "\t" << maf << "\n"; } } inFile.close(); // std::cerr << "\n\t# Records: " << numReadRecords << "\n"; // return success. return(0); }
int ClipOverlap::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String storeOrig = ""; bool readName = false; bool noRNValidate = false; bool stats = false; int poolSize = DEFAULT_POOL_SIZE; bool unmapped = false; bool noeof = false; bool params = false; String excludeFlags = "0xF0C"; // TODO, cleanup legacy parameters ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("storeOrig", &storeOrig) LONG_PARAMETER("readName", &readName) LONG_PARAMETER ("noRNValidate", &noRNValidate) LONG_PARAMETER ("stats", &stats) LONG_PARAMETER ("overlapsOnly", &myOverlapsOnly) LONG_STRINGPARAMETER ("excludeFlags", &excludeFlags) LONG_PARAMETER("unmapped", &unmapped) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Coordinate Processing Optional Parameters") LONG_INTPARAMETER("poolSize", &poolSize) LONG_PARAMETER("poolSkipOverlap", &myPoolSkipOverlap) LONG_PHONEHOME(VERSION) BEGIN_LEGACY_PARAMETERS() LONG_PARAMETER ("clipsOnly", &myOverlapsOnly) LONG_PARAMETER("poolSkipClip", &myPoolSkipOverlap) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the out file was specified, if not, report an error. if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // Out file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if((storeOrig.Length() != 0) && (storeOrig.Length() != 2)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "--storeOrig tag name must be 2 characters.\n"; return(-1); } myOverlapHandler = new OverlapClipLowerBaseQual(); if(myOverlapHandler == NULL) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Failed to allocate the overlap handler\n"; return(-1); } if(unmapped) { myOverlapHandler->markAsUnmapped(); } // Setup the overlap handler. myOverlapHandler->keepStats(stats); if(storeOrig.Length() != 0) { myOverlapHandler->storeOrigCigar(storeOrig); } myIntExcludeFlags = excludeFlags.AsInteger(); if(params) { inputParameters.Status(); } // For each step process the file. // Open the files & read/write the sam header. SamStatus::Status runStatus = SamStatus::SUCCESS; for(int i = 1; i <= myOverlapHandler->numSteps(); i++) { // Open the file for reading. mySamHeader.resetHeader(); SamFile samIn(inFile, SamFile::READ, &mySamHeader); SamFile* samOutPtr = NULL; // Check if writing, if so, open the output file. if(i == myOverlapHandler->numSteps()) { samOutPtr = new SamFile(outFile, SamFile::WRITE, &mySamHeader); } if(readName) { if(!noRNValidate) { samIn.setSortedValidation(SamFile::QUERY_NAME); } runStatus = handleSortedByReadName(samIn, samOutPtr); } else { // Coordinate sorted, so work with the pools. samIn.setSortedValidation(SamFile::COORDINATE); myPool.setMaxAllocatedRecs(poolSize); // Reset the number of failures myNumMateFailures = 0; myNumPoolFail = 0; myNumPoolFailNoHandle = 0; myNumPoolFailHandled = 0; myNumOutOfOrder = 0; // Run by coordinate if(samOutPtr != NULL) { // Setup the output buffer for writing. SamCoordOutput outputBuffer(myPool); outputBuffer.setOutputFile(samOutPtr, &mySamHeader); runStatus = handleSortedByCoord(samIn, &outputBuffer); // Cleanup the output buffer. if(!outputBuffer.flushAll()) { std::cerr << "ERROR: Failed to flush the output buffer\n"; runStatus = SamStatus::FAIL_IO; } } else { runStatus = handleSortedByCoord(samIn, NULL); } } if(runStatus != SamStatus::SUCCESS) { break; } // Close the input file, it will be reopened if there are // multiple steps. samIn.Close(); if(samOutPtr != NULL) { samOutPtr->Close(); delete samOutPtr; samOutPtr = NULL; } } // Done processing. // Print Stats myOverlapHandler->printStats(); if(myNumMateFailures != 0) { std::cerr << "WARNING: did not find expected overlapping mates for " << myNumMateFailures << " records." << std::endl; } if(myNumPoolFail != 0) { // Had to skip clipping some records due to running out of // memory and not being able to wait for the mate. std::cerr << "WARNING: " << myNumPoolFail << " record pool failures\n"; if(myNumPoolFailNoHandle != 0) { std::cerr << "Due to hitting the max record poolSize, skipped handling " << myNumPoolFailNoHandle << " records." << std::endl; } if(myNumPoolFailHandled != 0) { std::cerr << "Due to hitting the max record poolSize, default handled " << myNumPoolFailHandled << " records." << std::endl; } if(myNumOutOfOrder != 0) { std::cerr << "WARNING: Resulting File out of Order by " << myNumOutOfOrder << " records.\n"; } } if(runStatus == SamStatus::SUCCESS) { if(myNumPoolFail == 0) { std::cerr << "Completed ClipOverlap Successfully.\n"; } else { runStatus = SamStatus::NO_MORE_RECS; std::cerr << "Completed ClipOverlap with WARNINGS.\n"; } } else { std::cerr << "Failed to complete ClipOverlap.\n"; } return(runStatus); }
int ReadReference::execute(int argc, char **argv) { static const int UNSPECIFIED_INT = -1; String refFile = ""; String refName = ""; int start = UNSPECIFIED_INT; int numBases = UNSPECIFIED_INT; int end = UNSPECIFIED_INT; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("refFile", &refFile) LONG_STRINGPARAMETER("refName", &refName) LONG_INTPARAMETER("start", &start) LONG_INTPARAMETER("end", &end) LONG_INTPARAMETER("numBases", &numBases) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); if((refName == "") || (start == UNSPECIFIED_INT) || ((end == UNSPECIFIED_INT) && (numBases == UNSPECIFIED_INT))) { usage(); inputParameters.Status(); std::cerr << "Missing Required Parameter\n\n"; return(-1); } if((end != UNSPECIFIED_INT) && (numBases != UNSPECIFIED_INT)) { usage(); inputParameters.Status(); std::cerr << "Only --end or --numBases can be specified\n\n"; return(-1); } else if(numBases != UNSPECIFIED_INT) { end = start + numBases; } if(params) { inputParameters.Status(); } // Open the reference. GenomeSequence reference(refFile); uint32_t refStart = reference.getGenomePosition(refName.c_str()); if(refStart == INVALID_GENOME_INDEX) { std::cerr << "Reference Name: " << refName.c_str() << " not found in the reference file\n"; return(-1); } std::string refString; reference.getString(refString, refStart + start, end - start); std::cout << refString << std::endl; return(0); }
int main(int argc, char ** argv) { printf("glfMultiples -- SNP calls based on .glf or .glz files\n"); printf("(c) 2008-2011 Goncalo Abecasis, Sebastian Zoellner, Yun Li\n\n"); String pedfile; String positionfile; String callfile; String glfAliases; String glfPrefix; String glfSuffix; ParameterList pl; double posterior = 0.50; int mapQuality = 0; int minTotalDepth = 1; int maxTotalDepth = INT_MAX; bool verbose = false; bool mapQualityStrict = false; bool hardFilter = false; bool smartFilter = false; bool softFilter = true; bool robustPrior = true; bool uniformPrior = false; String xLabel("X"), yLabel("Y"), mitoLabel("MT"); int xStart = 2699520, xStop = 154931044; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Pedigree File") LONG_STRINGPARAMETER("ped", &pedfile) LONG_PARAMETER_GROUP("Map Quality Filter") LONG_INTPARAMETER("minMapQuality", &mapQuality) LONG_PARAMETER("strict", &mapQualityStrict) LONG_PARAMETER_GROUP("Total Depth Filter") LONG_INTPARAMETER("minDepth", &minTotalDepth) LONG_INTPARAMETER("maxDepth", &maxTotalDepth) LONG_PARAMETER_GROUP("Position Filter") LONG_STRINGPARAMETER("positionFile", &positionfile) LONG_PARAMETER_GROUP("Chromosome Labels") LONG_STRINGPARAMETER("xChr", &xLabel) LONG_STRINGPARAMETER("yChr", &yLabel) LONG_STRINGPARAMETER("mito", &mitoLabel) LONG_INTPARAMETER("xStart", &xStart) LONG_INTPARAMETER("xStop", &xStop) LONG_PARAMETER_GROUP("Filtering Options") EXCLUSIVE_PARAMETER("hardFilter", &hardFilter) EXCLUSIVE_PARAMETER("smartFilter", &smartFilter) EXCLUSIVE_PARAMETER("softFilter", &softFilter) LONG_PARAMETER_GROUP("Prior Options") EXCLUSIVE_PARAMETER("uniformPrior", &uniformPrior) EXCLUSIVE_PARAMETER("robustPrior", &robustPrior) LONG_PARAMETER_GROUP("Output") LONG_PARAMETER("verbose", &verbose) LONG_PARAMETER_GROUP("Sample Names") LONG_STRINGPARAMETER("glfAliases", &glfAliases) LONG_PARAMETER_GROUP("Prefixes and Suffixes") LONG_STRINGPARAMETER("glfPrefix",&glfPrefix) LONG_STRINGPARAMETER("glfSuffix",&glfSuffix) END_LONG_PARAMETERS(); pl.Add(new StringParameter('b', "Base Call File", callfile)); pl.Add(new DoubleParameter('p', "Posterior Threshold", posterior)); pl.Add(new LongParameters("Additional Options", longParameters)); int argstart = pl.ReadWithTrailer(argc, argv) + 1; pl.Status(); if (posterior < 0.50) error("Posterior threshold for genotype calls (-p option) must be > 0.50."); time_t t; time(&t); printf("Analysis started on %s\n", ctime(&t)); fflush(stdout); int n = argc - argstart; argv += argstart; Pedigree ped; if (!pedfile.IsEmpty()) { ped.pd.AddStringColumn("glfFile"); ped.Load(pedfile); n = ped.count; } else if (n == 0) error("No pedigree file present and no glf files listed at the end of command line\n"); // Prior for finding difference from the reference at a particular site //BgzfFileType::setRequireEofBlock(false); double prior = 0.0; for (int i = 1; i <= 2 * n; i++) prior += 1.0 / i; prior *= 0.001; glfHandler * glf = new glfHandler[n]; bool firstGlf = n; if (ped.count) { bool warn = false; for (int i = n - 1; i > 0; i++) { if (!glf[i].Open(ped[i].strings[0])) { printf("Failed to open genotype likelihood file [%s] for individual %s:%s\n", (const char *) ped[i].strings[0], (const char *) ped[i].famid, (const char *) ped[i].pid); glf[i].OpenStub(); firstGlf = i; } if (warn) printf("\n"); if (firstGlf == n) error("No genotype likelihood files could be opened"); } } else { for (int i = firstGlf = 0; i < n; i++) { String glfName = glfPrefix + String(argv[i]) + glfSuffix; if (!glf[i].Open(glfName)) error("Failed to open genotype likelihood file [%s]\n", glfName.c_str()); } } StringAlias aliases; aliases.ReadFromFile(glfAliases); printf("Calling genotypes for files ...\n"); for (int i = 0; i < n; i++) printf("%s\n", ped.count ? (const char *) ped[i].strings[0] : argv[i]); printf("\n"); baseCalls = fopen(callfile, "wt"); if (baseCalls != NULL) { fprintf(baseCalls, "##fileformat=VCFv4.0\n"); ReportDate(baseCalls); fprintf(baseCalls, "##source=glfMultiples\n"); fprintf(baseCalls, "##minDepth=%d\n", minTotalDepth); fprintf(baseCalls, "##maxDepth=%d\n", maxTotalDepth); fprintf(baseCalls, "##minMapQuality=%d\n", mapQuality); fprintf(baseCalls, "##minPosterior=%.4f\n", posterior); fprintf(baseCalls, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"); fprintf(baseCalls, "##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root Mean Squared Mapping Quality\">\n"); fprintf(baseCalls, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with coverage\">\n"); fprintf(baseCalls, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles (with coverage)\">\n"); fprintf(baseCalls, "##INFO=<ID=AC,Number=.,Type=Integer,Description=\"Alternative allele count (with coverage)\">\n"); fprintf(baseCalls, "##INFO=<ID=AF,Number=.,Type=Float,Description=\"Alternate allele frequency\">\n"); fprintf(baseCalls, "##INFO=<ID=AB,Number=1,Type=Float,Description=\"Estimated allele balance between the alleles\">\n"); if ( mapQuality > 0 ) { fprintf(baseCalls, "##FILTER=<ID=mq%d,Description=\"Mapping Quality less than %d\">\n",mapQuality,mapQuality); } if ( minTotalDepth > 1 ) { fprintf(baseCalls, "##FILTER=<ID=dp%d,Description=\"Total Read Depth less than %d\">\n",minTotalDepth,minTotalDepth); } if ( minTotalDepth < INT_MAX ) { fprintf(baseCalls, "##FILTER=<ID=DP%d,Description=\"Total Read Depth greater than %d\">\n",maxTotalDepth,maxTotalDepth); } fprintf(baseCalls, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Most Likely Genotype\">\n"); fprintf(baseCalls, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Call Quality\">\n"); fprintf(baseCalls, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n"); fprintf(baseCalls, "##FORMAT=<ID=PL,Number=3,Type=Integer,Description=\"Genotype Likelihoods for Genotypes 0/0,0/1,1/1\">\n"); fprintf(baseCalls, "##FORMAT=<ID=PL3,Number=6,Type=Integer,Description=\"Genotype Likelihoods for Genotypes 0/0,0/1,1/1,0/2,1/2,2/2\">\n"); fprintf(baseCalls, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); for (int i = 0; i < n; i++) fprintf(baseCalls, "\t%s", ped.count ? (const char *) (ped[i].famid + ":" + ped[i].pid) : (const char *) aliases.GetAlias(argv[i])); fprintf(baseCalls, "\n"); } StringArray buffer, tokens; StringHash positions; buffer.Read(positionfile); for (int i = 0; i < buffer.Length(); i++) { tokens.ReplaceTokens(buffer[i], " \t:"); if (tokens.Length() != 2) continue; positions.Add(tokens[0] + ":" + (int(tokens[1].AsInteger() - 1))); } int chromosomeType = 0; while (glf[firstGlf].NextSection()) { for (int i = firstGlf + 1; i < n; i++) { if (glf[i].isStub) continue; glf[i].NextSection(); if (glf[firstGlf].maxPosition != glf[i].maxPosition || glf[firstGlf].label != glf[i].label) { error("Genotype files '%s' and '%s' are not compatible ...\n" " File '%s' has section %s with %d entries ...\n" " File '%s' section %s with %d entries ...\n", ped.count ? (const char *) ped[firstGlf].strings[0] : argv[firstGlf], ped.count ? (const char *) ped[i].strings[0] : argv[i], ped.count ? (const char *) ped[firstGlf].strings[0] : argv[firstGlf], (const char *) glf[firstGlf].label, glf[firstGlf].maxPosition, ped.count ? (const char *) ped[i].strings[0] : argv[i], (const char *) glf[i].label, glf[i].maxPosition); } } chromosomeType = CT_AUTOSOME; if (ped.count) { if (glf[firstGlf].label == xLabel) chromosomeType = CT_CHRX; if (glf[firstGlf].label == yLabel) chromosomeType = CT_CHRY; if (glf[firstGlf].label == mitoLabel) chromosomeType = CT_MITO; } printf("Processing section %s with %d entries\n", (const char *) glf[firstGlf].label, glf[firstGlf].maxPosition); int refBase = 0; int position = 0; int mapQualityFilter = 0; int depthFilter = 0; int homozygousReference = 0; int transitions = 0; int transversions = 0; int otherPolymorphisms = 0; int sinkFilter = 0; int smartFilterHits = 0; int baseCounts[5] = {0, 0, 0, 0, 0}; String filter; while (true) { if (position > 0) { // Check whether we have reached the end of the current chromosome bool done = true; for (int i = 0; i < n; i++) if (glf[i].data.recordType != 0) done = false; if (done) break; } // Advance to the next position where needed for (int i = 0; i < n; i++) if (glf[i].position == position) glf[i].NextBaseEntry(); // Figure out the current analysis position refBase = glf[0].data.refBase; position = glf[0].position; for (int i = 1; i < n; i++) if (position > glf[i].position) { position = glf[i].position; refBase = glf[i].data.refBase; } // Avoid alignments that extend past the end of the chromosome if (position >= glf[firstGlf].maxPosition) break; baseCounts[(int)refBase]++; // These lines can be uncommented for debugging purposes // for (int i = 0; i < n; i++) // printf("GLF %d : position %d, refBase %d\n", i, position, refBase); // printf("Position: %d, refBase: %d\n", position, refBase); if (positions.Entries()) { filter = glf[firstGlf].label + ":" + position; if (positions.Find(filter) < 0) continue; } if (refBase == 0) continue; // Corrected calculation of root-mean-square Map Quality score // and check if we have at least one sample with good quality data int currentDepth = 0, totalDepth = 0, numCovered = 0; double currentQuality = 0.0, averageMapQuality = 0.0; bool passMapQualityFilter = false; for (int i = 0; i < n; i++) { currentDepth = glf[i].GetDepth(position); if (currentDepth != 0) { totalDepth += currentDepth; numCovered++; // not currently used -- will be "NS" currentQuality = glf[i].GetMapQuality(position); averageMapQuality += currentDepth * currentQuality * currentQuality; if (currentQuality >= mapQuality) passMapQualityFilter = true; } } averageMapQuality = sqrt(averageMapQuality / totalDepth); filter.Clear(); if (!passMapQualityFilter) { if (filter.Length() == 0) mapQualityFilter++; if (hardFilter) continue; filter.catprintf("%smq%d", filter.Length() ? ";" : "", mapQuality); } if (totalDepth < minTotalDepth) { if (filter.Length() == 0) depthFilter++; if (hardFilter) continue; filter.catprintf("%sdp%d", filter.Length() ? ";" : "", minTotalDepth); } if (totalDepth > maxTotalDepth) { if (filter.Length() == 0) depthFilter++; if (hardFilter) continue; filter.catprintf("%sDP%d", filter.Length() ? ";" : "", maxTotalDepth); } // Create convenient aliases for each base unsigned char transition = (((refBase - 1) ^ 2) + 1); unsigned char transvers1 = (((refBase - 1) ^ 3) + 1); unsigned char transvers2 = (((refBase - 1) ^ 1) + 1); int homRef = glf[0].GenotypeIndex(refBase, refBase); // Calculate likelihood assuming every is homozygous for the reference double lRef = log(1.0 - prior); for (int i = 0; i < n; i++) lRef += log(glf[i].GetLikelihoods(position)[homRef]); // Calculate maximum likelihood for a variant if (smartFilter) { double anyVariant = log(prior) + FilteringLikelihood(glf, n, position, refBase); if (exp(lRef - anyVariant) > (1.0 - posterior)/posterior) { smartFilterHits++; continue; } } //fprintf(stderr,"position = %d\n",position); double pTs = uniformPrior ? 1./3. : 2./3.; double pTv = uniformPrior ? 1./3. : 1./6.; // Calculate likelihoods for the most likelily SNP configurations double refTransition = log(prior * pTs) + PolymorphismLikelihood(glf, n, position, refBase, transition); double refTransvers1 = log(prior * pTv) + PolymorphismLikelihood(glf, n, position, refBase, transvers1); double refTransvers2 = log(prior * pTv) + PolymorphismLikelihood(glf, n, position, refBase, transvers2); // Calculate likelihoods for less likely SNP configurations double transitiontv1 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transition, transvers1); double transitiontv2 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transition, transvers2); double transvers1tv2 = log(prior * 0.001) + PolymorphismLikelihood(glf, n, position, transvers1, transvers2); // Calculate the likelihood for unusual configurations where everyone is heterozygous ... double sink = n > 10 ? log(prior * 1e-8) + SinkLikelihood(glf, n, position) : -1e100; double lmax = max( max(max(lRef, refTransition),max(refTransvers1, refTransvers2)), max(max(transitiontv1, transitiontv2), max(transvers1tv2, sink))); double sum = exp(lRef - lmax) + exp(refTransition -lmax) + exp(refTransvers1 - lmax) + exp(refTransvers2 - lmax) + exp(transitiontv1 - lmax) + exp(transitiontv2 - lmax) + exp(transvers1tv2 - lmax) + exp(sink - lmax); if (sum == 0.0) continue; if (exp(lRef - lmax)/sum > 1.0 - prior) { if (filter.Length() == 0) homozygousReference++; if (positions.Entries()) ReportSNP(glf, n, position, refBase, refBase, refBase, filter, totalDepth, averageMapQuality, lRef / sum); continue; } double quality = 1.0 - exp(lRef - lmax) / sum; if (verbose) { DumpDetails(glf, n, position, refBase); printf("%.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", lRef, refTransition, refTransvers1, refTransvers2, transitiontv1, transitiontv2, transvers1tv2); } if (exp(refTransition - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, refBase, transition, filter, totalDepth, averageMapQuality, quality /* refTransition/sum */); if (filter.Length() == 0) transitions++; } else if (exp(refTransvers1 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, refBase, transvers1, filter, totalDepth, averageMapQuality, quality /* refTransvers1/sum */); if (filter.Length() == 0) transversions++; } else if (exp(refTransvers2 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, refBase, transvers2, filter, totalDepth, averageMapQuality, quality /* refTransvers2/sum */); if (filter.Length() == 0) transversions++; } else if (exp(transitiontv1 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, transition, transvers1, filter, totalDepth, averageMapQuality, quality /* transitiontv1/sum */); if (filter.Length() == 0) otherPolymorphisms++; } else if (exp(transitiontv2 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, transition, transvers2, filter, totalDepth, averageMapQuality, quality /* transitiontv2/sum */); if (filter.Length() == 0) otherPolymorphisms++; } else if (exp(transvers1tv2 - lmax)/sum > posterior) { ReportSNP(glf, n, position, refBase, transvers1, transvers2, filter, totalDepth, averageMapQuality, quality /* transvers1tv2/sum */); if (filter.Length() == 0) otherPolymorphisms++; } else if (exp(sink - lmax)/sum > posterior) sinkFilter++; } int actualBases = glf[firstGlf].maxPosition - baseCounts[0]; printf(" Missing bases = %9d (%.3f%%)\n", baseCounts[0], baseCounts[0] * 100. / glf[firstGlf].maxPosition); printf(" Reference bases = %9d (%.3f%%)\n", glf[firstGlf].maxPosition - baseCounts[0], (glf[firstGlf].maxPosition - baseCounts[0]) * 100. / glf[firstGlf].maxPosition); printf(" A/T bases = %9d (%.3f%%, %d A, %d T)\n", baseCounts[1] + baseCounts[4], (baseCounts[1] + baseCounts[4]) * 100. / actualBases, baseCounts[1], baseCounts[4]); printf(" G/C bases = %9d (%.3f%%, %d G, %d C)\n", baseCounts[3] + baseCounts[2], (baseCounts[3] + baseCounts[2]) * 100. / actualBases, baseCounts[3], baseCounts[2]); printf(" Depth Filter = %9d bases (%.3f%%)\n", depthFilter, depthFilter * 100. / actualBases); printf(" Map Quality Filter = %9d bases (%.3f%%)\n", mapQualityFilter, mapQualityFilter * 100. / actualBases); printf(" Non-Polymorphic = %9d bases (%.3f%%)\n", homozygousReference, homozygousReference * 100. / actualBases); printf(" Transitions = %9d bases (%.3f%%)\n", transitions, transitions * 100. / actualBases); printf(" Transversions = %9d bases (%.3f%%)\n", transversions, transversions * 100. / actualBases); printf(" Other Polymorphisms = %9d bases (%.3f%%)\n", otherPolymorphisms, otherPolymorphisms * 100. / actualBases); if (n > 10) printf(" Homology Sink = %9d bases (%.3f%%)\n", sinkFilter, sinkFilter * 100. / actualBases); if (smartFilter) printf(" Smart Filter = %9d bases (%.3f%%)\n", smartFilterHits, smartFilterHits * 100. / actualBases); int noCalls = actualBases - homozygousReference - transitions - transversions - otherPolymorphisms - sinkFilter; printf(" No call = %9d bases (%.3f%%)\n", noCalls, noCalls * 100. / actualBases); fflush(stdout); } if (baseCalls != NULL) fclose(baseCalls); time(&t); printf("\nAnalysis completed on %s\n", ctime(&t)); fflush(stdout); }
// main function of verifyBamID int execute(int argc, char** argv) { printf("verifyBamID %s -- verify identity and purity of sequence data\n" "(c) 2010-2014 Hyun Min Kang, Goo Jun, and Goncalo Abecasis\n\n", VERSION); VerifyBamIDArgs args; ParameterList pl; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Input Files") LONG_STRINGPARAMETER("vcf",&args.sVcfFile) LONG_STRINGPARAMETER("bam",&args.sBamFile) LONG_STRINGPARAMETER("subset",&args.sSubsetInds) LONG_STRINGPARAMETER("smID",&args.sSMID) LONG_PARAMETER_GROUP("VCF analysis options") LONG_DOUBLEPARAMETER("genoError",&args.genoError) LONG_DOUBLEPARAMETER("minAF",&args.minAF) LONG_DOUBLEPARAMETER("minCallRate",&args.minCallRate) LONG_PARAMETER_GROUP("Individuals to compare with chip data") EXCLUSIVE_PARAMETER("site",&args.bSiteOnly) EXCLUSIVE_PARAMETER("self",&args.bSelfOnly) EXCLUSIVE_PARAMETER("best",&args.bFindBest) LONG_PARAMETER_GROUP("Chip-free optimization options") EXCLUSIVE_PARAMETER("free-none",&args.bFreeNone) EXCLUSIVE_PARAMETER("free-mix",&args.bFreeMixOnly) EXCLUSIVE_PARAMETER("free-refBias",&args.bFreeRefBiasOnly) EXCLUSIVE_PARAMETER("free-full",&args.bFreeFull) LONG_PARAMETER_GROUP("With-chip optimization options") EXCLUSIVE_PARAMETER("chip-none",&args.bChipNone) EXCLUSIVE_PARAMETER("chip-mix",&args.bChipMixOnly) EXCLUSIVE_PARAMETER("chip-refBias",&args.bChipRefBiasOnly) EXCLUSIVE_PARAMETER("chip-full",&args.bChipFull) LONG_PARAMETER_GROUP("BAM analysis options") LONG_PARAMETER("ignoreRG",&args.bIgnoreRG) LONG_PARAMETER("ignoreOverlapPair",&args.bIgnoreOverlapPair) LONG_PARAMETER("noEOF",&args.bNoEOF) LONG_PARAMETER("precise",&args.bPrecise) LONG_INTPARAMETER("minMapQ",&args.minMapQ) LONG_INTPARAMETER("maxDepth",&args.maxDepth) LONG_INTPARAMETER("minQ",&args.minQ) LONG_INTPARAMETER("maxQ",&args.maxQ) LONG_DOUBLEPARAMETER("grid",&args.grid) LONG_PARAMETER_GROUP("Modeling Reference Bias") LONG_DOUBLEPARAMETER("refRef",&args.pRefRef) LONG_DOUBLEPARAMETER("refHet",&args.pRefHet) LONG_DOUBLEPARAMETER("refAlt",&args.pRefAlt) LONG_PARAMETER_GROUP("Output options") LONG_STRINGPARAMETER("out",&args.sOutFile) LONG_PARAMETER("verbose",&args.bVerbose) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); pl.Add(new LongParameters("Available Options",longParameters)); pl.Read(argc, argv); pl.Status(); // check the validity of input files if ( args.sVcfFile.IsEmpty() ) { error("--vcf [vcf file] required"); } if ( args.sBamFile.IsEmpty() ) { error("--bam [bam file] is required"); } if ( args.sOutFile.IsEmpty() ) { error("--out [output prefix] is required"); } Logger::gLogger = new Logger((args.sOutFile + ".log").c_str(), args.bVerbose); if ( ! ( args.bSiteOnly || args.bSelfOnly || args.bFindBest ) ) { warning("--self option was autotomatically turned on by default. Specify --best option if you wanted to check across all possible samples in the VCF"); args.bSelfOnly = true; } if ( ( args.maxDepth > 20 ) && ( !args.bPrecise ) ) { warning("--precise option is not turned on at --maxDepth %d : may be prone to precision errors",args.maxDepth); } if ( ( args.bChipRefBiasOnly ) && ( !args.bSelfOnly ) ) { error("--self must be set for --chip-refBias to work. Skipping.."); } // check timestamp time_t t; time(&t); Logger::gLogger->writeLog("Analysis started on %s",ctime(&t)); // load arguments VerifyBamID vbid(&args); // load input VCF and BAM files Logger::gLogger->writeLog("Opening Input Files"); vbid.loadFiles(args.sBamFile.c_str(), args.sVcfFile.c_str()); // Check which genotype-free method is used if ( args.bFreeNone ) { // if no genotype-free mode is tested. skip it // do nothing for genotype-free estimation Logger::gLogger->writeLog("Skipping chip-free estimation of sample mixture"); } else if ( args.bFreeMixOnly ) { // only mixture is estimated. // genotype-free method Logger::gLogger->writeLog("Performing chip-free estimation of sample mixture at fixed reference bias parameters (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt); // scan across multiple readgroups for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) { VerifyBamID::mixLLK mix(&vbid); mix.OptimizeLLK(rg); Logger::gLogger->writeLog("Optimal per-sample fMix = %lf, LLK0 = %lf, LLK1 = %lf\n",mix.fMix,mix.llk0,mix.llk1); vbid.mixOut.llk0s[rg+1] = mix.llk0; vbid.mixOut.llk1s[rg+1] = mix.llk1; vbid.mixOut.fMixs[rg+1] = mix.fMix; } //vbid.mixRefHet = 0.5; //vbid.mixRefAlt = 0.00; } else if ( args.bFreeRefBiasOnly ) { Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias without sample mixture"); for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) { VerifyBamID::refBiasMixLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(2); startingPoint[0] = 0; // pRefHet = 0.5 startingPoint[1] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(2); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf at readGroup %d",pRefHet,pRefAlt,myMinimizer.fmin,rg); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.mixOut.llk0s[rg+1] = myFunc.llk0; vbid.mixOut.llk1s[rg+1] = myFunc.llk1; vbid.mixOut.refHets[rg+1] = myFunc.pRefHet; vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt; } } else if ( args.bFreeFull ) { Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias and sample mixture together"); for(int rg = -1; rg < vbid.nRGs - args.bIgnoreRG; ++rg) { VerifyBamID::fullMixLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(3); startingPoint[0] = -3.91; // start with fMix = 0.01 startingPoint[1] = 0; // pRefHet = 0.5 startingPoint[2] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(3); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double fMix = VerifyBamID::invLogit(myMinimizer.point[0]); if ( fMix > 0.5 ) fMix = 1.-fMix; double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]); Logger::gLogger->writeLog("Optimal per-sample fMix = %lf\n",fMix); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.mixOut.llk0s[rg+1] = myFunc.llk0; vbid.mixOut.llk1s[rg+1] = myFunc.llk1; vbid.mixOut.fMixs[rg+1] = myFunc.fMix; vbid.mixOut.refHets[rg+1] = myFunc.pRefHet; vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt; } } Logger::gLogger->writeLog("calculating depth distribution"); vbid.calculateDepthDistribution(args.maxDepth, vbid.mixOut); Logger::gLogger->writeLog("finished calculating depth distribution"); std::vector<int> bestInds(vbid.nRGs+1,-1); std::vector<int> selfInds(vbid.nRGs+1,-1); if ( args.bChipNone ) { // do nothing Logger::gLogger->writeLog("Skipping with-chip estimation of sample mixture"); } else if ( args.bChipMixOnly ) { Logger::gLogger->writeLog("Performing with-chip estimation of sample mixture at fixed reference bias parameter (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt); for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { double maxIBD = -1; VerifyBamID::ibdLLK ibd(&vbid); for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) { double fIBD = ibd.OptimizeLLK(i, rg); Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(),fIBD, ibd.llk0, ibd.llk1, rg); if ( maxIBD < fIBD ) { bestInds[rg+1] = i; vbid.bestOut.llk0s[rg+1] = ibd.llk0; vbid.bestOut.llk1s[rg+1] = ibd.llk1; vbid.bestOut.fMixs[rg+1] = 1-ibd.fIBD; maxIBD = ibd.fIBD; } if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) { selfInds[rg+1] = i; vbid.selfOut.llk0s[rg+1] = ibd.llk0; vbid.selfOut.llk1s[rg+1] = ibd.llk1; vbid.selfOut.fMixs[rg+1] = 1-ibd.fIBD; } } if ( bestInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD); vbid.calculateDepthByGenotype(bestInds[rg+1],rg,vbid.bestOut); } if ( selfInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]); vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut); } } } else if ( args.bChipRefBiasOnly ) { Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias without sample mixture"); if ( args.bSelfOnly ) { for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { VerifyBamID::refBiasIbdLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(2); startingPoint[0] = 0; // pRefHet = 0.5 startingPoint[1] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(2); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.selfOut.llk0s[rg+1] = myFunc.llk0; vbid.selfOut.llk1s[rg+1] = myFunc.llk1; vbid.selfOut.refHets[rg+1] = myFunc.pRefHet; vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt; vbid.calculateDepthByGenotype(0,rg,vbid.selfOut); } } else { Logger::gLogger->warning("--self must be set for --chip-refBias to work. Skipping.."); } } else if ( args.bChipFull ) { Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias and sample mixture together"); for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { double maxIBD = -1; for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) { VerifyBamID::fullIbdLLKFunc myFunc(&vbid,i,rg); AmoebaMinimizer myMinimizer; Vector startingPoint(3); startingPoint[0] = 3.91; // start with fIBD = 0.99 startingPoint[1] = 0; // pRefHet = 0.5 startingPoint[2] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myFunc.indIdx = i; myMinimizer.Reset(3); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double fIBD = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]); Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(), fIBD, myFunc.llk0, myFunc.llk1, rg); //Logger::gLogger->writeLog("Optimal per-sample fIBD = %lf, ",fIBD); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf ) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); if ( maxIBD < fIBD ) { bestInds[rg+1] = i; maxIBD = fIBD; vbid.bestOut.llk0s[rg+1] = myFunc.llk0; vbid.bestOut.llk1s[rg+1] = myFunc.llk1; vbid.bestOut.fMixs[rg+1] = 1.-myFunc.fIBD; vbid.bestOut.refHets[rg+1] = myFunc.pRefHet; vbid.bestOut.refAlts[rg+1] = myFunc.pRefAlt; } if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) { selfInds[rg+1] = i; vbid.selfOut.llk0s[rg+1] = myFunc.llk0; vbid.selfOut.llk1s[rg+1] = myFunc.llk1; vbid.selfOut.fMixs[rg+1] = 1.-myFunc.fIBD; vbid.selfOut.refHets[rg+1] = myFunc.pRefHet; vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt; vbid.calculateDepthByGenotype(i, rg, vbid.selfOut); } } //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); if ( bestInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD); vbid.calculateDepthByGenotype(bestInds[rg+1], rg, vbid.bestOut); } if ( selfInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]); vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut); } } } // PRINT OUTPUT FILE - ".selfSM" // [SEQ_ID] : SAMPLE ID in the sequence file // [CHIP_ID] : SAMPLE ID in the chip file (NA if not available) // [#SNPS] : Number of markers evaluated // [#READS] : Number of reads evaluated // [AVG_DP] : Mean depth // [FREEMIX] : Chip-free estimated alpha (% MIX in 0-1 scale), NA if unavailable // [FREELK1] : Chip-free log-likelihood at estimated alpha // [FREELK0] : Chip-free log-likelihood at 0% contamination // [CHIPIBD] : With-chip estimated alpha (% MIX in 0-1 scale) // [CHIPLK1] : With-chip log-likelihood at estimated alpha // [CHIPLK0] : With-chip log-likelihood at 0% contamination // [DPREF] : Depth at reference site in the chip // [RDPHET] : Relative depth at HET site in the chip // [RDPALT] : Relative depth at HOMALT site in the chip // [FREE_RF] : Pr(Ref|Ref) site estimated without chip data // [FREE_RH] : Pr(Ref|Het) site estimated without chip data // [FREE_RA] : Pr(Ref|Alt) site estimated without chip data // [CHIP_RF] : Pr(Ref|Ref) site estimated with chip data // [CHIP_RH] : Pr(Ref|Het) site estimated with chip data // [CHIP_RA] : Pr(Ref|Alt) site estimated with chip data // [DPREF] : Depth at reference alleles // [RDPHET] : Relative depth at heterozygous alleles // [RDPALT] : Relative depth at hom-alt alleles String selfSMFN = args.sOutFile + ".selfSM"; String bestSMFN = args.sOutFile + ".bestSM"; String selfRGFN = args.sOutFile + ".selfRG"; String bestRGFN = args.sOutFile + ".bestRG"; String dpSMFN = args.sOutFile + ".depthSM"; String dpRGFN = args.sOutFile + ".depthRG"; IFILE selfSMF = ifopen(selfSMFN,"wb"); IFILE bestSMF = (args.bFindBest ? ifopen(bestSMFN,"wb") : NULL); IFILE selfRGF = (args.bIgnoreRG ? NULL : ifopen(selfRGFN,"wb")); IFILE bestRGF = (args.bFindBest && !args.bIgnoreRG) ? ifopen(bestRGFN,"wb") : NULL; IFILE dpSMF = ifopen(dpSMFN,"wb"); IFILE dpRGF = (args.bIgnoreRG ? NULL : ifopen(dpRGFN,"wb")); if ( selfSMF == NULL ) { Logger::gLogger->error("Cannot write to %s",selfSMF); } if ( args.bFindBest && ( bestSMF == NULL ) ) { Logger::gLogger->error("Cannot write to %s",bestSMF); } if ( dpSMF == NULL ) { Logger::gLogger->error("Cannot write to %s",dpSMF); } ifprintf(dpSMF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n"); int nCumMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { nCumMarkers += vbid.mixOut.depths[i]; ifprintf(dpSMF,"ALL\t%d\t%d\t%.5lf\t%.5lf\n",i, vbid.mixOut.depths[i],(double) vbid.mixOut.depths[i]/(double)vbid.nMarkers,(double)nCumMarkers/(double)vbid.nMarkers); } ifclose(dpSMF); if ( dpRGF != NULL ) { ifprintf(dpRGF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n"); for(int rg=0; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { const char* rgID = vbid.pPile->vsRGIDs[rg].c_str(); int nMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { nMarkers += vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i]; } nCumMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { int d = vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i]; nCumMarkers += d; ifprintf(dpRGF,"%s\t%d\t%d\t%.5lf\t%.5lf\n",rgID,i,d,(double)d/(double)vbid.nMarkers,(double)nCumMarkers/(double)nMarkers); } } ifclose(dpRGF); } const char* headers[] = {"#SEQ_ID","RG","CHIP_ID","#SNPS","#READS","AVG_DP","FREEMIX","FREELK1","FREELK0","FREE_RH","FREE_RA","CHIPMIX","CHIPLK1","CHIPLK0","CHIP_RH","CHIP_RA","DPREF","RDPHET","RDPALT"}; int nheaders = sizeof(headers)/sizeof(headers[0]); for(int i=0; i < nheaders; ++i) { ifprintf(selfSMF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(selfSMF,"\n"); ifprintf(selfSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str()); ifprintf(selfSMF,"\t%s",selfInds[0] >= 0 ? vbid.pGenotypes->indids[selfInds[0]].c_str() : "NA"); ifprintf(selfSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers); if ( args.bFreeNone ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else if ( args.bFreeFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[0],vbid.selfOut.llk1s[0],vbid.selfOut.llk0s[0],(double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else if ( args.bChipFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[0], vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else { error("Invalid option in handling bChip"); } ifprintf(selfSMF,"\n"); ifclose(selfSMF); if ( bestSMF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(bestSMF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(bestSMF,"\n"); ifprintf(bestSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str()); ifprintf(bestSMF,"\t%s",bestInds[0] >= 0 ? vbid.pGenotypes->indids[bestInds[0]].c_str() : "NA"); ifprintf(bestSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers); if ( args.bFreeNone ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else if ( args.bFreeFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[0],vbid.bestOut.llk1s[0],vbid.bestOut.llk0s[0],(double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else if ( args.bChipFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[0], vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else { error("Invalid option in handling bChip"); } ifprintf(bestSMF,"\n"); ifclose(bestSMF); } if ( selfRGF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(selfRGF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(selfRGF,"\n"); for(int rg=0; rg < vbid.nRGs; ++rg) { ifprintf(selfRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str()); ifprintf(selfRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA"); ifprintf(selfRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]); if ( args.bFreeNone ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else if ( args.bFreeFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else { error("Invalid option in handling bChip"); } ifprintf(selfRGF,"\n"); } ifclose(selfRGF); } if ( bestRGF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(bestRGF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(bestRGF,"\n"); for(int rg=0; rg < vbid.nRGs; ++rg) { ifprintf(bestRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str()); ifprintf(bestRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA"); ifprintf(bestRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]); if ( args.bFreeNone ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else if ( args.bFreeFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else { error("Invalid option in handling bChip"); } ifprintf(bestRGF,"\n"); } ifclose(bestRGF); } time(&t); Logger::gLogger->writeLog("Analysis finished on %s",ctime(&t)); return 0; }
int Validate::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; int maxErrors = -1; int printableErrors = 100; bool so_flag = false; bool so_coord = false; bool so_query = false; bool noeof = false; bool disableStatistics = false; bool verbose = false; bool params = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER("noeof", &noeof) LONG_INTPARAMETER("maxErrors", &maxErrors) LONG_PARAMETER("verbose", &verbose) LONG_INTPARAMETER("printableErrors", &printableErrors) LONG_PARAMETER("disableStatistics", &disableStatistics) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("SortOrder") EXCLUSIVE_PARAMETER("so_flag", &so_flag) EXCLUSIVE_PARAMETER("so_coord", &so_coord) EXCLUSIVE_PARAMETER("so_query", &so_query) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Determine the sort type for validation based on the parameters. SamFile::SortedType sortType = SamFile::UNSORTED; if(so_flag) { sortType = SamFile::FLAG; } else if(so_coord) { sortType = SamFile::COORDINATE; } else if(so_query) { sortType = SamFile::QUERY_NAME; } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for validate, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Since we want to accumulate multiple errors, use RETURN rather // than throwing exceptions. SamFile samIn(ErrorHandler::RETURN); // Open the file for reading. if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Set the sorting validation type. samIn.setSortedValidation(sortType); // Set that statistics should be generated. samIn.GenerateStatistics(!disableStatistics); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Read the sam records. SamRecord samRecord(ErrorHandler::RETURN); // Track the status. SamStatus::Status status = SamStatus::SUCCESS; // Keep reading records until the end of the file is reached. int numValidRecords = 0; int numInvalidRecords = 0; int numErrorRecords = 0; int numRecords = 0; int numReportedErrors = 0; int totalErrorRecords = 0; std::map<SamStatus::Status, uint64_t> errorStats; std::map<SamValidationError::Type, uint64_t> invalidStats; SamValidationErrors invalidSamErrors; // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while( ( (maxErrors < 0) || (totalErrorRecords < maxErrors) ) && ( (samIn.ReadRecord(samHeader, samRecord)) || (SamStatus::isContinuableStatus(samIn.GetStatus())) ) ) { ++numRecords; if(samIn.GetStatus() == SamStatus::SUCCESS) { // Successfully set the record, so check to see if it is valid. // Clear any errors in the list. invalidSamErrors.clear(); if(!SamValidator::isValid(samHeader, samRecord, invalidSamErrors)) { // The record is not valid. ++numInvalidRecords; ++totalErrorRecords; if(verbose && (numReportedErrors < printableErrors)) { std::cerr << "Record " << numRecords << std::endl << invalidSamErrors << std::endl; ++numReportedErrors; } // Update the statistics for all validation errors found in this record. invalidSamErrors.resetErrorIter(); const SamValidationError* errorPtr = invalidSamErrors.getNextError(); while(errorPtr != NULL) { ++invalidStats[errorPtr->getType()]; errorPtr = invalidSamErrors.getNextError(); } // If the status is not yet set, set it. if(status == SamStatus::SUCCESS) { status = SamStatus::INVALID; } } else { // Valid record, so increment the counter. ++numValidRecords; } } else { // Error reading the record. ++numErrorRecords; ++totalErrorRecords; if(verbose && (numReportedErrors < printableErrors)) { // report error. std::cerr << "Record " << numRecords << std::endl << samIn.GetStatusMessage() << std::endl << std::endl; ++numReportedErrors; } // Increment the statistics ++errorStats[samIn.GetStatus()]; // If the status is not yet set, set it. if(status == SamStatus::SUCCESS) { status = samIn.GetStatus(); } } } if( (samIn.GetStatus() != SamStatus::NO_MORE_RECS) && (totalErrorRecords < maxErrors) ) { // The last read call had a failure, so report it. // If the number of errors is >= ,maxErrors we don't // want to print any more failures. ++numErrorRecords; ++totalErrorRecords; if(numReportedErrors < printableErrors) { std::cerr << "Record " << numRecords << ": "; std::cerr << std::endl << samIn.GetStatusMessage() << std::endl; } // Increment the statistics ++errorStats[samIn.GetStatus()]; if(status == SamStatus::SUCCESS) { status = samIn.GetStatus(); } } if(totalErrorRecords == maxErrors) { if(maxErrors == 0) { std::cerr << "WARNING file was not read at all due to maxErrors setting, but returning Success.\n"; } else { // Print a note that the entire file was not read. std::cerr << "File was not completely read due to the number of errors.\n"; std::cerr << "Statistics only reflect the part of the file that was read.\n"; } } fprintf(stderr, "\nNumber of records read = %d\n", numRecords); fprintf(stderr, "Number of valid records = %d\n", numValidRecords); std::cerr << std::endl; if(numRecords != numValidRecords) { std::cerr << "Error Counts:\n"; // Loop through the non-validation errors. std::map<SamStatus::Status, uint64_t>::iterator statusIter; for(statusIter = errorStats.begin(); statusIter != errorStats.end(); statusIter++) { std::cerr << "\t" << SamStatus::getStatusString(statusIter->first) << ": " << statusIter->second << std::endl; } std::map<SamValidationError::Type, uint64_t>::iterator invalidIter; for(invalidIter = invalidStats.begin(); invalidIter != invalidStats.end(); invalidIter++) { std::cerr << "\t" << SamValidationError::getTypeString(invalidIter->first) << ": " << invalidIter->second << std::endl; } std::cerr << std::endl; } samIn.PrintStatistics(); fprintf(stderr, "Returning: %d (%s)\n", status, SamStatus::getStatusString(status)); return(status); }
int main(int argc, char ** argv) { ParameterList inputParameters; String filename; int minReadLength = 10; int printableErrors = 20; int maxErrors = -1; String testParam; BaseAsciiMap::SPACE_TYPE myBaseType = BaseAsciiMap::UNKNOWN; // Read the parameters from the command line. bool baseSpace = false; bool colorSpace = false; bool autoDetect = false; bool ignoreErrors = false; bool baseComposition = false; bool avgQual = false; bool quiet = false; bool noeof = false; bool params = false; bool disableSeqIDCheck = false; bool interleaved = false; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("file", &filename) LONG_PARAMETER("baseComposition", &baseComposition) LONG_PARAMETER("avgQual", &avgQual) LONG_PARAMETER("disableSeqIDCheck", &disableSeqIDCheck) LONG_PARAMETER("interleaved", &interleaved) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("quiet", &quiet) LONG_PARAMETER("params", ¶ms) LONG_INTPARAMETER("minReadLen", &minReadLength) LONG_INTPARAMETER("maxErrors", &maxErrors) LONG_PARAMETER_GROUP("Space Type") EXCLUSIVE_PARAMETER("baseSpace", &baseSpace) EXCLUSIVE_PARAMETER("colorSpace", &colorSpace) EXCLUSIVE_PARAMETER("auto", &autoDetect) LONG_PARAMETER_GROUP("Errors") EXCLUSIVE_PARAMETER("ignoreErrors", &ignoreErrors) LONG_SMARTINTPARAMETER("printableErrors", &printableErrors) BEGIN_LEGACY_PARAMETERS() LONG_PARAMETER("printBaseComp", &baseComposition) LONG_PARAMETER("disableAllMessages", &quiet) LONG_INTPARAMETER("quitAfterErrorNum", &maxErrors) LONG_PARAMETER_GROUP("Space Type") EXCLUSIVE_PARAMETER("baseSpace", &baseSpace) EXCLUSIVE_PARAMETER("colorSpace", &colorSpace) EXCLUSIVE_PARAMETER("autoDetect", &autoDetect) LONG_PARAMETER_GROUP("Errors") EXCLUSIVE_PARAMETER("ignoreAllErrors", &ignoreErrors) LONG_SMARTINTPARAMETER("maxReportedErrors", &printableErrors) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc, argv); if(ignoreErrors) { // Ignore all errors, so set printableErrors to 0. printableErrors = 0; } // Set the base type based on the passed in parameters. if(baseSpace) { // Base Space myBaseType = BaseAsciiMap::BASE_SPACE; } else if(colorSpace) { myBaseType = BaseAsciiMap::COLOR_SPACE; } else { myBaseType = BaseAsciiMap::UNKNOWN; // Set autoDetect autoDetect = true; } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // DO not print status if set to quiet. if((!quiet) && params) { inputParameters.Status(); } if(filename == "") { if(quiet) { return(-1); } // No filename was specified so print a usage description. std::cout << "ERROR: No filename specified. See below for usage help."; std::cout << std::endl << std::endl; std::cout << " Required Parameters:" << std::endl; std::cout << "\t--file : FastQ filename with path to be prorcessed.\n"; std::cout << std::endl; std::cout << " Optional Parameters:" << std::endl; std::cout << "\t--minReadLen : Minimum allowed read length (Defaults to 10).\n"; std::cout << "\t--maxErrors : Number of errors to allow before quitting\n"; std::cout << "\t reading/validating the file.\n"; std::cout << "\t -1 (default) indicates to not quit until\n"; std::cout << "\t the entire file is read.\n"; std::cout << "\t 0 indicates not to read/validate anything\n"; std::cout << "\t--printableErrors : Maximum number of errors to print before\n"; std::cout << "\t suppressing them (Defaults to 20).\n"; std::cout << "\t Different than maxErrors since \n"; std::cout << "\t printableErrors will continue reading and\n"; std::cout << "\t validating the file until the end, but\n"; std::cout << "\t just doesn't print the errors.\n"; std::cout << "\t--ignoreErrors : Ignore all errors (same as printableErrors = 0)\n"; std::cout << "\t overwrites the printableErrors option.\n"; std::cout << "\t--baseComposition : Print the Base Composition Statistics.\n"; std::cout << "\t--avgQual : Print the average phred quality per cycle & overall average quality.\n"; std::cout << "\t--disableSeqIDCheck : Disable the unique sequence identifier check.\n"; std::cout << "\t Use this option to save memory since the sequence id\n"; std::cout << "\t check uses a lot of memory.\n"; std::cout << "\t--noeof : Disable checking that the eof block is present in gzipped files\n."; std::cout << "\t--interleaved : Validate consequtive reads have the same sequence identifier\n"; std::cout << "\t (only allowed difference is 1/2, but not required) and validate\n"; std::cout << "\t that otherwise reads have unique sequence identifiers.\n"; std::cout << "\t Cannot be used if '--disableSeqIDCheck' is specified.\n"; std::cout << "\t--params : Print the parameter settings.\n"; std::cout << "\t--quiet : Suppresses the display of errors and summary statistics.\n"; std::cout << "\t Does not affect the printing of Base Composition Statistics.\n"; std::cout << "\n Optional Space Options for Raw Sequence (Last one specified is used):\n"; std::cout << "\t--auto : Determine baseSpace/colorSpace from the Raw Sequence in the file (Default).\n"; std::cout << "\t--baseSpace : ACTGN only\n"; std::cout << "\t--colorSpace : 0123. only\n"; std::cout << std::endl; std::cout << " Usage:" << std::endl; std::cout << "\t./fastQValidator --file <fileName> [--minReadLen <minReadLen>] [--maxErrors <numErrors>] [--printableErrors <printableErrors>|--ignoreErrors] [--baseComposition] [--disableSeqIDCheck] [--interleaved] [--quiet] [--baseSpace|--colorSpace|--auto] [--params]\n\n"; std::cout << " Examples:" << std::endl; std::cout << "\t../fastQValidator --file testFile.txt\n"; std::cout << "\t../fastQValidator --file testFile.txt --minReadLen 10 --baseSpace --printableErrors 100\n"; std::cout << "\t./fastQValidator --file test/testFile.txt --minReadLen 10 --colorSpace --ignoreErrors\n"; std::cout << std::endl; return (-1); } FastQFile validator(minReadLength, printableErrors); if(quiet) { validator.disableMessages(); } if(disableSeqIDCheck) { validator.disableSeqIDCheck(); } if(interleaved) { validator.interleaved(); } if(interleaved && disableSeqIDCheck) { if(!quiet) { std::cout << "ERROR: --interleaved and --disableSeqIDCheck cannot both be specified.\n"; } return(-1); } validator.setMaxErrors(maxErrors); FastQStatus::Status status = validator.validateFastQFile(filename, baseComposition, myBaseType, avgQual); if(!quiet) { std::cout << "Returning: " << status << " : " << FastQStatus::getStatusString(status) << std::endl; } return(status); }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
int Dump::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool params = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Optional Other Parameters") LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); // mandatory argument was not specified. inputParameters.Status(); std::cerr << "Missing mandatory argument: --in" << std::endl; return(-1); } if(params) { inputParameters.Status(); } GlfFile glfIn; GlfHeader glfHeader; // Open the file for reading. glfIn.openForRead(inFile); // Read the glf header. glfIn.readHeader(glfHeader); // Output the glf header. std::string headerText = ""; glfHeader.getHeaderTextString(headerText); std::cout << "GlfHeader:\n"; std::cout << headerText << std::endl; int numSections = 0; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. GlfStatus::Status returnStatus = GlfStatus::SUCCESS; GlfRefSection refSection; while(glfIn.getNextRefSection(refSection)) { ++numSections; std::string refName; refSection.getName(refName); std::cout << "\tRefName = " << refName << "; RefLen = " << refSection.getRefLen() << "\n"; int64_t numSectionRecords = 0; GlfRecord record; int pos = 0; while(glfIn.getNextRecord(record)) { // Print the position. pos += record.getOffset(); std::cout << "position: " << pos << "\n\t"; record.print(); ++numSectionRecords; } } // // Keep reading records until they aren't anymore. // while(glfIn.ReadRecord(glfHeader, glfRecord)) // { // if(!readName.IsEmpty()) // { // // Check for readname. // if(strcmp(glfRecord.getReadName(), readName.c_str()) != 0) // { // // not a matching read name, so continue to the next record. // continue; // } // } // // Check to see if the read has already been processed. // if(myPrevEnd != UNSPECIFIED_INT) // { // // Because we already know that the bed was sorted, // // we know that the previous section started before // // this one, so if the previous end is greater than // // this record's end position we know that it // // was already written in the previous section. // // Note: can't be equal to the previous end since // // the end range was exclusive, while // // get0BasedAlignmentEnd is inclusive. // // myPrevEnd is reset by getNextSection when a new // // chromosome is hit. // if(glfRecord.get0BasedAlignmentEnd() < myPrevEnd) // { // // This record was already written. // continue; // } // } // // Shift left if applicable. // if(lshift) // { // glfRecord.shiftIndelsLeft(); // } // // Successfully read a record from the file, so write it. // glfOut.WriteRecord(glfHeader, glfRecord); // ++numSectionRecords; // } // myWroteReg = true; // } // if(myBedFile != NULL) // { // ifclose(myBedFile); // } // std::cerr << "Wrote " << outFile << " with " << numSectionRecords // << " records.\n"; return(returnStatus); }
int GapInfo::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String refFile = ""; bool detailed = false; bool checkFirst = false; bool checkStrand = false; bool noeof = false; bool params = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("refFile", &refFile) LONG_PARAMETER("detailed", &detailed) LONG_PARAMETER_GROUP("Optional Detailed Parameters") LONG_PARAMETER("checkFirst", &checkFirst) LONG_PARAMETER("checkStrand", &checkStrand) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the out file was specified, if not, report an error. if(outFile == "") { usage(); inputParameters.Status(); // Out file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } return(processFile(inFile.c_str(), outFile.c_str(), refFile, detailed, checkFirst, checkStrand)); }
int WriteRegion::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String indexFile = ""; String readName = ""; String bed = ""; myStart = UNSPECIFIED_INT; myEnd = UNSPECIFIED_INT; myPrevStart = UNSPECIFIED_INT; myPrevEnd = UNSPECIFIED_INT; myRefID = UNSET_REF; myRefName.Clear(); myPrevRefName.Clear(); myBedRefID = SamReferenceInfo::NO_REF_ID; bool lshift = false; bool noeof = false; bool params = false; myWithinReg = false; myWroteReg = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Region Parameters") LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("refName", &myRefName) LONG_INTPARAMETER("refID", &myRefID) LONG_INTPARAMETER("start", &myStart) LONG_INTPARAMETER("end", &myEnd) LONG_STRINGPARAMETER("bed", &bed) LONG_PARAMETER("withinReg", &myWithinReg) LONG_STRINGPARAMETER("readName", &readName) LONG_PARAMETER_GROUP("Optional Other Parameters") LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); // mandatory argument was not specified. inputParameters.Status(); std::cerr << "Missing mandatory argument: --in" << std::endl; return(-1); } if(outFile == "") { usage(); // mandatory argument was not specified. inputParameters.Status(); std::cerr << "Missing mandatory argument: --out" << std::endl; return(-1); } if(indexFile == "") { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } if(myRefID != UNSET_REF && myRefName.Length() != 0) { std::cerr << "Can't specify both refID and refName" << std::endl; inputParameters.Status(); return(-1); } if(myRefID != UNSET_REF && bed.Length() != 0) { std::cerr << "Can't specify both refID and bed" << std::endl; inputParameters.Status(); return(-1); } if(myRefName.Length() != 0 && bed.Length() != 0) { std::cerr << "Can't specify both refName and bed" << std::endl; inputParameters.Status(); return(-1); } if(!bed.IsEmpty()) { myBedFile = ifopen(bed, "r"); } if(params) { inputParameters.Status(); } // Open the file for reading. mySamIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); // Open the bam index file for reading if a region was specified. if((myRefName.Length() != 0) || (myRefID != UNSET_REF) || (myBedFile != NULL)) { mySamIn.ReadBamIndex(indexFile); } // Read & write the sam header. mySamIn.ReadHeader(mySamHeader); samOut.WriteHeader(mySamHeader); // Read the sam records. SamRecord samRecord; // Track the status. int numSectionRecords = 0; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(getNextSection()) { // Keep reading records until they aren't anymore. while(mySamIn.ReadRecord(mySamHeader, samRecord)) { if(!readName.IsEmpty()) { // Check for readname. if(strcmp(samRecord.getReadName(), readName.c_str()) != 0) { // not a matching read name, so continue to the next record. continue; } } // Check to see if the read has already been processed. if(myPrevEnd != UNSPECIFIED_INT) { // Because we already know that the bed was sorted, // we know that the previous section started before // this one, so if the previous end is greater than // this record's end position we know that it // was already written in the previous section. // Note: can't be equal to the previous end since // the end range was exclusive, while // get0BasedAlignmentEnd is inclusive. // myPrevEnd is reset by getNextSection when a new // chromosome is hit. if(samRecord.get0BasedAlignmentEnd() < myPrevEnd) { // This record was already written. continue; } } // Shift left if applicable. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. samOut.WriteRecord(mySamHeader, samRecord); ++numSectionRecords; } myWroteReg = true; } if(myBedFile != NULL) { ifclose(myBedFile); } std::cerr << "Wrote " << outFile << " with " << numSectionRecords << " records.\n"; return(returnStatus); }
int VcfConvert::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcf = ""; String refName = ""; bool uncompress = false; bool params = false; bool noeof = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("out", &outputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_STRINGPARAMETER("refName", &refName) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } VcfFileReader inFile; VcfFileWriter outFile; VcfHeader header; // Open the file. inFile.open(inputVcf, header); if(refName != "") { inFile.setReadSection(refName.c_str()); } if(uncompress) { outFile.open(outputVcf, header, InputFile::DEFAULT); } else { outFile.open(outputVcf, header); } VcfRecord record; int numRecords = 0; while(inFile.readRecord(record)) { ++numRecords; outFile.writeRecord(record); } inFile.close(); std::cerr << "NumRecords: " << numRecords << "\n"; return(0); }
int main(int argc, char ** argv) { setbuf(stdout, NULL); time_t start = time(NULL); printf("MiniMac - Imputation into phased haplotypes\n" "(c) 2011 Goncalo Abecasis\n"); #ifdef __VERSION__ printf("VERSION 5.0\n"); #else printf("UNDOCUMENTED RELEASE\n"); #endif int rounds = 5, states = 200, cpus = 0; bool em = false, gzip = false, phased = false; String referenceHaplotypes, referenceSnps; String haplotypes, snps; String prefix("minimac"); String firstMarker, lastMarker; String recombinationRates, errorRates; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Reference Haplotypes") LONG_STRINGPARAMETER("refHaps", &referenceHaplotypes) LONG_STRINGPARAMETER("refSnps", &referenceSnps) LONG_PARAMETER_GROUP("Target Haplotypes") LONG_STRINGPARAMETER("haps", &haplotypes) LONG_STRINGPARAMETER("snps", &snps) LONG_PARAMETER_GROUP("Starting Parameters") LONG_STRINGPARAMETER("rec", &recombinationRates) LONG_STRINGPARAMETER("erate", &errorRates) LONG_PARAMETER_GROUP("Parameter Fitting") LONG_INTPARAMETER("rounds", &rounds) LONG_INTPARAMETER("states", &states) LONG_PARAMETER("em", &em) LONG_PARAMETER_GROUP("Output Files") LONG_STRINGPARAMETER("prefix", &prefix) LONG_PARAMETER("phased", &phased) LONG_PARAMETER("gzip", &gzip) // LONG_PARAMETER_GROUP("Clipping Window") // LONG_STRINGPARAMETER("start", &firstMarker) // LONG_STRINGPARAMETER("stop", &lastMarker) #ifdef _OPENMP LONG_PARAMETER_GROUP("Multi-Threading") LONG_INTPARAMETER("cpus", &cpus) #endif END_LONG_PARAMETERS(); ParameterList pl; pl.Add(new LongParameters("Command Line Options", longParameters)); pl.Read(argc, argv); pl.Status(); #ifdef _OPENMP if (cpus > 0) omp_set_num_threads(cpus); #endif // Read marker list printf("Reading Reference Marker List ...\n"); StringArray refMarkerList; refMarkerList.Read(referenceSnps); // Index markers StringIntHash referenceHash; for (int i = 0; i < refMarkerList.Length(); i++) referenceHash.Add(refMarkerList[i].Trim(), i); printf(" %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length()); // Load reference haplotypes printf("Loading reference haplotypes ...\n"); HaplotypeSet reference; reference.markerCount = refMarkerList.Length(); reference.LoadHaplotypes(referenceHaplotypes); printf(" %d Reference Haplotypes Loaded ...\n\n", reference.count); // Read framework marker list printf("Reading Framework Marker List ...\n"); StringArray markerList; markerList.Read(snps); ClipReference(reference, refMarkerList, referenceHash, markerList, firstMarker, lastMarker); // Crossref Marker Names to Reference Panel Positions IntArray markerIndex; markerIndex.Dimension(markerList.Length()); int matches = 0; for (int i = 0; i < markerList.Length(); i++) { markerIndex[i] = referenceHash.Integer(markerList[i].Trim()); if (markerIndex[i] >= 0) matches++; } printf(" %d Markers in Framework Haplotypes Overlap Reference ...\n", matches); if (matches == 0) error("No markers overlap between target and reference\n" "Please check correct reference is being used and markers are named consistently"); printf(" %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches); // Check for flips in reference vs. target haplotypes int flips = 0; int previous = -1; for (int i = 0; i < markerIndex.Length(); i++) if (markerIndex[i] >= 0) if (markerIndex[i] < previous) { if (flips++ < 10) printf(" -> Marker %s precedes %s in reference, but follows it in target\n", (const char *) refMarkerList[previous], (const char *) markerList[i]); previous = markerIndex[i]; } if (flips > 10) printf(" -> %d Additional Marker Order Changes Not Listed\n", flips - 10); if (flips) printf(" %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips); // Load target haplotypes printf("Loading target haplotypes ...\n"); HaplotypeSet target; target.markerCount = markerList.Length(); target.LoadHaplotypes(haplotypes, true); reference.CalculateFrequencies(); target.CalculateFrequencies(); target.CompareFrequencies(reference, markerIndex, markerList); printf(" %d Target Haplotypes Loaded ...\n\n", target.count); int startIndex = firstMarker.IsEmpty() ? 0 : referenceHash.Integer(firstMarker); int stopIndex = lastMarker.IsEmpty() ? reference.markerCount - 1 : referenceHash.Integer(lastMarker); if (startIndex < 0 || stopIndex < 0) error("Clipping requested, but no position available for one of the endpoints"); printf("Setting up Markov Model...\n\n"); // Setup Markov Model MarkovParameters mp; mp.Allocate(reference.markerCount); if (rounds > 0) printf("Initializing Model Parameters (using %s and up to %d haplotypes)\n", em ? "E-M" : "MCMC", states); // Simple initial estimates of error and recombination rate for (int i = 0; i < reference.markerCount; i++) mp.E[i] = 0.01; for (int i = 0; i < reference.markerCount - 1; i++) mp.R[i] = 0.001; if (mp.ReadErrorRates(errorRates)) printf(" Updated error rates using data in %s ...\n", (const char *) errorRates); if (mp.ReadCrossoverRates(recombinationRates)) printf(" Updated recombination rates using %s ...\n", (const char *) recombinationRates); // Parameter estimation loop for (int round = 0; round < rounds; round++) { printf(" Round %d of Parameter Refinement ...\n", round + 1); int iterations = states < reference.count ? states : reference.count; MarkovModel original; original.CopyParameters(mp); #pragma omp parallel for for (int i = 0; i < iterations; i++) { MarkovModel mm; mm.Allocate(reference.markerCount, reference.count - 1); mm.CopyParameters(original); // Reference leave one out (loo) panel char ** reference_loo = new char * [reference.count - 1]; for (int in = 0, out = 0; in < reference.count; in++) if (in != i) reference_loo[out++] = reference.haplotypes[in]; mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq); if (em) mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq); else { #pragma omp critical { mm.ProfileModel(reference.haplotypes[i], reference_loo, reference.freq); } } delete [] reference_loo; #pragma omp critical mp += mm; } if (round >= rounds / 2) { int iterations = states < target.count ? states : target.count; #pragma omp parallel for for (int i = 0; i < iterations; i++) { MarkovModel mm; mm.Allocate(reference.markerCount, reference.count); mm.CopyParameters(original); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int k = 0; k < reference.markerCount; k++) padded[k] = 0; // Copy current haplotype into padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = target.haplotypes[i][j]; mm.WalkLeft(padded, reference.haplotypes, reference.freq); if (em) mm.CountExpected(padded, reference.haplotypes, reference.freq); else { #pragma omp critical { mm.ProfileModel(padded, reference.haplotypes, reference.freq); } } delete [] padded; #pragma omp critical mp += mm; } } mp.UpdateModel(); double crossovers = 0; for (int i = 0; i < reference.markerCount - 1; i++) crossovers += mp.R[i]; double errors = 0; for (int i = 0; i < reference.markerCount; i++) { double heterozygosity = 1.0 - square(reference.freq[1][i]) - square(reference.freq[2][i]) - square(reference.freq[3][i]) - square(reference.freq[4][i]); errors += mp.E[i] * heterozygosity; } errors /= reference.markerCount + 1e-30; printf(" %.0f mosaic crossovers expected per haplotype\n", crossovers); printf(" %.1f%% of crossovers are due to reference flips\n", mp.empiricalFlipRate * 100.); printf(" %.3g errors in mosaic expected per marker\n", errors); } if (rounds > 0) { printf(" Saving estimated parameters for future use ...\n"); mp.WriteParameters(refMarkerList, prefix, gzip); } printf("\n"); // List the major allele at each location reference.ListMajorAlleles(); printf("Generating Draft .info File ...\n\n"); // Output some basic information IFILE info = ifopen(prefix + ".info.draft", "wt"); ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n"); for (int i = 0, j = 0; i <= stopIndex; i++) if (i >= startIndex) ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n", (const char *) refMarkerList[i], reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i), reference.freq[reference.major[i]][i], j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-"); else if (j < markerIndex.Length() && i == markerIndex[j]) j++; ifclose(info); printf("Imputing Genotypes ...\n"); IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wt"); IFILE hapdose, haps; if (phased) { hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wt"); haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wt"); } ImputationStatistics stats(reference.markerCount); // Impute each haplotype #pragma omp parallel for for (int i = 0; i < target.count; i++) { if (i != 0 && target.labels[i] == target.labels[i-1]) continue; MarkovModel mm; mm.Allocate(reference.markerCount, reference.count); mm.ClearImputedDose(); mm.CopyParameters(mp); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int j = 0; j < reference.markerCount; j++) padded[j] = 0; int k = i; do { printf(" Processing Haplotype %d of %d ...\n", k + 1, target.count); // Copy current haplotype into padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = target.haplotypes[k][j]; mm.WalkLeft(padded, reference.haplotypes, reference.freq); mm.Impute(reference.major, padded, reference.haplotypes, reference.freq); #pragma omp critical { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); } #pragma omp critical if (phased) { ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1); ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1); for (int j = startIndex; j <= stopIndex; j++) { ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]); ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]); } ifprintf(hapdose, "\n"); ifprintf(haps, "\n"); } k++; } while (k < target.count && target.labels[k] == target.labels[i]); printf(" Outputting Individual %s ...\n", (const char *) target.labels[i]); #pragma omp critical { ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]); for (int j = startIndex; j <= stopIndex; j++) ifprintf(dosages, "\t%.3f", mm.imputedDose[j]); ifprintf(dosages, "\n"); } delete [] padded; } ifclose(dosages); if (phased) { ifclose(hapdose); ifclose(haps); } // Output some basic information info = ifopen(prefix + ".info" + (gzip ? ".gz" : ""), "wt"); ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tDose2\n"); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int k = 0; k < reference.markerCount; k++) padded[k] = 0; // Mark genotyped SNPs in padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = 1; for (int i = startIndex; i <= stopIndex; i++) { ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%.5f\t", (const char *) refMarkerList[i], reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i), stats.AlleleFrequency(i), stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i), stats.AverageCallScore(i), stats.Rsq(i)); if (padded[i]) ifprintf(info, "Genotyped\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n", stats.LooRsq(i), stats.EmpiricalR(i), stats.EmpiricalRsq(i), stats.LooMajorDose(i), stats.LooMinorDose(i)); else ifprintf(info, "-\t-\t-\t-\t-\t-\n"); } ifclose(info); delete [] padded; time_t stop = time(NULL); int seconds = stop - start; printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n", seconds / 3600, (seconds % 3600) / 60, seconds % 60, ctime(&stop)); }
int main(int argc, char ** argv) { printf("glfMerge V1.0.2 -- Merge SNP calls based on .glf or .glz files\n"); printf("(c) 2009 Goncalo Abecasis, Sebastian Zoellner, Yun Li\n\n"); ParameterList pl; String qualities = "30,30"; String minDepths = "1,1"; String maxDepths = "200,200"; String outfile = "merged.glf"; bool verbose = false; IntArray qualityFilter; IntArray lowDepthFilter; IntArray highDepthFilter; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Map Quality Filter") LONG_STRINGPARAMETER("qualities", &qualities) LONG_PARAMETER_GROUP("Depth Filters") LONG_STRINGPARAMETER("minDepths", &minDepths) LONG_STRINGPARAMETER("maxDepths", &maxDepths) LONG_PARAMETER_GROUP("Output") LONG_STRINGPARAMETER("outfile", &outfile) LONG_PARAMETER("verbose", &verbose) END_LONG_PARAMETERS(); pl.Add(new LongParameters("Options", longParameters)); int argstart = pl.ReadWithTrailer(argc, argv) + 1; pl.Status(); time_t t; time(&t); printf("Analysis started on %s\n", ctime(&t)); fflush(stdout); int n = argc - argstart; argv += argstart; if (n == 0) error("No glf files listed at the end of command line\n"); StringToArray(qualities, qualityFilter, n); StringToArray(minDepths, lowDepthFilter, n); StringToArray(maxDepths, highDepthFilter, n); glfHandler * glf = new glfHandler[n]; for (int i = 0; i < n; i++) if (!glf[i].Open(argv[i])) error("Failed to open genotype likelihood file [%s]\n", argv[i]); printf("Calling genotypes for files ...\n"); for (int i = 0; i < n; i++) if (glf[i].isOpen()) printf(" %s\n", argv[i]); printf("\n"); glfHandler output; output.Create(outfile); long long depth = 0, originalDepth = 0; long long sites = 0, originalSites = 0; while (glf[0].NextSection()) { for (int i = 1; i < n; i++) { glf[i].NextSection(); if (glf[0].maxPosition != glf[i].maxPosition || glf[0].label != glf[i].label) { error("Genotype files '%s' and '%s' are not compatible ...\n" " File '%s' has section %s with %d entries ...\n" " File '%s' section %s with %d entries ...\n", argv[0], argv[i], argv[0], (const char *) glf[0].label, glf[0].maxPosition, argv[i], (const char *) glf[i].label, glf[i].maxPosition); } } printf("Processing section %s with %d entries\n", (const char *) glf[0].label, glf[0].maxPosition); output.BeginSection(glf[0].label, glf[0].maxPosition); for (int i = 0; i < n; i++) glf[i].NextBaseEntry(); int position = glf[0].position; char refBase = glf[0].data.refBase; for (int i = 1; i < n; i++) if (position > glf[i].position) { position = glf[i].position; refBase = glf[i].data.refBase; } while (position < glf[0].maxPosition) { output.data.recordType = 1; output.data.refBase = refBase; output.data.depth = 0; output.data.mapQuality = 0; output.data.minLLK = 0; for (int i = 0; i < 10; i++) output.data.lk[i] = 0; for (int i = 0; i < n; i++) if (glf[i].position == position && glf[i].data.mapQuality >= qualityFilter[i] && glf[i].data.depth >= (unsigned) lowDepthFilter[i] && (glf[i].data.depth <= (unsigned) highDepthFilter[i] || highDepthFilter[i] == 0)) { int deltaMinMap = output.data.lk[0] + glf[i].data.lk[0]; for (int j = 1; j < 10; j++) if (deltaMinMap > output.data.lk[j] + glf[i].data.lk[j]) deltaMinMap = output.data.lk[j] + glf[i].data.lk[j]; output.data.minLLK += deltaMinMap + glf[i].data.minLLK; for (int j = 0; j < 10; j++) if (output.data.lk[j] + glf[i].data.lk[j] - deltaMinMap < 255) output.data.lk[j] += glf[i].data.lk[j] - deltaMinMap; else output.data.lk[j] = 255; output.data.mapQuality = (char) sqrt( ( (sq(output.data.mapQuality) * output.data.depth) + (sq(glf[i].data.mapQuality) * glf[i].data.depth) ) / (output.data.depth + glf[i].data.depth + 1e-30) ); output.data.depth += glf[i].data.depth; if (verbose) { printf("lk[%d] : { %d", i, glf[i].data.lk[0]); for (int j = 1; j < 10; j++) printf(", %d", glf[i].data.lk[j]); printf("} [map: %d, depth: %d]\n", glf[i].data.mapQuality, glf[i].data.depth); } } for (int i = 0; i < n; i++) if (glf[i].position == position) originalDepth += glf[i].data.depth; originalSites++; if (output.data.depth) { if (verbose) { printf("output : { %d", output.data.lk[0]); for (int j = 1; j < 10; j++) printf(", %d", output.data.lk[j]); printf("} [map: %d, depth: %d]\n", output.data.mapQuality, output.data.depth); } output.WriteEntry(position); depth += output.data.depth; sites ++; } for (int i = 0; i < n; i++) if (glf[i].position == position) glf[i].NextBaseEntry(); position = glf[0].position; refBase = glf[0].data.refBase; for (int i = 1; i < n; i++) if (position > glf[i].position) { position = glf[i].position; refBase = glf[i].data.refBase; } } output.EndSection(); } printf("Combined file includes %ld bases covering %ld sites (%.1f coverage)\n", depth, sites, depth / (sites + 1e-30)); printf("Original files include %ld bases covering %ld sites (%.1f coverage)\n", originalDepth, originalSites, originalDepth / (originalSites + 1e-30)); for (int i = 0; i < n; i++) if (glf[i].isOpen()) glf[i].Close(); output.Close(); }
int VcfConsensus::execute(int argc, char ** argv) { String vcfName1; String vcfName2; String vcfName3; String outputFileName; bool uncompress = false; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in1", &vcfName1) LONG_STRINGPARAMETER("in2", &vcfName2) LONG_STRINGPARAMETER("in3", &vcfName3) LONG_STRINGPARAMETER("out", &outputFileName) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); std::string gtField = "GT"; VcfFileReader vcf1; VcfFileReader vcf2; VcfFileReader vcf3; VcfHeader header1; VcfHeader header2; VcfHeader header3; VcfRecord record1; VcfRecord record2; VcfRecord record3; VcfRecordGenotype* genotypeInfoPtr1 = NULL; VcfRecordGenotype* genotypeInfoPtr2 = NULL; VcfRecordGenotype* genotypeInfoPtr3 = NULL; unsigned int numMissing2 = 0; unsigned int numMissing3 = 0; unsigned int numMismatchRefAlt = 0; unsigned int numMissingGT1 = 0; const unsigned int myMaxErrors = 4; // Check that the required parameters were set. if(vcfName1 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in1\", a required parameter.\n\n"; return(-1); } if(vcfName2 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in2\", a required parameter.\n\n"; return(-1); } if(vcfName3 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in3\", a required parameter.\n\n"; return(-1); } if(outputFileName == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // Open the files. vcf1.open(vcfName1, header1); vcf2.open(vcfName2, header2); vcf3.open(vcfName3, header3); // Setup the sample name maps. int numSamples = header1.getNumSamples(); std::vector<int> sample2Indices; std::vector<int> sample3Indices; std::vector<int> removeIndices; int numSamplesSkipped1 = 0; int numSamplesSkipped2 = 0; int numSamplesSkipped3 = 0; for(int i = 0; i < numSamples; i++) { int sm2Index = header2.getSampleIndex(header1.getSampleName(i)); int sm3Index = header3.getSampleIndex(header1.getSampleName(i)); // Look for this sample name in vcf2. if((sm2Index != -1) && (sm3Index != -1)) { sample2Indices.push_back(sm2Index); sample3Indices.push_back(sm3Index); } else { // Sample not found in all three vcfs. removeIndices.push_back(i); ++numSamplesSkipped1; } } // Remove samples not found in all 3 vcfs from header1. // Remove them in reverse order so they are removed from the end of the header first. VcfSubsetSamples subset1; subset1.init(header1, true); for(int i = (removeIndices.size() - 1); i >= 0; i--) { subset1.addExcludeSample(header1.getSampleName(removeIndices[i])); header1.removeSample(removeIndices[i]); } // Set numSamples to the new number of samples in header1. numSamples = header1.getNumSamples(); // Calculate the number of samples skipped for files 2 & 3. numSamplesSkipped2 = header2.getNumSamples() - sample2Indices.size(); numSamplesSkipped3 = header3.getNumSamples() - sample3Indices.size(); if(numSamplesSkipped1 > 0) { std::cerr << "Skipping " << numSamplesSkipped1 << " samples from --in1\n"; } if(numSamplesSkipped2 > 0) { std::cerr << "Skipping " << numSamplesSkipped2 << " samples from --in2\n"; } if(numSamplesSkipped3 > 0) { std::cerr << "Skipping " << numSamplesSkipped3 << " samples from --in3\n"; } VcfFileWriter outputVcf; // Open and write the header if(uncompress) { outputVcf.open(outputFileName, header1, InputFile::DEFAULT); } else { outputVcf.open(outputFileName, header1); } const char* chrom1 = NULL; int pos1 = UNSET_POS; // Read the first record from vcf2 & vcf3. vcf2.readRecord(record2); vcf3.readRecord(record3); bool newChrom = true; static std::string prevChrom = ""; uint64_t numAllMatch = 0; uint64_t num1Match2Only = 0; uint64_t num1Match3Only = 0; uint64_t num2Match3Only = 0; uint64_t numNoMatches = 0; uint64_t numAllMatch00 = 0; uint64_t num1Match2Only00 = 0; uint64_t num1Match3Only00 = 0; uint64_t num2Match3Only00 = 0; uint64_t numAllMatch01 = 0; uint64_t num1Match2Only01 = 0; uint64_t num1Match3Only01 = 0; uint64_t num2Match3Only01 = 0; uint64_t numAllMatch11 = 0; uint64_t num1Match2Only11 = 0; uint64_t num1Match3Only11 = 0; uint64_t num2Match3Only11 = 0; // Loop through vcf1. while(vcf1.readRecord(record1, &subset1)) { chrom1 = record1.getChromStr(); pos1 = record1.get1BasedPosition(); if(strcmp(chrom1, prevChrom.c_str()) == 0) { newChrom = false; } else { prevChrom = chrom1; newChrom = true; } bool found = true; if(!findPos(newChrom, chrom1, pos1, record2, vcf2)) { // Failed to find the position, continue to the next position if(++numMissing2 <= myMaxErrors) { std::cerr << "Failed to find " << chrom1 << ":" << pos1 << " in " << vcfName2 << ", so skipping this pos\n"; } found = false; } if(!findPos(newChrom, chrom1, pos1, record3, vcf3)) { // Failed to find the position, continue to the next position if(++numMissing3 <= myMaxErrors) { std::cerr << "Failed to find " << chrom1 << ":" << pos1 << " in " << vcfName3 << ", so skipping this pos\n"; } found = false; } if(found == false) { continue; } // Found the position in all files. // Validate that the reference & alternate alleles are the same. const char* ref1 = record1.getRefStr(); const char* alt1 = record1.getAltStr(); if((strcmp(ref1, record2.getRefStr()) != 0) || (strcmp(ref1, record3.getRefStr()) != 0) || (strcmp(alt1, record2.getAltStr()) != 0) || (strcmp(alt1, record3.getAltStr()) != 0)) { if(++numMismatchRefAlt <= myMaxErrors) { std::cerr << "Mismatching ref/alt found at " << chrom1 << ":" << pos1 << ", so skipping this pos\n"; } continue; } // Get the genotype information for each. genotypeInfoPtr1 = &(record1.getGenotypeInfo()); genotypeInfoPtr2 = &(record2.getGenotypeInfo()); genotypeInfoPtr3 = &(record3.getGenotypeInfo()); // Loop through all the samples in vcf1. // Get the Genotype Information. for(int i = 0; i < numSamples; i++) { const std::string* genotypeVal1 = genotypeInfoPtr1->getString(gtField, i); const std::string* genotypeVal2 = genotypeInfoPtr2->getString(gtField, sample2Indices[i]); const std::string* genotypeVal3 = genotypeInfoPtr3->getString(gtField, sample3Indices[i]); // Need to make sure the field was found. if(genotypeVal1 == NULL) { // GT not found in the first record, so just continue. if(++numMissingGT1 <= myMaxErrors) { std::cerr << "Missing GT for " << header1.getSampleName(i) << " in " << vcfName1 << "\n"; } continue; } if(isSame(genotypeVal1, genotypeVal2)) { // genotypeVal1 is majority, so make no change. if(isSame(genotypeVal1, genotypeVal3)) { ++numAllMatch; if(*genotypeVal1 == "0/0") { ++numAllMatch00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++numAllMatch01; } if(*genotypeVal1 == "1/1") { ++numAllMatch11; } } else { ++num1Match2Only; if(*genotypeVal1 == "0/0") { ++num1Match2Only00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++num1Match2Only01; } if(*genotypeVal1 == "1/1") { ++num1Match2Only11; } } } else if(isSame(genotypeVal1, genotypeVal3)) { // genotypeVal1 is majority, so make no change. ++num1Match3Only; if(*genotypeVal1 == "0/0") { ++num1Match3Only00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++num1Match3Only01; } if(*genotypeVal1 == "1/1") { ++num1Match3Only11; } } else if(isSame(genotypeVal2, genotypeVal3)) { // genotypeVal2 is majority, so change genotypeVal1. genotypeInfoPtr1->setString(gtField, i, *genotypeVal2); ++num2Match3Only; if(*genotypeVal2 == "0/0") { ++num2Match3Only00; } else if((*genotypeVal2 == "0/1") || (*genotypeVal2 == "1/0")) { ++num2Match3Only01; } if(*genotypeVal2 == "1/1") { ++num2Match3Only11; } } else { // None are the same so set to "./." genotypeInfoPtr1->setString(gtField, i, "./."); ++numNoMatches; } } // loop back to vcf1 samples. // Write this record. outputVcf.writeRecord(record1); } // loop back to next vcf1 record. std::cerr << "\n"; if(numMissing2 > myMaxErrors) { std::cerr << "Suppressed " << numMissing2 - myMaxErrors << " errors about skipped positions because they were not in " << vcfName2 << "\n"; } if(numMissing3 > myMaxErrors) { std::cerr << "Suppressed " << numMissing3 - myMaxErrors << " errors about skipped positions because they were not in " << vcfName3 << "\n"; } if(numMismatchRefAlt > myMaxErrors) { std::cerr << "Suppressed " << numMismatchRefAlt - myMaxErrors << " errors about mismatched ref/alt positions\n"; } if(numMissingGT1 > myMaxErrors) { std::cerr << "Suppressed " << numMissingGT1 - myMaxErrors << " errors about missing GT for " << vcfName1 << "\n"; } std::cerr << "\n"; // Output the stats. std::cerr << "File1 = " << vcfName1 << std::endl; std::cerr << "File2 = " << vcfName2 << std::endl; std::cerr << "File3 = " << vcfName3 << std::endl; std::cerr << "\nType\tTotal\t0/0\t0/1|1/0\t1/1\n"; std::cerr << "AllMatched" << "\t" << numAllMatch << "\t" << numAllMatch00 << "\t" << numAllMatch01 << "\t" << numAllMatch11 << std::endl; std::cerr << "1matched2" << "\t" << num1Match2Only << "\t" << num1Match2Only00 << "\t" << num1Match2Only01 << "\t" << num1Match2Only11 << std::endl; std::cerr << "1matched3" << "\t" << num1Match3Only << "\t" << num1Match3Only00 << "\t" << num1Match3Only01 << "\t" << num1Match3Only11 << std::endl; std::cerr << "2matched3" << "\t" << num2Match3Only << "\t" << num2Match3Only00 << "\t" << num2Match3Only01 << "\t" << num2Match3Only11 << std::endl; std::cerr << "NoneMatched\t" << numNoMatches << std::endl; return(0); }
int Revert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; bool cigar = false; bool qual = false; bool noeof = false; bool params = false; bool rmBQ = false; String rmTags = ""; myKeepTags = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER("cigar", &cigar) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("keepTags", &myKeepTags) LONG_PARAMETER("rmBQ", &rmBQ) LONG_STRINGPARAMETER("rmTags", &rmTags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed to the // failure reason if any of the writes or updates fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Update the cigar & position. if(cigar) { if(!updateCigar(samRecord)) { // Failed to update the cigar & position. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(qual) { if(!updateQual(samRecord)) { // Failed to update the quality. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmBQ) { if(!removeBQ(samRecord)) { // Failed to remove BQ. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmTags != "") { if(!samRecord.rmTags(rmTags.c_str())) { // Failed to remove the specified tags. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int VcfSplit::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcfBase = ""; String refName = ""; bool uncompress = false; bool params = false; bool noeof = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("obase", &outputVcfBase) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_STRINGPARAMETER("refName", &refName) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcfBase == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--obase\", a required parameter.\n\n"; return(-1); } outputVcfBase += "."; if(params) { inputParameters.Status(); } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } VcfFileReader inFile; std::map<std::string, VcfFileWriter*> outFiles; VcfHeader header; // Open the file. inFile.open(inputVcf, header); if(refName != "") { inFile.setReadSection(refName.c_str()); } VcfRecord record; int numRecords = 0; std::string prevChr = ""; std::string chr = ""; VcfFileWriter* outFilePtr = 0; std::string outName = ""; while(inFile.readRecord(record)) { ++numRecords; chr = record.getChromStr(); if((outFilePtr == 0) || (chr != prevChr)) { outFilePtr = outFiles[chr]; if(outFilePtr == 0) { outFilePtr = new VcfFileWriter(); outFiles[chr] = outFilePtr; outName = outputVcfBase.c_str(); if(chr.substr(0,3) != "chr") { outName += "chr"; } outName += chr + ".vcf"; // chr not in outFile list. if(uncompress) { outFilePtr->open(outName.c_str(), header, InputFile::DEFAULT); } else { outName += ".gz"; outFilePtr->open(outName.c_str(), header); } } } outFilePtr->writeRecord(record); } inFile.close(); for (std::map<std::string,VcfFileWriter*>::iterator it = outFiles.begin(); it != outFiles.end(); ++it) { if(it->second != 0) { it->second->close(); it->second = 0; } } std::cerr << "NumRecords: " << numRecords << "\n"; return(0); }
int VcfCleaner::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcf = ""; bool uncompress = false; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("out", &outputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } VcfFileReader inFile; VcfFileWriter outFile; VcfHeader header; VcfRecord record; // Open the file. inFile.open(inputVcf, header); if(uncompress) { outFile.open(outputVcf, header, InputFile::DEFAULT); } else { outFile.open(outputVcf, header); } int numReadRecords = 0; int numWrittenRecords = 0; int returnVal = 0; // Set to only store/write the GT field. VcfRecordGenotype::addStoreField("GT"); while(inFile.readRecord(record)) { ++numReadRecords; // Check if any samples are missing GT or if any are not phased. if(!record.hasAllGenotypeAlleles() || !record.allPhased()) { // Missing a GT or not phased, so continue without writing. continue; } // Clear the INFO field. record.getInfo().clear(); // Write the record. if(!outFile.writeRecord(record)) { // Write error. std::cerr << "Failed writing a vcf record.\n"; returnVal = -1; } ++numWrittenRecords; } inFile.close(); outFile.close(); std::cerr << "NumReadRecords: " << numReadRecords << "; NumWrittenRecords: " << numWrittenRecords << "\n"; return(returnVal); }
// Dump the reference information from specified SAM/BAM file. int DumpRefInfo::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool noeof = false; bool printRecordRefs = false; bool params = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("printRecordRefs", &printRecordRefs) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; samIn.OpenForRead(inFile); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int numReferences = refInfo.getNumEntries(); for(int i = 0; i < numReferences; i++) { std::cout << "Reference Index " << i; std::cout << "; Name: " << refInfo.getReferenceName(i) << std::endl; } if(numReferences == 0) { // There is no reference info. std::cerr << "The header contains no reference information.\n"; } // If we are to print the references as found in the records, loop // through reading the records. if(printRecordRefs) { SamRecord samRecord; // Track the prev name/id. std::string prevName = ""; int prevID = -2; int recCount = 0; // track the num records in a ref. // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { const char* name = samRecord.getReferenceName(); int id = samRecord.getReferenceID(); if((strcmp(name, prevName.c_str()) != 0) || (id != prevID)) { if(prevID != -2) { std::cout << "\tRef ID: " << prevID << "\tRef Name: " << prevName << "\tNumRecs: " << recCount << std::endl; } recCount = 0; prevID = id; prevName = name; } ++recCount; } // Print the last index. if(prevID != -2) { std::cout << "\tRef ID: " << prevID << "\tRef Name: " << prevName << "\tNumRecs: " << recCount << std::endl; } } return(SamStatus::SUCCESS); }