int ReadReference::execute(int argc, char **argv) { static const int UNSPECIFIED_INT = -1; String refFile = ""; String refName = ""; int start = UNSPECIFIED_INT; int numBases = UNSPECIFIED_INT; int end = UNSPECIFIED_INT; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("refFile", &refFile) LONG_STRINGPARAMETER("refName", &refName) LONG_INTPARAMETER("start", &start) LONG_INTPARAMETER("end", &end) LONG_INTPARAMETER("numBases", &numBases) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); if((refName == "") || (start == UNSPECIFIED_INT) || ((end == UNSPECIFIED_INT) && (numBases == UNSPECIFIED_INT))) { usage(); inputParameters.Status(); std::cerr << "Missing Required Parameter\n\n"; return(-1); } if((end != UNSPECIFIED_INT) && (numBases != UNSPECIFIED_INT)) { usage(); inputParameters.Status(); std::cerr << "Only --end or --numBases can be specified\n\n"; return(-1); } else if(numBases != UNSPECIFIED_INT) { end = start + numBases; } if(params) { inputParameters.Status(); } // Open the reference. GenomeSequence reference(refFile); uint32_t refStart = reference.getGenomePosition(refName.c_str()); if(refStart == INVALID_GENOME_INDEX) { std::cerr << "Reference Name: " << refName.c_str() << " not found in the reference file\n"; return(-1); } std::string refString; reference.getString(refString, refStart + start, end - start); std::cout << refString << std::endl; return(0); }
int Revert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; bool cigar = false; bool qual = false; bool noeof = false; bool params = false; bool rmBQ = false; String rmTags = ""; myKeepTags = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER("cigar", &cigar) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("keepTags", &myKeepTags) LONG_PARAMETER("rmBQ", &rmBQ) LONG_STRINGPARAMETER("rmTags", &rmTags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed to the // failure reason if any of the writes or updates fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Update the cigar & position. if(cigar) { if(!updateCigar(samRecord)) { // Failed to update the cigar & position. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(qual) { if(!updateQual(samRecord)) { // Failed to update the quality. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmBQ) { if(!removeBQ(samRecord)) { // Failed to remove BQ. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmTags != "") { if(!samRecord.rmTags(rmTags.c_str())) { // Failed to remove the specified tags. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int Convert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String refFile = ""; bool lshift = false; bool noeof = false; bool params = false; bool useBases = false; bool useEquals = false; bool useOrigSeq = false; bool recover = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_STRINGPARAMETER("refFile", &refFile) LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("recover", &recover) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("SequenceConversion") EXCLUSIVE_PARAMETER("useBases", &useBases) EXCLUSIVE_PARAMETER("useEquals", &useEquals) EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the ref file was specified. // Open the reference. GenomeSequence* refPtr = NULL; if(refFile != "") { refPtr = new GenomeSequence(refFile); } SamRecord::SequenceTranslation translation; if((useBases) && (refPtr != NULL)) { translation = SamRecord::BASES; } else if((useEquals) && (refPtr != NULL)) { translation = SamRecord::EQUAL; } else { useOrigSeq = true; translation = SamRecord::NONE; } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; if(recover) samIn.setAttemptRecovery(true); samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); samOut.SetWriteSequenceTranslation(translation); samOut.SetReference(refPtr); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(1) { try { // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // left shift if necessary. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } break; } catch (std::runtime_error e) { std::cerr << "Caught runtime error: " << e.what() << "\n"; if(!recover) { std::cerr << "Corrupted BAM file detected - consider using --recover option.\n"; break; } std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n"; // XXX need to resync SamFile stream here bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH); if(rc) { std::cerr << "Successful resync - some data lost.\n"; continue; // succeeded } std::cerr << "Failed to re-sync on data stream.\n"; break; // failed to resync } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(refPtr != NULL) { delete(refPtr); } // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int VcfConvert::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcf = ""; String refName = ""; bool uncompress = false; bool params = false; bool noeof = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("out", &outputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_STRINGPARAMETER("refName", &refName) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } VcfFileReader inFile; VcfFileWriter outFile; VcfHeader header; // Open the file. inFile.open(inputVcf, header); if(refName != "") { inFile.setReadSection(refName.c_str()); } if(uncompress) { outFile.open(outputVcf, header, InputFile::DEFAULT); } else { outFile.open(outputVcf, header); } VcfRecord record; int numRecords = 0; while(inFile.readRecord(record)) { ++numRecords; outFile.writeRecord(record); } inFile.close(); std::cerr << "NumRecords: " << numRecords << "\n"; return(0); }
int ClipOverlap::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String storeOrig = ""; bool readName = false; bool noRNValidate = false; bool stats = false; int poolSize = DEFAULT_POOL_SIZE; bool unmapped = false; bool noeof = false; bool params = false; String excludeFlags = "0xF0C"; // TODO, cleanup legacy parameters ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("storeOrig", &storeOrig) LONG_PARAMETER("readName", &readName) LONG_PARAMETER ("noRNValidate", &noRNValidate) LONG_PARAMETER ("stats", &stats) LONG_PARAMETER ("overlapsOnly", &myOverlapsOnly) LONG_STRINGPARAMETER ("excludeFlags", &excludeFlags) LONG_PARAMETER("unmapped", &unmapped) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Coordinate Processing Optional Parameters") LONG_INTPARAMETER("poolSize", &poolSize) LONG_PARAMETER("poolSkipOverlap", &myPoolSkipOverlap) LONG_PHONEHOME(VERSION) BEGIN_LEGACY_PARAMETERS() LONG_PARAMETER ("clipsOnly", &myOverlapsOnly) LONG_PARAMETER("poolSkipClip", &myPoolSkipOverlap) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the out file was specified, if not, report an error. if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // Out file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if((storeOrig.Length() != 0) && (storeOrig.Length() != 2)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "--storeOrig tag name must be 2 characters.\n"; return(-1); } myOverlapHandler = new OverlapClipLowerBaseQual(); if(myOverlapHandler == NULL) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Failed to allocate the overlap handler\n"; return(-1); } if(unmapped) { myOverlapHandler->markAsUnmapped(); } // Setup the overlap handler. myOverlapHandler->keepStats(stats); if(storeOrig.Length() != 0) { myOverlapHandler->storeOrigCigar(storeOrig); } myIntExcludeFlags = excludeFlags.AsInteger(); if(params) { inputParameters.Status(); } // For each step process the file. // Open the files & read/write the sam header. SamStatus::Status runStatus = SamStatus::SUCCESS; for(int i = 1; i <= myOverlapHandler->numSteps(); i++) { // Open the file for reading. mySamHeader.resetHeader(); SamFile samIn(inFile, SamFile::READ, &mySamHeader); SamFile* samOutPtr = NULL; // Check if writing, if so, open the output file. if(i == myOverlapHandler->numSteps()) { samOutPtr = new SamFile(outFile, SamFile::WRITE, &mySamHeader); } if(readName) { if(!noRNValidate) { samIn.setSortedValidation(SamFile::QUERY_NAME); } runStatus = handleSortedByReadName(samIn, samOutPtr); } else { // Coordinate sorted, so work with the pools. samIn.setSortedValidation(SamFile::COORDINATE); myPool.setMaxAllocatedRecs(poolSize); // Reset the number of failures myNumMateFailures = 0; myNumPoolFail = 0; myNumPoolFailNoHandle = 0; myNumPoolFailHandled = 0; myNumOutOfOrder = 0; // Run by coordinate if(samOutPtr != NULL) { // Setup the output buffer for writing. SamCoordOutput outputBuffer(myPool); outputBuffer.setOutputFile(samOutPtr, &mySamHeader); runStatus = handleSortedByCoord(samIn, &outputBuffer); // Cleanup the output buffer. if(!outputBuffer.flushAll()) { std::cerr << "ERROR: Failed to flush the output buffer\n"; runStatus = SamStatus::FAIL_IO; } } else { runStatus = handleSortedByCoord(samIn, NULL); } } if(runStatus != SamStatus::SUCCESS) { break; } // Close the input file, it will be reopened if there are // multiple steps. samIn.Close(); if(samOutPtr != NULL) { samOutPtr->Close(); delete samOutPtr; samOutPtr = NULL; } } // Done processing. // Print Stats myOverlapHandler->printStats(); if(myNumMateFailures != 0) { std::cerr << "WARNING: did not find expected overlapping mates for " << myNumMateFailures << " records." << std::endl; } if(myNumPoolFail != 0) { // Had to skip clipping some records due to running out of // memory and not being able to wait for the mate. std::cerr << "WARNING: " << myNumPoolFail << " record pool failures\n"; if(myNumPoolFailNoHandle != 0) { std::cerr << "Due to hitting the max record poolSize, skipped handling " << myNumPoolFailNoHandle << " records." << std::endl; } if(myNumPoolFailHandled != 0) { std::cerr << "Due to hitting the max record poolSize, default handled " << myNumPoolFailHandled << " records." << std::endl; } if(myNumOutOfOrder != 0) { std::cerr << "WARNING: Resulting File out of Order by " << myNumOutOfOrder << " records.\n"; } } if(runStatus == SamStatus::SUCCESS) { if(myNumPoolFail == 0) { std::cerr << "Completed ClipOverlap Successfully.\n"; } else { runStatus = SamStatus::NO_MORE_RECS; std::cerr << "Completed ClipOverlap with WARNINGS.\n"; } } else { std::cerr << "Failed to complete ClipOverlap.\n"; } return(runStatus); }
int Bam2FastQ::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool readName = false; String refFile = ""; String firstOut = ""; String secondOut = ""; String unpairedOut = ""; bool interleave = false; bool noeof = false; bool gzip = false; bool params = false; myOutBase = ""; myNumMateFailures = 0; myNumPairs = 0; myNumUnpaired = 0; mySplitRG = false; myQField = ""; myNumQualTagErrors = 0; myReverseComp = true; myRNPlus = false; myFirstRNExt = DEFAULT_FIRST_EXT; mySecondRNExt = DEFAULT_SECOND_EXT; myCompression = InputFile::DEFAULT; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("readName", &readName) LONG_PARAMETER("splitRG", &mySplitRG) LONG_STRINGPARAMETER("qualField", &myQField) LONG_PARAMETER("merge", &interleave) LONG_STRINGPARAMETER("refFile", &refFile) LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt) LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt) LONG_PARAMETER("rnPlus", &myRNPlus) LONG_PARAMETER("noReverseComp", &myReverseComp) LONG_PARAMETER("gzip", &gzip) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional OutputFile Names") LONG_STRINGPARAMETER("outBase", &myOutBase) LONG_STRINGPARAMETER("firstOut", &firstOut) LONG_STRINGPARAMETER("secondOut", &secondOut) LONG_STRINGPARAMETER("unpairedOut", &unpairedOut) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(gzip) { myCompression = InputFile::GZIP; } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both splitRG & firstOut/secondOut/unpairedOut // since it needs a different file for each RG. if(mySplitRG && (!firstOut.IsEmpty() || !secondOut.IsEmpty() || !unpairedOut.IsEmpty())) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n"; std::cerr << "Use --outBase instead.\n"; return(-1); } // Cannot specify splitRG & output to stdout. if(mySplitRG && (myOutBase[0] == '-')) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n"; return(-1); } // Check to see if the out file was specified, if not, generate it from // the input filename. if(myOutBase == "") { // Just remove the extension from the input filename. int extStart = inFile.FastFindLastChar('.'); if(extStart <= 0) { myOutBase = inFile; } else { myOutBase = inFile.Left(extStart); } } if(mySplitRG) { std::string fqList = myOutBase.c_str(); fqList += ".list"; myFqList = ifopen(fqList.c_str(), "w"); ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n"); } // Check to see if the first/second/single-ended were specified and // if not, set them. myFirstFileNameExt = "_1.fastq"; mySecondFileNameExt = "_2.fastq"; myUnpairedFileNameExt = ".fastq"; if(interleave) { myFirstFileNameExt = "_interleaved.fastq"; myFirstFileNameExt = "_interleaved.fastq"; } getFileName(firstOut, myFirstFileNameExt); getFileName(secondOut, mySecondFileNameExt); getFileName(unpairedOut, myUnpairedFileNameExt); if(params) { inputParameters.Status(); } // Open the files for reading/writing. // Open prior to opening the output files, // so if there is an error, the outputs don't get created. SamFile samIn; samIn.OpenForRead(inFile, &mySamHeader); // Skip non-primary reads. samIn.SetReadFlags(0, 0x0100); // Open the output files if not splitting RG if(!mySplitRG) { myUnpairedFile = ifopen(unpairedOut, "w", myCompression); // Only open the first file if it is different than an already opened file. if(firstOut != unpairedOut) { myFirstFile = ifopen(firstOut, "w", myCompression); } else { myFirstFile = myUnpairedFile; } // If it is interleaved or the 2nd file is not a new name, set it appropriately. if(interleave || secondOut == firstOut) { mySecondFile = myFirstFile; } else if(secondOut == unpairedOut) { mySecondFile = myUnpairedFile; } else { mySecondFile = ifopen(secondOut, "w", myCompression); } if(myUnpairedFile == NULL) { std::cerr << "Failed to open " << unpairedOut << " so can't convert bam2FastQ.\n"; return(-1); } if(myFirstFile == NULL) { std::cerr << "Failed to open " << firstOut << " so can't convert bam2FastQ.\n"; return(-1); } if(mySecondFile == NULL) { std::cerr << "Failed to open " << secondOut << " so can't convert bam2FastQ.\n"; return(-1); } } if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0)) { readName = true; } else { // defaulting to coordinate sorted. samIn.setSortedValidation(SamFile::COORDINATE); } // Setup the '=' translation if the reference was specified. if(!refFile.IsEmpty()) { GenomeSequence* refPtr = new GenomeSequence(refFile); samIn.SetReadSequenceTranslation(SamRecord::BASES); samIn.SetReference(refPtr); } SamRecord* recordPtr; int16_t samFlag; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = myPool.getRecord(); if(recordPtr == NULL) { // Failed to allocate a new record. throw(std::runtime_error("Failed to allocate a new SAM/BAM record")); } if(!samIn.ReadRecord(mySamHeader, *recordPtr)) { // Failed to read a record. returnStatus = samIn.GetStatus(); continue; } // Have a record. Check to see if it is a pair or unpaired read. samFlag = recordPtr->getFlag(); if(SamFlag::isPaired(samFlag)) { if(readName) { handlePairedRN(*recordPtr); } else { handlePairedCoord(*recordPtr); } } else { ++myNumUnpaired; writeFastQ(*recordPtr, myUnpairedFile, myUnpairedFileNameExt); } } // Flush All cleanUpMateMap(0, true); if(returnStatus == SamStatus::NO_MORE_RECS) { returnStatus = SamStatus::SUCCESS; } samIn.Close(); closeFiles(); // Output the results std::cerr << "\nFound " << myNumPairs << " read pairs.\n"; std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n"; if(myNumMateFailures != 0) { std::cerr << "Failed to find mates for " << myNumMateFailures << " reads, so they were written as unpaired\n" << " (not included in either of the above counts).\n"; } if(myNumQualTagErrors != 0) { std::cerr << myNumQualTagErrors << " records did not have tag " << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n"; } return(returnStatus); }
int VcfSplit::execute(int argc, char **argv) { String refFile = ""; String inputVcf = ""; String outputVcfBase = ""; String refName = ""; bool uncompress = false; bool params = false; bool noeof = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_STRINGPARAMETER("obase", &outputVcfBase) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_STRINGPARAMETER("refName", &refName) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(outputVcfBase == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--obase\", a required parameter.\n\n"; return(-1); } outputVcfBase += "."; if(params) { inputParameters.Status(); } // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } VcfFileReader inFile; std::map<std::string, VcfFileWriter*> outFiles; VcfHeader header; // Open the file. inFile.open(inputVcf, header); if(refName != "") { inFile.setReadSection(refName.c_str()); } VcfRecord record; int numRecords = 0; std::string prevChr = ""; std::string chr = ""; VcfFileWriter* outFilePtr = 0; std::string outName = ""; while(inFile.readRecord(record)) { ++numRecords; chr = record.getChromStr(); if((outFilePtr == 0) || (chr != prevChr)) { outFilePtr = outFiles[chr]; if(outFilePtr == 0) { outFilePtr = new VcfFileWriter(); outFiles[chr] = outFilePtr; outName = outputVcfBase.c_str(); if(chr.substr(0,3) != "chr") { outName += "chr"; } outName += chr + ".vcf"; // chr not in outFile list. if(uncompress) { outFilePtr->open(outName.c_str(), header, InputFile::DEFAULT); } else { outName += ".gz"; outFilePtr->open(outName.c_str(), header); } } } outFilePtr->writeRecord(record); } inFile.close(); for (std::map<std::string,VcfFileWriter*>::iterator it = outFiles.begin(); it != outFiles.end(); ++it) { if(it->second != 0) { it->second->close(); it->second = 0; } } std::cerr << "NumRecords: " << numRecords << "\n"; return(0); }
int VcfConsensus::execute(int argc, char ** argv) { String vcfName1; String vcfName2; String vcfName3; String outputFileName; bool uncompress = false; bool params = false; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in1", &vcfName1) LONG_STRINGPARAMETER("in2", &vcfName2) LONG_STRINGPARAMETER("in3", &vcfName3) LONG_STRINGPARAMETER("out", &outputFileName) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("uncompress", &uncompress) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); std::string gtField = "GT"; VcfFileReader vcf1; VcfFileReader vcf2; VcfFileReader vcf3; VcfHeader header1; VcfHeader header2; VcfHeader header3; VcfRecord record1; VcfRecord record2; VcfRecord record3; VcfRecordGenotype* genotypeInfoPtr1 = NULL; VcfRecordGenotype* genotypeInfoPtr2 = NULL; VcfRecordGenotype* genotypeInfoPtr3 = NULL; unsigned int numMissing2 = 0; unsigned int numMissing3 = 0; unsigned int numMismatchRefAlt = 0; unsigned int numMissingGT1 = 0; const unsigned int myMaxErrors = 4; // Check that the required parameters were set. if(vcfName1 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in1\", a required parameter.\n\n"; return(-1); } if(vcfName2 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in2\", a required parameter.\n\n"; return(-1); } if(vcfName3 == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in3\", a required parameter.\n\n"; return(-1); } if(outputFileName == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--out\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // Open the files. vcf1.open(vcfName1, header1); vcf2.open(vcfName2, header2); vcf3.open(vcfName3, header3); // Setup the sample name maps. int numSamples = header1.getNumSamples(); std::vector<int> sample2Indices; std::vector<int> sample3Indices; std::vector<int> removeIndices; int numSamplesSkipped1 = 0; int numSamplesSkipped2 = 0; int numSamplesSkipped3 = 0; for(int i = 0; i < numSamples; i++) { int sm2Index = header2.getSampleIndex(header1.getSampleName(i)); int sm3Index = header3.getSampleIndex(header1.getSampleName(i)); // Look for this sample name in vcf2. if((sm2Index != -1) && (sm3Index != -1)) { sample2Indices.push_back(sm2Index); sample3Indices.push_back(sm3Index); } else { // Sample not found in all three vcfs. removeIndices.push_back(i); ++numSamplesSkipped1; } } // Remove samples not found in all 3 vcfs from header1. // Remove them in reverse order so they are removed from the end of the header first. VcfSubsetSamples subset1; subset1.init(header1, true); for(int i = (removeIndices.size() - 1); i >= 0; i--) { subset1.addExcludeSample(header1.getSampleName(removeIndices[i])); header1.removeSample(removeIndices[i]); } // Set numSamples to the new number of samples in header1. numSamples = header1.getNumSamples(); // Calculate the number of samples skipped for files 2 & 3. numSamplesSkipped2 = header2.getNumSamples() - sample2Indices.size(); numSamplesSkipped3 = header3.getNumSamples() - sample3Indices.size(); if(numSamplesSkipped1 > 0) { std::cerr << "Skipping " << numSamplesSkipped1 << " samples from --in1\n"; } if(numSamplesSkipped2 > 0) { std::cerr << "Skipping " << numSamplesSkipped2 << " samples from --in2\n"; } if(numSamplesSkipped3 > 0) { std::cerr << "Skipping " << numSamplesSkipped3 << " samples from --in3\n"; } VcfFileWriter outputVcf; // Open and write the header if(uncompress) { outputVcf.open(outputFileName, header1, InputFile::DEFAULT); } else { outputVcf.open(outputFileName, header1); } const char* chrom1 = NULL; int pos1 = UNSET_POS; // Read the first record from vcf2 & vcf3. vcf2.readRecord(record2); vcf3.readRecord(record3); bool newChrom = true; static std::string prevChrom = ""; uint64_t numAllMatch = 0; uint64_t num1Match2Only = 0; uint64_t num1Match3Only = 0; uint64_t num2Match3Only = 0; uint64_t numNoMatches = 0; uint64_t numAllMatch00 = 0; uint64_t num1Match2Only00 = 0; uint64_t num1Match3Only00 = 0; uint64_t num2Match3Only00 = 0; uint64_t numAllMatch01 = 0; uint64_t num1Match2Only01 = 0; uint64_t num1Match3Only01 = 0; uint64_t num2Match3Only01 = 0; uint64_t numAllMatch11 = 0; uint64_t num1Match2Only11 = 0; uint64_t num1Match3Only11 = 0; uint64_t num2Match3Only11 = 0; // Loop through vcf1. while(vcf1.readRecord(record1, &subset1)) { chrom1 = record1.getChromStr(); pos1 = record1.get1BasedPosition(); if(strcmp(chrom1, prevChrom.c_str()) == 0) { newChrom = false; } else { prevChrom = chrom1; newChrom = true; } bool found = true; if(!findPos(newChrom, chrom1, pos1, record2, vcf2)) { // Failed to find the position, continue to the next position if(++numMissing2 <= myMaxErrors) { std::cerr << "Failed to find " << chrom1 << ":" << pos1 << " in " << vcfName2 << ", so skipping this pos\n"; } found = false; } if(!findPos(newChrom, chrom1, pos1, record3, vcf3)) { // Failed to find the position, continue to the next position if(++numMissing3 <= myMaxErrors) { std::cerr << "Failed to find " << chrom1 << ":" << pos1 << " in " << vcfName3 << ", so skipping this pos\n"; } found = false; } if(found == false) { continue; } // Found the position in all files. // Validate that the reference & alternate alleles are the same. const char* ref1 = record1.getRefStr(); const char* alt1 = record1.getAltStr(); if((strcmp(ref1, record2.getRefStr()) != 0) || (strcmp(ref1, record3.getRefStr()) != 0) || (strcmp(alt1, record2.getAltStr()) != 0) || (strcmp(alt1, record3.getAltStr()) != 0)) { if(++numMismatchRefAlt <= myMaxErrors) { std::cerr << "Mismatching ref/alt found at " << chrom1 << ":" << pos1 << ", so skipping this pos\n"; } continue; } // Get the genotype information for each. genotypeInfoPtr1 = &(record1.getGenotypeInfo()); genotypeInfoPtr2 = &(record2.getGenotypeInfo()); genotypeInfoPtr3 = &(record3.getGenotypeInfo()); // Loop through all the samples in vcf1. // Get the Genotype Information. for(int i = 0; i < numSamples; i++) { const std::string* genotypeVal1 = genotypeInfoPtr1->getString(gtField, i); const std::string* genotypeVal2 = genotypeInfoPtr2->getString(gtField, sample2Indices[i]); const std::string* genotypeVal3 = genotypeInfoPtr3->getString(gtField, sample3Indices[i]); // Need to make sure the field was found. if(genotypeVal1 == NULL) { // GT not found in the first record, so just continue. if(++numMissingGT1 <= myMaxErrors) { std::cerr << "Missing GT for " << header1.getSampleName(i) << " in " << vcfName1 << "\n"; } continue; } if(isSame(genotypeVal1, genotypeVal2)) { // genotypeVal1 is majority, so make no change. if(isSame(genotypeVal1, genotypeVal3)) { ++numAllMatch; if(*genotypeVal1 == "0/0") { ++numAllMatch00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++numAllMatch01; } if(*genotypeVal1 == "1/1") { ++numAllMatch11; } } else { ++num1Match2Only; if(*genotypeVal1 == "0/0") { ++num1Match2Only00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++num1Match2Only01; } if(*genotypeVal1 == "1/1") { ++num1Match2Only11; } } } else if(isSame(genotypeVal1, genotypeVal3)) { // genotypeVal1 is majority, so make no change. ++num1Match3Only; if(*genotypeVal1 == "0/0") { ++num1Match3Only00; } else if((*genotypeVal1 == "0/1") || (*genotypeVal1 == "1/0")) { ++num1Match3Only01; } if(*genotypeVal1 == "1/1") { ++num1Match3Only11; } } else if(isSame(genotypeVal2, genotypeVal3)) { // genotypeVal2 is majority, so change genotypeVal1. genotypeInfoPtr1->setString(gtField, i, *genotypeVal2); ++num2Match3Only; if(*genotypeVal2 == "0/0") { ++num2Match3Only00; } else if((*genotypeVal2 == "0/1") || (*genotypeVal2 == "1/0")) { ++num2Match3Only01; } if(*genotypeVal2 == "1/1") { ++num2Match3Only11; } } else { // None are the same so set to "./." genotypeInfoPtr1->setString(gtField, i, "./."); ++numNoMatches; } } // loop back to vcf1 samples. // Write this record. outputVcf.writeRecord(record1); } // loop back to next vcf1 record. std::cerr << "\n"; if(numMissing2 > myMaxErrors) { std::cerr << "Suppressed " << numMissing2 - myMaxErrors << " errors about skipped positions because they were not in " << vcfName2 << "\n"; } if(numMissing3 > myMaxErrors) { std::cerr << "Suppressed " << numMissing3 - myMaxErrors << " errors about skipped positions because they were not in " << vcfName3 << "\n"; } if(numMismatchRefAlt > myMaxErrors) { std::cerr << "Suppressed " << numMismatchRefAlt - myMaxErrors << " errors about mismatched ref/alt positions\n"; } if(numMissingGT1 > myMaxErrors) { std::cerr << "Suppressed " << numMissingGT1 - myMaxErrors << " errors about missing GT for " << vcfName1 << "\n"; } std::cerr << "\n"; // Output the stats. std::cerr << "File1 = " << vcfName1 << std::endl; std::cerr << "File2 = " << vcfName2 << std::endl; std::cerr << "File3 = " << vcfName3 << std::endl; std::cerr << "\nType\tTotal\t0/0\t0/1|1/0\t1/1\n"; std::cerr << "AllMatched" << "\t" << numAllMatch << "\t" << numAllMatch00 << "\t" << numAllMatch01 << "\t" << numAllMatch11 << std::endl; std::cerr << "1matched2" << "\t" << num1Match2Only << "\t" << num1Match2Only00 << "\t" << num1Match2Only01 << "\t" << num1Match2Only11 << std::endl; std::cerr << "1matched3" << "\t" << num1Match3Only << "\t" << num1Match3Only00 << "\t" << num1Match3Only01 << "\t" << num1Match3Only11 << std::endl; std::cerr << "2matched3" << "\t" << num2Match3Only << "\t" << num2Match3Only00 << "\t" << num2Match3Only01 << "\t" << num2Match3Only11 << std::endl; std::cerr << "NoneMatched\t" << numNoMatches << std::endl; return(0); }
// main function of verifyBamID int execute(int argc, char** argv) { printf("verifyBamID %s -- verify identity and purity of sequence data\n" "(c) 2010-2014 Hyun Min Kang, Goo Jun, and Goncalo Abecasis\n\n", VERSION); VerifyBamIDArgs args; ParameterList pl; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Input Files") LONG_STRINGPARAMETER("vcf",&args.sVcfFile) LONG_STRINGPARAMETER("bam",&args.sBamFile) LONG_STRINGPARAMETER("subset",&args.sSubsetInds) LONG_STRINGPARAMETER("smID",&args.sSMID) LONG_PARAMETER_GROUP("VCF analysis options") LONG_DOUBLEPARAMETER("genoError",&args.genoError) LONG_DOUBLEPARAMETER("minAF",&args.minAF) LONG_DOUBLEPARAMETER("minCallRate",&args.minCallRate) LONG_PARAMETER_GROUP("Individuals to compare with chip data") EXCLUSIVE_PARAMETER("site",&args.bSiteOnly) EXCLUSIVE_PARAMETER("self",&args.bSelfOnly) EXCLUSIVE_PARAMETER("best",&args.bFindBest) LONG_PARAMETER_GROUP("Chip-free optimization options") EXCLUSIVE_PARAMETER("free-none",&args.bFreeNone) EXCLUSIVE_PARAMETER("free-mix",&args.bFreeMixOnly) EXCLUSIVE_PARAMETER("free-refBias",&args.bFreeRefBiasOnly) EXCLUSIVE_PARAMETER("free-full",&args.bFreeFull) LONG_PARAMETER_GROUP("With-chip optimization options") EXCLUSIVE_PARAMETER("chip-none",&args.bChipNone) EXCLUSIVE_PARAMETER("chip-mix",&args.bChipMixOnly) EXCLUSIVE_PARAMETER("chip-refBias",&args.bChipRefBiasOnly) EXCLUSIVE_PARAMETER("chip-full",&args.bChipFull) LONG_PARAMETER_GROUP("BAM analysis options") LONG_PARAMETER("ignoreRG",&args.bIgnoreRG) LONG_PARAMETER("ignoreOverlapPair",&args.bIgnoreOverlapPair) LONG_PARAMETER("noEOF",&args.bNoEOF) LONG_PARAMETER("precise",&args.bPrecise) LONG_INTPARAMETER("minMapQ",&args.minMapQ) LONG_INTPARAMETER("maxDepth",&args.maxDepth) LONG_INTPARAMETER("minQ",&args.minQ) LONG_INTPARAMETER("maxQ",&args.maxQ) LONG_DOUBLEPARAMETER("grid",&args.grid) LONG_PARAMETER_GROUP("Modeling Reference Bias") LONG_DOUBLEPARAMETER("refRef",&args.pRefRef) LONG_DOUBLEPARAMETER("refHet",&args.pRefHet) LONG_DOUBLEPARAMETER("refAlt",&args.pRefAlt) LONG_PARAMETER_GROUP("Output options") LONG_STRINGPARAMETER("out",&args.sOutFile) LONG_PARAMETER("verbose",&args.bVerbose) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); pl.Add(new LongParameters("Available Options",longParameters)); pl.Read(argc, argv); pl.Status(); // check the validity of input files if ( args.sVcfFile.IsEmpty() ) { error("--vcf [vcf file] required"); } if ( args.sBamFile.IsEmpty() ) { error("--bam [bam file] is required"); } if ( args.sOutFile.IsEmpty() ) { error("--out [output prefix] is required"); } Logger::gLogger = new Logger((args.sOutFile + ".log").c_str(), args.bVerbose); if ( ! ( args.bSiteOnly || args.bSelfOnly || args.bFindBest ) ) { warning("--self option was autotomatically turned on by default. Specify --best option if you wanted to check across all possible samples in the VCF"); args.bSelfOnly = true; } if ( ( args.maxDepth > 20 ) && ( !args.bPrecise ) ) { warning("--precise option is not turned on at --maxDepth %d : may be prone to precision errors",args.maxDepth); } if ( ( args.bChipRefBiasOnly ) && ( !args.bSelfOnly ) ) { error("--self must be set for --chip-refBias to work. Skipping.."); } // check timestamp time_t t; time(&t); Logger::gLogger->writeLog("Analysis started on %s",ctime(&t)); // load arguments VerifyBamID vbid(&args); // load input VCF and BAM files Logger::gLogger->writeLog("Opening Input Files"); vbid.loadFiles(args.sBamFile.c_str(), args.sVcfFile.c_str()); // Check which genotype-free method is used if ( args.bFreeNone ) { // if no genotype-free mode is tested. skip it // do nothing for genotype-free estimation Logger::gLogger->writeLog("Skipping chip-free estimation of sample mixture"); } else if ( args.bFreeMixOnly ) { // only mixture is estimated. // genotype-free method Logger::gLogger->writeLog("Performing chip-free estimation of sample mixture at fixed reference bias parameters (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt); // scan across multiple readgroups for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) { VerifyBamID::mixLLK mix(&vbid); mix.OptimizeLLK(rg); Logger::gLogger->writeLog("Optimal per-sample fMix = %lf, LLK0 = %lf, LLK1 = %lf\n",mix.fMix,mix.llk0,mix.llk1); vbid.mixOut.llk0s[rg+1] = mix.llk0; vbid.mixOut.llk1s[rg+1] = mix.llk1; vbid.mixOut.fMixs[rg+1] = mix.fMix; } //vbid.mixRefHet = 0.5; //vbid.mixRefAlt = 0.00; } else if ( args.bFreeRefBiasOnly ) { Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias without sample mixture"); for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) { VerifyBamID::refBiasMixLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(2); startingPoint[0] = 0; // pRefHet = 0.5 startingPoint[1] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(2); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf at readGroup %d",pRefHet,pRefAlt,myMinimizer.fmin,rg); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.mixOut.llk0s[rg+1] = myFunc.llk0; vbid.mixOut.llk1s[rg+1] = myFunc.llk1; vbid.mixOut.refHets[rg+1] = myFunc.pRefHet; vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt; } } else if ( args.bFreeFull ) { Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias and sample mixture together"); for(int rg = -1; rg < vbid.nRGs - args.bIgnoreRG; ++rg) { VerifyBamID::fullMixLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(3); startingPoint[0] = -3.91; // start with fMix = 0.01 startingPoint[1] = 0; // pRefHet = 0.5 startingPoint[2] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(3); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double fMix = VerifyBamID::invLogit(myMinimizer.point[0]); if ( fMix > 0.5 ) fMix = 1.-fMix; double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]); Logger::gLogger->writeLog("Optimal per-sample fMix = %lf\n",fMix); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.mixOut.llk0s[rg+1] = myFunc.llk0; vbid.mixOut.llk1s[rg+1] = myFunc.llk1; vbid.mixOut.fMixs[rg+1] = myFunc.fMix; vbid.mixOut.refHets[rg+1] = myFunc.pRefHet; vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt; } } Logger::gLogger->writeLog("calculating depth distribution"); vbid.calculateDepthDistribution(args.maxDepth, vbid.mixOut); Logger::gLogger->writeLog("finished calculating depth distribution"); std::vector<int> bestInds(vbid.nRGs+1,-1); std::vector<int> selfInds(vbid.nRGs+1,-1); if ( args.bChipNone ) { // do nothing Logger::gLogger->writeLog("Skipping with-chip estimation of sample mixture"); } else if ( args.bChipMixOnly ) { Logger::gLogger->writeLog("Performing with-chip estimation of sample mixture at fixed reference bias parameter (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt); for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { double maxIBD = -1; VerifyBamID::ibdLLK ibd(&vbid); for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) { double fIBD = ibd.OptimizeLLK(i, rg); Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(),fIBD, ibd.llk0, ibd.llk1, rg); if ( maxIBD < fIBD ) { bestInds[rg+1] = i; vbid.bestOut.llk0s[rg+1] = ibd.llk0; vbid.bestOut.llk1s[rg+1] = ibd.llk1; vbid.bestOut.fMixs[rg+1] = 1-ibd.fIBD; maxIBD = ibd.fIBD; } if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) { selfInds[rg+1] = i; vbid.selfOut.llk0s[rg+1] = ibd.llk0; vbid.selfOut.llk1s[rg+1] = ibd.llk1; vbid.selfOut.fMixs[rg+1] = 1-ibd.fIBD; } } if ( bestInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD); vbid.calculateDepthByGenotype(bestInds[rg+1],rg,vbid.bestOut); } if ( selfInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]); vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut); } } } else if ( args.bChipRefBiasOnly ) { Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias without sample mixture"); if ( args.bSelfOnly ) { for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { VerifyBamID::refBiasIbdLLKFunc myFunc(&vbid, rg); AmoebaMinimizer myMinimizer; Vector startingPoint(2); startingPoint[0] = 0; // pRefHet = 0.5 startingPoint[1] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myMinimizer.Reset(2); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); vbid.selfOut.llk0s[rg+1] = myFunc.llk0; vbid.selfOut.llk1s[rg+1] = myFunc.llk1; vbid.selfOut.refHets[rg+1] = myFunc.pRefHet; vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt; vbid.calculateDepthByGenotype(0,rg,vbid.selfOut); } } else { Logger::gLogger->warning("--self must be set for --chip-refBias to work. Skipping.."); } } else if ( args.bChipFull ) { Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias and sample mixture together"); for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { double maxIBD = -1; for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) { VerifyBamID::fullIbdLLKFunc myFunc(&vbid,i,rg); AmoebaMinimizer myMinimizer; Vector startingPoint(3); startingPoint[0] = 3.91; // start with fIBD = 0.99 startingPoint[1] = 0; // pRefHet = 0.5 startingPoint[2] = -4.595; // pRefAlt = 0.01 myMinimizer.func = &myFunc; myFunc.indIdx = i; myMinimizer.Reset(3); myMinimizer.point = startingPoint; myMinimizer.Minimize(1e-6); double fIBD = VerifyBamID::invLogit(myMinimizer.point[0]); double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]); double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]); Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(), fIBD, myFunc.llk0, myFunc.llk1, rg); //Logger::gLogger->writeLog("Optimal per-sample fIBD = %lf, ",fIBD); Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf ) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin); if ( maxIBD < fIBD ) { bestInds[rg+1] = i; maxIBD = fIBD; vbid.bestOut.llk0s[rg+1] = myFunc.llk0; vbid.bestOut.llk1s[rg+1] = myFunc.llk1; vbid.bestOut.fMixs[rg+1] = 1.-myFunc.fIBD; vbid.bestOut.refHets[rg+1] = myFunc.pRefHet; vbid.bestOut.refAlts[rg+1] = myFunc.pRefAlt; } if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) { selfInds[rg+1] = i; vbid.selfOut.llk0s[rg+1] = myFunc.llk0; vbid.selfOut.llk1s[rg+1] = myFunc.llk1; vbid.selfOut.fMixs[rg+1] = 1.-myFunc.fIBD; vbid.selfOut.refHets[rg+1] = myFunc.pRefHet; vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt; vbid.calculateDepthByGenotype(i, rg, vbid.selfOut); } } //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt); if ( bestInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD); vbid.calculateDepthByGenotype(bestInds[rg+1], rg, vbid.bestOut); } if ( selfInds[rg+1] >= 0 ) { Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]); vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut); } } } // PRINT OUTPUT FILE - ".selfSM" // [SEQ_ID] : SAMPLE ID in the sequence file // [CHIP_ID] : SAMPLE ID in the chip file (NA if not available) // [#SNPS] : Number of markers evaluated // [#READS] : Number of reads evaluated // [AVG_DP] : Mean depth // [FREEMIX] : Chip-free estimated alpha (% MIX in 0-1 scale), NA if unavailable // [FREELK1] : Chip-free log-likelihood at estimated alpha // [FREELK0] : Chip-free log-likelihood at 0% contamination // [CHIPIBD] : With-chip estimated alpha (% MIX in 0-1 scale) // [CHIPLK1] : With-chip log-likelihood at estimated alpha // [CHIPLK0] : With-chip log-likelihood at 0% contamination // [DPREF] : Depth at reference site in the chip // [RDPHET] : Relative depth at HET site in the chip // [RDPALT] : Relative depth at HOMALT site in the chip // [FREE_RF] : Pr(Ref|Ref) site estimated without chip data // [FREE_RH] : Pr(Ref|Het) site estimated without chip data // [FREE_RA] : Pr(Ref|Alt) site estimated without chip data // [CHIP_RF] : Pr(Ref|Ref) site estimated with chip data // [CHIP_RH] : Pr(Ref|Het) site estimated with chip data // [CHIP_RA] : Pr(Ref|Alt) site estimated with chip data // [DPREF] : Depth at reference alleles // [RDPHET] : Relative depth at heterozygous alleles // [RDPALT] : Relative depth at hom-alt alleles String selfSMFN = args.sOutFile + ".selfSM"; String bestSMFN = args.sOutFile + ".bestSM"; String selfRGFN = args.sOutFile + ".selfRG"; String bestRGFN = args.sOutFile + ".bestRG"; String dpSMFN = args.sOutFile + ".depthSM"; String dpRGFN = args.sOutFile + ".depthRG"; IFILE selfSMF = ifopen(selfSMFN,"wb"); IFILE bestSMF = (args.bFindBest ? ifopen(bestSMFN,"wb") : NULL); IFILE selfRGF = (args.bIgnoreRG ? NULL : ifopen(selfRGFN,"wb")); IFILE bestRGF = (args.bFindBest && !args.bIgnoreRG) ? ifopen(bestRGFN,"wb") : NULL; IFILE dpSMF = ifopen(dpSMFN,"wb"); IFILE dpRGF = (args.bIgnoreRG ? NULL : ifopen(dpRGFN,"wb")); if ( selfSMF == NULL ) { Logger::gLogger->error("Cannot write to %s",selfSMF); } if ( args.bFindBest && ( bestSMF == NULL ) ) { Logger::gLogger->error("Cannot write to %s",bestSMF); } if ( dpSMF == NULL ) { Logger::gLogger->error("Cannot write to %s",dpSMF); } ifprintf(dpSMF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n"); int nCumMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { nCumMarkers += vbid.mixOut.depths[i]; ifprintf(dpSMF,"ALL\t%d\t%d\t%.5lf\t%.5lf\n",i, vbid.mixOut.depths[i],(double) vbid.mixOut.depths[i]/(double)vbid.nMarkers,(double)nCumMarkers/(double)vbid.nMarkers); } ifclose(dpSMF); if ( dpRGF != NULL ) { ifprintf(dpRGF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n"); for(int rg=0; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) { const char* rgID = vbid.pPile->vsRGIDs[rg].c_str(); int nMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { nMarkers += vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i]; } nCumMarkers = 0; for(int i=args.maxDepth; i >= 0; --i) { int d = vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i]; nCumMarkers += d; ifprintf(dpRGF,"%s\t%d\t%d\t%.5lf\t%.5lf\n",rgID,i,d,(double)d/(double)vbid.nMarkers,(double)nCumMarkers/(double)nMarkers); } } ifclose(dpRGF); } const char* headers[] = {"#SEQ_ID","RG","CHIP_ID","#SNPS","#READS","AVG_DP","FREEMIX","FREELK1","FREELK0","FREE_RH","FREE_RA","CHIPMIX","CHIPLK1","CHIPLK0","CHIP_RH","CHIP_RA","DPREF","RDPHET","RDPALT"}; int nheaders = sizeof(headers)/sizeof(headers[0]); for(int i=0; i < nheaders; ++i) { ifprintf(selfSMF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(selfSMF,"\n"); ifprintf(selfSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str()); ifprintf(selfSMF,"\t%s",selfInds[0] >= 0 ? vbid.pGenotypes->indids[selfInds[0]].c_str() : "NA"); ifprintf(selfSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers); if ( args.bFreeNone ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else if ( args.bFreeFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[0],vbid.selfOut.llk1s[0],vbid.selfOut.llk0s[0],(double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else if ( args.bChipFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[0], vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); } else { error("Invalid option in handling bChip"); } ifprintf(selfSMF,"\n"); ifclose(selfSMF); if ( bestSMF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(bestSMF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(bestSMF,"\n"); ifprintf(bestSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str()); ifprintf(bestSMF,"\t%s",bestInds[0] >= 0 ? vbid.pGenotypes->indids[bestInds[0]].c_str() : "NA"); ifprintf(bestSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers); if ( args.bFreeNone ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else if ( args.bFreeFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[0],vbid.bestOut.llk1s[0],vbid.bestOut.llk0s[0],(double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else if ( args.bChipFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[0], vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); } else { error("Invalid option in handling bChip"); } ifprintf(bestSMF,"\n"); ifclose(bestSMF); } if ( selfRGF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(selfRGF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(selfRGF,"\n"); for(int rg=0; rg < vbid.nRGs; ++rg) { ifprintf(selfRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str()); ifprintf(selfRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA"); ifprintf(selfRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]); if ( args.bFreeNone ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else if ( args.bFreeFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); } else { error("Invalid option in handling bChip"); } ifprintf(selfRGF,"\n"); } ifclose(selfRGF); } if ( bestRGF != NULL ) { for(int i=0; i < nheaders; ++i) { ifprintf(bestRGF,"%s%s",i>0 ? "\t" : "",headers[i]); } ifprintf(bestRGF,"\n"); for(int rg=0; rg < vbid.nRGs; ++rg) { ifprintf(bestRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str()); ifprintf(bestRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA"); ifprintf(bestRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]); if ( args.bFreeNone ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bFreeMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); } else if ( args.bFreeRefBiasOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else if ( args.bFreeFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); } else { error("Invalid option in handling bFree"); } if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); } else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else if ( args.bChipFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); } else { error("Invalid option in handling bChip"); } ifprintf(bestRGF,"\n"); } ifclose(bestRGF); } time(&t); Logger::gLogger->writeLog("Analysis finished on %s",ctime(&t)); return 0; }
int VcfMac::execute(int argc, char **argv) { String inputVcf = ""; int minAC = -1; String sampleSubset = ""; String filterList = ""; bool params = false; IntervalTree<int> regions; std::vector<int> intersection; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("sampleSubset", &sampleSubset) LONG_INTPARAMETER("minAC", &minAC) LONG_STRINGPARAMETER("filterList", &filterList) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // Open the two input files. VcfFileReader inFile; VcfHeader header; VcfRecord record; // Open the file if(sampleSubset.IsEmpty()) { inFile.open(inputVcf, header); } else { inFile.open(inputVcf, header, sampleSubset, NULL, NULL); } // Add the discard rule for minor allele count. if(minAC >= 0) { inFile.addDiscardMinMinorAlleleCount(minAC, NULL); } if(!filterList.IsEmpty()) { // Open the filter list. IFILE regionFile = ifopen(filterList, "r"); String regionLine; StringArray regionColumn; int start; int end; int intervalVal = 1; if(regionFile == NULL) { std::cerr << "Failed to open " << filterList << ", so keeping all positions\n"; filterList.Clear(); } else { while( regionFile->isOpen() && !regionFile->ifeof()) { // Read the next interval regionLine.Clear(); regionLine.ReadLine(regionFile); if(regionLine.IsEmpty()) { // Nothing on this line, continue to the next. continue; } regionColumn.ReplaceColumns(regionLine, ' '); if(regionColumn.Length() != 2) { std::cerr << "Improperly formatted region line: " << regionLine << "; skipping to the next line.\n"; continue; } // Convert the columns to integers. if(!regionColumn[0].AsInteger(start)) { // The start position (1st column) is not an integer. std::cerr << "Improperly formatted region line, start position " << "(1st column) is not an integer: " << regionColumn[0] << "; Skipping to the next line.\n"; continue; } if(!regionColumn[1].AsInteger(end)) { // The start position (1st column) is not an integer. std::cerr << "Improperly formatted region line, end position " << "(2nd column) is not an integer: " << regionColumn[1] << "; Skipping to the next line.\n"; continue; } // Add 1-based inclusive intervals. regions.add(start,end, intervalVal); } } } int numReadRecords = 0; while( inFile.readRecord(record)) { if(!filterList.IsEmpty()) { // Check if the region should be kept. intersection.clear(); regions.get_intersecting_intervals(record.get1BasedPosition(), intersection); if(intersection.empty()) { // not in the interval, so continue to the next record. continue; } } ++numReadRecords; // Loop through the number of possible alternates. unsigned int numAlts = record.getNumAlts(); int minAlleleCount = -1; int curAlleleCount = 0; int totalAlleleCount = 0; for(unsigned int i = 0; i <= numAlts; i++) { curAlleleCount = record.getAlleleCount(i); if((minAlleleCount == -1) || (curAlleleCount < minAlleleCount)) { minAlleleCount = curAlleleCount; } totalAlleleCount += curAlleleCount; } if(totalAlleleCount != 0) { double maf = (double)minAlleleCount/totalAlleleCount; std::cout << record.getIDStr() << "\t" << minAlleleCount << "\t" << maf << "\n"; } } inFile.close(); // std::cerr << "\n\t# Records: " << numReadRecords << "\n"; // return success. return(0); }
// Dump the reference information from specified SAM/BAM file. int DumpRefInfo::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool noeof = false; bool printRecordRefs = false; bool params = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("printRecordRefs", &printRecordRefs) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; samIn.OpenForRead(inFile); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int numReferences = refInfo.getNumEntries(); for(int i = 0; i < numReferences; i++) { std::cout << "Reference Index " << i; std::cout << "; Name: " << refInfo.getReferenceName(i) << std::endl; } if(numReferences == 0) { // There is no reference info. std::cerr << "The header contains no reference information.\n"; } // If we are to print the references as found in the records, loop // through reading the records. if(printRecordRefs) { SamRecord samRecord; // Track the prev name/id. std::string prevName = ""; int prevID = -2; int recCount = 0; // track the num records in a ref. // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { const char* name = samRecord.getReferenceName(); int id = samRecord.getReferenceID(); if((strcmp(name, prevName.c_str()) != 0) || (id != prevID)) { if(prevID != -2) { std::cout << "\tRef ID: " << prevID << "\tRef Name: " << prevName << "\tNumRecs: " << recCount << std::endl; } recCount = 0; prevID = id; prevName = name; } ++recCount; } // Print the last index. if(prevID != -2) { std::cout << "\tRef ID: " << prevID << "\tRef Name: " << prevName << "\tNumRecs: " << recCount << std::endl; } } return(SamStatus::SUCCESS); }