void testReadBam() { SamFile inSam; assert(inSam.OpenForRead("testFiles/testBam.bam")); // Call generic test which since the sam and bam are identical, should // contain the same results. testRead(inSam); inSam.Close(); testFlagRead("testFiles/testBam.bam"); }
bool BamProcessor::finalize (bool success) { if (outfile_.IsOpen ()) { trclog << "Closing output file" << std::endl; outfile_.Close (); } if (logfile_.is_open ()) { time_t t = time (NULL); print_stats (logfile_); logfile_ << "\nFinished " << (success ? "successfully" : "due to error") << " at " << asctime (localtime (&t)) << "\n"; trclog << "Closing log file" << std::endl; logfile_.close (); } if (info.enabled ()) print_stats (info.o_); return true; }
int Dedup_LowMem::execute(int argc, char** argv) { /* -------------------------------- * process the arguments * -------------------------------*/ String inFile, outFile, logFile; myDoRecab = false; bool removeFlag = false; bool verboseFlag = false; myForceFlag = false; myNumMissingMate = 0; myMinQual = DEFAULT_MIN_QUAL; String excludeFlags = "0xB04"; uint16_t intExcludeFlags = 0; bool noeof = false; bool params = false; LongParamContainer parameters; parameters.addGroup("Required Parameters"); parameters.addString("in", &inFile); parameters.addString("out", &outFile); parameters.addGroup("Optional Parameters"); parameters.addInt("minQual", & myMinQual); parameters.addString("log", &logFile); parameters.addBool("oneChrom", &myOneChrom); parameters.addBool("recab", &myDoRecab); parameters.addBool("rmDups", &removeFlag); parameters.addBool("force", &myForceFlag); parameters.addString("excludeFlags", &excludeFlags); parameters.addBool("verbose", &verboseFlag); parameters.addBool("noeof", &noeof); parameters.addBool("params", ¶ms); parameters.addPhoneHome(VERSION); myRecab.addRecabSpecificParameters(parameters); ParameterList inputParameters; inputParameters.Add(new LongParameters ("Input Parameters", parameters.getLongParameterList())); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(inFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an input file" << std::endl; return EXIT_FAILURE; } if(outFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an output file" << std::endl; return EXIT_FAILURE; } intExcludeFlags = excludeFlags.AsInteger(); if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Cannot specify --force and Duplicate in the excludeFlags. Since --force indicates to override" << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n"; return EXIT_FAILURE; } if(!SamFlag::isSecondary(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n"; return EXIT_FAILURE; } if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n"; return EXIT_FAILURE; } if(logFile.IsEmpty()) { logFile = outFile + ".log"; } if(myDoRecab) { int status = myRecab.processRecabParam(); if(status != 0) { inputParameters.Status(); return(status); } } if(params) { inputParameters.Status(); } Logger::gLogger = new Logger(logFile.c_str(), verboseFlag); /* ------------------------------------------------------------------- * The arguments are processed. Prepare the input BAM file, * instantiate dedup_LowMem, and construct the read group library map * ------------------------------------------------------------------*/ SamFile samIn; samIn.OpenForRead(inFile.c_str()); // If the file isn't sorted it will throw an exception. samIn.setSortedValidation(SamFile::COORDINATE); SamFileHeader header; samIn.ReadHeader(header); buildReadGroupLibraryMap(header); lastReference = -1; lastCoordinate = -1; // for keeping some basic statistics uint32_t recordCount = 0; uint32_t pairedCount = 0; uint32_t properPairCount = 0; uint32_t unmappedCount = 0; uint32_t reverseCount = 0; uint32_t qualCheckFailCount = 0; uint32_t secondaryCount = 0; uint32_t supplementaryCount = 0; uint32_t excludedCount = 0; // Now we start reading records SamRecord* recordPtr; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = mySamPool.getRecord(); if(recordPtr == NULL) { std::cerr << "Failed to allocate enough records\n"; return(-1); } if(!samIn.ReadRecord(header, *recordPtr)) { returnStatus = samIn.GetStatus(); continue; } // Take note of properties of this record int flag = recordPtr->getFlag(); if(SamFlag::isPaired(flag)) ++pairedCount; if(SamFlag::isProperPair(flag)) ++properPairCount; if(SamFlag::isReverse(flag)) ++reverseCount; if(SamFlag::isQCFailure(flag)) ++qualCheckFailCount; if(SamFlag::isSecondary(flag)) ++secondaryCount; if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT) ++supplementaryCount; if(!SamFlag::isMapped(flag)) ++unmappedCount; // put the record in the appropriate maps: // single reads go in myFragmentMap // paired reads go in myPairedMap recordCount = samIn.GetCurrentRecordCount(); // if we have moved to a new position, look back at previous reads for duplicates if (hasPositionChanged(*recordPtr)) { cleanupPriorReads(recordPtr); } // Determine if this read should be checked for duplicates. if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0)) { ++excludedCount; // No deduping done on this record, but still build the recab table. if(myDoRecab) { myRecab.processReadBuildTable(*recordPtr); } // Nothing more to do with this record, so // release the pointer. mySamPool.releaseRecord(recordPtr); } else { if(SamFlag::isDuplicate(flag) && !myForceFlag) { // Error: Marked duplicates, and duplicates aren't excluded. Logger::gLogger->error("There are records already duplicate marked."); Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over"); } checkDups(*recordPtr, recordCount); mySamPool.releaseRecord(recordPtr); } // let the user know we're not napping if (verboseFlag && (recordCount % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u", recordCount, myFragmentMap.size(), myPairedMap.size(), myMateMap.size()); } } // we're finished reading record so clean up the duplicate search and // close the input file cleanupPriorReads(NULL); samIn.Close(); // print some statistics Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS"); Logger::gLogger->writeLog("Total number of reads: %u",recordCount); Logger::gLogger->writeLog("Total number of paired-end reads: %u", pairedCount); Logger::gLogger->writeLog("Total number of properly paired reads: %u", properPairCount); Logger::gLogger->writeLog("Total number of unmapped reads: %u", unmappedCount); Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u", reverseCount); Logger::gLogger->writeLog("Total number of QC-failed reads: %u", qualCheckFailCount); Logger::gLogger->writeLog("Total number of secondary reads: %u", secondaryCount); Logger::gLogger->writeLog("Total number of supplementary reads: %u", supplementaryCount); Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u", myFragmentMap.size()); Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u", myPairedMap.size()); Logger::gLogger->writeLog("Total number of missing mates: %u", myNumMissingMate); Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u", excludedCount); Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("Sorting the indices of %d duplicated records", myDupList.size()); // sort the indices of duplicate records std::sort(myDupList.begin(), myDupList.end(), std::less<uint32_t> ()); // get ready to write the output file by making a second pass // through the input file samIn.OpenForRead(inFile.c_str()); samIn.ReadHeader(header); SamFile samOut; samOut.OpenForWrite(outFile.c_str()); samOut.WriteHeader(header); // If we are recalibrating, output the model information. if(myDoRecab) { myRecab.modelFitPrediction(outFile); } // an iterator to run through the duplicate indices int currentDupIndex = 0; bool moreDups = !myDupList.empty(); // let the user know what we're doing Logger::gLogger->writeLog("\nWriting %s", outFile.c_str()); // count the duplicate records as a check uint32_t singleDuplicates(0), pairedDuplicates(0); // start reading records and writing them out SamRecord record; while(samIn.ReadRecord(header, record)) { uint32_t currentIndex = samIn.GetCurrentRecordCount(); bool foundDup = moreDups && (currentIndex == myDupList[currentDupIndex]); // modify the duplicate flag and write out the record, // if it's appropriate int flag = record.getFlag(); if (foundDup) { // this record is a duplicate, so mark it. record.setFlag( flag | 0x400 ); currentDupIndex++; // increment duplicate counters to verify we found them all if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) ) { // unpaired or mate unmapped singleDuplicates++; } else { pairedDuplicates++; } // recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } // write the record if we are not removing duplicates if (!removeFlag ) samOut.WriteRecord(header, record); } else { if(myForceFlag) { // this is not a duplicate we've identified but we want to // remove any duplicate marking record.setFlag( flag & 0xfffffbff ); // unmark duplicate } // Not a duplicate, so recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } samOut.WriteRecord(header, record); } // Let the user know we're still here if (verboseFlag && (currentIndex % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u", currentIndex); } } // We're done. Close the files and print triumphant messages. samIn.Close(); samOut.Close(); Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads", removeFlag ? "removed" : "marked" , singleDuplicates, pairedDuplicates/2); Logger::gLogger->writeLog("\nDedup_LowMem complete!"); return 0; }
int Bam2FastQ::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool readName = false; String refFile = ""; String firstOut = ""; String secondOut = ""; String unpairedOut = ""; bool interleave = false; bool noeof = false; bool gzip = false; bool params = false; myOutBase = ""; myNumMateFailures = 0; myNumPairs = 0; myNumUnpaired = 0; mySplitRG = false; myQField = ""; myNumQualTagErrors = 0; myReverseComp = true; myRNPlus = false; myFirstRNExt = DEFAULT_FIRST_EXT; mySecondRNExt = DEFAULT_SECOND_EXT; myCompression = InputFile::DEFAULT; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("readName", &readName) LONG_PARAMETER("splitRG", &mySplitRG) LONG_STRINGPARAMETER("qualField", &myQField) LONG_PARAMETER("merge", &interleave) LONG_STRINGPARAMETER("refFile", &refFile) LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt) LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt) LONG_PARAMETER("rnPlus", &myRNPlus) LONG_PARAMETER("noReverseComp", &myReverseComp) LONG_PARAMETER("gzip", &gzip) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional OutputFile Names") LONG_STRINGPARAMETER("outBase", &myOutBase) LONG_STRINGPARAMETER("firstOut", &firstOut) LONG_STRINGPARAMETER("secondOut", &secondOut) LONG_STRINGPARAMETER("unpairedOut", &unpairedOut) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(gzip) { myCompression = InputFile::GZIP; } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both splitRG & firstOut/secondOut/unpairedOut // since it needs a different file for each RG. if(mySplitRG && (!firstOut.IsEmpty() || !secondOut.IsEmpty() || !unpairedOut.IsEmpty())) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n"; std::cerr << "Use --outBase instead.\n"; return(-1); } // Cannot specify splitRG & output to stdout. if(mySplitRG && (myOutBase[0] == '-')) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n"; return(-1); } // Check to see if the out file was specified, if not, generate it from // the input filename. if(myOutBase == "") { // Just remove the extension from the input filename. int extStart = inFile.FastFindLastChar('.'); if(extStart <= 0) { myOutBase = inFile; } else { myOutBase = inFile.Left(extStart); } } if(mySplitRG) { std::string fqList = myOutBase.c_str(); fqList += ".list"; myFqList = ifopen(fqList.c_str(), "w"); ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n"); } // Check to see if the first/second/single-ended were specified and // if not, set them. myFirstFileNameExt = "_1.fastq"; mySecondFileNameExt = "_2.fastq"; myUnpairedFileNameExt = ".fastq"; if(interleave) { myFirstFileNameExt = "_interleaved.fastq"; myFirstFileNameExt = "_interleaved.fastq"; } getFileName(firstOut, myFirstFileNameExt); getFileName(secondOut, mySecondFileNameExt); getFileName(unpairedOut, myUnpairedFileNameExt); if(params) { inputParameters.Status(); } // Open the files for reading/writing. // Open prior to opening the output files, // so if there is an error, the outputs don't get created. SamFile samIn; samIn.OpenForRead(inFile, &mySamHeader); // Skip non-primary reads. samIn.SetReadFlags(0, 0x0100); // Open the output files if not splitting RG if(!mySplitRG) { myUnpairedFile = ifopen(unpairedOut, "w", myCompression); // Only open the first file if it is different than an already opened file. if(firstOut != unpairedOut) { myFirstFile = ifopen(firstOut, "w", myCompression); } else { myFirstFile = myUnpairedFile; } // If it is interleaved or the 2nd file is not a new name, set it appropriately. if(interleave || secondOut == firstOut) { mySecondFile = myFirstFile; } else if(secondOut == unpairedOut) { mySecondFile = myUnpairedFile; } else { mySecondFile = ifopen(secondOut, "w", myCompression); } if(myUnpairedFile == NULL) { std::cerr << "Failed to open " << unpairedOut << " so can't convert bam2FastQ.\n"; return(-1); } if(myFirstFile == NULL) { std::cerr << "Failed to open " << firstOut << " so can't convert bam2FastQ.\n"; return(-1); } if(mySecondFile == NULL) { std::cerr << "Failed to open " << secondOut << " so can't convert bam2FastQ.\n"; return(-1); } } if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0)) { readName = true; } else { // defaulting to coordinate sorted. samIn.setSortedValidation(SamFile::COORDINATE); } // Setup the '=' translation if the reference was specified. if(!refFile.IsEmpty()) { GenomeSequence* refPtr = new GenomeSequence(refFile); samIn.SetReadSequenceTranslation(SamRecord::BASES); samIn.SetReference(refPtr); } SamRecord* recordPtr; int16_t samFlag; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = myPool.getRecord(); if(recordPtr == NULL) { // Failed to allocate a new record. throw(std::runtime_error("Failed to allocate a new SAM/BAM record")); } if(!samIn.ReadRecord(mySamHeader, *recordPtr)) { // Failed to read a record. returnStatus = samIn.GetStatus(); continue; } // Have a record. Check to see if it is a pair or unpaired read. samFlag = recordPtr->getFlag(); if(SamFlag::isPaired(samFlag)) { if(readName) { handlePairedRN(*recordPtr); } else { handlePairedCoord(*recordPtr); } } else { ++myNumUnpaired; writeFastQ(*recordPtr, myUnpairedFile, myUnpairedFileNameExt); } } // Flush All cleanUpMateMap(0, true); if(returnStatus == SamStatus::NO_MORE_RECS) { returnStatus = SamStatus::SUCCESS; } samIn.Close(); closeFiles(); // Output the results std::cerr << "\nFound " << myNumPairs << " read pairs.\n"; std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n"; if(myNumMateFailures != 0) { std::cerr << "Failed to find mates for " << myNumMateFailures << " reads, so they were written as unpaired\n" << " (not included in either of the above counts).\n"; } if(myNumQualTagErrors != 0) { std::cerr << myNumQualTagErrors << " records did not have tag " << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n"; } return(returnStatus); }
int main(int argc, char ** argv) { gpLogger = new Logger; static struct option getopt_long_options[] = { // Input options { "fasta", required_argument, NULL, 'f'}, { "in", required_argument, NULL, 'i'}, { "out", required_argument, NULL, 'o'}, { "verbose", no_argument, NULL, 'v'}, { "log", required_argument, NULL, 'l'}, { "clear", no_argument, NULL, 0}, { "AS", required_argument, NULL, 0}, { "UR", required_argument, NULL, 0}, { "SP", required_argument, NULL, 0}, { "HD", required_argument, NULL, 0}, { "RG", required_argument, NULL, 0}, { "PG", required_argument, NULL, 0}, { "checkSQ", no_argument, NULL, 0}, { NULL, 0, NULL, 0 }, }; int n_option_index = 0, c; std::string sAS, sUR, sSP, sFasta, sInFile, sOutFile, sLogFile; bool bClear, bCheckSQ, bVerbose; std::vector<std::string> vsHDHeaders, vsRGHeaders, vsPGHeaders; bCheckSQ = bVerbose = false; bClear = true; while ( (c = getopt_long(argc, argv, "vf:i:o:l:", getopt_long_options, &n_option_index)) != -1 ) { // std::cout << getopt_long_options[n_option_index].name << "\t" << optarg << std::endl; if ( c == 'f' ) { sFasta = optarg; } else if ( c == 'i' ) { sInFile = optarg; } else if ( c == 'o' ) { sOutFile = optarg; } else if ( c == 'v' ) { bVerbose = true; } else if ( c == 'l' ) { sLogFile = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"AS") == 0 ) { sAS = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"UR") == 0 ) { sUR = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"SP") == 0 ) { sSP = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"HD") == 0 ) { vsHDHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"RG") == 0 ) { vsRGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"PG") == 0 ) { vsPGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"checkSQ") == 0 ) { bCheckSQ = true; } else { std::cerr << "Error: Unrecognized option " << getopt_long_options[n_option_index].name << std::endl; abort(); } } if ( optind < argc ) { printUsage(std::cerr); gpLogger->error("non-option argument %s exist ",argv[optind]); } if ( sInFile.empty() || sOutFile.empty() ) { printUsage(std::cerr); gpLogger->error("Input and output files are required"); } if ( sLogFile.compare("__NONE__") == 0 ) { sLogFile = (sOutFile + ".log"); } gpLogger->open(sLogFile.c_str(), bVerbose); if ( ( bCheckSQ ) && ( sFasta.empty() ) ) { printUsage(std::cerr); gpLogger->error("--checkSQ option must be used with --fasta option"); } // check whether each header line starts with a correct tag checkHeaderStarts(vsHDHeaders, "@HD\t"); checkHeaderStarts(vsRGHeaders, "@RG\t"); checkHeaderStarts(vsPGHeaders, "@PG\t"); gpLogger->write_log("Arguments in effect:"); gpLogger->write_log("\t--in [%s]",sInFile.c_str()); gpLogger->write_log("\t--out [%s]",sOutFile.c_str()); gpLogger->write_log("\t--log [%s]",sLogFile.c_str()); gpLogger->write_log("\t--fasta [%s]",sFasta.c_str()); gpLogger->write_log("\t--AS [%s]",sAS.c_str()); gpLogger->write_log("\t--UR [%s]",sUR.c_str()); gpLogger->write_log("\t--SP [%s]",sSP.c_str()); gpLogger->write_log("\t--checkSQ [%s]",bClear ? "ON" : "OFF" ); if ( vsHDHeaders.empty() ) { gpLogger->write_log("\t--HD []"); } else { gpLogger->write_log("\t--HD [%s]",vsHDHeaders[0].c_str()); } if ( vsRGHeaders.empty() ) { gpLogger->write_log("\t--RG []"); } else { gpLogger->write_log("\t--RG [%s]",vsRGHeaders[0].c_str()); } if ( vsPGHeaders.empty() ) { gpLogger->write_log("\t--PG []"); } else { for(uint32_t i=0; i < vsPGHeaders.size(); ++i) { gpLogger->write_log("\t--PG [%s]",vsPGHeaders[i].c_str()); } } if ( (vsHDHeaders.empty() ) && ( vsRGHeaders.empty() ) && ( vsPGHeaders.empty() ) && ( !bClear ) && ( sFasta.empty() ) ) { gpLogger->warning("No option is in effect for modifying BAM files. The input and output files will be identical"); } if ( ( vsHDHeaders.size() > 1 ) || ( vsRGHeaders.size() > 1 ) ) { gpLogger->error("HD and RG headers cannot be multiple"); } FastaFile fastaFile; if ( ! sFasta.empty() ) { if ( fastaFile.open(sFasta.c_str()) ) { gpLogger->write_log("Reading the reference file %s",sFasta.c_str()); fastaFile.readThru(); fastaFile.close(); gpLogger->write_log("Finished reading the reference file %s",sFasta.c_str()); } else { gpLogger->error("Failed to open reference file %s",sFasta.c_str()); } } SamFile samIn; SamFile samOut; if ( ! samIn.OpenForRead(sInFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for reading - %s",sInFile.c_str(), SamStatus::getStatusString(samIn.GetStatus()) ); } if ( ! samOut.OpenForWrite(sOutFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for writing - %s",sOutFile.c_str(), SamStatus::getStatusString(samOut.GetStatus()) ); } SamFileHeader samHeader; SamHeaderRecord* pSamHeaderRecord; samIn.ReadHeader(samHeader); // check the sanity of SQ file // make sure the SN and LN matches, with the same order if ( bCheckSQ ) { unsigned int numSQ = 0; while( (pSamHeaderRecord = samHeader.getNextHeaderRecord()) != NULL ) { if ( pSamHeaderRecord->getType() == SamHeaderRecord::SQ ) { ++numSQ; } } if ( numSQ != fastaFile.vsSequenceNames.size() ) { gpLogger->error("# of @SQ tags are different from the original BAM and the reference file"); } // iterator over all @SQ objects for(unsigned int i=0; i < numSQ; ++i) { pSamHeaderRecord = samHeader.getSQ(fastaFile.vsSequenceNames[i].c_str()); if ( fastaFile.vsSequenceNames[i].compare(pSamHeaderRecord->getTagValue("SN")) != 0 ) { gpLogger->error("SequenceName is not identical between fasta and input BAM file"); } else if ( static_cast<int>(fastaFile.vnSequenceLengths[i]) != atoi(pSamHeaderRecord->getTagValue("LN")) ) { gpLogger->error("SequenceLength is not identical between fasta and input BAM file"); } else { if ( !sAS.empty() ) samHeader.setSQTag("AS",sAS.c_str(),fastaFile.vsSequenceNames[i].c_str()); samHeader.setSQTag("M5",fastaFile.vsMD5sums[i].c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sUR.empty() ) samHeader.setSQTag("UR",sUR.c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sSP.empty() ) samHeader.setSQTag("SP",sSP.c_str(),fastaFile.vsSequenceNames[i].c_str()); } } gpLogger->write_log("Finished checking the consistency of SQ tags"); } else { gpLogger->write_log("Skipped checking the consistency of SQ tags"); } // go over the headers again, // assuming order of HD, SQ, RG, PG, and put proper tags at the end of the original tags gpLogger->write_log("Creating the header of new output file"); //SamFileHeader outHeader; samHeader.resetHeaderRecordIter(); for(unsigned int i=0; i < vsHDHeaders.size(); ++i) { samHeader.addHeaderLine(vsHDHeaders[i].c_str()); } /* for(int i=0; i < fastaFile.vsSequenceNames.size(); ++i) { std::string s("@SQ\tSN:"); char buf[1024]; s += fastaFile.vsSequenceNames[i]; sprintf(buf,"\tLN:%d",fastaFile.vnSequenceLengths[i]); s += buf; if ( !sAS.empty() ) { sprintf(buf,"\tAS:%s",sAS.c_str()); s += buf; } if ( !sUR.empty() ) { sprintf(buf,"\tUR:%s",sUR.c_str()); s += buf; } sprintf(buf,"\tM5:%s",fastaFile.vsMD5sums[i].c_str()); s += buf; if ( !sSP.empty() ) { sprintf(buf,"\tSP:%s",sSP.c_str()); s += buf; } outHeader.addHeaderLine(s.c_str()); }*/ for(unsigned int i=0; i < vsRGHeaders.size(); ++i) { samHeader.addHeaderLine(vsRGHeaders[i].c_str()); } for(unsigned int i=0; i < vsPGHeaders.size(); ++i) { samHeader.addHeaderLine(vsPGHeaders[i].c_str()); } samOut.WriteHeader(samHeader); gpLogger->write_log("Adding %d HD, %d RG, and %d PG headers",vsHDHeaders.size(), vsRGHeaders.size(), vsPGHeaders.size()); gpLogger->write_log("Finished writing output headers"); // parse RG tag and get RG ID to append std::string sRGID; if ( ! vsRGHeaders.empty() ) { std::vector<std::string> tokens; FastaFile::tokenizeString( vsRGHeaders[0].c_str(), tokens ); for(unsigned int i=0; i < tokens.size(); ++i) { if ( tokens[i].find("ID:") == 0 ) { sRGID = tokens[i].substr(3); } } } gpLogger->write_log("Writing output BAM file"); SamRecord samRecord; while (samIn.ReadRecord(samHeader, samRecord) == true) { if ( !sRGID.empty() ) { if ( samRecord.addTag("RG",'Z',sRGID.c_str()) == false ) { gpLogger->error("Failed to add a RG tag %s",sRGID.c_str()); } // temporary code added if ( strncmp(samRecord.getReadName(),"seqcore_",8) == 0 ) { char buf[1024]; sprintf(buf,"UM%s",samRecord.getReadName()+8); samRecord.setReadName(buf); } } samOut.WriteRecord(samHeader, samRecord); //if ( samIn.GetCurrentRecordCount() == 1000 ) break; } samOut.Close(); gpLogger->write_log("Successfully written %d records",samIn.GetCurrentRecordCount()); delete gpLogger; return 0; }
int ClipOverlap::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String storeOrig = ""; bool readName = false; bool noRNValidate = false; bool stats = false; int poolSize = DEFAULT_POOL_SIZE; bool unmapped = false; bool noeof = false; bool params = false; String excludeFlags = "0xF0C"; // TODO, cleanup legacy parameters ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("storeOrig", &storeOrig) LONG_PARAMETER("readName", &readName) LONG_PARAMETER ("noRNValidate", &noRNValidate) LONG_PARAMETER ("stats", &stats) LONG_PARAMETER ("overlapsOnly", &myOverlapsOnly) LONG_STRINGPARAMETER ("excludeFlags", &excludeFlags) LONG_PARAMETER("unmapped", &unmapped) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Coordinate Processing Optional Parameters") LONG_INTPARAMETER("poolSize", &poolSize) LONG_PARAMETER("poolSkipOverlap", &myPoolSkipOverlap) LONG_PHONEHOME(VERSION) BEGIN_LEGACY_PARAMETERS() LONG_PARAMETER ("clipsOnly", &myOverlapsOnly) LONG_PARAMETER("poolSkipClip", &myPoolSkipOverlap) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the out file was specified, if not, report an error. if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // Out file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if((storeOrig.Length() != 0) && (storeOrig.Length() != 2)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "--storeOrig tag name must be 2 characters.\n"; return(-1); } myOverlapHandler = new OverlapClipLowerBaseQual(); if(myOverlapHandler == NULL) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Failed to allocate the overlap handler\n"; return(-1); } if(unmapped) { myOverlapHandler->markAsUnmapped(); } // Setup the overlap handler. myOverlapHandler->keepStats(stats); if(storeOrig.Length() != 0) { myOverlapHandler->storeOrigCigar(storeOrig); } myIntExcludeFlags = excludeFlags.AsInteger(); if(params) { inputParameters.Status(); } // For each step process the file. // Open the files & read/write the sam header. SamStatus::Status runStatus = SamStatus::SUCCESS; for(int i = 1; i <= myOverlapHandler->numSteps(); i++) { // Open the file for reading. mySamHeader.resetHeader(); SamFile samIn(inFile, SamFile::READ, &mySamHeader); SamFile* samOutPtr = NULL; // Check if writing, if so, open the output file. if(i == myOverlapHandler->numSteps()) { samOutPtr = new SamFile(outFile, SamFile::WRITE, &mySamHeader); } if(readName) { if(!noRNValidate) { samIn.setSortedValidation(SamFile::QUERY_NAME); } runStatus = handleSortedByReadName(samIn, samOutPtr); } else { // Coordinate sorted, so work with the pools. samIn.setSortedValidation(SamFile::COORDINATE); myPool.setMaxAllocatedRecs(poolSize); // Reset the number of failures myNumMateFailures = 0; myNumPoolFail = 0; myNumPoolFailNoHandle = 0; myNumPoolFailHandled = 0; myNumOutOfOrder = 0; // Run by coordinate if(samOutPtr != NULL) { // Setup the output buffer for writing. SamCoordOutput outputBuffer(myPool); outputBuffer.setOutputFile(samOutPtr, &mySamHeader); runStatus = handleSortedByCoord(samIn, &outputBuffer); // Cleanup the output buffer. if(!outputBuffer.flushAll()) { std::cerr << "ERROR: Failed to flush the output buffer\n"; runStatus = SamStatus::FAIL_IO; } } else { runStatus = handleSortedByCoord(samIn, NULL); } } if(runStatus != SamStatus::SUCCESS) { break; } // Close the input file, it will be reopened if there are // multiple steps. samIn.Close(); if(samOutPtr != NULL) { samOutPtr->Close(); delete samOutPtr; samOutPtr = NULL; } } // Done processing. // Print Stats myOverlapHandler->printStats(); if(myNumMateFailures != 0) { std::cerr << "WARNING: did not find expected overlapping mates for " << myNumMateFailures << " records." << std::endl; } if(myNumPoolFail != 0) { // Had to skip clipping some records due to running out of // memory and not being able to wait for the mate. std::cerr << "WARNING: " << myNumPoolFail << " record pool failures\n"; if(myNumPoolFailNoHandle != 0) { std::cerr << "Due to hitting the max record poolSize, skipped handling " << myNumPoolFailNoHandle << " records." << std::endl; } if(myNumPoolFailHandled != 0) { std::cerr << "Due to hitting the max record poolSize, default handled " << myNumPoolFailHandled << " records." << std::endl; } if(myNumOutOfOrder != 0) { std::cerr << "WARNING: Resulting File out of Order by " << myNumOutOfOrder << " records.\n"; } } if(runStatus == SamStatus::SUCCESS) { if(myNumPoolFail == 0) { std::cerr << "Completed ClipOverlap Successfully.\n"; } else { runStatus = SamStatus::NO_MORE_RECS; std::cerr << "Completed ClipOverlap with WARNINGS.\n"; } } else { std::cerr << "Failed to complete ClipOverlap.\n"; } return(runStatus); }
void testFlagRead(const char* fileName) { SamFile inSam; SamFileHeader samHeader; SamRecord samRecord; //////////////////////////////////////////////////////////// // Required flag 0x48 (only flag 73 matches) // Exclude nothing assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x48, 0x0); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // No required flags. // Exclude 0x48. This leaves just the one read with flag 133. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x0, 0x48); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x40 // Exclude 0x48. // This will not find any records since the exclude and required conflict. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x40, 0x48); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x4 // Exclude 0x8. // Only finds flag 133. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x4, 0x8); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x4 // Exclude nothing // Finds flags 133 & 141. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x4, 0x0); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); }
// main function int TrimBam::execute(int argc, char ** argv) { SamFile samIn; SamFile samOut; int numTrimBaseL = 0; int numTrimBaseR = 0; bool noeof = false; bool ignoreStrand = false; bool noPhoneHome = false; std::string inName = ""; std::string outName = ""; if ( argc < 5 ) { usage(); std::cerr << "ERROR: Incorrect number of parameters specified\n"; return(-1); } inName = argv[2]; outName = argv[3]; static struct option getopt_long_options[] = { // Input options { "left", required_argument, NULL, 'L'}, { "right", required_argument, NULL, 'R'}, { "ignoreStrand", no_argument, NULL, 'i'}, { "noeof", no_argument, NULL, 'n'}, { "noPhoneHome", no_argument, NULL, 'p'}, { "nophonehome", no_argument, NULL, 'P'}, { "phoneHomeThinning", required_argument, NULL, 't'}, { "phonehomethinning", required_argument, NULL, 'T'}, { NULL, 0, NULL, 0 }, }; int argIndex = 4; if(argv[argIndex][0] != '-') { // This is the number of bases to trim off both sides // so convert to a number. numTrimBaseL = atoi(argv[argIndex]); numTrimBaseR = numTrimBaseL; ++argIndex; } int c = 0; int n_option_index = 0; // Process any additional parameters while ( ( c = getopt_long(argc, argv, "L:R:in", getopt_long_options, &n_option_index) ) != -1 ) { switch(c) { case 'L': numTrimBaseL = atoi(optarg); break; case 'R': numTrimBaseR = atoi(optarg); break; case 'i': ignoreStrand = true; break; case 'n': noeof = true; break; case 'p': case 'P': noPhoneHome = true; break; case 't': case 'T': PhoneHome::allThinning = atoi(optarg); break; default: fprintf(stderr,"ERROR: Unrecognized option %s\n", getopt_long_options[n_option_index].name); return(-1); } } if(!noPhoneHome) { PhoneHome::checkVersion(getProgramName(), VERSION); } if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if ( ! samIn.OpenForRead(inName.c_str()) ) { fprintf(stderr, "***Problem opening %s\n",inName.c_str()); return(-1); } if(!samOut.OpenForWrite(outName.c_str())) { fprintf(stderr, "%s\n", samOut.GetStatusMessage()); return(samOut.GetStatus()); } fprintf(stderr,"Arguments in effect: \n"); fprintf(stderr,"\tInput file : %s\n",inName.c_str()); fprintf(stderr,"\tOutput file : %s\n",outName.c_str()); if(numTrimBaseL == numTrimBaseR) { fprintf(stderr,"\t#Bases to trim from each side : %d\n", numTrimBaseL); } else { fprintf(stderr,"\t#Bases to trim from the left of forward strands : %d\n", numTrimBaseL); fprintf(stderr,"\t#Bases to trim from the right of forward strands: %d\n", numTrimBaseR); if(!ignoreStrand) { // By default, reverse strands are treated the opposite. fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n", numTrimBaseR); fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n", numTrimBaseL); } else { // ignore strand, treating forward & reverse strands the same fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n", numTrimBaseL); fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n", numTrimBaseR); } } // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Write the sam header. if(!samOut.WriteHeader(samHeader)) { fprintf(stderr, "%s\n", samOut.GetStatusMessage()); return(samOut.GetStatus()); } SamRecord samRecord; char seq[65536]; char qual[65536]; int i, len; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Successfully read a record from the file, so write it. strcpy(seq,samRecord.getSequence()); strcpy(qual,samRecord.getQuality()); // Number of bases to trim from the left/right, // set based on ignoreStrand flag and strand info. int trimLeft = numTrimBaseL; int trimRight = numTrimBaseR; if(!ignoreStrand) { if(SamFlag::isReverse(samRecord.getFlag())) { // We are reversing the reverse reads, // so swap the left & right trim counts. trimRight = numTrimBaseL; trimLeft = numTrimBaseR; } } len = strlen(seq); // Do not trim if sequence is '*' if ( strcmp(seq, "*") != 0 ) { bool qualValue = true; if(strcmp(qual, "*") == 0) { qualValue = false; } int qualLen = strlen(qual); if ( (qualLen != len) && qualValue ) { fprintf(stderr,"ERROR: Sequence and Quality have different length\n"); return(-1); } if ( len < (trimLeft + trimRight) ) { // Read Length is less than the total number of bases to trim, // so trim the entire read. for(i=0; i < len; ++i) { seq[i] = 'N'; if ( qualValue ) { qual[i] = '!'; } } } else { // Read Length is larger than the total number of bases to trim, // so trim from the left, then from the right. for(i=0; i < trimLeft; ++i) { // Trim the bases from the left. seq[i] = 'N'; if ( qualValue ) { qual[i] = '!'; } } for(i = 0; i < trimRight; i++) { seq[len-i-1] = 'N'; if(qualValue) { qual[len-i-1] = '!'; } } } samRecord.setSequence(seq); samRecord.setQuality(qual); } if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "Failure in writing record %s\n", samOut.GetStatusMessage()); return(-1); } } if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed to read a record. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed reading a record. return(samIn.GetStatus()); } // Since the reads were successful, return the status based samIn.Close(); samOut.Close(); return 0; }
/* if a discordant read is mapped to MEI (overlap with MEI coord) add centor of ( anchor_end + 3*avr_ins_var ) skip unmap & clip check 3 types at the same time */ void Sites::AddDiscoverSiteFromSingleBam( SingleSampleInfo & si ) { /*for(int i=0;i<NMEI; i++) { std::cout << "m=" << i << ": "; for(map<string, map<int, bool> >::iterator t=meiCoord[m].begin(); t!=meiCoord[m].end(); t++) std::cout << t->first << "->" << t->second.size() << " "; std::cout << std::endl; }*/ avr_read_length = si.avr_read_length; avr_ins_size = si.avr_ins_size; min_read_length = avr_read_length / 3; current_depth = si.depth; // total_depth += current_depth; resetDepthAddFlag(); SamFile bam; SamFileHeader bam_header; OpenBamAndBai( bam,bam_header, si.bam_name ); for( int i=0; i<pchr_list->size(); i++ ) { string chr = (*pchr_list)[i]; // if ( !single_chr.empty() && chr.compare(single_chr)!=0 ) // continue; if ( siteList.find(chr) == siteList.end() ) siteList[chr].clear(); // map<string, map<int, SingleSite> >::iterator pchr = siteList[m].find(chr); // map<string, map<int, bool> >::iterator coord_chr_ptr = meiCoord[m].find(chr); // if (coord_chr_ptr == meiCoord[m].end()) // continue; bool section_status; if (range_start<0) { // no range section_status = bam.SetReadSection( chr.c_str() ); if (!section_status) { string str = "Cannot set read section at chr " + chr; morphWarning( str ); } } else { // set range section_status = bam.SetReadSection( chr.c_str(), range_start, range_end ); if (!section_status) { string str = "Cannot set read section at chr " + chr + " " + std::to_string(range_start) + "-" + std::to_string(range_end); morphWarning( str ); } } // DO ADDING // if (siteList[chr].empty()) // p_reach_last = 1; // else { // p_reach_last = 0; pnearest = siteList[chr].begin(); // } SingleSite new_site; // temporary cluster. will be added to map later. new_site.depth = current_depth; bool start_new = 1; // check if need to add new_site to map and start new new_site SamRecord rec; int between = 0; // count #reads after new_site.end. If end changed, add it to rcount and reset to zero while( bam.ReadRecord( bam_header, rec ) ) { if (!start_new) { if (rec.get1BasedPosition() >= new_site.end) between++; else new_site.rcount++; } if (rec.getFlag() & 0x2) continue; if ( OneEndUnmap( rec.getFlag() ) ) continue; if ( IsSupplementary(rec.getFlag()) ) continue; if ( rec.getReadLength() < min_read_length ) continue; if ( rec.getMapQuality() < MIN_QUALITY ) continue; if (chr.compare(rec.getMateReferenceName())==0 && rec.getInsertSize() < abs(avr_ins_size*2)) continue; bool is_mei = 0; vector<bool> is_in_coord; is_in_coord.resize(3, 0); for(int m=0; m<NMEI; m++) { map<string, map<int, bool> >::iterator coord_chr_ptr = meiCoord[m].find(rec.getMateReferenceName()); if (coord_chr_ptr == meiCoord[m].end()) is_in_coord[m] = 0; else is_in_coord[m] = isWithinCoord( rec.get1BasedMatePosition(), coord_chr_ptr->second ); // within MEI coord if (is_in_coord[m]) is_mei = 1; } if (!is_mei) continue; if (start_new) { setNewCluster( is_in_coord, new_site,rec); start_new = 0; between = 0; } else { // add to existing cluster if ( rec.get1BasedPosition() > new_site.end + avr_ins_size ) { // start new coord addClusterToMap(new_site, siteList[chr]); setNewCluster( is_in_coord, new_site, rec); start_new = 0; between = 0; } else { addToCurrentCluster( is_in_coord, new_site, rec); new_site.rcount += between; between = 0; } } } // add last one if (!start_new) addClusterToMap(new_site, siteList[chr]); } bam.Close(); }