int ClipOverlap::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String storeOrig = ""; bool readName = false; bool noRNValidate = false; bool stats = false; int poolSize = DEFAULT_POOL_SIZE; bool unmapped = false; bool noeof = false; bool params = false; String excludeFlags = "0xF0C"; // TODO, cleanup legacy parameters ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("storeOrig", &storeOrig) LONG_PARAMETER("readName", &readName) LONG_PARAMETER ("noRNValidate", &noRNValidate) LONG_PARAMETER ("stats", &stats) LONG_PARAMETER ("overlapsOnly", &myOverlapsOnly) LONG_STRINGPARAMETER ("excludeFlags", &excludeFlags) LONG_PARAMETER("unmapped", &unmapped) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Coordinate Processing Optional Parameters") LONG_INTPARAMETER("poolSize", &poolSize) LONG_PARAMETER("poolSkipOverlap", &myPoolSkipOverlap) LONG_PHONEHOME(VERSION) BEGIN_LEGACY_PARAMETERS() LONG_PARAMETER ("clipsOnly", &myOverlapsOnly) LONG_PARAMETER("poolSkipClip", &myPoolSkipOverlap) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the out file was specified, if not, report an error. if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // Out file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if((storeOrig.Length() != 0) && (storeOrig.Length() != 2)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "--storeOrig tag name must be 2 characters.\n"; return(-1); } myOverlapHandler = new OverlapClipLowerBaseQual(); if(myOverlapHandler == NULL) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Failed to allocate the overlap handler\n"; return(-1); } if(unmapped) { myOverlapHandler->markAsUnmapped(); } // Setup the overlap handler. myOverlapHandler->keepStats(stats); if(storeOrig.Length() != 0) { myOverlapHandler->storeOrigCigar(storeOrig); } myIntExcludeFlags = excludeFlags.AsInteger(); if(params) { inputParameters.Status(); } // For each step process the file. // Open the files & read/write the sam header. SamStatus::Status runStatus = SamStatus::SUCCESS; for(int i = 1; i <= myOverlapHandler->numSteps(); i++) { // Open the file for reading. mySamHeader.resetHeader(); SamFile samIn(inFile, SamFile::READ, &mySamHeader); SamFile* samOutPtr = NULL; // Check if writing, if so, open the output file. if(i == myOverlapHandler->numSteps()) { samOutPtr = new SamFile(outFile, SamFile::WRITE, &mySamHeader); } if(readName) { if(!noRNValidate) { samIn.setSortedValidation(SamFile::QUERY_NAME); } runStatus = handleSortedByReadName(samIn, samOutPtr); } else { // Coordinate sorted, so work with the pools. samIn.setSortedValidation(SamFile::COORDINATE); myPool.setMaxAllocatedRecs(poolSize); // Reset the number of failures myNumMateFailures = 0; myNumPoolFail = 0; myNumPoolFailNoHandle = 0; myNumPoolFailHandled = 0; myNumOutOfOrder = 0; // Run by coordinate if(samOutPtr != NULL) { // Setup the output buffer for writing. SamCoordOutput outputBuffer(myPool); outputBuffer.setOutputFile(samOutPtr, &mySamHeader); runStatus = handleSortedByCoord(samIn, &outputBuffer); // Cleanup the output buffer. if(!outputBuffer.flushAll()) { std::cerr << "ERROR: Failed to flush the output buffer\n"; runStatus = SamStatus::FAIL_IO; } } else { runStatus = handleSortedByCoord(samIn, NULL); } } if(runStatus != SamStatus::SUCCESS) { break; } // Close the input file, it will be reopened if there are // multiple steps. samIn.Close(); if(samOutPtr != NULL) { samOutPtr->Close(); delete samOutPtr; samOutPtr = NULL; } } // Done processing. // Print Stats myOverlapHandler->printStats(); if(myNumMateFailures != 0) { std::cerr << "WARNING: did not find expected overlapping mates for " << myNumMateFailures << " records." << std::endl; } if(myNumPoolFail != 0) { // Had to skip clipping some records due to running out of // memory and not being able to wait for the mate. std::cerr << "WARNING: " << myNumPoolFail << " record pool failures\n"; if(myNumPoolFailNoHandle != 0) { std::cerr << "Due to hitting the max record poolSize, skipped handling " << myNumPoolFailNoHandle << " records." << std::endl; } if(myNumPoolFailHandled != 0) { std::cerr << "Due to hitting the max record poolSize, default handled " << myNumPoolFailHandled << " records." << std::endl; } if(myNumOutOfOrder != 0) { std::cerr << "WARNING: Resulting File out of Order by " << myNumOutOfOrder << " records.\n"; } } if(runStatus == SamStatus::SUCCESS) { if(myNumPoolFail == 0) { std::cerr << "Completed ClipOverlap Successfully.\n"; } else { runStatus = SamStatus::NO_MORE_RECS; std::cerr << "Completed ClipOverlap with WARNINGS.\n"; } } else { std::cerr << "Failed to complete ClipOverlap.\n"; } return(runStatus); }
int Dedup_LowMem::execute(int argc, char** argv) { /* -------------------------------- * process the arguments * -------------------------------*/ String inFile, outFile, logFile; myDoRecab = false; bool removeFlag = false; bool verboseFlag = false; myForceFlag = false; myNumMissingMate = 0; myMinQual = DEFAULT_MIN_QUAL; String excludeFlags = "0xB04"; uint16_t intExcludeFlags = 0; bool noeof = false; bool params = false; LongParamContainer parameters; parameters.addGroup("Required Parameters"); parameters.addString("in", &inFile); parameters.addString("out", &outFile); parameters.addGroup("Optional Parameters"); parameters.addInt("minQual", & myMinQual); parameters.addString("log", &logFile); parameters.addBool("oneChrom", &myOneChrom); parameters.addBool("recab", &myDoRecab); parameters.addBool("rmDups", &removeFlag); parameters.addBool("force", &myForceFlag); parameters.addString("excludeFlags", &excludeFlags); parameters.addBool("verbose", &verboseFlag); parameters.addBool("noeof", &noeof); parameters.addBool("params", ¶ms); parameters.addPhoneHome(VERSION); myRecab.addRecabSpecificParameters(parameters); ParameterList inputParameters; inputParameters.Add(new LongParameters ("Input Parameters", parameters.getLongParameterList())); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(inFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an input file" << std::endl; return EXIT_FAILURE; } if(outFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an output file" << std::endl; return EXIT_FAILURE; } intExcludeFlags = excludeFlags.AsInteger(); if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Cannot specify --force and Duplicate in the excludeFlags. Since --force indicates to override" << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n"; return EXIT_FAILURE; } if(!SamFlag::isSecondary(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n"; return EXIT_FAILURE; } if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n"; return EXIT_FAILURE; } if(logFile.IsEmpty()) { logFile = outFile + ".log"; } if(myDoRecab) { int status = myRecab.processRecabParam(); if(status != 0) { inputParameters.Status(); return(status); } } if(params) { inputParameters.Status(); } Logger::gLogger = new Logger(logFile.c_str(), verboseFlag); /* ------------------------------------------------------------------- * The arguments are processed. Prepare the input BAM file, * instantiate dedup_LowMem, and construct the read group library map * ------------------------------------------------------------------*/ SamFile samIn; samIn.OpenForRead(inFile.c_str()); // If the file isn't sorted it will throw an exception. samIn.setSortedValidation(SamFile::COORDINATE); SamFileHeader header; samIn.ReadHeader(header); buildReadGroupLibraryMap(header); lastReference = -1; lastCoordinate = -1; // for keeping some basic statistics uint32_t recordCount = 0; uint32_t pairedCount = 0; uint32_t properPairCount = 0; uint32_t unmappedCount = 0; uint32_t reverseCount = 0; uint32_t qualCheckFailCount = 0; uint32_t secondaryCount = 0; uint32_t supplementaryCount = 0; uint32_t excludedCount = 0; // Now we start reading records SamRecord* recordPtr; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = mySamPool.getRecord(); if(recordPtr == NULL) { std::cerr << "Failed to allocate enough records\n"; return(-1); } if(!samIn.ReadRecord(header, *recordPtr)) { returnStatus = samIn.GetStatus(); continue; } // Take note of properties of this record int flag = recordPtr->getFlag(); if(SamFlag::isPaired(flag)) ++pairedCount; if(SamFlag::isProperPair(flag)) ++properPairCount; if(SamFlag::isReverse(flag)) ++reverseCount; if(SamFlag::isQCFailure(flag)) ++qualCheckFailCount; if(SamFlag::isSecondary(flag)) ++secondaryCount; if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT) ++supplementaryCount; if(!SamFlag::isMapped(flag)) ++unmappedCount; // put the record in the appropriate maps: // single reads go in myFragmentMap // paired reads go in myPairedMap recordCount = samIn.GetCurrentRecordCount(); // if we have moved to a new position, look back at previous reads for duplicates if (hasPositionChanged(*recordPtr)) { cleanupPriorReads(recordPtr); } // Determine if this read should be checked for duplicates. if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0)) { ++excludedCount; // No deduping done on this record, but still build the recab table. if(myDoRecab) { myRecab.processReadBuildTable(*recordPtr); } // Nothing more to do with this record, so // release the pointer. mySamPool.releaseRecord(recordPtr); } else { if(SamFlag::isDuplicate(flag) && !myForceFlag) { // Error: Marked duplicates, and duplicates aren't excluded. Logger::gLogger->error("There are records already duplicate marked."); Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over"); } checkDups(*recordPtr, recordCount); mySamPool.releaseRecord(recordPtr); } // let the user know we're not napping if (verboseFlag && (recordCount % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u", recordCount, myFragmentMap.size(), myPairedMap.size(), myMateMap.size()); } } // we're finished reading record so clean up the duplicate search and // close the input file cleanupPriorReads(NULL); samIn.Close(); // print some statistics Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS"); Logger::gLogger->writeLog("Total number of reads: %u",recordCount); Logger::gLogger->writeLog("Total number of paired-end reads: %u", pairedCount); Logger::gLogger->writeLog("Total number of properly paired reads: %u", properPairCount); Logger::gLogger->writeLog("Total number of unmapped reads: %u", unmappedCount); Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u", reverseCount); Logger::gLogger->writeLog("Total number of QC-failed reads: %u", qualCheckFailCount); Logger::gLogger->writeLog("Total number of secondary reads: %u", secondaryCount); Logger::gLogger->writeLog("Total number of supplementary reads: %u", supplementaryCount); Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u", myFragmentMap.size()); Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u", myPairedMap.size()); Logger::gLogger->writeLog("Total number of missing mates: %u", myNumMissingMate); Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u", excludedCount); Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("Sorting the indices of %d duplicated records", myDupList.size()); // sort the indices of duplicate records std::sort(myDupList.begin(), myDupList.end(), std::less<uint32_t> ()); // get ready to write the output file by making a second pass // through the input file samIn.OpenForRead(inFile.c_str()); samIn.ReadHeader(header); SamFile samOut; samOut.OpenForWrite(outFile.c_str()); samOut.WriteHeader(header); // If we are recalibrating, output the model information. if(myDoRecab) { myRecab.modelFitPrediction(outFile); } // an iterator to run through the duplicate indices int currentDupIndex = 0; bool moreDups = !myDupList.empty(); // let the user know what we're doing Logger::gLogger->writeLog("\nWriting %s", outFile.c_str()); // count the duplicate records as a check uint32_t singleDuplicates(0), pairedDuplicates(0); // start reading records and writing them out SamRecord record; while(samIn.ReadRecord(header, record)) { uint32_t currentIndex = samIn.GetCurrentRecordCount(); bool foundDup = moreDups && (currentIndex == myDupList[currentDupIndex]); // modify the duplicate flag and write out the record, // if it's appropriate int flag = record.getFlag(); if (foundDup) { // this record is a duplicate, so mark it. record.setFlag( flag | 0x400 ); currentDupIndex++; // increment duplicate counters to verify we found them all if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) ) { // unpaired or mate unmapped singleDuplicates++; } else { pairedDuplicates++; } // recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } // write the record if we are not removing duplicates if (!removeFlag ) samOut.WriteRecord(header, record); } else { if(myForceFlag) { // this is not a duplicate we've identified but we want to // remove any duplicate marking record.setFlag( flag & 0xfffffbff ); // unmark duplicate } // Not a duplicate, so recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } samOut.WriteRecord(header, record); } // Let the user know we're still here if (verboseFlag && (currentIndex % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u", currentIndex); } } // We're done. Close the files and print triumphant messages. samIn.Close(); samOut.Close(); Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads", removeFlag ? "removed" : "marked" , singleDuplicates, pairedDuplicates/2); Logger::gLogger->writeLog("\nDedup_LowMem complete!"); return 0; }
void testAsInteger() { // Test AsInteger with ints & negative ints. String intString = "123"; String negIntString = "-123"; assert(intString.AsInteger() == 123); assert(negIntString.AsInteger() == -123); // Run the same tests with AsInteger that returns a bool and takes // in a long to set. long retValue; assert(intString.AsInteger(retValue)); assert(retValue == 123); assert(negIntString.AsInteger(retValue)); assert(retValue == -123); // Strings that are not integers // For AsInteger, it returns just the starting integer portion. // For AsInteger that returns a bool and a long set, it returns false // and sets the long to the starting int. String nonIntString = "abd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); nonIntString = "12ab33"; assert(nonIntString.AsInteger() == 12); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 12); nonIntString = "as12ab3a4sd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 0); // Negatives are only recognized as the first characer. nonIntString = "-12ab3a4sd"; assert(nonIntString.AsInteger() == -12); assert(!nonIntString.AsInteger(retValue)); assert(retValue == -12); nonIntString = "-as12ab3a4sd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 0); nonIntString = "as-12ab3a4sd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 0); nonIntString = "as12-ab3a4sd"; assert(nonIntString.AsInteger() == 0); assert(!nonIntString.AsInteger(retValue)); assert(retValue == 0); }