void modify::modifyTags() { assert(samIn.OpenForRead(myFilename.c_str())); // Read the sam header. assert(samIn.ReadHeader(samHeader)); SamFile samOut; SamFile bamOut; std::string inputType = myFilename.substr(myFilename.find_last_of('.')); std::string outFileBase = "results/updateTagFrom"; if(inputType == ".bam") { outFileBase += "Bam"; } else { outFileBase += "Sam"; } std::string outFile = outFileBase + ".sam"; assert(samOut.OpenForWrite(outFile.c_str())); outFile = outFileBase + ".bam"; assert(bamOut.OpenForWrite(outFile.c_str())); assert(samOut.WriteHeader(samHeader)); assert(bamOut.WriteHeader(samHeader)); int count = 0; // Read the records. while(samIn.ReadRecord(samHeader, samRecord)) { if(count == 0) { assert(samRecord.rmTag("MD", 'Z')); } else if(count == 2) { assert(samRecord.rmTags("XT:A;MD:Z;AB:c;NM:i")); } else if(count == 4) { assert(samRecord.rmTags("MD:Z,AB:c,NM:i")); } assert(bamOut.WriteRecord(samHeader, samRecord)); assert(samOut.WriteRecord(samHeader, samRecord)); ++count; } }
int Dedup_LowMem::execute(int argc, char** argv) { /* -------------------------------- * process the arguments * -------------------------------*/ String inFile, outFile, logFile; myDoRecab = false; bool removeFlag = false; bool verboseFlag = false; myForceFlag = false; myNumMissingMate = 0; myMinQual = DEFAULT_MIN_QUAL; String excludeFlags = "0xB04"; uint16_t intExcludeFlags = 0; bool noeof = false; bool params = false; LongParamContainer parameters; parameters.addGroup("Required Parameters"); parameters.addString("in", &inFile); parameters.addString("out", &outFile); parameters.addGroup("Optional Parameters"); parameters.addInt("minQual", & myMinQual); parameters.addString("log", &logFile); parameters.addBool("oneChrom", &myOneChrom); parameters.addBool("recab", &myDoRecab); parameters.addBool("rmDups", &removeFlag); parameters.addBool("force", &myForceFlag); parameters.addString("excludeFlags", &excludeFlags); parameters.addBool("verbose", &verboseFlag); parameters.addBool("noeof", &noeof); parameters.addBool("params", ¶ms); parameters.addPhoneHome(VERSION); myRecab.addRecabSpecificParameters(parameters); ParameterList inputParameters; inputParameters.Add(new LongParameters ("Input Parameters", parameters.getLongParameterList())); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(inFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an input file" << std::endl; return EXIT_FAILURE; } if(outFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an output file" << std::endl; return EXIT_FAILURE; } intExcludeFlags = excludeFlags.AsInteger(); if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Cannot specify --force and Duplicate in the excludeFlags. Since --force indicates to override" << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n"; return EXIT_FAILURE; } if(!SamFlag::isSecondary(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n"; return EXIT_FAILURE; } if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n"; return EXIT_FAILURE; } if(logFile.IsEmpty()) { logFile = outFile + ".log"; } if(myDoRecab) { int status = myRecab.processRecabParam(); if(status != 0) { inputParameters.Status(); return(status); } } if(params) { inputParameters.Status(); } Logger::gLogger = new Logger(logFile.c_str(), verboseFlag); /* ------------------------------------------------------------------- * The arguments are processed. Prepare the input BAM file, * instantiate dedup_LowMem, and construct the read group library map * ------------------------------------------------------------------*/ SamFile samIn; samIn.OpenForRead(inFile.c_str()); // If the file isn't sorted it will throw an exception. samIn.setSortedValidation(SamFile::COORDINATE); SamFileHeader header; samIn.ReadHeader(header); buildReadGroupLibraryMap(header); lastReference = -1; lastCoordinate = -1; // for keeping some basic statistics uint32_t recordCount = 0; uint32_t pairedCount = 0; uint32_t properPairCount = 0; uint32_t unmappedCount = 0; uint32_t reverseCount = 0; uint32_t qualCheckFailCount = 0; uint32_t secondaryCount = 0; uint32_t supplementaryCount = 0; uint32_t excludedCount = 0; // Now we start reading records SamRecord* recordPtr; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = mySamPool.getRecord(); if(recordPtr == NULL) { std::cerr << "Failed to allocate enough records\n"; return(-1); } if(!samIn.ReadRecord(header, *recordPtr)) { returnStatus = samIn.GetStatus(); continue; } // Take note of properties of this record int flag = recordPtr->getFlag(); if(SamFlag::isPaired(flag)) ++pairedCount; if(SamFlag::isProperPair(flag)) ++properPairCount; if(SamFlag::isReverse(flag)) ++reverseCount; if(SamFlag::isQCFailure(flag)) ++qualCheckFailCount; if(SamFlag::isSecondary(flag)) ++secondaryCount; if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT) ++supplementaryCount; if(!SamFlag::isMapped(flag)) ++unmappedCount; // put the record in the appropriate maps: // single reads go in myFragmentMap // paired reads go in myPairedMap recordCount = samIn.GetCurrentRecordCount(); // if we have moved to a new position, look back at previous reads for duplicates if (hasPositionChanged(*recordPtr)) { cleanupPriorReads(recordPtr); } // Determine if this read should be checked for duplicates. if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0)) { ++excludedCount; // No deduping done on this record, but still build the recab table. if(myDoRecab) { myRecab.processReadBuildTable(*recordPtr); } // Nothing more to do with this record, so // release the pointer. mySamPool.releaseRecord(recordPtr); } else { if(SamFlag::isDuplicate(flag) && !myForceFlag) { // Error: Marked duplicates, and duplicates aren't excluded. Logger::gLogger->error("There are records already duplicate marked."); Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over"); } checkDups(*recordPtr, recordCount); mySamPool.releaseRecord(recordPtr); } // let the user know we're not napping if (verboseFlag && (recordCount % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u", recordCount, myFragmentMap.size(), myPairedMap.size(), myMateMap.size()); } } // we're finished reading record so clean up the duplicate search and // close the input file cleanupPriorReads(NULL); samIn.Close(); // print some statistics Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS"); Logger::gLogger->writeLog("Total number of reads: %u",recordCount); Logger::gLogger->writeLog("Total number of paired-end reads: %u", pairedCount); Logger::gLogger->writeLog("Total number of properly paired reads: %u", properPairCount); Logger::gLogger->writeLog("Total number of unmapped reads: %u", unmappedCount); Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u", reverseCount); Logger::gLogger->writeLog("Total number of QC-failed reads: %u", qualCheckFailCount); Logger::gLogger->writeLog("Total number of secondary reads: %u", secondaryCount); Logger::gLogger->writeLog("Total number of supplementary reads: %u", supplementaryCount); Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u", myFragmentMap.size()); Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u", myPairedMap.size()); Logger::gLogger->writeLog("Total number of missing mates: %u", myNumMissingMate); Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u", excludedCount); Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("Sorting the indices of %d duplicated records", myDupList.size()); // sort the indices of duplicate records std::sort(myDupList.begin(), myDupList.end(), std::less<uint32_t> ()); // get ready to write the output file by making a second pass // through the input file samIn.OpenForRead(inFile.c_str()); samIn.ReadHeader(header); SamFile samOut; samOut.OpenForWrite(outFile.c_str()); samOut.WriteHeader(header); // If we are recalibrating, output the model information. if(myDoRecab) { myRecab.modelFitPrediction(outFile); } // an iterator to run through the duplicate indices int currentDupIndex = 0; bool moreDups = !myDupList.empty(); // let the user know what we're doing Logger::gLogger->writeLog("\nWriting %s", outFile.c_str()); // count the duplicate records as a check uint32_t singleDuplicates(0), pairedDuplicates(0); // start reading records and writing them out SamRecord record; while(samIn.ReadRecord(header, record)) { uint32_t currentIndex = samIn.GetCurrentRecordCount(); bool foundDup = moreDups && (currentIndex == myDupList[currentDupIndex]); // modify the duplicate flag and write out the record, // if it's appropriate int flag = record.getFlag(); if (foundDup) { // this record is a duplicate, so mark it. record.setFlag( flag | 0x400 ); currentDupIndex++; // increment duplicate counters to verify we found them all if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) ) { // unpaired or mate unmapped singleDuplicates++; } else { pairedDuplicates++; } // recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } // write the record if we are not removing duplicates if (!removeFlag ) samOut.WriteRecord(header, record); } else { if(myForceFlag) { // this is not a duplicate we've identified but we want to // remove any duplicate marking record.setFlag( flag & 0xfffffbff ); // unmark duplicate } // Not a duplicate, so recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } samOut.WriteRecord(header, record); } // Let the user know we're still here if (verboseFlag && (currentIndex % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u", currentIndex); } } // We're done. Close the files and print triumphant messages. samIn.Close(); samOut.Close(); Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads", removeFlag ? "removed" : "marked" , singleDuplicates, pairedDuplicates/2); Logger::gLogger->writeLog("\nDedup_LowMem complete!"); return 0; }
int Revert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; bool cigar = false; bool qual = false; bool noeof = false; bool params = false; bool rmBQ = false; String rmTags = ""; myKeepTags = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER("cigar", &cigar) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("keepTags", &myKeepTags) LONG_PARAMETER("rmBQ", &rmBQ) LONG_STRINGPARAMETER("rmTags", &rmTags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed to the // failure reason if any of the writes or updates fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Update the cigar & position. if(cigar) { if(!updateCigar(samRecord)) { // Failed to update the cigar & position. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(qual) { if(!updateQual(samRecord)) { // Failed to update the quality. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmBQ) { if(!removeBQ(samRecord)) { // Failed to remove BQ. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmTags != "") { if(!samRecord.rmTags(rmTags.c_str())) { // Failed to remove the specified tags. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int Convert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String refFile = ""; bool lshift = false; bool noeof = false; bool params = false; bool useBases = false; bool useEquals = false; bool useOrigSeq = false; bool recover = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_STRINGPARAMETER("refFile", &refFile) LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("recover", &recover) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("SequenceConversion") EXCLUSIVE_PARAMETER("useBases", &useBases) EXCLUSIVE_PARAMETER("useEquals", &useEquals) EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the ref file was specified. // Open the reference. GenomeSequence* refPtr = NULL; if(refFile != "") { refPtr = new GenomeSequence(refFile); } SamRecord::SequenceTranslation translation; if((useBases) && (refPtr != NULL)) { translation = SamRecord::BASES; } else if((useEquals) && (refPtr != NULL)) { translation = SamRecord::EQUAL; } else { useOrigSeq = true; translation = SamRecord::NONE; } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; if(recover) samIn.setAttemptRecovery(true); samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); samOut.SetWriteSequenceTranslation(translation); samOut.SetReference(refPtr); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(1) { try { // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // left shift if necessary. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } break; } catch (std::runtime_error e) { std::cerr << "Caught runtime error: " << e.what() << "\n"; if(!recover) { std::cerr << "Corrupted BAM file detected - consider using --recover option.\n"; break; } std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n"; // XXX need to resync SamFile stream here bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH); if(rc) { std::cerr << "Successful resync - some data lost.\n"; continue; // succeeded } std::cerr << "Failed to re-sync on data stream.\n"; break; // failed to resync } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(refPtr != NULL) { delete(refPtr); } // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int main(int argc, char ** argv) { gpLogger = new Logger; static struct option getopt_long_options[] = { // Input options { "fasta", required_argument, NULL, 'f'}, { "in", required_argument, NULL, 'i'}, { "out", required_argument, NULL, 'o'}, { "verbose", no_argument, NULL, 'v'}, { "log", required_argument, NULL, 'l'}, { "clear", no_argument, NULL, 0}, { "AS", required_argument, NULL, 0}, { "UR", required_argument, NULL, 0}, { "SP", required_argument, NULL, 0}, { "HD", required_argument, NULL, 0}, { "RG", required_argument, NULL, 0}, { "PG", required_argument, NULL, 0}, { "checkSQ", no_argument, NULL, 0}, { NULL, 0, NULL, 0 }, }; int n_option_index = 0, c; std::string sAS, sUR, sSP, sFasta, sInFile, sOutFile, sLogFile; bool bClear, bCheckSQ, bVerbose; std::vector<std::string> vsHDHeaders, vsRGHeaders, vsPGHeaders; bCheckSQ = bVerbose = false; bClear = true; while ( (c = getopt_long(argc, argv, "vf:i:o:l:", getopt_long_options, &n_option_index)) != -1 ) { // std::cout << getopt_long_options[n_option_index].name << "\t" << optarg << std::endl; if ( c == 'f' ) { sFasta = optarg; } else if ( c == 'i' ) { sInFile = optarg; } else if ( c == 'o' ) { sOutFile = optarg; } else if ( c == 'v' ) { bVerbose = true; } else if ( c == 'l' ) { sLogFile = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"AS") == 0 ) { sAS = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"UR") == 0 ) { sUR = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"SP") == 0 ) { sSP = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"HD") == 0 ) { vsHDHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"RG") == 0 ) { vsRGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"PG") == 0 ) { vsPGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"checkSQ") == 0 ) { bCheckSQ = true; } else { std::cerr << "Error: Unrecognized option " << getopt_long_options[n_option_index].name << std::endl; abort(); } } if ( optind < argc ) { printUsage(std::cerr); gpLogger->error("non-option argument %s exist ",argv[optind]); } if ( sInFile.empty() || sOutFile.empty() ) { printUsage(std::cerr); gpLogger->error("Input and output files are required"); } if ( sLogFile.compare("__NONE__") == 0 ) { sLogFile = (sOutFile + ".log"); } gpLogger->open(sLogFile.c_str(), bVerbose); if ( ( bCheckSQ ) && ( sFasta.empty() ) ) { printUsage(std::cerr); gpLogger->error("--checkSQ option must be used with --fasta option"); } // check whether each header line starts with a correct tag checkHeaderStarts(vsHDHeaders, "@HD\t"); checkHeaderStarts(vsRGHeaders, "@RG\t"); checkHeaderStarts(vsPGHeaders, "@PG\t"); gpLogger->write_log("Arguments in effect:"); gpLogger->write_log("\t--in [%s]",sInFile.c_str()); gpLogger->write_log("\t--out [%s]",sOutFile.c_str()); gpLogger->write_log("\t--log [%s]",sLogFile.c_str()); gpLogger->write_log("\t--fasta [%s]",sFasta.c_str()); gpLogger->write_log("\t--AS [%s]",sAS.c_str()); gpLogger->write_log("\t--UR [%s]",sUR.c_str()); gpLogger->write_log("\t--SP [%s]",sSP.c_str()); gpLogger->write_log("\t--checkSQ [%s]",bClear ? "ON" : "OFF" ); if ( vsHDHeaders.empty() ) { gpLogger->write_log("\t--HD []"); } else { gpLogger->write_log("\t--HD [%s]",vsHDHeaders[0].c_str()); } if ( vsRGHeaders.empty() ) { gpLogger->write_log("\t--RG []"); } else { gpLogger->write_log("\t--RG [%s]",vsRGHeaders[0].c_str()); } if ( vsPGHeaders.empty() ) { gpLogger->write_log("\t--PG []"); } else { for(uint32_t i=0; i < vsPGHeaders.size(); ++i) { gpLogger->write_log("\t--PG [%s]",vsPGHeaders[i].c_str()); } } if ( (vsHDHeaders.empty() ) && ( vsRGHeaders.empty() ) && ( vsPGHeaders.empty() ) && ( !bClear ) && ( sFasta.empty() ) ) { gpLogger->warning("No option is in effect for modifying BAM files. The input and output files will be identical"); } if ( ( vsHDHeaders.size() > 1 ) || ( vsRGHeaders.size() > 1 ) ) { gpLogger->error("HD and RG headers cannot be multiple"); } FastaFile fastaFile; if ( ! sFasta.empty() ) { if ( fastaFile.open(sFasta.c_str()) ) { gpLogger->write_log("Reading the reference file %s",sFasta.c_str()); fastaFile.readThru(); fastaFile.close(); gpLogger->write_log("Finished reading the reference file %s",sFasta.c_str()); } else { gpLogger->error("Failed to open reference file %s",sFasta.c_str()); } } SamFile samIn; SamFile samOut; if ( ! samIn.OpenForRead(sInFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for reading - %s",sInFile.c_str(), SamStatus::getStatusString(samIn.GetStatus()) ); } if ( ! samOut.OpenForWrite(sOutFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for writing - %s",sOutFile.c_str(), SamStatus::getStatusString(samOut.GetStatus()) ); } SamFileHeader samHeader; SamHeaderRecord* pSamHeaderRecord; samIn.ReadHeader(samHeader); // check the sanity of SQ file // make sure the SN and LN matches, with the same order if ( bCheckSQ ) { unsigned int numSQ = 0; while( (pSamHeaderRecord = samHeader.getNextHeaderRecord()) != NULL ) { if ( pSamHeaderRecord->getType() == SamHeaderRecord::SQ ) { ++numSQ; } } if ( numSQ != fastaFile.vsSequenceNames.size() ) { gpLogger->error("# of @SQ tags are different from the original BAM and the reference file"); } // iterator over all @SQ objects for(unsigned int i=0; i < numSQ; ++i) { pSamHeaderRecord = samHeader.getSQ(fastaFile.vsSequenceNames[i].c_str()); if ( fastaFile.vsSequenceNames[i].compare(pSamHeaderRecord->getTagValue("SN")) != 0 ) { gpLogger->error("SequenceName is not identical between fasta and input BAM file"); } else if ( static_cast<int>(fastaFile.vnSequenceLengths[i]) != atoi(pSamHeaderRecord->getTagValue("LN")) ) { gpLogger->error("SequenceLength is not identical between fasta and input BAM file"); } else { if ( !sAS.empty() ) samHeader.setSQTag("AS",sAS.c_str(),fastaFile.vsSequenceNames[i].c_str()); samHeader.setSQTag("M5",fastaFile.vsMD5sums[i].c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sUR.empty() ) samHeader.setSQTag("UR",sUR.c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sSP.empty() ) samHeader.setSQTag("SP",sSP.c_str(),fastaFile.vsSequenceNames[i].c_str()); } } gpLogger->write_log("Finished checking the consistency of SQ tags"); } else { gpLogger->write_log("Skipped checking the consistency of SQ tags"); } // go over the headers again, // assuming order of HD, SQ, RG, PG, and put proper tags at the end of the original tags gpLogger->write_log("Creating the header of new output file"); //SamFileHeader outHeader; samHeader.resetHeaderRecordIter(); for(unsigned int i=0; i < vsHDHeaders.size(); ++i) { samHeader.addHeaderLine(vsHDHeaders[i].c_str()); } /* for(int i=0; i < fastaFile.vsSequenceNames.size(); ++i) { std::string s("@SQ\tSN:"); char buf[1024]; s += fastaFile.vsSequenceNames[i]; sprintf(buf,"\tLN:%d",fastaFile.vnSequenceLengths[i]); s += buf; if ( !sAS.empty() ) { sprintf(buf,"\tAS:%s",sAS.c_str()); s += buf; } if ( !sUR.empty() ) { sprintf(buf,"\tUR:%s",sUR.c_str()); s += buf; } sprintf(buf,"\tM5:%s",fastaFile.vsMD5sums[i].c_str()); s += buf; if ( !sSP.empty() ) { sprintf(buf,"\tSP:%s",sSP.c_str()); s += buf; } outHeader.addHeaderLine(s.c_str()); }*/ for(unsigned int i=0; i < vsRGHeaders.size(); ++i) { samHeader.addHeaderLine(vsRGHeaders[i].c_str()); } for(unsigned int i=0; i < vsPGHeaders.size(); ++i) { samHeader.addHeaderLine(vsPGHeaders[i].c_str()); } samOut.WriteHeader(samHeader); gpLogger->write_log("Adding %d HD, %d RG, and %d PG headers",vsHDHeaders.size(), vsRGHeaders.size(), vsPGHeaders.size()); gpLogger->write_log("Finished writing output headers"); // parse RG tag and get RG ID to append std::string sRGID; if ( ! vsRGHeaders.empty() ) { std::vector<std::string> tokens; FastaFile::tokenizeString( vsRGHeaders[0].c_str(), tokens ); for(unsigned int i=0; i < tokens.size(); ++i) { if ( tokens[i].find("ID:") == 0 ) { sRGID = tokens[i].substr(3); } } } gpLogger->write_log("Writing output BAM file"); SamRecord samRecord; while (samIn.ReadRecord(samHeader, samRecord) == true) { if ( !sRGID.empty() ) { if ( samRecord.addTag("RG",'Z',sRGID.c_str()) == false ) { gpLogger->error("Failed to add a RG tag %s",sRGID.c_str()); } // temporary code added if ( strncmp(samRecord.getReadName(),"seqcore_",8) == 0 ) { char buf[1024]; sprintf(buf,"UM%s",samRecord.getReadName()+8); samRecord.setReadName(buf); } } samOut.WriteRecord(samHeader, samRecord); //if ( samIn.GetCurrentRecordCount() == 1000 ) break; } samOut.Close(); gpLogger->write_log("Successfully written %d records",samIn.GetCurrentRecordCount()); delete gpLogger; return 0; }
bool BamProcessor::init (const ContalignParams& p) { read_cnt_ = proc_cnt_ = toolongs_ = unaligned_cnt_ = fail_cnt_ = nomd_cnt_ = realigned_cnt_ = modified_cnt_ = pos_adjusted_cnt_ = 0; log_diff_ = log_matr_ = log_base_ = false; p_ = &p; if (!*p.inbam ()) ers << "Input file name not specified" << Throw; limit_ = p.limit (); skip_ = p.skip (); infile_.OpenForRead (p.inbam ()); if (!infile_.IsOpen ()) ers << p.inbam () << ThrowEx (FileNotFoundRerror); bool index_ok = false; if (*p.bamidx ()) { index_ok = infile_.ReadBamIndex (p.bamidx ()); if (!index_ok) warn << "Unable to open specified BAM index: " << p.bamidx () << ". Default index will be attempted" << std::endl; } if (!index_ok) { try { index_ok = infile_.ReadBamIndex (); } catch (std::exception& e) { // for some reason not converted into return status by libStatGen } if (!index_ok) warn << "Unable to open default BAM index for " << p.inbam () << std::endl; } if (*p.refname () || p.refno () != -1) { if (!index_ok) ers << "Reference section specified, but the BAM index could not be open." << Throw; if (*p.refname ()) { if (p.endpos () != 0) { infile_.SetReadSection (p.refname (), p.begpos (), p.endpos ()); info << "Read section set : " << p.refname () << ": " << p.begpos () << "-" << p.endpos () << std::endl; } else { infile_.SetReadSection (p.refname ()); info << "Read section set : " << p.refname () << std::endl; } } else { if (p.endpos () != 0) { info << "Read section set : ref# " << p.refno () << ": " << p.begpos () << "-" << p.endpos () << std::endl; infile_.SetReadSection (p.refno (), p.begpos (), p.endpos ()); } else { info << "Read section set : ref# " << p.refno () << std::endl; infile_.SetReadSection (p.refno ()); } } } if (*p.outbam ()) { if (!p.overwrite () && file_exists (p.outbam ())) ers << "Output file " << p.outbam () << " exists. Use --ov key to allow overwriting" << Throw; outfile_.OpenForWrite (p.outbam ()); if (!outfile_.IsOpen ()) ers << "Unable to open output file " << p.outbam () << std::endl; } if (*p.logfname ()) { if (!p.overwrite () && file_exists (p.logfname ())) ers << "Log file " << p.logfname () << " exists. Use --ov key to allow overwriting" << Throw; logfile_.open (p.logfname (), std::fstream::out); if (!logfile_.is_open ()) ers << "Unable to open log file " << p.logfname () << std::endl; time_t t = time (NULL); logfile_ << "Context-aware realigner log\nStarted at " << asctime (localtime (&t)) << "\nParameters:\n"; logfile_ << *(p.parameters_); logfile_ << std::endl; log_base_ = p.logging ("base"); log_diff_ = p.logging ("diff"); log_matr_ = p.logging ("matr"); } band_width_ = p.bwid (); switch (p.algo ()) { case ContalignParams::TEMPL: { matrix_.configure (genstr::nucleotides.symbols (), genstr::nucleotides.size (), genstr::NegUnitaryMatrix <int, 4>().values ()); gap_cost_.configure (p.gip (), p.gep ()); taligner_.configure (&matrix_, &gap_cost_, &gap_cost_, &genstr::nn2num, &genstr::nn2num); } break; case ContalignParams::PLAIN: { batches_.reset (max_batch_no_); aligner_.init (MAX_SEQ_LEN, MAX_SEQ_LEN*MAX_BAND_WIDTH, p.gip (), p.gep (), p.mat (), -p.mis ()); if (log_matr_) aligner_.set_log (logfile_); if (p.debug () > 5) aligner_.set_trace (true); } break; case ContalignParams::POLY: { batches_.reset (max_batch_no_); contalign_.init (MAX_SEQ_LEN, MAX_RSEQ_LEN, MAX_SEQ_LEN*MAX_BAND_WIDTH, p.gip (), p.gep (), p.mat (), -p.mis ()); if (log_matr_) contalign_.set_log (logfile_); if (p.debug () > 5) contalign_.set_trace (true); } break; default: { ers << "Alignment algorithm " << p.algostr () << " not yet supported" << Throw; } } timer_.reset (DEFAULT_REPORT_IVAL, 1); return true; }
// main function int TrimBam::execute(int argc, char ** argv) { SamFile samIn; SamFile samOut; int numTrimBaseL = 0; int numTrimBaseR = 0; bool noeof = false; bool ignoreStrand = false; bool noPhoneHome = false; std::string inName = ""; std::string outName = ""; if ( argc < 5 ) { usage(); std::cerr << "ERROR: Incorrect number of parameters specified\n"; return(-1); } inName = argv[2]; outName = argv[3]; static struct option getopt_long_options[] = { // Input options { "left", required_argument, NULL, 'L'}, { "right", required_argument, NULL, 'R'}, { "ignoreStrand", no_argument, NULL, 'i'}, { "noeof", no_argument, NULL, 'n'}, { "noPhoneHome", no_argument, NULL, 'p'}, { "nophonehome", no_argument, NULL, 'P'}, { "phoneHomeThinning", required_argument, NULL, 't'}, { "phonehomethinning", required_argument, NULL, 'T'}, { NULL, 0, NULL, 0 }, }; int argIndex = 4; if(argv[argIndex][0] != '-') { // This is the number of bases to trim off both sides // so convert to a number. numTrimBaseL = atoi(argv[argIndex]); numTrimBaseR = numTrimBaseL; ++argIndex; } int c = 0; int n_option_index = 0; // Process any additional parameters while ( ( c = getopt_long(argc, argv, "L:R:in", getopt_long_options, &n_option_index) ) != -1 ) { switch(c) { case 'L': numTrimBaseL = atoi(optarg); break; case 'R': numTrimBaseR = atoi(optarg); break; case 'i': ignoreStrand = true; break; case 'n': noeof = true; break; case 'p': case 'P': noPhoneHome = true; break; case 't': case 'T': PhoneHome::allThinning = atoi(optarg); break; default: fprintf(stderr,"ERROR: Unrecognized option %s\n", getopt_long_options[n_option_index].name); return(-1); } } if(!noPhoneHome) { PhoneHome::checkVersion(getProgramName(), VERSION); } if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if ( ! samIn.OpenForRead(inName.c_str()) ) { fprintf(stderr, "***Problem opening %s\n",inName.c_str()); return(-1); } if(!samOut.OpenForWrite(outName.c_str())) { fprintf(stderr, "%s\n", samOut.GetStatusMessage()); return(samOut.GetStatus()); } fprintf(stderr,"Arguments in effect: \n"); fprintf(stderr,"\tInput file : %s\n",inName.c_str()); fprintf(stderr,"\tOutput file : %s\n",outName.c_str()); if(numTrimBaseL == numTrimBaseR) { fprintf(stderr,"\t#Bases to trim from each side : %d\n", numTrimBaseL); } else { fprintf(stderr,"\t#Bases to trim from the left of forward strands : %d\n", numTrimBaseL); fprintf(stderr,"\t#Bases to trim from the right of forward strands: %d\n", numTrimBaseR); if(!ignoreStrand) { // By default, reverse strands are treated the opposite. fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n", numTrimBaseR); fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n", numTrimBaseL); } else { // ignore strand, treating forward & reverse strands the same fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n", numTrimBaseL); fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n", numTrimBaseR); } } // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Write the sam header. if(!samOut.WriteHeader(samHeader)) { fprintf(stderr, "%s\n", samOut.GetStatusMessage()); return(samOut.GetStatus()); } SamRecord samRecord; char seq[65536]; char qual[65536]; int i, len; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Successfully read a record from the file, so write it. strcpy(seq,samRecord.getSequence()); strcpy(qual,samRecord.getQuality()); // Number of bases to trim from the left/right, // set based on ignoreStrand flag and strand info. int trimLeft = numTrimBaseL; int trimRight = numTrimBaseR; if(!ignoreStrand) { if(SamFlag::isReverse(samRecord.getFlag())) { // We are reversing the reverse reads, // so swap the left & right trim counts. trimRight = numTrimBaseL; trimLeft = numTrimBaseR; } } len = strlen(seq); // Do not trim if sequence is '*' if ( strcmp(seq, "*") != 0 ) { bool qualValue = true; if(strcmp(qual, "*") == 0) { qualValue = false; } int qualLen = strlen(qual); if ( (qualLen != len) && qualValue ) { fprintf(stderr,"ERROR: Sequence and Quality have different length\n"); return(-1); } if ( len < (trimLeft + trimRight) ) { // Read Length is less than the total number of bases to trim, // so trim the entire read. for(i=0; i < len; ++i) { seq[i] = 'N'; if ( qualValue ) { qual[i] = '!'; } } } else { // Read Length is larger than the total number of bases to trim, // so trim from the left, then from the right. for(i=0; i < trimLeft; ++i) { // Trim the bases from the left. seq[i] = 'N'; if ( qualValue ) { qual[i] = '!'; } } for(i = 0; i < trimRight; i++) { seq[len-i-1] = 'N'; if(qualValue) { qual[len-i-1] = '!'; } } } samRecord.setSequence(seq); samRecord.setQuality(qual); } if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "Failure in writing record %s\n", samOut.GetStatusMessage()); return(-1); } } if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed to read a record. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed reading a record. return(samIn.GetStatus()); } // Since the reads were successful, return the status based samIn.Close(); samOut.Close(); return 0; }
int WriteRegion::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String indexFile = ""; String readName = ""; String bed = ""; myStart = UNSPECIFIED_INT; myEnd = UNSPECIFIED_INT; myPrevStart = UNSPECIFIED_INT; myPrevEnd = UNSPECIFIED_INT; myRefID = UNSET_REF; myRefName.Clear(); myPrevRefName.Clear(); myBedRefID = SamReferenceInfo::NO_REF_ID; bool lshift = false; bool noeof = false; bool params = false; myWithinReg = false; myWroteReg = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Region Parameters") LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("refName", &myRefName) LONG_INTPARAMETER("refID", &myRefID) LONG_INTPARAMETER("start", &myStart) LONG_INTPARAMETER("end", &myEnd) LONG_STRINGPARAMETER("bed", &bed) LONG_PARAMETER("withinReg", &myWithinReg) LONG_STRINGPARAMETER("readName", &readName) LONG_PARAMETER_GROUP("Optional Other Parameters") LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); // mandatory argument was not specified. inputParameters.Status(); std::cerr << "Missing mandatory argument: --in" << std::endl; return(-1); } if(outFile == "") { usage(); // mandatory argument was not specified. inputParameters.Status(); std::cerr << "Missing mandatory argument: --out" << std::endl; return(-1); } if(indexFile == "") { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } if(myRefID != UNSET_REF && myRefName.Length() != 0) { std::cerr << "Can't specify both refID and refName" << std::endl; inputParameters.Status(); return(-1); } if(myRefID != UNSET_REF && bed.Length() != 0) { std::cerr << "Can't specify both refID and bed" << std::endl; inputParameters.Status(); return(-1); } if(myRefName.Length() != 0 && bed.Length() != 0) { std::cerr << "Can't specify both refName and bed" << std::endl; inputParameters.Status(); return(-1); } if(!bed.IsEmpty()) { myBedFile = ifopen(bed, "r"); } if(params) { inputParameters.Status(); } // Open the file for reading. mySamIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); // Open the bam index file for reading if a region was specified. if((myRefName.Length() != 0) || (myRefID != UNSET_REF) || (myBedFile != NULL)) { mySamIn.ReadBamIndex(indexFile); } // Read & write the sam header. mySamIn.ReadHeader(mySamHeader); samOut.WriteHeader(mySamHeader); // Read the sam records. SamRecord samRecord; // Track the status. int numSectionRecords = 0; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(getNextSection()) { // Keep reading records until they aren't anymore. while(mySamIn.ReadRecord(mySamHeader, samRecord)) { if(!readName.IsEmpty()) { // Check for readname. if(strcmp(samRecord.getReadName(), readName.c_str()) != 0) { // not a matching read name, so continue to the next record. continue; } } // Check to see if the read has already been processed. if(myPrevEnd != UNSPECIFIED_INT) { // Because we already know that the bed was sorted, // we know that the previous section started before // this one, so if the previous end is greater than // this record's end position we know that it // was already written in the previous section. // Note: can't be equal to the previous end since // the end range was exclusive, while // get0BasedAlignmentEnd is inclusive. // myPrevEnd is reset by getNextSection when a new // chromosome is hit. if(samRecord.get0BasedAlignmentEnd() < myPrevEnd) { // This record was already written. continue; } } // Shift left if applicable. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. samOut.WriteRecord(mySamHeader, samRecord); ++numSectionRecords; } myWroteReg = true; } if(myBedFile != NULL) { ifclose(myBedFile); } std::cerr << "Wrote " << outFile << " with " << numSectionRecords << " records.\n"; return(returnStatus); }