SamStatus::Status ClipOverlap::readCoordRecord(SamFile& samIn, SamRecord** recordPtr, MateMapByCoord& mateMap, SamCoordOutput* outputBufferPtr) { // Null pointer, so get a new pointer. if(*recordPtr == NULL) { *recordPtr = myPool.getRecord(); if(*recordPtr == NULL) { // Failed to allocate a new record. // Try to free up records from the mate map if(!forceRecordFlush(mateMap, outputBufferPtr)) { std::cerr << "Failed to flush the output buffer.\n"; return(SamStatus::FAIL_IO); } // Try to get a new record, one should have been cleared. *recordPtr = myPool.getRecord(); if(*recordPtr == NULL) { std::cerr << "Failed to allocate any records.\n"; return(SamStatus::FAIL_MEM); } } } // RecordPtr is set. if(!samIn.ReadRecord(mySamHeader, **recordPtr)) { // Nothing to process, so return. return(samIn.GetStatus()); } return(SamStatus::SUCCESS); }
int Dedup_LowMem::execute(int argc, char** argv) { /* -------------------------------- * process the arguments * -------------------------------*/ String inFile, outFile, logFile; myDoRecab = false; bool removeFlag = false; bool verboseFlag = false; myForceFlag = false; myNumMissingMate = 0; myMinQual = DEFAULT_MIN_QUAL; String excludeFlags = "0xB04"; uint16_t intExcludeFlags = 0; bool noeof = false; bool params = false; LongParamContainer parameters; parameters.addGroup("Required Parameters"); parameters.addString("in", &inFile); parameters.addString("out", &outFile); parameters.addGroup("Optional Parameters"); parameters.addInt("minQual", & myMinQual); parameters.addString("log", &logFile); parameters.addBool("oneChrom", &myOneChrom); parameters.addBool("recab", &myDoRecab); parameters.addBool("rmDups", &removeFlag); parameters.addBool("force", &myForceFlag); parameters.addString("excludeFlags", &excludeFlags); parameters.addBool("verbose", &verboseFlag); parameters.addBool("noeof", &noeof); parameters.addBool("params", ¶ms); parameters.addPhoneHome(VERSION); myRecab.addRecabSpecificParameters(parameters); ParameterList inputParameters; inputParameters.Add(new LongParameters ("Input Parameters", parameters.getLongParameterList())); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(inFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an input file" << std::endl; return EXIT_FAILURE; } if(outFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an output file" << std::endl; return EXIT_FAILURE; } intExcludeFlags = excludeFlags.AsInteger(); if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Cannot specify --force and Duplicate in the excludeFlags. Since --force indicates to override" << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n"; return EXIT_FAILURE; } if(!SamFlag::isSecondary(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n"; return EXIT_FAILURE; } if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n"; return EXIT_FAILURE; } if(logFile.IsEmpty()) { logFile = outFile + ".log"; } if(myDoRecab) { int status = myRecab.processRecabParam(); if(status != 0) { inputParameters.Status(); return(status); } } if(params) { inputParameters.Status(); } Logger::gLogger = new Logger(logFile.c_str(), verboseFlag); /* ------------------------------------------------------------------- * The arguments are processed. Prepare the input BAM file, * instantiate dedup_LowMem, and construct the read group library map * ------------------------------------------------------------------*/ SamFile samIn; samIn.OpenForRead(inFile.c_str()); // If the file isn't sorted it will throw an exception. samIn.setSortedValidation(SamFile::COORDINATE); SamFileHeader header; samIn.ReadHeader(header); buildReadGroupLibraryMap(header); lastReference = -1; lastCoordinate = -1; // for keeping some basic statistics uint32_t recordCount = 0; uint32_t pairedCount = 0; uint32_t properPairCount = 0; uint32_t unmappedCount = 0; uint32_t reverseCount = 0; uint32_t qualCheckFailCount = 0; uint32_t secondaryCount = 0; uint32_t supplementaryCount = 0; uint32_t excludedCount = 0; // Now we start reading records SamRecord* recordPtr; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = mySamPool.getRecord(); if(recordPtr == NULL) { std::cerr << "Failed to allocate enough records\n"; return(-1); } if(!samIn.ReadRecord(header, *recordPtr)) { returnStatus = samIn.GetStatus(); continue; } // Take note of properties of this record int flag = recordPtr->getFlag(); if(SamFlag::isPaired(flag)) ++pairedCount; if(SamFlag::isProperPair(flag)) ++properPairCount; if(SamFlag::isReverse(flag)) ++reverseCount; if(SamFlag::isQCFailure(flag)) ++qualCheckFailCount; if(SamFlag::isSecondary(flag)) ++secondaryCount; if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT) ++supplementaryCount; if(!SamFlag::isMapped(flag)) ++unmappedCount; // put the record in the appropriate maps: // single reads go in myFragmentMap // paired reads go in myPairedMap recordCount = samIn.GetCurrentRecordCount(); // if we have moved to a new position, look back at previous reads for duplicates if (hasPositionChanged(*recordPtr)) { cleanupPriorReads(recordPtr); } // Determine if this read should be checked for duplicates. if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0)) { ++excludedCount; // No deduping done on this record, but still build the recab table. if(myDoRecab) { myRecab.processReadBuildTable(*recordPtr); } // Nothing more to do with this record, so // release the pointer. mySamPool.releaseRecord(recordPtr); } else { if(SamFlag::isDuplicate(flag) && !myForceFlag) { // Error: Marked duplicates, and duplicates aren't excluded. Logger::gLogger->error("There are records already duplicate marked."); Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over"); } checkDups(*recordPtr, recordCount); mySamPool.releaseRecord(recordPtr); } // let the user know we're not napping if (verboseFlag && (recordCount % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u", recordCount, myFragmentMap.size(), myPairedMap.size(), myMateMap.size()); } } // we're finished reading record so clean up the duplicate search and // close the input file cleanupPriorReads(NULL); samIn.Close(); // print some statistics Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS"); Logger::gLogger->writeLog("Total number of reads: %u",recordCount); Logger::gLogger->writeLog("Total number of paired-end reads: %u", pairedCount); Logger::gLogger->writeLog("Total number of properly paired reads: %u", properPairCount); Logger::gLogger->writeLog("Total number of unmapped reads: %u", unmappedCount); Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u", reverseCount); Logger::gLogger->writeLog("Total number of QC-failed reads: %u", qualCheckFailCount); Logger::gLogger->writeLog("Total number of secondary reads: %u", secondaryCount); Logger::gLogger->writeLog("Total number of supplementary reads: %u", supplementaryCount); Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u", myFragmentMap.size()); Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u", myPairedMap.size()); Logger::gLogger->writeLog("Total number of missing mates: %u", myNumMissingMate); Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u", excludedCount); Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("Sorting the indices of %d duplicated records", myDupList.size()); // sort the indices of duplicate records std::sort(myDupList.begin(), myDupList.end(), std::less<uint32_t> ()); // get ready to write the output file by making a second pass // through the input file samIn.OpenForRead(inFile.c_str()); samIn.ReadHeader(header); SamFile samOut; samOut.OpenForWrite(outFile.c_str()); samOut.WriteHeader(header); // If we are recalibrating, output the model information. if(myDoRecab) { myRecab.modelFitPrediction(outFile); } // an iterator to run through the duplicate indices int currentDupIndex = 0; bool moreDups = !myDupList.empty(); // let the user know what we're doing Logger::gLogger->writeLog("\nWriting %s", outFile.c_str()); // count the duplicate records as a check uint32_t singleDuplicates(0), pairedDuplicates(0); // start reading records and writing them out SamRecord record; while(samIn.ReadRecord(header, record)) { uint32_t currentIndex = samIn.GetCurrentRecordCount(); bool foundDup = moreDups && (currentIndex == myDupList[currentDupIndex]); // modify the duplicate flag and write out the record, // if it's appropriate int flag = record.getFlag(); if (foundDup) { // this record is a duplicate, so mark it. record.setFlag( flag | 0x400 ); currentDupIndex++; // increment duplicate counters to verify we found them all if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) ) { // unpaired or mate unmapped singleDuplicates++; } else { pairedDuplicates++; } // recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } // write the record if we are not removing duplicates if (!removeFlag ) samOut.WriteRecord(header, record); } else { if(myForceFlag) { // this is not a duplicate we've identified but we want to // remove any duplicate marking record.setFlag( flag & 0xfffffbff ); // unmark duplicate } // Not a duplicate, so recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } samOut.WriteRecord(header, record); } // Let the user know we're still here if (verboseFlag && (currentIndex % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u", currentIndex); } } // We're done. Close the files and print triumphant messages. samIn.Close(); samOut.Close(); Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads", removeFlag ? "removed" : "marked" , singleDuplicates, pairedDuplicates/2); Logger::gLogger->writeLog("\nDedup_LowMem complete!"); return 0; }
int Revert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; bool cigar = false; bool qual = false; bool noeof = false; bool params = false; bool rmBQ = false; String rmTags = ""; myKeepTags = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER("cigar", &cigar) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("keepTags", &myKeepTags) LONG_PARAMETER("rmBQ", &rmBQ) LONG_STRINGPARAMETER("rmTags", &rmTags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed to the // failure reason if any of the writes or updates fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Update the cigar & position. if(cigar) { if(!updateCigar(samRecord)) { // Failed to update the cigar & position. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(qual) { if(!updateQual(samRecord)) { // Failed to update the quality. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmBQ) { if(!removeBQ(samRecord)) { // Failed to remove BQ. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmTags != "") { if(!samRecord.rmTags(rmTags.c_str())) { // Failed to remove the specified tags. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int Convert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String refFile = ""; bool lshift = false; bool noeof = false; bool params = false; bool useBases = false; bool useEquals = false; bool useOrigSeq = false; bool recover = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_STRINGPARAMETER("refFile", &refFile) LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("recover", &recover) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("SequenceConversion") EXCLUSIVE_PARAMETER("useBases", &useBases) EXCLUSIVE_PARAMETER("useEquals", &useEquals) EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the ref file was specified. // Open the reference. GenomeSequence* refPtr = NULL; if(refFile != "") { refPtr = new GenomeSequence(refFile); } SamRecord::SequenceTranslation translation; if((useBases) && (refPtr != NULL)) { translation = SamRecord::BASES; } else if((useEquals) && (refPtr != NULL)) { translation = SamRecord::EQUAL; } else { useOrigSeq = true; translation = SamRecord::NONE; } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; if(recover) samIn.setAttemptRecovery(true); samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); samOut.SetWriteSequenceTranslation(translation); samOut.SetReference(refPtr); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(1) { try { // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // left shift if necessary. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } break; } catch (std::runtime_error e) { std::cerr << "Caught runtime error: " << e.what() << "\n"; if(!recover) { std::cerr << "Corrupted BAM file detected - consider using --recover option.\n"; break; } std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n"; // XXX need to resync SamFile stream here bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH); if(rc) { std::cerr << "Successful resync - some data lost.\n"; continue; // succeeded } std::cerr << "Failed to re-sync on data stream.\n"; break; // failed to resync } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(refPtr != NULL) { delete(refPtr); } // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int Bam2FastQ::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool readName = false; String refFile = ""; String firstOut = ""; String secondOut = ""; String unpairedOut = ""; bool interleave = false; bool noeof = false; bool gzip = false; bool params = false; myOutBase = ""; myNumMateFailures = 0; myNumPairs = 0; myNumUnpaired = 0; mySplitRG = false; myQField = ""; myNumQualTagErrors = 0; myReverseComp = true; myRNPlus = false; myFirstRNExt = DEFAULT_FIRST_EXT; mySecondRNExt = DEFAULT_SECOND_EXT; myCompression = InputFile::DEFAULT; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("readName", &readName) LONG_PARAMETER("splitRG", &mySplitRG) LONG_STRINGPARAMETER("qualField", &myQField) LONG_PARAMETER("merge", &interleave) LONG_STRINGPARAMETER("refFile", &refFile) LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt) LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt) LONG_PARAMETER("rnPlus", &myRNPlus) LONG_PARAMETER("noReverseComp", &myReverseComp) LONG_PARAMETER("gzip", &gzip) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional OutputFile Names") LONG_STRINGPARAMETER("outBase", &myOutBase) LONG_STRINGPARAMETER("firstOut", &firstOut) LONG_STRINGPARAMETER("secondOut", &secondOut) LONG_STRINGPARAMETER("unpairedOut", &unpairedOut) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(gzip) { myCompression = InputFile::GZIP; } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both splitRG & firstOut/secondOut/unpairedOut // since it needs a different file for each RG. if(mySplitRG && (!firstOut.IsEmpty() || !secondOut.IsEmpty() || !unpairedOut.IsEmpty())) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n"; std::cerr << "Use --outBase instead.\n"; return(-1); } // Cannot specify splitRG & output to stdout. if(mySplitRG && (myOutBase[0] == '-')) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n"; return(-1); } // Check to see if the out file was specified, if not, generate it from // the input filename. if(myOutBase == "") { // Just remove the extension from the input filename. int extStart = inFile.FastFindLastChar('.'); if(extStart <= 0) { myOutBase = inFile; } else { myOutBase = inFile.Left(extStart); } } if(mySplitRG) { std::string fqList = myOutBase.c_str(); fqList += ".list"; myFqList = ifopen(fqList.c_str(), "w"); ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n"); } // Check to see if the first/second/single-ended were specified and // if not, set them. myFirstFileNameExt = "_1.fastq"; mySecondFileNameExt = "_2.fastq"; myUnpairedFileNameExt = ".fastq"; if(interleave) { myFirstFileNameExt = "_interleaved.fastq"; myFirstFileNameExt = "_interleaved.fastq"; } getFileName(firstOut, myFirstFileNameExt); getFileName(secondOut, mySecondFileNameExt); getFileName(unpairedOut, myUnpairedFileNameExt); if(params) { inputParameters.Status(); } // Open the files for reading/writing. // Open prior to opening the output files, // so if there is an error, the outputs don't get created. SamFile samIn; samIn.OpenForRead(inFile, &mySamHeader); // Skip non-primary reads. samIn.SetReadFlags(0, 0x0100); // Open the output files if not splitting RG if(!mySplitRG) { myUnpairedFile = ifopen(unpairedOut, "w", myCompression); // Only open the first file if it is different than an already opened file. if(firstOut != unpairedOut) { myFirstFile = ifopen(firstOut, "w", myCompression); } else { myFirstFile = myUnpairedFile; } // If it is interleaved or the 2nd file is not a new name, set it appropriately. if(interleave || secondOut == firstOut) { mySecondFile = myFirstFile; } else if(secondOut == unpairedOut) { mySecondFile = myUnpairedFile; } else { mySecondFile = ifopen(secondOut, "w", myCompression); } if(myUnpairedFile == NULL) { std::cerr << "Failed to open " << unpairedOut << " so can't convert bam2FastQ.\n"; return(-1); } if(myFirstFile == NULL) { std::cerr << "Failed to open " << firstOut << " so can't convert bam2FastQ.\n"; return(-1); } if(mySecondFile == NULL) { std::cerr << "Failed to open " << secondOut << " so can't convert bam2FastQ.\n"; return(-1); } } if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0)) { readName = true; } else { // defaulting to coordinate sorted. samIn.setSortedValidation(SamFile::COORDINATE); } // Setup the '=' translation if the reference was specified. if(!refFile.IsEmpty()) { GenomeSequence* refPtr = new GenomeSequence(refFile); samIn.SetReadSequenceTranslation(SamRecord::BASES); samIn.SetReference(refPtr); } SamRecord* recordPtr; int16_t samFlag; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = myPool.getRecord(); if(recordPtr == NULL) { // Failed to allocate a new record. throw(std::runtime_error("Failed to allocate a new SAM/BAM record")); } if(!samIn.ReadRecord(mySamHeader, *recordPtr)) { // Failed to read a record. returnStatus = samIn.GetStatus(); continue; } // Have a record. Check to see if it is a pair or unpaired read. samFlag = recordPtr->getFlag(); if(SamFlag::isPaired(samFlag)) { if(readName) { handlePairedRN(*recordPtr); } else { handlePairedCoord(*recordPtr); } } else { ++myNumUnpaired; writeFastQ(*recordPtr, myUnpairedFile, myUnpairedFileNameExt); } } // Flush All cleanUpMateMap(0, true); if(returnStatus == SamStatus::NO_MORE_RECS) { returnStatus = SamStatus::SUCCESS; } samIn.Close(); closeFiles(); // Output the results std::cerr << "\nFound " << myNumPairs << " read pairs.\n"; std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n"; if(myNumMateFailures != 0) { std::cerr << "Failed to find mates for " << myNumMateFailures << " reads, so they were written as unpaired\n" << " (not included in either of the above counts).\n"; } if(myNumQualTagErrors != 0) { std::cerr << myNumQualTagErrors << " records did not have tag " << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n"; } return(returnStatus); }
int main(int argc, char ** argv) { gpLogger = new Logger; static struct option getopt_long_options[] = { // Input options { "fasta", required_argument, NULL, 'f'}, { "in", required_argument, NULL, 'i'}, { "out", required_argument, NULL, 'o'}, { "verbose", no_argument, NULL, 'v'}, { "log", required_argument, NULL, 'l'}, { "clear", no_argument, NULL, 0}, { "AS", required_argument, NULL, 0}, { "UR", required_argument, NULL, 0}, { "SP", required_argument, NULL, 0}, { "HD", required_argument, NULL, 0}, { "RG", required_argument, NULL, 0}, { "PG", required_argument, NULL, 0}, { "checkSQ", no_argument, NULL, 0}, { NULL, 0, NULL, 0 }, }; int n_option_index = 0, c; std::string sAS, sUR, sSP, sFasta, sInFile, sOutFile, sLogFile; bool bClear, bCheckSQ, bVerbose; std::vector<std::string> vsHDHeaders, vsRGHeaders, vsPGHeaders; bCheckSQ = bVerbose = false; bClear = true; while ( (c = getopt_long(argc, argv, "vf:i:o:l:", getopt_long_options, &n_option_index)) != -1 ) { // std::cout << getopt_long_options[n_option_index].name << "\t" << optarg << std::endl; if ( c == 'f' ) { sFasta = optarg; } else if ( c == 'i' ) { sInFile = optarg; } else if ( c == 'o' ) { sOutFile = optarg; } else if ( c == 'v' ) { bVerbose = true; } else if ( c == 'l' ) { sLogFile = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"AS") == 0 ) { sAS = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"UR") == 0 ) { sUR = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"SP") == 0 ) { sSP = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"HD") == 0 ) { vsHDHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"RG") == 0 ) { vsRGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"PG") == 0 ) { vsPGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"checkSQ") == 0 ) { bCheckSQ = true; } else { std::cerr << "Error: Unrecognized option " << getopt_long_options[n_option_index].name << std::endl; abort(); } } if ( optind < argc ) { printUsage(std::cerr); gpLogger->error("non-option argument %s exist ",argv[optind]); } if ( sInFile.empty() || sOutFile.empty() ) { printUsage(std::cerr); gpLogger->error("Input and output files are required"); } if ( sLogFile.compare("__NONE__") == 0 ) { sLogFile = (sOutFile + ".log"); } gpLogger->open(sLogFile.c_str(), bVerbose); if ( ( bCheckSQ ) && ( sFasta.empty() ) ) { printUsage(std::cerr); gpLogger->error("--checkSQ option must be used with --fasta option"); } // check whether each header line starts with a correct tag checkHeaderStarts(vsHDHeaders, "@HD\t"); checkHeaderStarts(vsRGHeaders, "@RG\t"); checkHeaderStarts(vsPGHeaders, "@PG\t"); gpLogger->write_log("Arguments in effect:"); gpLogger->write_log("\t--in [%s]",sInFile.c_str()); gpLogger->write_log("\t--out [%s]",sOutFile.c_str()); gpLogger->write_log("\t--log [%s]",sLogFile.c_str()); gpLogger->write_log("\t--fasta [%s]",sFasta.c_str()); gpLogger->write_log("\t--AS [%s]",sAS.c_str()); gpLogger->write_log("\t--UR [%s]",sUR.c_str()); gpLogger->write_log("\t--SP [%s]",sSP.c_str()); gpLogger->write_log("\t--checkSQ [%s]",bClear ? "ON" : "OFF" ); if ( vsHDHeaders.empty() ) { gpLogger->write_log("\t--HD []"); } else { gpLogger->write_log("\t--HD [%s]",vsHDHeaders[0].c_str()); } if ( vsRGHeaders.empty() ) { gpLogger->write_log("\t--RG []"); } else { gpLogger->write_log("\t--RG [%s]",vsRGHeaders[0].c_str()); } if ( vsPGHeaders.empty() ) { gpLogger->write_log("\t--PG []"); } else { for(uint32_t i=0; i < vsPGHeaders.size(); ++i) { gpLogger->write_log("\t--PG [%s]",vsPGHeaders[i].c_str()); } } if ( (vsHDHeaders.empty() ) && ( vsRGHeaders.empty() ) && ( vsPGHeaders.empty() ) && ( !bClear ) && ( sFasta.empty() ) ) { gpLogger->warning("No option is in effect for modifying BAM files. The input and output files will be identical"); } if ( ( vsHDHeaders.size() > 1 ) || ( vsRGHeaders.size() > 1 ) ) { gpLogger->error("HD and RG headers cannot be multiple"); } FastaFile fastaFile; if ( ! sFasta.empty() ) { if ( fastaFile.open(sFasta.c_str()) ) { gpLogger->write_log("Reading the reference file %s",sFasta.c_str()); fastaFile.readThru(); fastaFile.close(); gpLogger->write_log("Finished reading the reference file %s",sFasta.c_str()); } else { gpLogger->error("Failed to open reference file %s",sFasta.c_str()); } } SamFile samIn; SamFile samOut; if ( ! samIn.OpenForRead(sInFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for reading - %s",sInFile.c_str(), SamStatus::getStatusString(samIn.GetStatus()) ); } if ( ! samOut.OpenForWrite(sOutFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for writing - %s",sOutFile.c_str(), SamStatus::getStatusString(samOut.GetStatus()) ); } SamFileHeader samHeader; SamHeaderRecord* pSamHeaderRecord; samIn.ReadHeader(samHeader); // check the sanity of SQ file // make sure the SN and LN matches, with the same order if ( bCheckSQ ) { unsigned int numSQ = 0; while( (pSamHeaderRecord = samHeader.getNextHeaderRecord()) != NULL ) { if ( pSamHeaderRecord->getType() == SamHeaderRecord::SQ ) { ++numSQ; } } if ( numSQ != fastaFile.vsSequenceNames.size() ) { gpLogger->error("# of @SQ tags are different from the original BAM and the reference file"); } // iterator over all @SQ objects for(unsigned int i=0; i < numSQ; ++i) { pSamHeaderRecord = samHeader.getSQ(fastaFile.vsSequenceNames[i].c_str()); if ( fastaFile.vsSequenceNames[i].compare(pSamHeaderRecord->getTagValue("SN")) != 0 ) { gpLogger->error("SequenceName is not identical between fasta and input BAM file"); } else if ( static_cast<int>(fastaFile.vnSequenceLengths[i]) != atoi(pSamHeaderRecord->getTagValue("LN")) ) { gpLogger->error("SequenceLength is not identical between fasta and input BAM file"); } else { if ( !sAS.empty() ) samHeader.setSQTag("AS",sAS.c_str(),fastaFile.vsSequenceNames[i].c_str()); samHeader.setSQTag("M5",fastaFile.vsMD5sums[i].c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sUR.empty() ) samHeader.setSQTag("UR",sUR.c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sSP.empty() ) samHeader.setSQTag("SP",sSP.c_str(),fastaFile.vsSequenceNames[i].c_str()); } } gpLogger->write_log("Finished checking the consistency of SQ tags"); } else { gpLogger->write_log("Skipped checking the consistency of SQ tags"); } // go over the headers again, // assuming order of HD, SQ, RG, PG, and put proper tags at the end of the original tags gpLogger->write_log("Creating the header of new output file"); //SamFileHeader outHeader; samHeader.resetHeaderRecordIter(); for(unsigned int i=0; i < vsHDHeaders.size(); ++i) { samHeader.addHeaderLine(vsHDHeaders[i].c_str()); } /* for(int i=0; i < fastaFile.vsSequenceNames.size(); ++i) { std::string s("@SQ\tSN:"); char buf[1024]; s += fastaFile.vsSequenceNames[i]; sprintf(buf,"\tLN:%d",fastaFile.vnSequenceLengths[i]); s += buf; if ( !sAS.empty() ) { sprintf(buf,"\tAS:%s",sAS.c_str()); s += buf; } if ( !sUR.empty() ) { sprintf(buf,"\tUR:%s",sUR.c_str()); s += buf; } sprintf(buf,"\tM5:%s",fastaFile.vsMD5sums[i].c_str()); s += buf; if ( !sSP.empty() ) { sprintf(buf,"\tSP:%s",sSP.c_str()); s += buf; } outHeader.addHeaderLine(s.c_str()); }*/ for(unsigned int i=0; i < vsRGHeaders.size(); ++i) { samHeader.addHeaderLine(vsRGHeaders[i].c_str()); } for(unsigned int i=0; i < vsPGHeaders.size(); ++i) { samHeader.addHeaderLine(vsPGHeaders[i].c_str()); } samOut.WriteHeader(samHeader); gpLogger->write_log("Adding %d HD, %d RG, and %d PG headers",vsHDHeaders.size(), vsRGHeaders.size(), vsPGHeaders.size()); gpLogger->write_log("Finished writing output headers"); // parse RG tag and get RG ID to append std::string sRGID; if ( ! vsRGHeaders.empty() ) { std::vector<std::string> tokens; FastaFile::tokenizeString( vsRGHeaders[0].c_str(), tokens ); for(unsigned int i=0; i < tokens.size(); ++i) { if ( tokens[i].find("ID:") == 0 ) { sRGID = tokens[i].substr(3); } } } gpLogger->write_log("Writing output BAM file"); SamRecord samRecord; while (samIn.ReadRecord(samHeader, samRecord) == true) { if ( !sRGID.empty() ) { if ( samRecord.addTag("RG",'Z',sRGID.c_str()) == false ) { gpLogger->error("Failed to add a RG tag %s",sRGID.c_str()); } // temporary code added if ( strncmp(samRecord.getReadName(),"seqcore_",8) == 0 ) { char buf[1024]; sprintf(buf,"UM%s",samRecord.getReadName()+8); samRecord.setReadName(buf); } } samOut.WriteRecord(samHeader, samRecord); //if ( samIn.GetCurrentRecordCount() == 1000 ) break; } samOut.Close(); gpLogger->write_log("Successfully written %d records",samIn.GetCurrentRecordCount()); delete gpLogger; return 0; }
SamStatus::Status ClipOverlap::handleSortedByReadName(SamFile& samIn, SamFile* samOutPtr) { // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; // Read the sam records. SamRecord* prevSamRecord = NULL; SamRecord* samRecord = new SamRecord; SamRecord* tmpRecord = new SamRecord; if((samRecord == NULL) || (tmpRecord == NULL)) { std::cerr << "Failed to allocate a SamRecord, so exit.\n"; return(SamStatus::FAIL_MEM); } // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(mySamHeader, *samRecord)) { int16_t flag = samRecord->getFlag(); if((flag & myIntExcludeFlags) != 0) { // This read should not be checked for overlaps. // Check if there is a previous SamRecord. if(prevSamRecord != NULL) { // There is a previous record. // If it has a different read name, write it. if(strcmp(samRecord->getReadName(), prevSamRecord->getReadName()) != 0) { // Different read name, so write the previous record. if((samOutPtr != NULL) && !myOverlapsOnly) { if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } } // Clear the previous record info. tmpRecord = prevSamRecord; prevSamRecord = NULL; } // If it has the same read name, leave it in case there is another read with that name } // This record is not being checked for overlaps, so just write it and continue if((samOutPtr != NULL) && !myOverlapsOnly) { if(!samOutPtr->WriteRecord(mySamHeader, *samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } } continue; } if(prevSamRecord == NULL) { // Nothing to compare this record to, so set this // record to the previous, and the next record. prevSamRecord = samRecord; samRecord = tmpRecord; tmpRecord = NULL; continue; } // Check if the read name matches the previous read name. if(strcmp(samRecord->getReadName(), prevSamRecord->getReadName()) == 0) { bool overlap = false; // Same Read Name, so check clipping. OverlapHandler::OverlapInfo prevClipInfo = myOverlapHandler->getOverlapInfo(*prevSamRecord); OverlapHandler::OverlapInfo curClipInfo = myOverlapHandler->getOverlapInfo(*samRecord); // If either indicate a complete clipping, clip both. if((prevClipInfo == OverlapHandler::NO_OVERLAP_WRONG_ORIENT) || (curClipInfo == OverlapHandler::NO_OVERLAP_WRONG_ORIENT)) { overlap = true; myOverlapHandler->handleNoOverlapWrongOrientation(*prevSamRecord); // Don't update stats since this is the 2nd in the pair myOverlapHandler->handleNoOverlapWrongOrientation(*samRecord, false); } else if((prevClipInfo == OverlapHandler::OVERLAP) || (prevClipInfo == OverlapHandler::SAME_START)) { // The previous read starts at or before the current one. overlap = true; myOverlapHandler->handleOverlapPair(*prevSamRecord, *samRecord); } else if(curClipInfo == OverlapHandler::OVERLAP) { // The current read starts before the previous one. overlap = true; myOverlapHandler->handleOverlapPair(*samRecord, *prevSamRecord); } // Found a read pair, so write both records if: // 1) output file is specified // AND // 2a) all records should be written // OR // 2b) the pair overlaps if((samOutPtr != NULL) && (!myOverlapsOnly || overlap)) { if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } if(!samOutPtr->WriteRecord(mySamHeader, *samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } } // Setup for the next read with no previous. tmpRecord = prevSamRecord; prevSamRecord = NULL; } else { // Read name does not match, so write the previous record // if we are writing all records. if((samOutPtr != NULL) && !myOverlapsOnly) { if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } } // Store this record as the previous. tmpRecord = prevSamRecord; prevSamRecord = samRecord; samRecord = tmpRecord; tmpRecord = NULL; } } // Write the previous record if there is one. if((samOutPtr != NULL) && (prevSamRecord != NULL) && !myOverlapsOnly) { if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } delete prevSamRecord; } if(samRecord != NULL) { delete samRecord; } if(tmpRecord != NULL) { delete tmpRecord; } if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { return(samIn.GetStatus()); } return(returnStatus); }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
// main function int TrimBam::execute(int argc, char ** argv) { SamFile samIn; SamFile samOut; int numTrimBaseL = 0; int numTrimBaseR = 0; bool noeof = false; bool ignoreStrand = false; bool noPhoneHome = false; std::string inName = ""; std::string outName = ""; if ( argc < 5 ) { usage(); std::cerr << "ERROR: Incorrect number of parameters specified\n"; return(-1); } inName = argv[2]; outName = argv[3]; static struct option getopt_long_options[] = { // Input options { "left", required_argument, NULL, 'L'}, { "right", required_argument, NULL, 'R'}, { "ignoreStrand", no_argument, NULL, 'i'}, { "noeof", no_argument, NULL, 'n'}, { "noPhoneHome", no_argument, NULL, 'p'}, { "nophonehome", no_argument, NULL, 'P'}, { "phoneHomeThinning", required_argument, NULL, 't'}, { "phonehomethinning", required_argument, NULL, 'T'}, { NULL, 0, NULL, 0 }, }; int argIndex = 4; if(argv[argIndex][0] != '-') { // This is the number of bases to trim off both sides // so convert to a number. numTrimBaseL = atoi(argv[argIndex]); numTrimBaseR = numTrimBaseL; ++argIndex; } int c = 0; int n_option_index = 0; // Process any additional parameters while ( ( c = getopt_long(argc, argv, "L:R:in", getopt_long_options, &n_option_index) ) != -1 ) { switch(c) { case 'L': numTrimBaseL = atoi(optarg); break; case 'R': numTrimBaseR = atoi(optarg); break; case 'i': ignoreStrand = true; break; case 'n': noeof = true; break; case 'p': case 'P': noPhoneHome = true; break; case 't': case 'T': PhoneHome::allThinning = atoi(optarg); break; default: fprintf(stderr,"ERROR: Unrecognized option %s\n", getopt_long_options[n_option_index].name); return(-1); } } if(!noPhoneHome) { PhoneHome::checkVersion(getProgramName(), VERSION); } if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if ( ! samIn.OpenForRead(inName.c_str()) ) { fprintf(stderr, "***Problem opening %s\n",inName.c_str()); return(-1); } if(!samOut.OpenForWrite(outName.c_str())) { fprintf(stderr, "%s\n", samOut.GetStatusMessage()); return(samOut.GetStatus()); } fprintf(stderr,"Arguments in effect: \n"); fprintf(stderr,"\tInput file : %s\n",inName.c_str()); fprintf(stderr,"\tOutput file : %s\n",outName.c_str()); if(numTrimBaseL == numTrimBaseR) { fprintf(stderr,"\t#Bases to trim from each side : %d\n", numTrimBaseL); } else { fprintf(stderr,"\t#Bases to trim from the left of forward strands : %d\n", numTrimBaseL); fprintf(stderr,"\t#Bases to trim from the right of forward strands: %d\n", numTrimBaseR); if(!ignoreStrand) { // By default, reverse strands are treated the opposite. fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n", numTrimBaseR); fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n", numTrimBaseL); } else { // ignore strand, treating forward & reverse strands the same fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n", numTrimBaseL); fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n", numTrimBaseR); } } // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Write the sam header. if(!samOut.WriteHeader(samHeader)) { fprintf(stderr, "%s\n", samOut.GetStatusMessage()); return(samOut.GetStatus()); } SamRecord samRecord; char seq[65536]; char qual[65536]; int i, len; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Successfully read a record from the file, so write it. strcpy(seq,samRecord.getSequence()); strcpy(qual,samRecord.getQuality()); // Number of bases to trim from the left/right, // set based on ignoreStrand flag and strand info. int trimLeft = numTrimBaseL; int trimRight = numTrimBaseR; if(!ignoreStrand) { if(SamFlag::isReverse(samRecord.getFlag())) { // We are reversing the reverse reads, // so swap the left & right trim counts. trimRight = numTrimBaseL; trimLeft = numTrimBaseR; } } len = strlen(seq); // Do not trim if sequence is '*' if ( strcmp(seq, "*") != 0 ) { bool qualValue = true; if(strcmp(qual, "*") == 0) { qualValue = false; } int qualLen = strlen(qual); if ( (qualLen != len) && qualValue ) { fprintf(stderr,"ERROR: Sequence and Quality have different length\n"); return(-1); } if ( len < (trimLeft + trimRight) ) { // Read Length is less than the total number of bases to trim, // so trim the entire read. for(i=0; i < len; ++i) { seq[i] = 'N'; if ( qualValue ) { qual[i] = '!'; } } } else { // Read Length is larger than the total number of bases to trim, // so trim from the left, then from the right. for(i=0; i < trimLeft; ++i) { // Trim the bases from the left. seq[i] = 'N'; if ( qualValue ) { qual[i] = '!'; } } for(i = 0; i < trimRight; i++) { seq[len-i-1] = 'N'; if(qualValue) { qual[len-i-1] = '!'; } } } samRecord.setSequence(seq); samRecord.setQuality(qual); } if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "Failure in writing record %s\n", samOut.GetStatusMessage()); return(-1); } } if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed to read a record. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed reading a record. return(samIn.GetStatus()); } // Since the reads were successful, return the status based samIn.Close(); samOut.Close(); return 0; }
int GapInfo::processFile(const char* inputFileName, const char* outputFileName, const char* refFile, bool detailed, bool checkFirst, bool checkStrand) { // Open the file for reading. SamFile samIn; samIn.OpenForRead(inputFileName); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); SamRecord samRecord; GenomeSequence* refPtr = NULL; if(strcmp(refFile, "") != 0) { refPtr = new GenomeSequence(refFile); } IFILE outFile = ifopen(outputFileName, "w"); // Map for summary. std::map<int, int> gapInfoMap; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { uint16_t samFlags = samRecord.getFlag(); if((!SamFlag::isMapped(samFlags)) || (!SamFlag::isMateMapped(samFlags)) || (!SamFlag::isPaired(samFlags)) || (samFlags & SamFlag::SECONDARY_ALIGNMENT) || (SamFlag::isDuplicate(samFlags)) || (SamFlag::isQCFailure(samFlags))) { // unmapped, mate unmapped, not paired, // not the primary alignment, // duplicate, fails vendor quality check continue; } // No gap info if the chromosome names are different or // are unknown. int32_t refID = samRecord.getReferenceID(); if((refID != samRecord.getMateReferenceID()) || (refID == -1)) { continue; } int32_t readStart = samRecord.get0BasedPosition(); int32_t mateStart = samRecord.get0BasedMatePosition(); // If the mate starts first, then the pair was processed by // the mate. if(mateStart < readStart) { continue; } if((mateStart == readStart) && (SamFlag::isReverse(samFlags))) { // read and mate start at the same position, so // only process the forward strand. continue; } // Process this read pair. int32_t readEnd = samRecord.get0BasedAlignmentEnd(); int32_t gapSize = mateStart - readEnd - 1; if(detailed) { // Output the gap info. ifprintf(outFile, "%s\t%d\t%d", samRecord.getReferenceName(), readEnd+1, gapSize); // Check if it is not the first or if it is not the forward strand. if(checkFirst && !SamFlag::isFirstFragment(samFlags)) { ifprintf(outFile, "\tNotFirst"); } if(checkStrand && SamFlag::isReverse(samFlags)) { ifprintf(outFile, "\tReverse"); } ifprintf(outFile, "\n"); } else { // Summary. // Skip reads that are not the forward strand. if(SamFlag::isReverse(samFlags)) { // continue continue; } // Forward. // Check the reference for 'N's. if(refPtr != NULL) { genomeIndex_t chromStartIndex = refPtr->getGenomePosition(samRecord.getReferenceName()); if(chromStartIndex == INVALID_GENOME_INDEX) { // Invalid position, so continue to the next one. continue; } bool skipRead = false; for(int i = readEnd + 1; i < mateStart; i++) { if((*refPtr)[i] == 'N') { // 'N' in the reference, so continue to the next read. skipRead = true; break; } } if(skipRead) { continue; } } // Update the gapInfo. gapInfoMap[gapSize]++; } } if(!detailed) { // Output the summary. ifprintf(outFile, "GapSize\tNumPairs\n"); for(std::map<int,int>::iterator iter = gapInfoMap.begin(); iter != gapInfoMap.end(); iter++) { ifprintf(outFile, "%d\t%d\n", (*iter).first, (*iter).second); } } SamStatus::Status returnStatus = samIn.GetStatus(); if(returnStatus == SamStatus::NO_MORE_RECS) { return(SamStatus::SUCCESS); } return(returnStatus); }