void testAddHeaderAndTagToFile(const char* inputName, const char* outputName) { SamFile inSam, outSam; assert(inSam.OpenForRead(inputName)); assert(outSam.OpenForWrite(outputName)); // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); // Add a header line. assert(samHeader.addHeaderLine("@RG\tID:myID\tSM:mySM") == false); assert(samHeader.addHeaderLine("@RG\tID:myID3\tSM:mySM") == true); // Write Header assert(outSam.WriteHeader(samHeader)); SamRecord samRecord; assert(inSam.ReadRecord(samHeader, samRecord)); // validateRead1(samRecord); // Add two tags. assert(samRecord.addIntTag("XA", 123)); assert(samRecord.addIntTag("XA", 456)); assert(samRecord.addTag("RR", 'Z', "myID1")); assert(samRecord.addTag("RR", 'Z', "myID2")); // Write as Sam. assert(outSam.WriteRecord(samHeader, samRecord)); // TODO, add test to verify it was written correctly. // Read a couple of records to make sure it properly can read them even // if they are bigger than the original. assert(inSam.ReadRecord(samHeader, samRecord)); assert(inSam.ReadRecord(samHeader, samRecord)); // Check the MD tag, which requires the reference. GenomeSequence reference("testFiles/chr1_partial.fa"); assert(SamTags::isMDTagCorrect(samRecord, reference) == false); String newMDTag; SamTags::createMDTag(newMDTag, samRecord, reference); assert(newMDTag == "2T1N0"); assert(SamTags::updateMDTag(samRecord, reference)); // Write as Sam. assert(outSam.WriteRecord(samHeader, samRecord)); }
int main(int argc, char ** argv) { gpLogger = new Logger; static struct option getopt_long_options[] = { // Input options { "fasta", required_argument, NULL, 'f'}, { "in", required_argument, NULL, 'i'}, { "out", required_argument, NULL, 'o'}, { "verbose", no_argument, NULL, 'v'}, { "log", required_argument, NULL, 'l'}, { "clear", no_argument, NULL, 0}, { "AS", required_argument, NULL, 0}, { "UR", required_argument, NULL, 0}, { "SP", required_argument, NULL, 0}, { "HD", required_argument, NULL, 0}, { "RG", required_argument, NULL, 0}, { "PG", required_argument, NULL, 0}, { "checkSQ", no_argument, NULL, 0}, { NULL, 0, NULL, 0 }, }; int n_option_index = 0, c; std::string sAS, sUR, sSP, sFasta, sInFile, sOutFile, sLogFile; bool bClear, bCheckSQ, bVerbose; std::vector<std::string> vsHDHeaders, vsRGHeaders, vsPGHeaders; bCheckSQ = bVerbose = false; bClear = true; while ( (c = getopt_long(argc, argv, "vf:i:o:l:", getopt_long_options, &n_option_index)) != -1 ) { // std::cout << getopt_long_options[n_option_index].name << "\t" << optarg << std::endl; if ( c == 'f' ) { sFasta = optarg; } else if ( c == 'i' ) { sInFile = optarg; } else if ( c == 'o' ) { sOutFile = optarg; } else if ( c == 'v' ) { bVerbose = true; } else if ( c == 'l' ) { sLogFile = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"AS") == 0 ) { sAS = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"UR") == 0 ) { sUR = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"SP") == 0 ) { sSP = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"HD") == 0 ) { vsHDHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"RG") == 0 ) { vsRGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"PG") == 0 ) { vsPGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"checkSQ") == 0 ) { bCheckSQ = true; } else { std::cerr << "Error: Unrecognized option " << getopt_long_options[n_option_index].name << std::endl; abort(); } } if ( optind < argc ) { printUsage(std::cerr); gpLogger->error("non-option argument %s exist ",argv[optind]); } if ( sInFile.empty() || sOutFile.empty() ) { printUsage(std::cerr); gpLogger->error("Input and output files are required"); } if ( sLogFile.compare("__NONE__") == 0 ) { sLogFile = (sOutFile + ".log"); } gpLogger->open(sLogFile.c_str(), bVerbose); if ( ( bCheckSQ ) && ( sFasta.empty() ) ) { printUsage(std::cerr); gpLogger->error("--checkSQ option must be used with --fasta option"); } // check whether each header line starts with a correct tag checkHeaderStarts(vsHDHeaders, "@HD\t"); checkHeaderStarts(vsRGHeaders, "@RG\t"); checkHeaderStarts(vsPGHeaders, "@PG\t"); gpLogger->write_log("Arguments in effect:"); gpLogger->write_log("\t--in [%s]",sInFile.c_str()); gpLogger->write_log("\t--out [%s]",sOutFile.c_str()); gpLogger->write_log("\t--log [%s]",sLogFile.c_str()); gpLogger->write_log("\t--fasta [%s]",sFasta.c_str()); gpLogger->write_log("\t--AS [%s]",sAS.c_str()); gpLogger->write_log("\t--UR [%s]",sUR.c_str()); gpLogger->write_log("\t--SP [%s]",sSP.c_str()); gpLogger->write_log("\t--checkSQ [%s]",bClear ? "ON" : "OFF" ); if ( vsHDHeaders.empty() ) { gpLogger->write_log("\t--HD []"); } else { gpLogger->write_log("\t--HD [%s]",vsHDHeaders[0].c_str()); } if ( vsRGHeaders.empty() ) { gpLogger->write_log("\t--RG []"); } else { gpLogger->write_log("\t--RG [%s]",vsRGHeaders[0].c_str()); } if ( vsPGHeaders.empty() ) { gpLogger->write_log("\t--PG []"); } else { for(uint32_t i=0; i < vsPGHeaders.size(); ++i) { gpLogger->write_log("\t--PG [%s]",vsPGHeaders[i].c_str()); } } if ( (vsHDHeaders.empty() ) && ( vsRGHeaders.empty() ) && ( vsPGHeaders.empty() ) && ( !bClear ) && ( sFasta.empty() ) ) { gpLogger->warning("No option is in effect for modifying BAM files. The input and output files will be identical"); } if ( ( vsHDHeaders.size() > 1 ) || ( vsRGHeaders.size() > 1 ) ) { gpLogger->error("HD and RG headers cannot be multiple"); } FastaFile fastaFile; if ( ! sFasta.empty() ) { if ( fastaFile.open(sFasta.c_str()) ) { gpLogger->write_log("Reading the reference file %s",sFasta.c_str()); fastaFile.readThru(); fastaFile.close(); gpLogger->write_log("Finished reading the reference file %s",sFasta.c_str()); } else { gpLogger->error("Failed to open reference file %s",sFasta.c_str()); } } SamFile samIn; SamFile samOut; if ( ! samIn.OpenForRead(sInFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for reading - %s",sInFile.c_str(), SamStatus::getStatusString(samIn.GetStatus()) ); } if ( ! samOut.OpenForWrite(sOutFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for writing - %s",sOutFile.c_str(), SamStatus::getStatusString(samOut.GetStatus()) ); } SamFileHeader samHeader; SamHeaderRecord* pSamHeaderRecord; samIn.ReadHeader(samHeader); // check the sanity of SQ file // make sure the SN and LN matches, with the same order if ( bCheckSQ ) { unsigned int numSQ = 0; while( (pSamHeaderRecord = samHeader.getNextHeaderRecord()) != NULL ) { if ( pSamHeaderRecord->getType() == SamHeaderRecord::SQ ) { ++numSQ; } } if ( numSQ != fastaFile.vsSequenceNames.size() ) { gpLogger->error("# of @SQ tags are different from the original BAM and the reference file"); } // iterator over all @SQ objects for(unsigned int i=0; i < numSQ; ++i) { pSamHeaderRecord = samHeader.getSQ(fastaFile.vsSequenceNames[i].c_str()); if ( fastaFile.vsSequenceNames[i].compare(pSamHeaderRecord->getTagValue("SN")) != 0 ) { gpLogger->error("SequenceName is not identical between fasta and input BAM file"); } else if ( static_cast<int>(fastaFile.vnSequenceLengths[i]) != atoi(pSamHeaderRecord->getTagValue("LN")) ) { gpLogger->error("SequenceLength is not identical between fasta and input BAM file"); } else { if ( !sAS.empty() ) samHeader.setSQTag("AS",sAS.c_str(),fastaFile.vsSequenceNames[i].c_str()); samHeader.setSQTag("M5",fastaFile.vsMD5sums[i].c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sUR.empty() ) samHeader.setSQTag("UR",sUR.c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sSP.empty() ) samHeader.setSQTag("SP",sSP.c_str(),fastaFile.vsSequenceNames[i].c_str()); } } gpLogger->write_log("Finished checking the consistency of SQ tags"); } else { gpLogger->write_log("Skipped checking the consistency of SQ tags"); } // go over the headers again, // assuming order of HD, SQ, RG, PG, and put proper tags at the end of the original tags gpLogger->write_log("Creating the header of new output file"); //SamFileHeader outHeader; samHeader.resetHeaderRecordIter(); for(unsigned int i=0; i < vsHDHeaders.size(); ++i) { samHeader.addHeaderLine(vsHDHeaders[i].c_str()); } /* for(int i=0; i < fastaFile.vsSequenceNames.size(); ++i) { std::string s("@SQ\tSN:"); char buf[1024]; s += fastaFile.vsSequenceNames[i]; sprintf(buf,"\tLN:%d",fastaFile.vnSequenceLengths[i]); s += buf; if ( !sAS.empty() ) { sprintf(buf,"\tAS:%s",sAS.c_str()); s += buf; } if ( !sUR.empty() ) { sprintf(buf,"\tUR:%s",sUR.c_str()); s += buf; } sprintf(buf,"\tM5:%s",fastaFile.vsMD5sums[i].c_str()); s += buf; if ( !sSP.empty() ) { sprintf(buf,"\tSP:%s",sSP.c_str()); s += buf; } outHeader.addHeaderLine(s.c_str()); }*/ for(unsigned int i=0; i < vsRGHeaders.size(); ++i) { samHeader.addHeaderLine(vsRGHeaders[i].c_str()); } for(unsigned int i=0; i < vsPGHeaders.size(); ++i) { samHeader.addHeaderLine(vsPGHeaders[i].c_str()); } samOut.WriteHeader(samHeader); gpLogger->write_log("Adding %d HD, %d RG, and %d PG headers",vsHDHeaders.size(), vsRGHeaders.size(), vsPGHeaders.size()); gpLogger->write_log("Finished writing output headers"); // parse RG tag and get RG ID to append std::string sRGID; if ( ! vsRGHeaders.empty() ) { std::vector<std::string> tokens; FastaFile::tokenizeString( vsRGHeaders[0].c_str(), tokens ); for(unsigned int i=0; i < tokens.size(); ++i) { if ( tokens[i].find("ID:") == 0 ) { sRGID = tokens[i].substr(3); } } } gpLogger->write_log("Writing output BAM file"); SamRecord samRecord; while (samIn.ReadRecord(samHeader, samRecord) == true) { if ( !sRGID.empty() ) { if ( samRecord.addTag("RG",'Z',sRGID.c_str()) == false ) { gpLogger->error("Failed to add a RG tag %s",sRGID.c_str()); } // temporary code added if ( strncmp(samRecord.getReadName(),"seqcore_",8) == 0 ) { char buf[1024]; sprintf(buf,"UM%s",samRecord.getReadName()+8); samRecord.setReadName(buf); } } samOut.WriteRecord(samHeader, samRecord); //if ( samIn.GetCurrentRecordCount() == 1000 ) break; } samOut.Close(); gpLogger->write_log("Successfully written %d records",samIn.GetCurrentRecordCount()); delete gpLogger; return 0; }
// add readgroup header line to the SamFileHeader void addReadGroupToHeader(SamFileHeader& header, ReadGroup& rg) { if ( !header.addHeaderLine(rg.s_header_line.c_str()) ) { Logger::gLogger->error("Failed to add ID = %s, header line %s",rg.s_id.c_str(),rg.s_header_line.c_str()); } }