void testReadBam() { SamFile inSam; assert(inSam.OpenForRead("testFiles/testBam.bam")); // Call generic test which since the sam and bam are identical, should // contain the same results. testRead(inSam); inSam.Close(); testFlagRead("testFiles/testBam.bam"); }
void GenomeRegionSeqStats::CalcClusters(String &bamFile, int minMapQuality) { SamFile sam; SamRecord samRecord; SamFileHeader samHeader; if(!sam.OpenForRead(bamFile.c_str())) error("Open BAM file %s failed!\n", bamFile.c_str()); if(!sam.ReadHeader(samHeader)) { error("Read BAM file header %s failed!\n", bamFile.c_str()); } if(depth.size()==0) depth.resize(referencegenome.sequenceLength()); String contigLabel; uint32_t start; uint32_t gstart; Reset(); while(sam.ReadRecord(samHeader, samRecord)) { nReads++; if(samRecord.getFlag() & SamFlag::UNMAPPED) nUnMapped++; if(samRecord.getMapQuality() < minMapQuality) continue; CigarRoller cigar(samRecord.getCigar()); int nonClipSequence = 0; if(cigar.size()!=0 && cigar[0].operation==Cigar::softClip) nonClipSequence = cigar[0].count; contigLabel = samRecord.getReferenceName(); start = nonClipSequence + samRecord.get0BasedPosition(); // start is 0-based gstart = referencegenome.getGenomePosition(contigLabel.c_str(), start); if(IsInRegions(contigLabel, start, start+samRecord.getReadLength())) continue; for(uint32_t i=gstart; i<gstart+samRecord.getReadLength(); i++) if(depth[i]<MAXDP) depth[i]++; nMappedOutTargets++; } }
bool BamProcessor::finalize (bool success) { if (outfile_.IsOpen ()) { trclog << "Closing output file" << std::endl; outfile_.Close (); } if (logfile_.is_open ()) { time_t t = time (NULL); print_stats (logfile_); logfile_ << "\nFinished " << (success ? "successfully" : "due to error") << " at " << asctime (localtime (&t)) << "\n"; trclog << "Closing log file" << std::endl; logfile_.close (); } if (info.enabled ()) print_stats (info.o_); return true; }
void modify::modifyTags() { assert(samIn.OpenForRead(myFilename.c_str())); // Read the sam header. assert(samIn.ReadHeader(samHeader)); SamFile samOut; SamFile bamOut; std::string inputType = myFilename.substr(myFilename.find_last_of('.')); std::string outFileBase = "results/updateTagFrom"; if(inputType == ".bam") { outFileBase += "Bam"; } else { outFileBase += "Sam"; } std::string outFile = outFileBase + ".sam"; assert(samOut.OpenForWrite(outFile.c_str())); outFile = outFileBase + ".bam"; assert(bamOut.OpenForWrite(outFile.c_str())); assert(samOut.WriteHeader(samHeader)); assert(bamOut.WriteHeader(samHeader)); int count = 0; // Read the records. while(samIn.ReadRecord(samHeader, samRecord)) { if(count == 0) { assert(samRecord.rmTag("MD", 'Z')); } else if(count == 2) { assert(samRecord.rmTags("XT:A;MD:Z;AB:c;NM:i")); } else if(count == 4) { assert(samRecord.rmTags("MD:Z,AB:c,NM:i")); } assert(bamOut.WriteRecord(samHeader, samRecord)); assert(samOut.WriteRecord(samHeader, samRecord)); ++count; } }
void GenomeRegionSeqStats::CalcRegionStats(String &bamFile) { SamFile sam; SamRecord samRecord; SamFileHeader samHeader; if(!sam.OpenForRead(bamFile.c_str())) error("Open BAM file %s failed!\n", bamFile.c_str()); if(!sam.ReadHeader(samHeader)) { error("Read BAM file header %s failed!\n", bamFile.c_str()); } String contigLabel; int start, end; Reset(); while(sam.ReadRecord(samHeader, samRecord)) { nReads++; if(samRecord.getFlag() & SamFlag::UNMAPPED) nUnMapped++; if(contigFinishedCnt>=contigs.size()) continue; CigarRoller cigar(samRecord.getCigar()); int nonClipSequence = 0; if(cigar.size()!=0 && cigar[0].operation==Cigar::softClip) nonClipSequence = cigar[0].count; contigLabel = samRecord.getReferenceName(); start = nonClipSequence + samRecord.get0BasedPosition(); // start is 0-based end = start + samRecord.getReadLength() - 1; if(UpdateRegionStats(contigLabel, start, end)) nMapped2Targets++; } CalcRegionReadCountInGCBins(); CalcGroupReadCountInGCBins(); std::cout << "Total reads : " << nReads << std::endl; }
SamStatus::Status ClipOverlap::readCoordRecord(SamFile& samIn, SamRecord** recordPtr, MateMapByCoord& mateMap, SamCoordOutput* outputBufferPtr) { // Null pointer, so get a new pointer. if(*recordPtr == NULL) { *recordPtr = myPool.getRecord(); if(*recordPtr == NULL) { // Failed to allocate a new record. // Try to free up records from the mate map if(!forceRecordFlush(mateMap, outputBufferPtr)) { std::cerr << "Failed to flush the output buffer.\n"; return(SamStatus::FAIL_IO); } // Try to get a new record, one should have been cleared. *recordPtr = myPool.getRecord(); if(*recordPtr == NULL) { std::cerr << "Failed to allocate any records.\n"; return(SamStatus::FAIL_MEM); } } } // RecordPtr is set. if(!samIn.ReadRecord(mySamHeader, **recordPtr)) { // Nothing to process, so return. return(samIn.GetStatus()); } return(SamStatus::SUCCESS); }
bool BamProcessor::process () { if (!infile_.ReadHeader (sam_header_)) ers << "Unable to read SAM header" << Throw; else info << "Header read" << std::endl; if (outfile_.IsOpen ()) { if (!outfile_.WriteHeader (sam_header_)) ers << "Unable to write header data" << Throw; else info << "Header written" << std::endl; } // set up signal handlers sighandler_t sighandler_int, sighandler_term, sighandler_hup; // set INT handler to int_handler if interrupting is not disabled allready if ((sighandler_int = signal (SIGINT, int_handler)) == SIG_IGN) signal (SIGINT, SIG_IGN), sighandler_int = NULL; // set HUP handler to nothing sighandler_hup = signal (SIGHUP, SIG_IGN); // set TERM handler to int_handler if terminating is not disabled allready if ((sighandler_term = signal (SIGTERM, int_handler)) == SIG_IGN) signal (SIGTERM, SIG_IGN), sighandler_term = NULL; begtime_ = time (NULL); while (!infile_.IsEOF () && !interrupted) { if (limit_ && proc_cnt_ >= limit_) { info << limit_ << " records processed. Limit reached." << std::endl; break; } if (read_cnt_ == skip_) timer_.mark (); infile_.ReadRecord (sam_header_, rec_); ++ read_cnt_; if (read_cnt_-1 >= skip_) { if (!processRecord ()) ++ fail_cnt_; ++ proc_cnt_; if (outfile_.IsOpen ()) outfile_.WriteRecord (sam_header_, rec_); } if (timer_ ()) { info << "\r" << read_cnt_; if (proc_cnt_ != read_cnt_) info << " rd " << proc_cnt_; info << " pr "; if (realigned_cnt_ != proc_cnt_) info << realigned_cnt_ << " al (" << (double (realigned_cnt_) * 100 / proc_cnt_) << "%) "; info << modified_cnt_ << " mod (" << (double (modified_cnt_) * 100 / proc_cnt_) << "%) "; if (pos_adjusted_cnt_) info << pos_adjusted_cnt_ << " sh (" << (double (pos_adjusted_cnt_) * 100 / modified_cnt_) << "% mod) "; info << "in " << timer_.tot_elapsed () << " sec (" << std::setprecision (3) << std::fixed << timer_.speed () << " r/s)" << std::flush; } } if (interrupted) { errlog << "\nProcessing interrupted by "; switch (signal_received) { case SIGTERM: errlog << "TERM signal"; break; case SIGINT: errlog << "user's request"; break; default: errlog << "receipt of signal " << signal_received; } errlog << std::endl; } // restore signal handlers if (sighandler_term) signal (SIGTERM, sighandler_term); if (sighandler_int) signal (SIGINT, sighandler_int); if (sighandler_hup) signal (SIGHUP, sighandler_hup); return 0; }
/* if a discordant read is mapped to MEI (overlap with MEI coord) add centor of ( anchor_end + 3*avr_ins_var ) skip unmap & clip check 3 types at the same time */ void Sites::AddDiscoverSiteFromSingleBam( SingleSampleInfo & si ) { /*for(int i=0;i<NMEI; i++) { std::cout << "m=" << i << ": "; for(map<string, map<int, bool> >::iterator t=meiCoord[m].begin(); t!=meiCoord[m].end(); t++) std::cout << t->first << "->" << t->second.size() << " "; std::cout << std::endl; }*/ avr_read_length = si.avr_read_length; avr_ins_size = si.avr_ins_size; min_read_length = avr_read_length / 3; current_depth = si.depth; // total_depth += current_depth; resetDepthAddFlag(); SamFile bam; SamFileHeader bam_header; OpenBamAndBai( bam,bam_header, si.bam_name ); for( int i=0; i<pchr_list->size(); i++ ) { string chr = (*pchr_list)[i]; // if ( !single_chr.empty() && chr.compare(single_chr)!=0 ) // continue; if ( siteList.find(chr) == siteList.end() ) siteList[chr].clear(); // map<string, map<int, SingleSite> >::iterator pchr = siteList[m].find(chr); // map<string, map<int, bool> >::iterator coord_chr_ptr = meiCoord[m].find(chr); // if (coord_chr_ptr == meiCoord[m].end()) // continue; bool section_status; if (range_start<0) { // no range section_status = bam.SetReadSection( chr.c_str() ); if (!section_status) { string str = "Cannot set read section at chr " + chr; morphWarning( str ); } } else { // set range section_status = bam.SetReadSection( chr.c_str(), range_start, range_end ); if (!section_status) { string str = "Cannot set read section at chr " + chr + " " + std::to_string(range_start) + "-" + std::to_string(range_end); morphWarning( str ); } } // DO ADDING // if (siteList[chr].empty()) // p_reach_last = 1; // else { // p_reach_last = 0; pnearest = siteList[chr].begin(); // } SingleSite new_site; // temporary cluster. will be added to map later. new_site.depth = current_depth; bool start_new = 1; // check if need to add new_site to map and start new new_site SamRecord rec; int between = 0; // count #reads after new_site.end. If end changed, add it to rcount and reset to zero while( bam.ReadRecord( bam_header, rec ) ) { if (!start_new) { if (rec.get1BasedPosition() >= new_site.end) between++; else new_site.rcount++; } if (rec.getFlag() & 0x2) continue; if ( OneEndUnmap( rec.getFlag() ) ) continue; if ( IsSupplementary(rec.getFlag()) ) continue; if ( rec.getReadLength() < min_read_length ) continue; if ( rec.getMapQuality() < MIN_QUALITY ) continue; if (chr.compare(rec.getMateReferenceName())==0 && rec.getInsertSize() < abs(avr_ins_size*2)) continue; bool is_mei = 0; vector<bool> is_in_coord; is_in_coord.resize(3, 0); for(int m=0; m<NMEI; m++) { map<string, map<int, bool> >::iterator coord_chr_ptr = meiCoord[m].find(rec.getMateReferenceName()); if (coord_chr_ptr == meiCoord[m].end()) is_in_coord[m] = 0; else is_in_coord[m] = isWithinCoord( rec.get1BasedMatePosition(), coord_chr_ptr->second ); // within MEI coord if (is_in_coord[m]) is_mei = 1; } if (!is_mei) continue; if (start_new) { setNewCluster( is_in_coord, new_site,rec); start_new = 0; between = 0; } else { // add to existing cluster if ( rec.get1BasedPosition() > new_site.end + avr_ins_size ) { // start new coord addClusterToMap(new_site, siteList[chr]); setNewCluster( is_in_coord, new_site, rec); start_new = 0; between = 0; } else { addToCurrentCluster( is_in_coord, new_site, rec); new_site.rcount += between; between = 0; } } } // add last one if (!start_new) addClusterToMap(new_site, siteList[chr]); } bam.Close(); }
void setReadCountInSection( vector<int> & raw_counts, string & chr, int center, SamFile & samIn, SamFileHeader & samHeader, vector<RefSeq*> & REF_SEQ ) { int st = center - WIN/2; int ed = center + WIN/2; bool section_status = samIn.SetReadSection( chr.c_str(), st, ed ); if (!section_status) { std::cerr << "Warning: Unable to set read section: " << chr << ": " << st << "-" << ed << ". Set section depth = 0!" << std::endl; return; } // proper reads map<string, vector< DiscPair > > disc_recs; // record where the disc come from ProperDeck pDeck( REF_SEQ ); SamRecord sam_rec; while( samIn.ReadRecord(samHeader, sam_rec) ) { bool pass_qc = PassQC( sam_rec ); if ( !pass_qc ) continue; if ( sam_rec.getFlag() & 0x2 ) { if ( sam_rec.getInsertSize() > 0 ) pDeck.Add( sam_rec ); else { RetrievedIndex rv = pDeck.RetrieveIndex( sam_rec ); int index = getRetrievedIndexToRawCounts( rv ); if (index >= 0) { // for read partially in window, only add clip if ( sam_rec.get1BasedPosition() < st || sam_rec.get1BasedAlignmentEnd() > ed ) { if ( index >= 2 ) raw_counts[ index ]++; } else raw_counts[ index ]++; } } } else { // disc: rec info and wait to reset section later if ( sam_rec.getReadLength() < 30 ) continue; string mate_chr = sam_rec.getMateReferenceName(); // check if this one is valid as anchor DiscPair new_pair( 1, 0, sam_rec, REF_SEQ ); disc_recs[mate_chr].push_back( new_pair ); } } // disc reads for( map<string, vector< DiscPair > >::iterator chr_it = disc_recs.begin(); chr_it != disc_recs.end(); chr_it++ ) { for( vector< DiscPair >::iterator dp_it = chr_it->second.begin(); dp_it != chr_it->second.end(); dp_it++ ) { bool section_status = samIn.SetReadSection( chr_it->first.c_str(), dp_it->GetSecondAlignPosition(), dp_it->GetSecondAlignPosition() + WIN ); if (!section_status) { std::cerr << "ERROR: Unable to set read section: " << chr << ": " << st << "-" << ed << std::endl; exit(1); } SamRecord sam_rec; while( samIn.ReadRecord(samHeader, sam_rec) ) { bool pass_qc = PassQC( sam_rec ); if ( !pass_qc ) continue; if ( !DiscSamPass(sam_rec) ) continue; int position = sam_rec.get1BasedPosition(); if ( position > dp_it->GetFirstAlignPosition() ) break; if ( sam_rec.getFlag() & 0x2 ) continue; bool same_pair = dp_it->IsSamePair( sam_rec ); if ( !same_pair ) continue; // now add ro raw stats: always use first as anchor dp_it->AddSecondToPair( sam_rec ); Loci loci = dp_it->GetFirstLoci(); if ( dp_it->FirstValid() ) { int index = getLociToRawCounts( loci ); raw_counts[ index ]++; // clear & break break; } } } } }
int Convert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String refFile = ""; bool lshift = false; bool noeof = false; bool params = false; bool useBases = false; bool useEquals = false; bool useOrigSeq = false; bool recover = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_STRINGPARAMETER("refFile", &refFile) LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("recover", &recover) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("SequenceConversion") EXCLUSIVE_PARAMETER("useBases", &useBases) EXCLUSIVE_PARAMETER("useEquals", &useEquals) EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the ref file was specified. // Open the reference. GenomeSequence* refPtr = NULL; if(refFile != "") { refPtr = new GenomeSequence(refFile); } SamRecord::SequenceTranslation translation; if((useBases) && (refPtr != NULL)) { translation = SamRecord::BASES; } else if((useEquals) && (refPtr != NULL)) { translation = SamRecord::EQUAL; } else { useOrigSeq = true; translation = SamRecord::NONE; } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; if(recover) samIn.setAttemptRecovery(true); samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); samOut.SetWriteSequenceTranslation(translation); samOut.SetReference(refPtr); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(1) { try { // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // left shift if necessary. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } break; } catch (std::runtime_error e) { std::cerr << "Caught runtime error: " << e.what() << "\n"; if(!recover) { std::cerr << "Corrupted BAM file detected - consider using --recover option.\n"; break; } std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n"; // XXX need to resync SamFile stream here bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH); if(rc) { std::cerr << "Successful resync - some data lost.\n"; continue; // succeeded } std::cerr << "Failed to re-sync on data stream.\n"; break; // failed to resync } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(refPtr != NULL) { delete(refPtr); } // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
int main(int argc, char ** argv) { gpLogger = new Logger; static struct option getopt_long_options[] = { // Input options { "fasta", required_argument, NULL, 'f'}, { "in", required_argument, NULL, 'i'}, { "out", required_argument, NULL, 'o'}, { "verbose", no_argument, NULL, 'v'}, { "log", required_argument, NULL, 'l'}, { "clear", no_argument, NULL, 0}, { "AS", required_argument, NULL, 0}, { "UR", required_argument, NULL, 0}, { "SP", required_argument, NULL, 0}, { "HD", required_argument, NULL, 0}, { "RG", required_argument, NULL, 0}, { "PG", required_argument, NULL, 0}, { "checkSQ", no_argument, NULL, 0}, { NULL, 0, NULL, 0 }, }; int n_option_index = 0, c; std::string sAS, sUR, sSP, sFasta, sInFile, sOutFile, sLogFile; bool bClear, bCheckSQ, bVerbose; std::vector<std::string> vsHDHeaders, vsRGHeaders, vsPGHeaders; bCheckSQ = bVerbose = false; bClear = true; while ( (c = getopt_long(argc, argv, "vf:i:o:l:", getopt_long_options, &n_option_index)) != -1 ) { // std::cout << getopt_long_options[n_option_index].name << "\t" << optarg << std::endl; if ( c == 'f' ) { sFasta = optarg; } else if ( c == 'i' ) { sInFile = optarg; } else if ( c == 'o' ) { sOutFile = optarg; } else if ( c == 'v' ) { bVerbose = true; } else if ( c == 'l' ) { sLogFile = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"AS") == 0 ) { sAS = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"UR") == 0 ) { sUR = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"SP") == 0 ) { sSP = optarg; } else if ( strcmp(getopt_long_options[n_option_index].name,"HD") == 0 ) { vsHDHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"RG") == 0 ) { vsRGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"PG") == 0 ) { vsPGHeaders.push_back(optarg); } else if ( strcmp(getopt_long_options[n_option_index].name,"checkSQ") == 0 ) { bCheckSQ = true; } else { std::cerr << "Error: Unrecognized option " << getopt_long_options[n_option_index].name << std::endl; abort(); } } if ( optind < argc ) { printUsage(std::cerr); gpLogger->error("non-option argument %s exist ",argv[optind]); } if ( sInFile.empty() || sOutFile.empty() ) { printUsage(std::cerr); gpLogger->error("Input and output files are required"); } if ( sLogFile.compare("__NONE__") == 0 ) { sLogFile = (sOutFile + ".log"); } gpLogger->open(sLogFile.c_str(), bVerbose); if ( ( bCheckSQ ) && ( sFasta.empty() ) ) { printUsage(std::cerr); gpLogger->error("--checkSQ option must be used with --fasta option"); } // check whether each header line starts with a correct tag checkHeaderStarts(vsHDHeaders, "@HD\t"); checkHeaderStarts(vsRGHeaders, "@RG\t"); checkHeaderStarts(vsPGHeaders, "@PG\t"); gpLogger->write_log("Arguments in effect:"); gpLogger->write_log("\t--in [%s]",sInFile.c_str()); gpLogger->write_log("\t--out [%s]",sOutFile.c_str()); gpLogger->write_log("\t--log [%s]",sLogFile.c_str()); gpLogger->write_log("\t--fasta [%s]",sFasta.c_str()); gpLogger->write_log("\t--AS [%s]",sAS.c_str()); gpLogger->write_log("\t--UR [%s]",sUR.c_str()); gpLogger->write_log("\t--SP [%s]",sSP.c_str()); gpLogger->write_log("\t--checkSQ [%s]",bClear ? "ON" : "OFF" ); if ( vsHDHeaders.empty() ) { gpLogger->write_log("\t--HD []"); } else { gpLogger->write_log("\t--HD [%s]",vsHDHeaders[0].c_str()); } if ( vsRGHeaders.empty() ) { gpLogger->write_log("\t--RG []"); } else { gpLogger->write_log("\t--RG [%s]",vsRGHeaders[0].c_str()); } if ( vsPGHeaders.empty() ) { gpLogger->write_log("\t--PG []"); } else { for(uint32_t i=0; i < vsPGHeaders.size(); ++i) { gpLogger->write_log("\t--PG [%s]",vsPGHeaders[i].c_str()); } } if ( (vsHDHeaders.empty() ) && ( vsRGHeaders.empty() ) && ( vsPGHeaders.empty() ) && ( !bClear ) && ( sFasta.empty() ) ) { gpLogger->warning("No option is in effect for modifying BAM files. The input and output files will be identical"); } if ( ( vsHDHeaders.size() > 1 ) || ( vsRGHeaders.size() > 1 ) ) { gpLogger->error("HD and RG headers cannot be multiple"); } FastaFile fastaFile; if ( ! sFasta.empty() ) { if ( fastaFile.open(sFasta.c_str()) ) { gpLogger->write_log("Reading the reference file %s",sFasta.c_str()); fastaFile.readThru(); fastaFile.close(); gpLogger->write_log("Finished reading the reference file %s",sFasta.c_str()); } else { gpLogger->error("Failed to open reference file %s",sFasta.c_str()); } } SamFile samIn; SamFile samOut; if ( ! samIn.OpenForRead(sInFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for reading - %s",sInFile.c_str(), SamStatus::getStatusString(samIn.GetStatus()) ); } if ( ! samOut.OpenForWrite(sOutFile.c_str()) ) { gpLogger->error("Cannot open BAM file %s for writing - %s",sOutFile.c_str(), SamStatus::getStatusString(samOut.GetStatus()) ); } SamFileHeader samHeader; SamHeaderRecord* pSamHeaderRecord; samIn.ReadHeader(samHeader); // check the sanity of SQ file // make sure the SN and LN matches, with the same order if ( bCheckSQ ) { unsigned int numSQ = 0; while( (pSamHeaderRecord = samHeader.getNextHeaderRecord()) != NULL ) { if ( pSamHeaderRecord->getType() == SamHeaderRecord::SQ ) { ++numSQ; } } if ( numSQ != fastaFile.vsSequenceNames.size() ) { gpLogger->error("# of @SQ tags are different from the original BAM and the reference file"); } // iterator over all @SQ objects for(unsigned int i=0; i < numSQ; ++i) { pSamHeaderRecord = samHeader.getSQ(fastaFile.vsSequenceNames[i].c_str()); if ( fastaFile.vsSequenceNames[i].compare(pSamHeaderRecord->getTagValue("SN")) != 0 ) { gpLogger->error("SequenceName is not identical between fasta and input BAM file"); } else if ( static_cast<int>(fastaFile.vnSequenceLengths[i]) != atoi(pSamHeaderRecord->getTagValue("LN")) ) { gpLogger->error("SequenceLength is not identical between fasta and input BAM file"); } else { if ( !sAS.empty() ) samHeader.setSQTag("AS",sAS.c_str(),fastaFile.vsSequenceNames[i].c_str()); samHeader.setSQTag("M5",fastaFile.vsMD5sums[i].c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sUR.empty() ) samHeader.setSQTag("UR",sUR.c_str(),fastaFile.vsSequenceNames[i].c_str()); if ( !sSP.empty() ) samHeader.setSQTag("SP",sSP.c_str(),fastaFile.vsSequenceNames[i].c_str()); } } gpLogger->write_log("Finished checking the consistency of SQ tags"); } else { gpLogger->write_log("Skipped checking the consistency of SQ tags"); } // go over the headers again, // assuming order of HD, SQ, RG, PG, and put proper tags at the end of the original tags gpLogger->write_log("Creating the header of new output file"); //SamFileHeader outHeader; samHeader.resetHeaderRecordIter(); for(unsigned int i=0; i < vsHDHeaders.size(); ++i) { samHeader.addHeaderLine(vsHDHeaders[i].c_str()); } /* for(int i=0; i < fastaFile.vsSequenceNames.size(); ++i) { std::string s("@SQ\tSN:"); char buf[1024]; s += fastaFile.vsSequenceNames[i]; sprintf(buf,"\tLN:%d",fastaFile.vnSequenceLengths[i]); s += buf; if ( !sAS.empty() ) { sprintf(buf,"\tAS:%s",sAS.c_str()); s += buf; } if ( !sUR.empty() ) { sprintf(buf,"\tUR:%s",sUR.c_str()); s += buf; } sprintf(buf,"\tM5:%s",fastaFile.vsMD5sums[i].c_str()); s += buf; if ( !sSP.empty() ) { sprintf(buf,"\tSP:%s",sSP.c_str()); s += buf; } outHeader.addHeaderLine(s.c_str()); }*/ for(unsigned int i=0; i < vsRGHeaders.size(); ++i) { samHeader.addHeaderLine(vsRGHeaders[i].c_str()); } for(unsigned int i=0; i < vsPGHeaders.size(); ++i) { samHeader.addHeaderLine(vsPGHeaders[i].c_str()); } samOut.WriteHeader(samHeader); gpLogger->write_log("Adding %d HD, %d RG, and %d PG headers",vsHDHeaders.size(), vsRGHeaders.size(), vsPGHeaders.size()); gpLogger->write_log("Finished writing output headers"); // parse RG tag and get RG ID to append std::string sRGID; if ( ! vsRGHeaders.empty() ) { std::vector<std::string> tokens; FastaFile::tokenizeString( vsRGHeaders[0].c_str(), tokens ); for(unsigned int i=0; i < tokens.size(); ++i) { if ( tokens[i].find("ID:") == 0 ) { sRGID = tokens[i].substr(3); } } } gpLogger->write_log("Writing output BAM file"); SamRecord samRecord; while (samIn.ReadRecord(samHeader, samRecord) == true) { if ( !sRGID.empty() ) { if ( samRecord.addTag("RG",'Z',sRGID.c_str()) == false ) { gpLogger->error("Failed to add a RG tag %s",sRGID.c_str()); } // temporary code added if ( strncmp(samRecord.getReadName(),"seqcore_",8) == 0 ) { char buf[1024]; sprintf(buf,"UM%s",samRecord.getReadName()+8); samRecord.setReadName(buf); } } samOut.WriteRecord(samHeader, samRecord); //if ( samIn.GetCurrentRecordCount() == 1000 ) break; } samOut.Close(); gpLogger->write_log("Successfully written %d records",samIn.GetCurrentRecordCount()); delete gpLogger; return 0; }
// main function int TrimBam::execute(int argc, char ** argv) { SamFile samIn; SamFile samOut; int numTrimBaseL = 0; int numTrimBaseR = 0; bool noeof = false; bool ignoreStrand = false; bool noPhoneHome = false; std::string inName = ""; std::string outName = ""; if ( argc < 5 ) { usage(); std::cerr << "ERROR: Incorrect number of parameters specified\n"; return(-1); } inName = argv[2]; outName = argv[3]; static struct option getopt_long_options[] = { // Input options { "left", required_argument, NULL, 'L'}, { "right", required_argument, NULL, 'R'}, { "ignoreStrand", no_argument, NULL, 'i'}, { "noeof", no_argument, NULL, 'n'}, { "noPhoneHome", no_argument, NULL, 'p'}, { "nophonehome", no_argument, NULL, 'P'}, { "phoneHomeThinning", required_argument, NULL, 't'}, { "phonehomethinning", required_argument, NULL, 'T'}, { NULL, 0, NULL, 0 }, }; int argIndex = 4; if(argv[argIndex][0] != '-') { // This is the number of bases to trim off both sides // so convert to a number. numTrimBaseL = atoi(argv[argIndex]); numTrimBaseR = numTrimBaseL; ++argIndex; } int c = 0; int n_option_index = 0; // Process any additional parameters while ( ( c = getopt_long(argc, argv, "L:R:in", getopt_long_options, &n_option_index) ) != -1 ) { switch(c) { case 'L': numTrimBaseL = atoi(optarg); break; case 'R': numTrimBaseR = atoi(optarg); break; case 'i': ignoreStrand = true; break; case 'n': noeof = true; break; case 'p': case 'P': noPhoneHome = true; break; case 't': case 'T': PhoneHome::allThinning = atoi(optarg); break; default: fprintf(stderr,"ERROR: Unrecognized option %s\n", getopt_long_options[n_option_index].name); return(-1); } } if(!noPhoneHome) { PhoneHome::checkVersion(getProgramName(), VERSION); } if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if ( ! samIn.OpenForRead(inName.c_str()) ) { fprintf(stderr, "***Problem opening %s\n",inName.c_str()); return(-1); } if(!samOut.OpenForWrite(outName.c_str())) { fprintf(stderr, "%s\n", samOut.GetStatusMessage()); return(samOut.GetStatus()); } fprintf(stderr,"Arguments in effect: \n"); fprintf(stderr,"\tInput file : %s\n",inName.c_str()); fprintf(stderr,"\tOutput file : %s\n",outName.c_str()); if(numTrimBaseL == numTrimBaseR) { fprintf(stderr,"\t#Bases to trim from each side : %d\n", numTrimBaseL); } else { fprintf(stderr,"\t#Bases to trim from the left of forward strands : %d\n", numTrimBaseL); fprintf(stderr,"\t#Bases to trim from the right of forward strands: %d\n", numTrimBaseR); if(!ignoreStrand) { // By default, reverse strands are treated the opposite. fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n", numTrimBaseR); fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n", numTrimBaseL); } else { // ignore strand, treating forward & reverse strands the same fprintf(stderr,"\t#Bases to trim from the left of reverse strands : %d\n", numTrimBaseL); fprintf(stderr,"\t#Bases to trim from the right of reverse strands : %d\n", numTrimBaseR); } } // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Write the sam header. if(!samOut.WriteHeader(samHeader)) { fprintf(stderr, "%s\n", samOut.GetStatusMessage()); return(samOut.GetStatus()); } SamRecord samRecord; char seq[65536]; char qual[65536]; int i, len; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Successfully read a record from the file, so write it. strcpy(seq,samRecord.getSequence()); strcpy(qual,samRecord.getQuality()); // Number of bases to trim from the left/right, // set based on ignoreStrand flag and strand info. int trimLeft = numTrimBaseL; int trimRight = numTrimBaseR; if(!ignoreStrand) { if(SamFlag::isReverse(samRecord.getFlag())) { // We are reversing the reverse reads, // so swap the left & right trim counts. trimRight = numTrimBaseL; trimLeft = numTrimBaseR; } } len = strlen(seq); // Do not trim if sequence is '*' if ( strcmp(seq, "*") != 0 ) { bool qualValue = true; if(strcmp(qual, "*") == 0) { qualValue = false; } int qualLen = strlen(qual); if ( (qualLen != len) && qualValue ) { fprintf(stderr,"ERROR: Sequence and Quality have different length\n"); return(-1); } if ( len < (trimLeft + trimRight) ) { // Read Length is less than the total number of bases to trim, // so trim the entire read. for(i=0; i < len; ++i) { seq[i] = 'N'; if ( qualValue ) { qual[i] = '!'; } } } else { // Read Length is larger than the total number of bases to trim, // so trim from the left, then from the right. for(i=0; i < trimLeft; ++i) { // Trim the bases from the left. seq[i] = 'N'; if ( qualValue ) { qual[i] = '!'; } } for(i = 0; i < trimRight; i++) { seq[len-i-1] = 'N'; if(qualValue) { qual[len-i-1] = '!'; } } } samRecord.setSequence(seq); samRecord.setQuality(qual); } if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "Failure in writing record %s\n", samOut.GetStatusMessage()); return(-1); } } if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed to read a record. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { // Failed reading a record. return(samIn.GetStatus()); } // Since the reads were successful, return the status based samIn.Close(); samOut.Close(); return 0; }
int GapInfo::processFile(const char* inputFileName, const char* outputFileName, const char* refFile, bool detailed, bool checkFirst, bool checkStrand) { // Open the file for reading. SamFile samIn; samIn.OpenForRead(inputFileName); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); SamRecord samRecord; GenomeSequence* refPtr = NULL; if(strcmp(refFile, "") != 0) { refPtr = new GenomeSequence(refFile); } IFILE outFile = ifopen(outputFileName, "w"); // Map for summary. std::map<int, int> gapInfoMap; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { uint16_t samFlags = samRecord.getFlag(); if((!SamFlag::isMapped(samFlags)) || (!SamFlag::isMateMapped(samFlags)) || (!SamFlag::isPaired(samFlags)) || (samFlags & SamFlag::SECONDARY_ALIGNMENT) || (SamFlag::isDuplicate(samFlags)) || (SamFlag::isQCFailure(samFlags))) { // unmapped, mate unmapped, not paired, // not the primary alignment, // duplicate, fails vendor quality check continue; } // No gap info if the chromosome names are different or // are unknown. int32_t refID = samRecord.getReferenceID(); if((refID != samRecord.getMateReferenceID()) || (refID == -1)) { continue; } int32_t readStart = samRecord.get0BasedPosition(); int32_t mateStart = samRecord.get0BasedMatePosition(); // If the mate starts first, then the pair was processed by // the mate. if(mateStart < readStart) { continue; } if((mateStart == readStart) && (SamFlag::isReverse(samFlags))) { // read and mate start at the same position, so // only process the forward strand. continue; } // Process this read pair. int32_t readEnd = samRecord.get0BasedAlignmentEnd(); int32_t gapSize = mateStart - readEnd - 1; if(detailed) { // Output the gap info. ifprintf(outFile, "%s\t%d\t%d", samRecord.getReferenceName(), readEnd+1, gapSize); // Check if it is not the first or if it is not the forward strand. if(checkFirst && !SamFlag::isFirstFragment(samFlags)) { ifprintf(outFile, "\tNotFirst"); } if(checkStrand && SamFlag::isReverse(samFlags)) { ifprintf(outFile, "\tReverse"); } ifprintf(outFile, "\n"); } else { // Summary. // Skip reads that are not the forward strand. if(SamFlag::isReverse(samFlags)) { // continue continue; } // Forward. // Check the reference for 'N's. if(refPtr != NULL) { genomeIndex_t chromStartIndex = refPtr->getGenomePosition(samRecord.getReferenceName()); if(chromStartIndex == INVALID_GENOME_INDEX) { // Invalid position, so continue to the next one. continue; } bool skipRead = false; for(int i = readEnd + 1; i < mateStart; i++) { if((*refPtr)[i] == 'N') { // 'N' in the reference, so continue to the next read. skipRead = true; break; } } if(skipRead) { continue; } } // Update the gapInfo. gapInfoMap[gapSize]++; } } if(!detailed) { // Output the summary. ifprintf(outFile, "GapSize\tNumPairs\n"); for(std::map<int,int>::iterator iter = gapInfoMap.begin(); iter != gapInfoMap.end(); iter++) { ifprintf(outFile, "%d\t%d\n", (*iter).first, (*iter).second); } } SamStatus::Status returnStatus = samIn.GetStatus(); if(returnStatus == SamStatus::NO_MORE_RECS) { return(SamStatus::SUCCESS); } return(returnStatus); }
int WriteRegion::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String indexFile = ""; String readName = ""; String bed = ""; myStart = UNSPECIFIED_INT; myEnd = UNSPECIFIED_INT; myPrevStart = UNSPECIFIED_INT; myPrevEnd = UNSPECIFIED_INT; myRefID = UNSET_REF; myRefName.Clear(); myPrevRefName.Clear(); myBedRefID = SamReferenceInfo::NO_REF_ID; bool lshift = false; bool noeof = false; bool params = false; myWithinReg = false; myWroteReg = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Region Parameters") LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("refName", &myRefName) LONG_INTPARAMETER("refID", &myRefID) LONG_INTPARAMETER("start", &myStart) LONG_INTPARAMETER("end", &myEnd) LONG_STRINGPARAMETER("bed", &bed) LONG_PARAMETER("withinReg", &myWithinReg) LONG_STRINGPARAMETER("readName", &readName) LONG_PARAMETER_GROUP("Optional Other Parameters") LONG_PARAMETER("lshift", &lshift) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); // mandatory argument was not specified. inputParameters.Status(); std::cerr << "Missing mandatory argument: --in" << std::endl; return(-1); } if(outFile == "") { usage(); // mandatory argument was not specified. inputParameters.Status(); std::cerr << "Missing mandatory argument: --out" << std::endl; return(-1); } if(indexFile == "") { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } if(myRefID != UNSET_REF && myRefName.Length() != 0) { std::cerr << "Can't specify both refID and refName" << std::endl; inputParameters.Status(); return(-1); } if(myRefID != UNSET_REF && bed.Length() != 0) { std::cerr << "Can't specify both refID and bed" << std::endl; inputParameters.Status(); return(-1); } if(myRefName.Length() != 0 && bed.Length() != 0) { std::cerr << "Can't specify both refName and bed" << std::endl; inputParameters.Status(); return(-1); } if(!bed.IsEmpty()) { myBedFile = ifopen(bed, "r"); } if(params) { inputParameters.Status(); } // Open the file for reading. mySamIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); // Open the bam index file for reading if a region was specified. if((myRefName.Length() != 0) || (myRefID != UNSET_REF) || (myBedFile != NULL)) { mySamIn.ReadBamIndex(indexFile); } // Read & write the sam header. mySamIn.ReadHeader(mySamHeader); samOut.WriteHeader(mySamHeader); // Read the sam records. SamRecord samRecord; // Track the status. int numSectionRecords = 0; // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; while(getNextSection()) { // Keep reading records until they aren't anymore. while(mySamIn.ReadRecord(mySamHeader, samRecord)) { if(!readName.IsEmpty()) { // Check for readname. if(strcmp(samRecord.getReadName(), readName.c_str()) != 0) { // not a matching read name, so continue to the next record. continue; } } // Check to see if the read has already been processed. if(myPrevEnd != UNSPECIFIED_INT) { // Because we already know that the bed was sorted, // we know that the previous section started before // this one, so if the previous end is greater than // this record's end position we know that it // was already written in the previous section. // Note: can't be equal to the previous end since // the end range was exclusive, while // get0BasedAlignmentEnd is inclusive. // myPrevEnd is reset by getNextSection when a new // chromosome is hit. if(samRecord.get0BasedAlignmentEnd() < myPrevEnd) { // This record was already written. continue; } } // Shift left if applicable. if(lshift) { samRecord.shiftIndelsLeft(); } // Successfully read a record from the file, so write it. samOut.WriteRecord(mySamHeader, samRecord); ++numSectionRecords; } myWroteReg = true; } if(myBedFile != NULL) { ifclose(myBedFile); } std::cerr << "Wrote " << outFile << " with " << numSectionRecords << " records.\n"; return(returnStatus); }
void testIndex(BamIndex& bamIndex) { assert(bamIndex.getNumMappedReads(1) == 2); assert(bamIndex.getNumUnMappedReads(1) == 0); assert(bamIndex.getNumMappedReads(0) == 4); assert(bamIndex.getNumUnMappedReads(0) == 1); assert(bamIndex.getNumMappedReads(23) == -1); assert(bamIndex.getNumUnMappedReads(23) == -1); assert(bamIndex.getNumMappedReads(-1) == 0); assert(bamIndex.getNumUnMappedReads(-1) == 2); assert(bamIndex.getNumMappedReads(-2) == -1); assert(bamIndex.getNumUnMappedReads(-2) == -1); assert(bamIndex.getNumMappedReads(22) == 0); assert(bamIndex.getNumUnMappedReads(22) == 0); // Get the chunks for reference id 1. Chunk testChunk; SortedChunkList chunkList; assert(bamIndex.getChunksForRegion(1, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x4e7); assert(testChunk.chunk_end == 0x599); // Get the chunks for reference id 0. assert(bamIndex.getChunksForRegion(0, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x360); assert(testChunk.chunk_end == 0x4e7); // Get the chunks for reference id 2. assert(bamIndex.getChunksForRegion(2, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x599); assert(testChunk.chunk_end == 0x5ea); // Get the chunks for reference id 3. // There isn't one for this ref id, but still successfully read the file, // so it should return true, but the list should be empty. assert(bamIndex.getChunksForRegion(3, -1, -1, chunkList) == true); assert(chunkList.empty()); // Test reading an indexed bam file. SamFile inFile; assert(inFile.OpenForRead("testFiles/sortedBam.bam")); inFile.setSortedValidation(SamFile::COORDINATE); assert(inFile.ReadBamIndex("testFiles/sortedBam.bam.bai")); SamFileHeader samHeader; assert(inFile.ReadHeader(samHeader)); SamRecord samRecord; // Test getting num mapped/unmapped reads. assert(inFile.getNumMappedReadsFromIndex(1) == 2); assert(inFile.getNumUnMappedReadsFromIndex(1) == 0); assert(inFile.getNumMappedReadsFromIndex(0) == 4); assert(inFile.getNumUnMappedReadsFromIndex(0) == 1); assert(inFile.getNumMappedReadsFromIndex(23) == -1); assert(inFile.getNumUnMappedReadsFromIndex(23) == -1); assert(inFile.getNumMappedReadsFromIndex(-1) == 0); assert(inFile.getNumUnMappedReadsFromIndex(-1) == 2); assert(inFile.getNumMappedReadsFromIndex(-2) == -1); assert(inFile.getNumUnMappedReadsFromIndex(-2) == -1); assert(inFile.getNumMappedReadsFromIndex(22) == 0); assert(inFile.getNumUnMappedReadsFromIndex(22) == 0); assert(inFile.getNumMappedReadsFromIndex("2", samHeader) == 2); assert(inFile.getNumUnMappedReadsFromIndex("2", samHeader) == 0); assert(inFile.getNumMappedReadsFromIndex("1", samHeader) == 4); assert(inFile.getNumUnMappedReadsFromIndex("1", samHeader) == 1); assert(inFile.getNumMappedReadsFromIndex("22", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("22", samHeader) == 0); assert(inFile.getNumMappedReadsFromIndex("", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("*", samHeader) == 2); assert(inFile.getNumMappedReadsFromIndex("unknown", samHeader) == -1); assert(inFile.getNumUnMappedReadsFromIndex("unknown", samHeader) == -1); assert(inFile.getNumMappedReadsFromIndex("X", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("X", samHeader) == 0); // Section -1 = Ref *: 2 records (8 & 10 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(-1)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead8(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead10(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 2 = Ref 3: 1 records (9 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(2)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead9(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 0 = Ref 1: 5 records (3, 4, 1, 2, & 6 from testSam.sam that is // reflected in the validation. assert(inFile.SetReadSection(0)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead3(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead4(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead6(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 1 = Ref 2: 2 records (5 & 7 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(1)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead5(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead7(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 3 to 22 (ref 4 - 23): 0 records. for(int i = 3; i < 23; i++) { assert(inFile.SetReadSection(i)); assert(inFile.ReadRecord(samHeader, samRecord) == false); } // Set the read section. assert(inFile.SetReadSection("1", 1010, 1012)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 2); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1010, 1020)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 5); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1010, 1011)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 1); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1011, 1012)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 1); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); }
void testRead(SamFile &inSam) { // Read the SAM Header. SamFileHeader samHeader; assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); testCopyHeader(samHeader); testModHeader(samHeader); SamRecord samRecord; assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); // Set a new quality and get the buffer. samRecord.setQuality("ABCDE"); validateRead1ModQuality(samRecord); // void* buffer = samRecord.getRecordBuffer(); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead3(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead4(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead5(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead6(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead7(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead9(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); }
SamStatus::Status ClipOverlap::handleSortedByReadName(SamFile& samIn, SamFile* samOutPtr) { // Set returnStatus to success. It will be changed // to the failure reason if any of the writes fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; // Read the sam records. SamRecord* prevSamRecord = NULL; SamRecord* samRecord = new SamRecord; SamRecord* tmpRecord = new SamRecord; if((samRecord == NULL) || (tmpRecord == NULL)) { std::cerr << "Failed to allocate a SamRecord, so exit.\n"; return(SamStatus::FAIL_MEM); } // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(mySamHeader, *samRecord)) { int16_t flag = samRecord->getFlag(); if((flag & myIntExcludeFlags) != 0) { // This read should not be checked for overlaps. // Check if there is a previous SamRecord. if(prevSamRecord != NULL) { // There is a previous record. // If it has a different read name, write it. if(strcmp(samRecord->getReadName(), prevSamRecord->getReadName()) != 0) { // Different read name, so write the previous record. if((samOutPtr != NULL) && !myOverlapsOnly) { if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } } // Clear the previous record info. tmpRecord = prevSamRecord; prevSamRecord = NULL; } // If it has the same read name, leave it in case there is another read with that name } // This record is not being checked for overlaps, so just write it and continue if((samOutPtr != NULL) && !myOverlapsOnly) { if(!samOutPtr->WriteRecord(mySamHeader, *samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } } continue; } if(prevSamRecord == NULL) { // Nothing to compare this record to, so set this // record to the previous, and the next record. prevSamRecord = samRecord; samRecord = tmpRecord; tmpRecord = NULL; continue; } // Check if the read name matches the previous read name. if(strcmp(samRecord->getReadName(), prevSamRecord->getReadName()) == 0) { bool overlap = false; // Same Read Name, so check clipping. OverlapHandler::OverlapInfo prevClipInfo = myOverlapHandler->getOverlapInfo(*prevSamRecord); OverlapHandler::OverlapInfo curClipInfo = myOverlapHandler->getOverlapInfo(*samRecord); // If either indicate a complete clipping, clip both. if((prevClipInfo == OverlapHandler::NO_OVERLAP_WRONG_ORIENT) || (curClipInfo == OverlapHandler::NO_OVERLAP_WRONG_ORIENT)) { overlap = true; myOverlapHandler->handleNoOverlapWrongOrientation(*prevSamRecord); // Don't update stats since this is the 2nd in the pair myOverlapHandler->handleNoOverlapWrongOrientation(*samRecord, false); } else if((prevClipInfo == OverlapHandler::OVERLAP) || (prevClipInfo == OverlapHandler::SAME_START)) { // The previous read starts at or before the current one. overlap = true; myOverlapHandler->handleOverlapPair(*prevSamRecord, *samRecord); } else if(curClipInfo == OverlapHandler::OVERLAP) { // The current read starts before the previous one. overlap = true; myOverlapHandler->handleOverlapPair(*samRecord, *prevSamRecord); } // Found a read pair, so write both records if: // 1) output file is specified // AND // 2a) all records should be written // OR // 2b) the pair overlaps if((samOutPtr != NULL) && (!myOverlapsOnly || overlap)) { if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } if(!samOutPtr->WriteRecord(mySamHeader, *samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } } // Setup for the next read with no previous. tmpRecord = prevSamRecord; prevSamRecord = NULL; } else { // Read name does not match, so write the previous record // if we are writing all records. if((samOutPtr != NULL) && !myOverlapsOnly) { if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } } // Store this record as the previous. tmpRecord = prevSamRecord; prevSamRecord = samRecord; samRecord = tmpRecord; tmpRecord = NULL; } } // Write the previous record if there is one. if((samOutPtr != NULL) && (prevSamRecord != NULL) && !myOverlapsOnly) { if(!samOutPtr->WriteRecord(mySamHeader, *prevSamRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOutPtr->GetStatusMessage()); returnStatus = samOutPtr->GetStatus(); } delete prevSamRecord; } if(samRecord != NULL) { delete samRecord; } if(tmpRecord != NULL) { delete tmpRecord; } if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) { return(samIn.GetStatus()); } return(returnStatus); }
void testFlagRead(const char* fileName) { SamFile inSam; SamFileHeader samHeader; SamRecord samRecord; //////////////////////////////////////////////////////////// // Required flag 0x48 (only flag 73 matches) // Exclude nothing assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x48, 0x0); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead1(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // No required flags. // Exclude 0x48. This leaves just the one read with flag 133. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x0, 0x48); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x40 // Exclude 0x48. // This will not find any records since the exclude and required conflict. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x40, 0x48); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x4 // Exclude 0x8. // Only finds flag 133. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x4, 0x8); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); //////////////////////////////////////////////////////////// // Required flag 0x4 // Exclude nothing // Finds flags 133 & 141. assert(inSam.OpenForRead(fileName)); assert(inSam.ReadHeader(samHeader)); validateHeader(samHeader); inSam.SetReadFlags(0x4, 0x0); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead2(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead8(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == true); validateRead10(samRecord); assert(inSam.ReadRecord(samHeader, samRecord) == false); inSam.Close(); }
int ClipOverlap::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; String storeOrig = ""; bool readName = false; bool noRNValidate = false; bool stats = false; int poolSize = DEFAULT_POOL_SIZE; bool unmapped = false; bool noeof = false; bool params = false; String excludeFlags = "0xF0C"; // TODO, cleanup legacy parameters ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("storeOrig", &storeOrig) LONG_PARAMETER("readName", &readName) LONG_PARAMETER ("noRNValidate", &noRNValidate) LONG_PARAMETER ("stats", &stats) LONG_PARAMETER ("overlapsOnly", &myOverlapsOnly) LONG_STRINGPARAMETER ("excludeFlags", &excludeFlags) LONG_PARAMETER("unmapped", &unmapped) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Coordinate Processing Optional Parameters") LONG_INTPARAMETER("poolSize", &poolSize) LONG_PARAMETER("poolSkipOverlap", &myPoolSkipOverlap) LONG_PHONEHOME(VERSION) BEGIN_LEGACY_PARAMETERS() LONG_PARAMETER ("clipsOnly", &myOverlapsOnly) LONG_PARAMETER("poolSkipClip", &myPoolSkipOverlap) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { printUsage(std::cerr); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Check to see if the out file was specified, if not, report an error. if(outFile == "") { printUsage(std::cerr); inputParameters.Status(); // Out file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if((storeOrig.Length() != 0) && (storeOrig.Length() != 2)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "--storeOrig tag name must be 2 characters.\n"; return(-1); } myOverlapHandler = new OverlapClipLowerBaseQual(); if(myOverlapHandler == NULL) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Failed to allocate the overlap handler\n"; return(-1); } if(unmapped) { myOverlapHandler->markAsUnmapped(); } // Setup the overlap handler. myOverlapHandler->keepStats(stats); if(storeOrig.Length() != 0) { myOverlapHandler->storeOrigCigar(storeOrig); } myIntExcludeFlags = excludeFlags.AsInteger(); if(params) { inputParameters.Status(); } // For each step process the file. // Open the files & read/write the sam header. SamStatus::Status runStatus = SamStatus::SUCCESS; for(int i = 1; i <= myOverlapHandler->numSteps(); i++) { // Open the file for reading. mySamHeader.resetHeader(); SamFile samIn(inFile, SamFile::READ, &mySamHeader); SamFile* samOutPtr = NULL; // Check if writing, if so, open the output file. if(i == myOverlapHandler->numSteps()) { samOutPtr = new SamFile(outFile, SamFile::WRITE, &mySamHeader); } if(readName) { if(!noRNValidate) { samIn.setSortedValidation(SamFile::QUERY_NAME); } runStatus = handleSortedByReadName(samIn, samOutPtr); } else { // Coordinate sorted, so work with the pools. samIn.setSortedValidation(SamFile::COORDINATE); myPool.setMaxAllocatedRecs(poolSize); // Reset the number of failures myNumMateFailures = 0; myNumPoolFail = 0; myNumPoolFailNoHandle = 0; myNumPoolFailHandled = 0; myNumOutOfOrder = 0; // Run by coordinate if(samOutPtr != NULL) { // Setup the output buffer for writing. SamCoordOutput outputBuffer(myPool); outputBuffer.setOutputFile(samOutPtr, &mySamHeader); runStatus = handleSortedByCoord(samIn, &outputBuffer); // Cleanup the output buffer. if(!outputBuffer.flushAll()) { std::cerr << "ERROR: Failed to flush the output buffer\n"; runStatus = SamStatus::FAIL_IO; } } else { runStatus = handleSortedByCoord(samIn, NULL); } } if(runStatus != SamStatus::SUCCESS) { break; } // Close the input file, it will be reopened if there are // multiple steps. samIn.Close(); if(samOutPtr != NULL) { samOutPtr->Close(); delete samOutPtr; samOutPtr = NULL; } } // Done processing. // Print Stats myOverlapHandler->printStats(); if(myNumMateFailures != 0) { std::cerr << "WARNING: did not find expected overlapping mates for " << myNumMateFailures << " records." << std::endl; } if(myNumPoolFail != 0) { // Had to skip clipping some records due to running out of // memory and not being able to wait for the mate. std::cerr << "WARNING: " << myNumPoolFail << " record pool failures\n"; if(myNumPoolFailNoHandle != 0) { std::cerr << "Due to hitting the max record poolSize, skipped handling " << myNumPoolFailNoHandle << " records." << std::endl; } if(myNumPoolFailHandled != 0) { std::cerr << "Due to hitting the max record poolSize, default handled " << myNumPoolFailHandled << " records." << std::endl; } if(myNumOutOfOrder != 0) { std::cerr << "WARNING: Resulting File out of Order by " << myNumOutOfOrder << " records.\n"; } } if(runStatus == SamStatus::SUCCESS) { if(myNumPoolFail == 0) { std::cerr << "Completed ClipOverlap Successfully.\n"; } else { runStatus = SamStatus::NO_MORE_RECS; std::cerr << "Completed ClipOverlap with WARNINGS.\n"; } } else { std::cerr << "Failed to complete ClipOverlap.\n"; } return(runStatus); }
bool Stats::getNextSection(SamFile &samIn) { static bool alreadyRead = false; if(myRegionList == NULL) { // no region list is set, so just read once. if(alreadyRead) { // No regions and it has already been read, so // return false, no more to read. return(false); } // Return true that there is more to read, but // set the flag that it has already been read // so the next call will return false. alreadyRead = true; return(true); } else { // There is a region list, so read process that. // Track whether or not a section has been found. bool sectionFound = false; myStartPos = 0; myEndPos = 0; // Loop until the end of the file or the end of the file or // a section is found. while(!sectionFound && !ifeof(myRegionList)) { myRegBuffer.Clear(); myRegBuffer.ReadLine(myRegionList); if(myRegBuffer.IsEmpty()) { // Nothing read, so continue to the next line. continue; } // A line was read, so parse it. myRegColumn.ReplaceColumns(myRegBuffer, '\t'); if(myRegColumn.Length() < 3) { // Incorrectly formatted line. std::cerr << "Improperly formatted reg line: " << myRegBuffer << "; Skipping to the next line.\n"; continue; } // Check the columns. if(!myRegColumn[1].AsInteger(myStartPos)) { // The start position (2nd column) is not an integer. std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << myRegColumn[1] << "; Skipping to the next line.\n"; } else if(!myRegColumn[2].AsInteger(myEndPos)) { // The end position (3rd column) is not an integer. std::cerr << "Improperly formatted region line, end position " << "(3rd column) is not an integer: " << myRegColumn[2] << "; Skipping to the next line.\n"; } else if((myStartPos >= myEndPos) && (myEndPos != -1)) { // The start position is >= the end position std::cerr << "Improperly formatted region line, the start position " << "is >= end position: " << myRegColumn[1] << " >= " << myRegColumn[2] << "; Skipping to the next line.\n"; } else { sectionFound = true; samIn.SetReadSection(myRegColumn[0].c_str(), myStartPos, myEndPos); } } return(sectionFound); } }
int Bam2FastQ::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool readName = false; String refFile = ""; String firstOut = ""; String secondOut = ""; String unpairedOut = ""; bool interleave = false; bool noeof = false; bool gzip = false; bool params = false; myOutBase = ""; myNumMateFailures = 0; myNumPairs = 0; myNumUnpaired = 0; mySplitRG = false; myQField = ""; myNumQualTagErrors = 0; myReverseComp = true; myRNPlus = false; myFirstRNExt = DEFAULT_FIRST_EXT; mySecondRNExt = DEFAULT_SECOND_EXT; myCompression = InputFile::DEFAULT; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Optional Parameters") LONG_PARAMETER("readName", &readName) LONG_PARAMETER("splitRG", &mySplitRG) LONG_STRINGPARAMETER("qualField", &myQField) LONG_PARAMETER("merge", &interleave) LONG_STRINGPARAMETER("refFile", &refFile) LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt) LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt) LONG_PARAMETER("rnPlus", &myRNPlus) LONG_PARAMETER("noReverseComp", &myReverseComp) LONG_PARAMETER("gzip", &gzip) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional OutputFile Names") LONG_STRINGPARAMETER("outBase", &myOutBase) LONG_STRINGPARAMETER("firstOut", &firstOut) LONG_STRINGPARAMETER("secondOut", &secondOut) LONG_STRINGPARAMETER("unpairedOut", &unpairedOut) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(gzip) { myCompression = InputFile::GZIP; } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both interleaved & secondOut since secondOut would be N/A. if(interleave && !secondOut.IsEmpty()) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n"; return(-1); } // Cannot specify both splitRG & firstOut/secondOut/unpairedOut // since it needs a different file for each RG. if(mySplitRG && (!firstOut.IsEmpty() || !secondOut.IsEmpty() || !unpairedOut.IsEmpty())) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n"; std::cerr << "Use --outBase instead.\n"; return(-1); } // Cannot specify splitRG & output to stdout. if(mySplitRG && (myOutBase[0] == '-')) { usage(); inputParameters.Status(); std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n"; return(-1); } // Check to see if the out file was specified, if not, generate it from // the input filename. if(myOutBase == "") { // Just remove the extension from the input filename. int extStart = inFile.FastFindLastChar('.'); if(extStart <= 0) { myOutBase = inFile; } else { myOutBase = inFile.Left(extStart); } } if(mySplitRG) { std::string fqList = myOutBase.c_str(); fqList += ".list"; myFqList = ifopen(fqList.c_str(), "w"); ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n"); } // Check to see if the first/second/single-ended were specified and // if not, set them. myFirstFileNameExt = "_1.fastq"; mySecondFileNameExt = "_2.fastq"; myUnpairedFileNameExt = ".fastq"; if(interleave) { myFirstFileNameExt = "_interleaved.fastq"; myFirstFileNameExt = "_interleaved.fastq"; } getFileName(firstOut, myFirstFileNameExt); getFileName(secondOut, mySecondFileNameExt); getFileName(unpairedOut, myUnpairedFileNameExt); if(params) { inputParameters.Status(); } // Open the files for reading/writing. // Open prior to opening the output files, // so if there is an error, the outputs don't get created. SamFile samIn; samIn.OpenForRead(inFile, &mySamHeader); // Skip non-primary reads. samIn.SetReadFlags(0, 0x0100); // Open the output files if not splitting RG if(!mySplitRG) { myUnpairedFile = ifopen(unpairedOut, "w", myCompression); // Only open the first file if it is different than an already opened file. if(firstOut != unpairedOut) { myFirstFile = ifopen(firstOut, "w", myCompression); } else { myFirstFile = myUnpairedFile; } // If it is interleaved or the 2nd file is not a new name, set it appropriately. if(interleave || secondOut == firstOut) { mySecondFile = myFirstFile; } else if(secondOut == unpairedOut) { mySecondFile = myUnpairedFile; } else { mySecondFile = ifopen(secondOut, "w", myCompression); } if(myUnpairedFile == NULL) { std::cerr << "Failed to open " << unpairedOut << " so can't convert bam2FastQ.\n"; return(-1); } if(myFirstFile == NULL) { std::cerr << "Failed to open " << firstOut << " so can't convert bam2FastQ.\n"; return(-1); } if(mySecondFile == NULL) { std::cerr << "Failed to open " << secondOut << " so can't convert bam2FastQ.\n"; return(-1); } } if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0)) { readName = true; } else { // defaulting to coordinate sorted. samIn.setSortedValidation(SamFile::COORDINATE); } // Setup the '=' translation if the reference was specified. if(!refFile.IsEmpty()) { GenomeSequence* refPtr = new GenomeSequence(refFile); samIn.SetReadSequenceTranslation(SamRecord::BASES); samIn.SetReference(refPtr); } SamRecord* recordPtr; int16_t samFlag; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = myPool.getRecord(); if(recordPtr == NULL) { // Failed to allocate a new record. throw(std::runtime_error("Failed to allocate a new SAM/BAM record")); } if(!samIn.ReadRecord(mySamHeader, *recordPtr)) { // Failed to read a record. returnStatus = samIn.GetStatus(); continue; } // Have a record. Check to see if it is a pair or unpaired read. samFlag = recordPtr->getFlag(); if(SamFlag::isPaired(samFlag)) { if(readName) { handlePairedRN(*recordPtr); } else { handlePairedCoord(*recordPtr); } } else { ++myNumUnpaired; writeFastQ(*recordPtr, myUnpairedFile, myUnpairedFileNameExt); } } // Flush All cleanUpMateMap(0, true); if(returnStatus == SamStatus::NO_MORE_RECS) { returnStatus = SamStatus::SUCCESS; } samIn.Close(); closeFiles(); // Output the results std::cerr << "\nFound " << myNumPairs << " read pairs.\n"; std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n"; if(myNumMateFailures != 0) { std::cerr << "Failed to find mates for " << myNumMateFailures << " reads, so they were written as unpaired\n" << " (not included in either of the above counts).\n"; } if(myNumQualTagErrors != 0) { std::cerr << myNumQualTagErrors << " records did not have tag " << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n"; } return(returnStatus); }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
int Revert::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String outFile = ""; bool cigar = false; bool qual = false; bool noeof = false; bool params = false; bool rmBQ = false; String rmTags = ""; myKeepTags = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_STRINGPARAMETER("out", &outFile) LONG_PARAMETER("cigar", &cigar) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("keepTags", &myKeepTags) LONG_PARAMETER("rmBQ", &rmBQ) LONG_STRINGPARAMETER("rmTags", &rmTags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(outFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--out is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; samIn.OpenForRead(inFile); // Open the output file for writing. SamFile samOut; samOut.OpenForWrite(outFile); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); // Write the sam header. samOut.WriteHeader(samHeader); SamRecord samRecord; // Set returnStatus to success. It will be changed to the // failure reason if any of the writes or updates fail. SamStatus::Status returnStatus = SamStatus::SUCCESS; // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { // Update the cigar & position. if(cigar) { if(!updateCigar(samRecord)) { // Failed to update the cigar & position. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(qual) { if(!updateQual(samRecord)) { // Failed to update the quality. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmBQ) { if(!removeBQ(samRecord)) { // Failed to remove BQ. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } if(rmTags != "") { if(!samRecord.rmTags(rmTags.c_str())) { // Failed to remove the specified tags. fprintf(stderr, "%s\n", samIn.GetStatusMessage()); returnStatus = samIn.GetStatus(); } } // Successfully read a record from the file, so write it. if(!samOut.WriteRecord(samHeader, samRecord)) { // Failed to write a record. fprintf(stderr, "%s\n", samOut.GetStatusMessage()); returnStatus = samOut.GetStatus(); } } std::cerr << std::endl << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; std::cerr << "Number of records written = " << samOut.GetCurrentRecordCount() << std::endl; // Since the reads were successful, return the status based // on the status of the writes. If any failed, return // their failure status. return(returnStatus); }
bool BamProcessor::init (const ContalignParams& p) { read_cnt_ = proc_cnt_ = toolongs_ = unaligned_cnt_ = fail_cnt_ = nomd_cnt_ = realigned_cnt_ = modified_cnt_ = pos_adjusted_cnt_ = 0; log_diff_ = log_matr_ = log_base_ = false; p_ = &p; if (!*p.inbam ()) ers << "Input file name not specified" << Throw; limit_ = p.limit (); skip_ = p.skip (); infile_.OpenForRead (p.inbam ()); if (!infile_.IsOpen ()) ers << p.inbam () << ThrowEx (FileNotFoundRerror); bool index_ok = false; if (*p.bamidx ()) { index_ok = infile_.ReadBamIndex (p.bamidx ()); if (!index_ok) warn << "Unable to open specified BAM index: " << p.bamidx () << ". Default index will be attempted" << std::endl; } if (!index_ok) { try { index_ok = infile_.ReadBamIndex (); } catch (std::exception& e) { // for some reason not converted into return status by libStatGen } if (!index_ok) warn << "Unable to open default BAM index for " << p.inbam () << std::endl; } if (*p.refname () || p.refno () != -1) { if (!index_ok) ers << "Reference section specified, but the BAM index could not be open." << Throw; if (*p.refname ()) { if (p.endpos () != 0) { infile_.SetReadSection (p.refname (), p.begpos (), p.endpos ()); info << "Read section set : " << p.refname () << ": " << p.begpos () << "-" << p.endpos () << std::endl; } else { infile_.SetReadSection (p.refname ()); info << "Read section set : " << p.refname () << std::endl; } } else { if (p.endpos () != 0) { info << "Read section set : ref# " << p.refno () << ": " << p.begpos () << "-" << p.endpos () << std::endl; infile_.SetReadSection (p.refno (), p.begpos (), p.endpos ()); } else { info << "Read section set : ref# " << p.refno () << std::endl; infile_.SetReadSection (p.refno ()); } } } if (*p.outbam ()) { if (!p.overwrite () && file_exists (p.outbam ())) ers << "Output file " << p.outbam () << " exists. Use --ov key to allow overwriting" << Throw; outfile_.OpenForWrite (p.outbam ()); if (!outfile_.IsOpen ()) ers << "Unable to open output file " << p.outbam () << std::endl; } if (*p.logfname ()) { if (!p.overwrite () && file_exists (p.logfname ())) ers << "Log file " << p.logfname () << " exists. Use --ov key to allow overwriting" << Throw; logfile_.open (p.logfname (), std::fstream::out); if (!logfile_.is_open ()) ers << "Unable to open log file " << p.logfname () << std::endl; time_t t = time (NULL); logfile_ << "Context-aware realigner log\nStarted at " << asctime (localtime (&t)) << "\nParameters:\n"; logfile_ << *(p.parameters_); logfile_ << std::endl; log_base_ = p.logging ("base"); log_diff_ = p.logging ("diff"); log_matr_ = p.logging ("matr"); } band_width_ = p.bwid (); switch (p.algo ()) { case ContalignParams::TEMPL: { matrix_.configure (genstr::nucleotides.symbols (), genstr::nucleotides.size (), genstr::NegUnitaryMatrix <int, 4>().values ()); gap_cost_.configure (p.gip (), p.gep ()); taligner_.configure (&matrix_, &gap_cost_, &gap_cost_, &genstr::nn2num, &genstr::nn2num); } break; case ContalignParams::PLAIN: { batches_.reset (max_batch_no_); aligner_.init (MAX_SEQ_LEN, MAX_SEQ_LEN*MAX_BAND_WIDTH, p.gip (), p.gep (), p.mat (), -p.mis ()); if (log_matr_) aligner_.set_log (logfile_); if (p.debug () > 5) aligner_.set_trace (true); } break; case ContalignParams::POLY: { batches_.reset (max_batch_no_); contalign_.init (MAX_SEQ_LEN, MAX_RSEQ_LEN, MAX_SEQ_LEN*MAX_BAND_WIDTH, p.gip (), p.gep (), p.mat (), -p.mis ()); if (log_matr_) contalign_.set_log (logfile_); if (p.debug () > 5) contalign_.set_trace (true); } break; default: { ers << "Alignment algorithm " << p.algostr () << " not yet supported" << Throw; } } timer_.reset (DEFAULT_REPORT_IVAL, 1); return true; }
int Dedup_LowMem::execute(int argc, char** argv) { /* -------------------------------- * process the arguments * -------------------------------*/ String inFile, outFile, logFile; myDoRecab = false; bool removeFlag = false; bool verboseFlag = false; myForceFlag = false; myNumMissingMate = 0; myMinQual = DEFAULT_MIN_QUAL; String excludeFlags = "0xB04"; uint16_t intExcludeFlags = 0; bool noeof = false; bool params = false; LongParamContainer parameters; parameters.addGroup("Required Parameters"); parameters.addString("in", &inFile); parameters.addString("out", &outFile); parameters.addGroup("Optional Parameters"); parameters.addInt("minQual", & myMinQual); parameters.addString("log", &logFile); parameters.addBool("oneChrom", &myOneChrom); parameters.addBool("recab", &myDoRecab); parameters.addBool("rmDups", &removeFlag); parameters.addBool("force", &myForceFlag); parameters.addString("excludeFlags", &excludeFlags); parameters.addBool("verbose", &verboseFlag); parameters.addBool("noeof", &noeof); parameters.addBool("params", ¶ms); parameters.addPhoneHome(VERSION); myRecab.addRecabSpecificParameters(parameters); ParameterList inputParameters; inputParameters.Add(new LongParameters ("Input Parameters", parameters.getLongParameterList())); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } if(inFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an input file" << std::endl; return EXIT_FAILURE; } if(outFile.IsEmpty()) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Specify an output file" << std::endl; return EXIT_FAILURE; } intExcludeFlags = excludeFlags.AsInteger(); if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "Cannot specify --force and Duplicate in the excludeFlags. Since --force indicates to override" << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n"; return EXIT_FAILURE; } if(!SamFlag::isSecondary(intExcludeFlags)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n"; return EXIT_FAILURE; } if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT)) { printUsage(std::cerr); inputParameters.Status(); std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n"; return EXIT_FAILURE; } if(logFile.IsEmpty()) { logFile = outFile + ".log"; } if(myDoRecab) { int status = myRecab.processRecabParam(); if(status != 0) { inputParameters.Status(); return(status); } } if(params) { inputParameters.Status(); } Logger::gLogger = new Logger(logFile.c_str(), verboseFlag); /* ------------------------------------------------------------------- * The arguments are processed. Prepare the input BAM file, * instantiate dedup_LowMem, and construct the read group library map * ------------------------------------------------------------------*/ SamFile samIn; samIn.OpenForRead(inFile.c_str()); // If the file isn't sorted it will throw an exception. samIn.setSortedValidation(SamFile::COORDINATE); SamFileHeader header; samIn.ReadHeader(header); buildReadGroupLibraryMap(header); lastReference = -1; lastCoordinate = -1; // for keeping some basic statistics uint32_t recordCount = 0; uint32_t pairedCount = 0; uint32_t properPairCount = 0; uint32_t unmappedCount = 0; uint32_t reverseCount = 0; uint32_t qualCheckFailCount = 0; uint32_t secondaryCount = 0; uint32_t supplementaryCount = 0; uint32_t excludedCount = 0; // Now we start reading records SamRecord* recordPtr; SamStatus::Status returnStatus = SamStatus::SUCCESS; while(returnStatus == SamStatus::SUCCESS) { recordPtr = mySamPool.getRecord(); if(recordPtr == NULL) { std::cerr << "Failed to allocate enough records\n"; return(-1); } if(!samIn.ReadRecord(header, *recordPtr)) { returnStatus = samIn.GetStatus(); continue; } // Take note of properties of this record int flag = recordPtr->getFlag(); if(SamFlag::isPaired(flag)) ++pairedCount; if(SamFlag::isProperPair(flag)) ++properPairCount; if(SamFlag::isReverse(flag)) ++reverseCount; if(SamFlag::isQCFailure(flag)) ++qualCheckFailCount; if(SamFlag::isSecondary(flag)) ++secondaryCount; if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT) ++supplementaryCount; if(!SamFlag::isMapped(flag)) ++unmappedCount; // put the record in the appropriate maps: // single reads go in myFragmentMap // paired reads go in myPairedMap recordCount = samIn.GetCurrentRecordCount(); // if we have moved to a new position, look back at previous reads for duplicates if (hasPositionChanged(*recordPtr)) { cleanupPriorReads(recordPtr); } // Determine if this read should be checked for duplicates. if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0)) { ++excludedCount; // No deduping done on this record, but still build the recab table. if(myDoRecab) { myRecab.processReadBuildTable(*recordPtr); } // Nothing more to do with this record, so // release the pointer. mySamPool.releaseRecord(recordPtr); } else { if(SamFlag::isDuplicate(flag) && !myForceFlag) { // Error: Marked duplicates, and duplicates aren't excluded. Logger::gLogger->error("There are records already duplicate marked."); Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over"); } checkDups(*recordPtr, recordCount); mySamPool.releaseRecord(recordPtr); } // let the user know we're not napping if (verboseFlag && (recordCount % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u", recordCount, myFragmentMap.size(), myPairedMap.size(), myMateMap.size()); } } // we're finished reading record so clean up the duplicate search and // close the input file cleanupPriorReads(NULL); samIn.Close(); // print some statistics Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS"); Logger::gLogger->writeLog("Total number of reads: %u",recordCount); Logger::gLogger->writeLog("Total number of paired-end reads: %u", pairedCount); Logger::gLogger->writeLog("Total number of properly paired reads: %u", properPairCount); Logger::gLogger->writeLog("Total number of unmapped reads: %u", unmappedCount); Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u", reverseCount); Logger::gLogger->writeLog("Total number of QC-failed reads: %u", qualCheckFailCount); Logger::gLogger->writeLog("Total number of secondary reads: %u", secondaryCount); Logger::gLogger->writeLog("Total number of supplementary reads: %u", supplementaryCount); Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u", myFragmentMap.size()); Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u", myPairedMap.size()); Logger::gLogger->writeLog("Total number of missing mates: %u", myNumMissingMate); Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u", excludedCount); Logger::gLogger->writeLog("--------------------------------------------------------------------------"); Logger::gLogger->writeLog("Sorting the indices of %d duplicated records", myDupList.size()); // sort the indices of duplicate records std::sort(myDupList.begin(), myDupList.end(), std::less<uint32_t> ()); // get ready to write the output file by making a second pass // through the input file samIn.OpenForRead(inFile.c_str()); samIn.ReadHeader(header); SamFile samOut; samOut.OpenForWrite(outFile.c_str()); samOut.WriteHeader(header); // If we are recalibrating, output the model information. if(myDoRecab) { myRecab.modelFitPrediction(outFile); } // an iterator to run through the duplicate indices int currentDupIndex = 0; bool moreDups = !myDupList.empty(); // let the user know what we're doing Logger::gLogger->writeLog("\nWriting %s", outFile.c_str()); // count the duplicate records as a check uint32_t singleDuplicates(0), pairedDuplicates(0); // start reading records and writing them out SamRecord record; while(samIn.ReadRecord(header, record)) { uint32_t currentIndex = samIn.GetCurrentRecordCount(); bool foundDup = moreDups && (currentIndex == myDupList[currentDupIndex]); // modify the duplicate flag and write out the record, // if it's appropriate int flag = record.getFlag(); if (foundDup) { // this record is a duplicate, so mark it. record.setFlag( flag | 0x400 ); currentDupIndex++; // increment duplicate counters to verify we found them all if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) ) { // unpaired or mate unmapped singleDuplicates++; } else { pairedDuplicates++; } // recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } // write the record if we are not removing duplicates if (!removeFlag ) samOut.WriteRecord(header, record); } else { if(myForceFlag) { // this is not a duplicate we've identified but we want to // remove any duplicate marking record.setFlag( flag & 0xfffffbff ); // unmark duplicate } // Not a duplicate, so recalibrate if necessary. if(myDoRecab) { myRecab.processReadApplyTable(record); } samOut.WriteRecord(header, record); } // Let the user know we're still here if (verboseFlag && (currentIndex % 100000 == 0)) { Logger::gLogger->writeLog("recordCount=%u", currentIndex); } } // We're done. Close the files and print triumphant messages. samIn.Close(); samOut.Close(); Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads", removeFlag ? "removed" : "marked" , singleDuplicates, pairedDuplicates/2); Logger::gLogger->writeLog("\nDedup_LowMem complete!"); return 0; }
// Dump the reference information from specified SAM/BAM file. int DumpRefInfo::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; bool noeof = false; bool printRecordRefs = false; bool params = false; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("printRecordRefs", &printRecordRefs) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); // parameters start at index 2 rather than 1. inputParameters.Read(argc, argv, 2); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument, " << "but was not specified" << std::endl; return(-1); } if(params) { inputParameters.Status(); } // Open the input file for reading. SamFile samIn; samIn.OpenForRead(inFile); // Read the sam header. SamFileHeader samHeader; samIn.ReadHeader(samHeader); const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int numReferences = refInfo.getNumEntries(); for(int i = 0; i < numReferences; i++) { std::cout << "Reference Index " << i; std::cout << "; Name: " << refInfo.getReferenceName(i) << std::endl; } if(numReferences == 0) { // There is no reference info. std::cerr << "The header contains no reference information.\n"; } // If we are to print the references as found in the records, loop // through reading the records. if(printRecordRefs) { SamRecord samRecord; // Track the prev name/id. std::string prevName = ""; int prevID = -2; int recCount = 0; // track the num records in a ref. // Keep reading records until ReadRecord returns false. while(samIn.ReadRecord(samHeader, samRecord)) { const char* name = samRecord.getReferenceName(); int id = samRecord.getReferenceID(); if((strcmp(name, prevName.c_str()) != 0) || (id != prevID)) { if(prevID != -2) { std::cout << "\tRef ID: " << prevID << "\tRef Name: " << prevName << "\tNumRecs: " << recCount << std::endl; } recCount = 0; prevID = id; prevName = name; } ++recCount; } // Print the last index. if(prevID != -2) { std::cout << "\tRef ID: " << prevID << "\tRef Name: " << prevName << "\tNumRecs: " << recCount << std::endl; } } return(SamStatus::SUCCESS); }