int main(int argc, char* argv[]) { vector<string> bamFile; for(int i=1; i<argc; ++i) bamFile.push_back(argv[i]); string outFile = "combined.bam"; BamMultiReader reader; reader.Open(bamFile); SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(8); writer.Open(outFile, header, refs); assert(writer.IsOpen()); BamAlignment al; size_t numReads = 0; while(reader.GetNextAlignment(al)){ writer.SaveAlignment(al); if(++numReads % 10000 == 0) cerr << setw(12) << numReads << '\r'; } cerr << setw(12) << numReads << endl; cerr << "done!" << endl; }
void Demux::match(){ BamReader bamReader(bamFilePath); SamRecord samRecord; while ( bamReader.getNextRecord(samRecord)) { string recordName(samRecord.getReadName()); recordName = cHandler.decrypt(recordName); //clean record name int len=recordName.find("$"); recordName=recordName.substr(0,len); //clean ended if (!isDecoy(recordName)){ string outputFile = generateFileName(recordName); printf("%s\n",outputFile.c_str()); if (writers.find(outputFile) == writers.end()) //if the BamWriter is not initialized writers[outputFile] = new BamWriter(outputFile, bamReader.getHeader()); BamWriter *writer = writers[outputFile]; writer->writeRecord(samRecord); } } for(auto it=writers.begin();it!=writers.end();it++){ BamWriter *writer = it->second; writer->close(); delete writer; } }
void BedIntersectPE::IntersectBamPE(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } // track the previous and current sequence // names so that we can identify blocks of // alignments for a given read ID. string prevName, currName; prevName = currName = ""; vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. alignments.reserve(100); _bedA->bedType = 10; // it's a full BEDPE given it's BAM // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; while (reader.GetNextAlignment(bam1)) { // the alignment must be paired if (bam1.IsPaired() == true) { // grab the second alignment for the pair. reader.GetNextAlignment(bam2); // require that the alignments are from the same query if (bam1.Name == bam2.Name) { ProcessBamBlock(bam1, bam2, refs, writer); } else { cerr << "*****ERROR: -bedpe requires BAM to be sorted or grouped by query name. " << endl; exit(1); } } } // close up reader.Close(); if (_bamOutput == true) { writer.Close(); } }
bool RevertTool::RevertToolPrivate::Run(void) { // opens the BAM file without checking for indexes BamReader reader; if ( !reader.Open(m_settings->InputFilename) ) { cerr << "Could not open input BAM file... quitting." << endl; return false; } // get BAM file metadata const string& headerText = reader.GetHeaderText(); const RefVector& references = reader.GetReferenceData(); // open writer BamWriter writer; bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) { cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl; return false; } // plow through file, reverting alignments BamAlignment al; while ( reader.GetNextAlignment(al) ) { RevertAlignment(al); writer.SaveAlignment(al); } // clean and exit reader.Close(); writer.Close(); return true; }
void processGroupRange (GroupRange& range_a, GroupRange& range_b, int& valid_count) { Group::iterator a_it; Group::iterator b_it; for (a_it = range_a.first; a_it != range_a.second; ++a_it) { for (b_it = range_b.first; b_it != range_b.second; ++b_it) { Alignment a = a_it->second; Alignment b = b_it->second; // ensure 'a' is the most 5' alignment if (b.position() < a.position()) { std::swap(a, b); } if (isValidPair(a, b)) { std::stringstream count_str; count_str << valid_count; Alignment x, y; makePair(a, b, x, y); x.Name += "-" + count_str.str(); y.Name += "-" + count_str.str(); ValidOut.SaveAlignment(x); ValidOut.SaveAlignment(y); valid_count++; } } } }
//{{{ void process_intra_chrom_pair(const BamAlignment &curr, void SV_Pair:: process_intra_chrom_pair(const BamAlignment &curr, const RefVector refs, BamWriter &inter_chrom_reads, map<string, BamAlignment> &mapped_pairs, UCSCBins<SV_BreakPoint*> &r_bin, int weight, int ev_id, SV_PairReader *reader) { if (curr.RefID == curr.MateRefID) { process_pair(curr, refs, mapped_pairs, r_bin, weight, ev_id, reader); } else if (curr.IsMapped() && curr.IsMateMapped() && (curr.RefID >= 0) && (curr.MateRefID >= 0) ) { BamAlignment al = curr; string x = reader->get_source_file_name(); al.AddTag("LS","Z",x); inter_chrom_reads.SaveAlignment(al); } }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:setAsUnpaired [in bam] [outbam]"<<endl<<"this program takes flags all paired sequences as singles"<<endl; return 1; } string bamfiletopen = string(argv[1]); string bamFileOUT = string(argv[2]); BamReader reader; BamWriter writer; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); if ( !writer.Open(bamFileOUT,header,references) ) { cerr << "Could not open output BAM file "<<bamFileOUT << endl; return 1; } BamAlignment al; while ( reader.GetNextAlignment(al) ) { if(al.IsMapped()){ cerr << "Cannot yet handle mapped reads " << endl; return 1; } al.SetIsPaired (false); writer.SaveAlignment(al); } //while al reader.Close(); writer.Close(); return 0; }
// merges sorted temp BAM files into single sorted output BAM file bool SortTool::SortToolPrivate::MergeSortedRuns(void) { // open up multi reader for all of our temp files // this might get broken up if we do a multi-pass system later ?? BamMultiReader multiReader; if ( !multiReader.Open(m_tempFilenames) ) { cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting." << endl; return false; } // set sort order for merge if ( m_settings->IsSortingByName ) multiReader.SetSortOrder(BamMultiReader::SortedByReadName); else multiReader.SetSortOrder(BamMultiReader::SortedByPosition); // open writer for our completely sorted output BAM file BamWriter mergedWriter; if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) { cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename << " for writing... Aborting." << endl; multiReader.Close(); return false; } // while data available in temp files BamAlignment al; while ( multiReader.GetNextAlignmentCore(al) ) mergedWriter.SaveAlignment(al); // close readers multiReader.Close(); mergedWriter.Close(); // delete all temp files vector<string>::const_iterator tempIter = m_tempFilenames.begin(); vector<string>::const_iterator tempEnd = m_tempFilenames.end(); for ( ; tempIter != tempEnd; ++tempIter ) { const string& tempFilename = (*tempIter); remove(tempFilename.c_str()); } return true; }
bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename) { // open temp file for writing BamWriter tempWriter; if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) { cerr << "bamtools sort ERROR: could not open " << tempFilename << " for writing." << endl; return false; } // write data vector<BamAlignment>::const_iterator buffIter = buffer.begin(); vector<BamAlignment>::const_iterator buffEnd = buffer.end(); for ( ; buffIter != buffEnd; ++buffIter ) { const BamAlignment& al = (*buffIter); tempWriter.SaveAlignment(al); } // close temp file & return success tempWriter.Close(); return true; }
bool CompressTool::CompressToolPrivate::Run(void) { // ------------------------------------ // initialize conversion input/output // set to default input if none provided if ( !m_settings->HasInput ) m_settings->InputFilename = Options::StandardIn(); if ( !m_settings->HasOutput ) m_settings->InputFilename = Options::StandardOut(); SamReader reader; reader.Open(m_settings->InputFilename); BamWriter writer; writer.Open( m_settings->OutputFilename, reader.GetHeader(), reader.GetReferenceData()); int alignment_ct = 0; while(true) { BamAlignment alignment; if(!reader.GetNextAlignment(alignment)) break; writer.SaveAlignment(alignment); alignment_ct++; //progress indicator //if(alignment_ct % 500000 == 0) // cerr << "."; } reader.Close(); writer.Close(); return true; }
void WriteBamRecords(BamWriter& ccsBam, unique_ptr<PbiBuilder>& ccsPbi, Results& counts, Results&& results) { counts += results; for (const auto& ccs : results) { BamRecordImpl record; TagCollection tags; stringstream name; // some defaults values record.Bin(0) .InsertSize(0) .MapQuality(255) .MatePosition(-1) .MateReferenceId(-1) .Position(-1) .ReferenceId(-1) .Flag(0) .SetMapped(false); name << *(ccs.Id.MovieName) << '/' << ccs.Id.HoleNumber << "/ccs"; vector<float> snr = { static_cast<float>(ccs.SignalToNoise.A), static_cast<float>(ccs.SignalToNoise.C), static_cast<float>(ccs.SignalToNoise.G), static_cast<float>(ccs.SignalToNoise.T) }; tags["RG"] = MakeReadGroupId(*(ccs.Id.MovieName), "CCS"); tags["zm"] = static_cast<int32_t>(ccs.Id.HoleNumber); tags["np"] = static_cast<int32_t>(ccs.NumPasses); tags["rq"] = static_cast<float>(ccs.PredictedAccuracy); tags["sn"] = snr; // TODO(lhepler) maybe remove one day tags["za"] = static_cast<float>(ccs.AvgZScore); vector<float> zScores; for (const double z : ccs.ZScores) zScores.emplace_back(static_cast<float>(z)); tags["zs"] = zScores; tags["rs"] = ccs.StatusCounts; #if DIAGNOSTICS if (ccs.Barcodes) { vector<uint16_t> bcs{ccs.Barcodes->first, ccs.Barcodes->second}; tags["bc"] = bcs; } tags["ms"] = ccs.ElapsedMilliseconds; tags["mt"] = static_cast<int32_t>(ccs.MutationsTested); tags["ma"] = static_cast<int32_t>(ccs.MutationsApplied); #endif record.Name(name.str()).SetSequenceAndQualities(ccs.Sequence, ccs.Qualities).Tags(tags); int64_t offset; ccsBam.Write(record, &offset); if (ccsPbi) ccsPbi->AddRecord(record, offset); } ccsBam.TryFlush(); }
int main (int argc, const char *argv[]) { printf ("------------- bamrealignment --------------\n"); OptArgs opts; opts.ParseCmdLine(argc, argv); vector<int> score_vals(4); string input_bam = opts.GetFirstString ('i', "input", ""); string output_bam = opts.GetFirstString ('o', "output", ""); opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores"); int clipping = opts.GetFirstInt ('c', "clipping", 2); bool anchors = opts.GetFirstBoolean ('a', "anchors", true); int bandwidth = opts.GetFirstInt ('b', "bandwidth", 10); bool verbose = opts.GetFirstBoolean ('v', "verbose", false); bool debug = opts.GetFirstBoolean ('d', "debug", false); int format = opts.GetFirstInt ('f', "format", 1); int num_threads = opts.GetFirstInt ('t', "threads", 8); string log_fname = opts.GetFirstString ('l', "log", ""); if (input_bam.empty() or output_bam.empty()) return PrintHelp(); opts.CheckNoLeftovers(); std::ofstream logf; if (log_fname.size ()) { logf.open (log_fname.c_str ()); if (!logf.is_open ()) { fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str()); return 1; } } BamReader reader; if (!reader.Open(input_bam)) { fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str()); return 1; } SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(num_threads); if (format == 1) writer.SetCompressionMode(BamWriter::Uncompressed); else writer.SetCompressionMode(BamWriter::Compressed); if (!writer.Open(output_bam, header, refs)) { fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str()); return 1; } // The meat starts here ------------------------------------ if (verbose) cout << "Verbose option is activated, each alignment will print to screen." << endl << " After a read hit RETURN to continue to the next one," << endl << " or press q RETURN to quit the program," << endl << " or press s Return to silence verbose," << endl << " or press c RETURN to continue printing without further prompt." << endl << endl; unsigned int readcounter = 0; unsigned int mapped_readcounter = 0; unsigned int realigned_readcounter = 0; unsigned int modified_alignment_readcounter = 0; unsigned int pos_update_readcounter = 0; unsigned int failed_clip_realigned_readcount = 0; unsigned int already_perfect_readcount = 0; unsigned int bad_md_tag_readcount = 0; unsigned int error_recreate_ref_readcount = 0; unsigned int error_clip_anchor_readcount = 0; unsigned int error_sw_readcount = 0; unsigned int error_unclip_readcount = 0; unsigned int start_position_shift; int orig_position; int new_position; string md_tag, new_md_tag, input = "x"; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; bool position_shift = false; time_t start_time = time(NULL); Realigner aligner; aligner.verbose_ = verbose; aligner.debug_ = debug; if (!aligner.SetScores(score_vals)) cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl; aligner.SetAlignmentBandwidth(bandwidth); BamAlignment alignment; while(reader.GetNextAlignment(alignment)){ readcounter ++; position_shift = false; if ( (readcounter % 100000) == 0 ) cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl; if (alignment.IsMapped()) { orig_position = alignment.Position; mapped_readcounter++; aligner.SetClipping(clipping, !alignment.IsReverseStrand()); if (aligner.verbose_) { cout << endl; if (alignment.IsReverseStrand()) cout << "The read is from the reverse strand." << endl; else cout << "The read is from the forward strand." << endl; } if (!alignment.GetTag("MD", md_tag)) { if (aligner.verbose_) cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n'; bad_md_tag_readcount++; } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) { bool clipfail = false; if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ()) { clipfail = true; failed_clip_realigned_readcount ++; } if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (aligner.verbose_) cout << "Error in the alignment! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n'; error_sw_readcount++; writer.SaveAlignment(alignment); // Write alignment unchanged continue; } if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) { if (aligner.verbose_) cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n'; writer.SaveAlignment(alignment); // Write alignment unchanged error_unclip_readcount ++; continue; } new_md_tag = aligner.GetMDstring(new_md_data); realigned_readcounter++; // adjust start position of read if (!aligner.LeftAnchorClipped() and start_position_shift != 0) { new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position); if (new_position != alignment.Position) { pos_update_readcounter++; position_shift = true; alignment.Position = new_position; } } if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag) { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD"; if (position_shift) logf << "-SHIFT"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } modified_alignment_readcounter++; } else { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } } if (aligner.verbose_){ cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } // Finally update alignment information alignment.CigarData = new_cigar_data; alignment.EditTag("MD", "Z" , new_md_tag); } // end of CreateRef else if else { switch (aligner.GetCreateRefError ()) { case Realigner::CR_ERR_RECREATE_REF: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n'; error_recreate_ref_readcount++; break; case Realigner::CR_ERR_CLIP_ANCHOR: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n'; error_clip_anchor_readcount++; break; default: // On a good run this writes way too many reads to the log file - don't want to create a too large txt file // if (logf.is_open ()) //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n'; already_perfect_readcount++; break; } if (aligner.verbose_) { cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } } // --- Debug output for Rajesh --- if (debug && aligner.invalid_cigar_in_input) { aligner.verbose_ = true; cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl; // Rerun reference generation to display error aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors); aligner.verbose_ = verbose; aligner.invalid_cigar_in_input = false; } // --- --- --- } // end of if isMapped writer.SaveAlignment(alignment); } // end while loop over reads if (aligner.invalid_cigar_in_input) cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl; // ---------------------------------------------------------------- // program end -- output summary information cout << " File: " << input_bam << endl << " Total reads: " << readcounter << endl << " Mapped reads: " << mapped_readcounter << endl; if (bad_md_tag_readcount) cout << " Skipped: bad MD tags: " << bad_md_tag_readcount << endl; if (error_recreate_ref_readcount) cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl; if (error_clip_anchor_readcount) cout << " Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl; cout << " Skipped: already perfect: " << already_perfect_readcount << endl << " Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl; if (failed_clip_realigned_readcount) cout << " (including " << failed_clip_realigned_readcount << " that failed to clip)" << endl; if (error_sw_readcount) cout << " Failed to complete SW alignment: " << error_sw_readcount << endl; if (error_unclip_readcount) cout << " Failed to unclip anchor: " << error_unclip_readcount << endl; cout << " Succesfully realigned: " << realigned_readcounter << endl << " Modified alignments: " << modified_alignment_readcounter << endl << " Shifted position: " << pos_update_readcounter << endl; cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl; cout << "INFO: The output BAM file may be unsorted." << endl; cout << "------------------------------------------" << endl; return 0; }
int main (int argc, char *argv[]) { bool produceUnCompressedBAM=false; bool verbose=false; bool ancientDNA=false; bool keepOrig=false; string adapter_F=options_adapter_F_BAM; string adapter_S=options_adapter_S_BAM; string adapter_chimera=options_adapter_chimera_BAM; string key=""; bool allowMissing=false; int trimCutoff=1; bool allowAligned=false; bool printLog=false; string logFileName; BamReader reader; BamWriter writer; string bamFile; string bamFileOUT=""; string key1; string key2; bool useDist=false; double location=-1.0; double scale =-1.0; bool fastqFormat=false; string fastqfile1 = ""; string fastqfile2 = ""; string fastqoutfile = ""; bool singleEndModeFQ=true; const string usage=string(string(argv[0])+ " [options] BAMfile"+"\n"+ "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+ "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+ "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+ "\t\t"+"-fq2" +"\t\t"+"Second fastq file (for paired-end)"+"\n"+ "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+ //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+ "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+ "\t"+"-u " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+ // "\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+ //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+ "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+ "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+ "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+ "\n\t"+"Paired End merging/Single Read trimming options"+"\n"+ "\t\t"+"You can specify either:"+"\n"+ "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+ "\t\t"+" "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+ "\n\t\t"+"or if you know your size length distribution:"+"\n"+ "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+ "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+ // "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n" "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged (default "+boolStringify(keepOrig)+")"+"\n"+ "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n" "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+ "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+ "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<usage<<endl; return 1; } for(int i=1;i<(argc-1);i++){ //all but the last arg if(strcmp(argv[i],"-fq1") == 0 ){ fastqfile1=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"-fq2") == 0 ){ fastqfile2=string(argv[i+1]); fastqFormat=true; singleEndModeFQ=false; i++; continue; } if(strcmp(argv[i],"-fqo") == 0 ){ fastqoutfile=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"--log") == 0 ){ logFileName =string(argv[i+1]); printLog=true; i++; continue; } if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){ cerr<<"This version no longer works with pipe, exiting"<<endl; return 1; } if(strcmp(argv[i],"-u") == 0 ){ produceUnCompressedBAM=true; continue; } if(strcmp(argv[i],"--aligned") == 0 ){ allowAligned=true; continue; } if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){ bamFileOUT =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){ verbose=true; continue; } if(strcmp(argv[i],"--ancientdna") == 0 ){ ancientDNA=true; continue; } if(strcmp(argv[i],"--keepOrig") == 0 ){ keepOrig=true; continue; } if(strcmp(argv[i],"--loc") == 0 ){ location =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"--scale") == 0 ){ scale =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){ adapter_F =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){ adapter_S =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){ adapter_chimera =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){ key =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){ allowMissing=true; continue; } if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){ trimCutoff=atoi(argv[i+1]); i++; continue; } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } bamFile=argv[argc-1]; if( (location != -1.0 && scale == -1.0) || (location == -1.0 && scale != -1.0) ){ cerr<<"Cannot specify --location without specifying --scale"<<endl; return 1; } if( (location != -1.0 && scale != -1.0) ){ useDist=true; if(ancientDNA){ cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl; return 1; } } MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera, key1,key2, trimCutoff,allowMissing,ancientDNA,location,scale,useDist); fqwriters onereadgroup; if(fastqFormat){ if( bamFileOUT != "" || produceUnCompressedBAM || allowAligned){ cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl; return 1; } if(fastqfile1 == ""){ cerr<<"ERROR : Must specify as least the first file for fastq"<<endl; return 1; } FastQParser * fqp1; FastQParser * fqp2; if(singleEndModeFQ){ fqp1 = new FastQParser (fastqfile1); string outdirs = fastqoutfile+".fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } }else{ fqp1 = new FastQParser (fastqfile1); fqp2 = new FastQParser (fastqfile2); string outdirs = fastqoutfile+".fq.gz"; string outdir1 = fastqoutfile+"_r1.fq.gz"; string outdir2 = fastqoutfile+"_r2.fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; string outdir1f = fastqoutfile+"_r1.fail.fq.gz"; string outdir2f = fastqoutfile+"_r2.fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.pairr1.open(outdir1.c_str(), ios::out); onereadgroup.pairr2.open(outdir2.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out); onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.pairr1.good()){ cerr<<"Cannot write to file "<<outdir1<<endl; return 1; } if(!onereadgroup.pairr2.good()){ cerr<<"Cannot write to file "<<outdir2<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } if(!onereadgroup.pairr1f.good()){ cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; } if(!onereadgroup.pairr2f.good()){ cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; } } unsigned int totalSeqs=0; while(fqp1->hasData()){ FastQObj * fo1=fqp1->getData(); vector<string> def1=allTokens( *(fo1->getID()), ' ' ); string def1s=def1[0]; FastQObj * fo2; string def2s; string ext2s; if(!singleEndModeFQ){ if(!fqp2->hasData()){ cerr << "ERROR: Discrepency between fastq files at record " << *(fo1->getID()) <<endl; return 1; } fo2=fqp2->getData(); vector<string> def2=allTokens( *(fo2->getID()), ' ' ); def2s=def2[0]; if(strEndsWith(def1s,"/1")){ def1s=def1s.substr(0,def1s.size()-2); } if(strEndsWith(def2s,"/2")){ def2s=def2s.substr(0,def2s.size()-2); } if(strBeginsWith(def1s,"@")){ def1s=def1s.substr(1,def1s.size()-1); } if(strBeginsWith(def2s,"@")){ def2s=def2s.substr(1,def2s.size()-1); } if(def1s != def2s){ cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl; return 1; } merged result= mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()), *(fo2->getSeq()),*(fo2->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //keys or chimeras if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; }else{ if(result.sequence != ""){ //new sequence onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){ mtr.incrementCountmergedoverlap(); }else{ mtr.incrementCountmerged(); } }else{ //keep as is mtr.incrementCountnothing(); onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } }else{ merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //either chimera or missing key if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; } if(result.sequence != ""){ //new sequence mtr.incrementCounttrimmed(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; }else{ mtr.incrementCountnothing(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } totalSeqs++; } delete fqp1; if(!singleEndModeFQ){ delete fqp2; } if(singleEndModeFQ){ onereadgroup.single.close(); onereadgroup.singlef.close(); }else{ onereadgroup.single.close(); onereadgroup.pairr1.close(); onereadgroup.pairr2.close(); onereadgroup.singlef.close(); onereadgroup.pairr1f.close(); onereadgroup.pairr2f.close(); } //fastq }else{ //else BAM // initMerge(); // set_adapter_sequences(adapter_F, // adapter_S, // adapter_chimera); // set_options(trimCutoff,allowMissing,mergeoverlap); if(key != ""){ size_t found=key.find(","); if (found == string::npos){ //single end reads key1=key; key2=""; } else{ //paired-end key1=key.substr(0,found); key2=key.substr(found+1,key.length()-found+1); } } if( bamFileOUT == "" ){ cerr<<"The output must be a be specified, exiting"<<endl; return 1; } if ( !reader.Open(bamFile) ) { cerr << "Could not open input BAM file "<<bamFile << endl; return 1; } SamHeader header = reader.GetHeader(); string pID = "mergeTrimReadsBAM"; string pName = "mergeTrimReadsBAM"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"..")); const RefVector references = reader.GetReferenceData(); //we will not call bgzip with full compression, good for piping into another program to //lessen the load on the CPU if(produceUnCompressedBAM) writer.SetCompressionMode(BamWriter::Uncompressed); if ( !writer.Open(bamFileOUT,header,references) ) { cerr << "Could not open output BAM file "<<bamFileOUT << endl; return 1; } SamHeader sh=reader.GetHeader(); //Up to the user to be sure that a sequence is followed by his mate // if(!sh.HasSortOrder() || // sh.SortOrder != "queryname"){ // cerr << "Bamfile must be sorted by queryname" << endl; // return 1; // } BamAlignment al; BamAlignment al2; bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD") ){ if(!allowAligned){ cerr << "Reads should not be aligned" << endl; return 1; }else{ //should we remove tags ? } } if(al.IsPaired() && al2Null ){ al2=al; al2Null=false; continue; }else{ if(al.IsPaired() && !al2Null){ bool result = mtr.processPair(al,al2); if( result ){//was merged BamAlignment orig; BamAlignment orig2; if(keepOrig){ orig2 = al2; orig = al; } writer.SaveAlignment(al); if(keepOrig){ orig.SetIsDuplicate(true); orig2.SetIsDuplicate(true); writer.SaveAlignment(orig2); writer.SaveAlignment(orig); } //the second record is empty }else{ //keep the sequences as pairs writer.SaveAlignment(al2); writer.SaveAlignment(al); } // // SINGLE END // }else{ BamAlignment orig; if(keepOrig){ orig =al; } mtr.processSingle(al); if(keepOrig){ //write duplicate if(orig.QueryBases.length() != al.QueryBases.length()){ orig.SetIsDuplicate(true); writer.SaveAlignment(orig); } } writer.SaveAlignment(al); } //end single end al2Null=true; }//second pair } //while al reader.Close(); writer.Close(); } //else BAM cerr <<mtr.reportSingleLine()<<endl; if(printLog){ ofstream fileLog; fileLog.open(logFileName.c_str()); if (fileLog.is_open()){ fileLog <<mtr.reportMultipleLines() <<endl; }else{ cerr << "Unable to print to file "<<logFileName<<endl; } fileLog.close(); } return 0; }
/** * Goes through all the records in a file and generates a set of ReadEnds objects that * hold the necessary information (reference sequence, 5' read coordinate) to do * duplication, caching to disk as necssary to sort them. */ void MarkDuplicates::buildSortedReadEndLists() { ReadEndsMap tmp; long index = 0; SamHeader header = source->getHeader(); BamWriter writer; writer.SetCompressionMode(BamWriter::Uncompressed); writer.Open(getBufferFileName(), header, source->getReferences()); while (true) { BamAlignment * prec = getInputAlignment(); if(!prec) break; BamAlignment & rec = *prec; if (!rec.IsMapped() || rec.RefID == -1) { // When we hit the unmapped reads or reads with no coordinate, just write them. } else if (rec.IsPrimaryAlignment()){ ReadEnds * fragmentEnd = buildReadEnds(header, index, rec); fragSort.push_back(fragmentEnd); if (rec.IsPaired() && rec.IsMateMapped()) { string read_group; rec.GetTag("RG", read_group); string key = read_group + ":" + rec.Name; ReadEnds * pairedEnds = tmp.remove(rec.RefID, key); // See if we've already seen the first end or not if (pairedEnds == NULL) { pairedEnds = buildReadEnds(header, index, rec); tmp.put(pairedEnds->read1Sequence, key, pairedEnds); } else { int sequence = fragmentEnd->read1Sequence; int coordinate = fragmentEnd->read1Coordinate; // If the second read is actually later, just add the second read data, else flip the reads if (sequence > pairedEnds->read1Sequence || (sequence == pairedEnds->read1Sequence && coordinate >= pairedEnds->read1Coordinate)) { pairedEnds->read2Sequence = sequence; pairedEnds->read2Coordinate = coordinate; pairedEnds->read2IndexInFile = index; pairedEnds->orientation = getOrientationByte(pairedEnds->orientation == RE_R, rec.IsReverseStrand()); } else { pairedEnds->read2Sequence = pairedEnds->read1Sequence; pairedEnds->read2Coordinate = pairedEnds->read1Coordinate; pairedEnds->read2IndexInFile = pairedEnds->read1IndexInFile; pairedEnds->read1Sequence = sequence; pairedEnds->read1Coordinate = coordinate; pairedEnds->read1IndexInFile = index; pairedEnds->orientation = getOrientationByte(rec.IsReverseStrand(), pairedEnds->orientation == RE_R); } pairedEnds->score += getScore(rec); pairSort.push_back(pairedEnds); } } } // Print out some stats every 1m reads if (verbose && ++index % 100000 == 0) { cerr << "\rRead " << index << " records. Tracking " << tmp.size() << " as yet unmatched pairs. Last sequence index: " << rec.Position << std::flush; } writer.SaveAlignment(rec); delete prec; } writer.Close(); if(verbose) cerr << "Read " << index << " records. " << tmp.size() << " pairs never matched." << endl << "Sorting pairs..." << flush; if(nothreads) sort(pairSort.begin(), pairSort.end(), compareReadEnds()); else ogeSortMt(pairSort.begin(), pairSort.end(), compareReadEnds()); if(verbose) cerr << "fragments..." << flush; if(nothreads) sort(fragSort.begin(), fragSort.end(), compareReadEnds()); else ogeSortMt(fragSort.begin(), fragSort.end(), compareReadEnds()); cerr << "done." << endl; vector<ReadEnds *>contents = tmp.allReadEnds(); // delete unmatched read ends for(vector<ReadEnds *>::const_iterator i = contents.begin(); i != contents.end(); i++) delete *i; }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "arguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); //dummy reader, will need to reposition anyway VCFreader vcfr (vcffiletopen, vcffiletopen+".tbi", chrname, 1, 1, 0); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } // if ( !reader.LocateIndex() ) { // cerr << "The index for the BAM file cannot be located" << endl; // return 1; // } // if ( !reader.HasIndex() ) { // cerr << "The BAM file has not been indexed." << endl; // return 1; // } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A // if(toprint->hasAtLeastOneG() && // toprint->getAlt().find("A") == string::npos){ if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one C but no T if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
int main (int argc, char** argv) { // Print Commandline string ss(argv[0]); // convert Char to String string commandline = "##Print Command line " + ss; int c; FastaReference* reference = NULL; int minbaseQ = 10; //default int windowlen = 40; //by default string regionstr; string RegionFile; string bamfile; bool STdin = false; bool has_region = false; bool has_regionFile = false; bool has_bamfile = false; bool has_ref = false; int ploidy = 2; bool SetLowComplexityRegionSWGapExt = false; bool SetLowComplexityRegion = false; if (argc < 2) { printSummary(argv); exit(1); } while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"ploidy", required_argument, 0, 'p'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'f'}, {"min-base-quality", required_argument, 0,'q'}, {"Region", required_argument, 0, 'R'}, {"STdin", no_argument, 0, 's'}, {"bam", required_argument, 0, 'b'}, {"Repeat-Extgap", no_argument, 0, 'E'}, {"LowCompex", no_argument, 0, 'l'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hslEf:q:w:s:r:R:p:b:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'f': reference = new FastaReference(optarg); // will exit on open failure commandline = commandline + " -f " + optarg; has_ref = true; break; case 'b': has_bamfile = true; bamfile = optarg; commandline = commandline + " -b " + optarg; break; case 'r': regionstr = optarg; has_region = true; commandline = commandline + " -r " + optarg; break; case 'R': RegionFile = optarg; has_regionFile = true; commandline = commandline + " -R " + optarg; break; case 's': STdin = true; commandline = commandline + " -s "; break; case 'q': minbaseQ = atoi(optarg); commandline = commandline + " -q " + optarg; break; case 'w': windowlen = atoi(optarg); commandline = commandline + " -w " + optarg; break; case 'p': ploidy = atoi(optarg); commandline = commandline + " -p " + optarg; break; case 'E': SetLowComplexityRegionSWGapExt = true; commandline = commandline + " -E "; break; case 'l': SetLowComplexityRegion = true; commandline = commandline + " -l "; break; case 'h': printSummary(argv); commandline = commandline + " -h "; exit(0); break; case '?': printSummary(argv); exit(1); break; default: abort(); break; } } //// Open Error log files ofstream cerrlog("bonsaiReport.txt"); streambuf *cerrsave = std::cerr.rdbuf(); // Redirect stream buffers if (cerrlog.is_open()) cerr.rdbuf(cerrlog.rdbuf()); cerr << commandline << endl; //Check for Reference Fasta sequence if (!has_ref) { cerr << "no FASTA reference provided, cannot realign" << endl; exit(1); } ////Check for reader BamReader reader; if (STdin == true) { if (!reader.Open("stdin")) { cerr << "could not open stdin bam for reading" << endl; cerr << reader.GetErrorString() << endl; reader.Close(); printSummary(argv); } } else { if (has_bamfile == true) { if (!reader.Open(bamfile)) { cerr << "ERROR: could not open bam files from stdin ... Aborting" << endl; cerr << reader.GetErrorString() << endl; reader.Close(); printSummary(argv); } if ( !reader.LocateIndex() ) reader.CreateIndex(); } else { cerr << "--bam flag is set but no bamfile is provided... Aborting" << endl; reader.Close(); printSummary(argv); } } //// Check Region Tags if ( (has_regionFile == true) && (has_region == true) ) { cerr << "ERROR: You provide both region and has provide a Set Region List... Aborting" << endl; exit(1); } //// store the names of all the reference sequences in the BAM file vector<RefData> referencedata = reader.GetReferenceData(); //// Store Region LIST vector<BamRegion> regionlist; if (has_region == true) { BamRegion region; ParseRegionString(regionstr, reader, region); regionlist.push_back(region); } else if (has_regionFile == true) { ifstream RG(RegionFile.c_str(), ios_base::in); string line; while(getline(RG,line)) { BamRegion region; ParseRegionString(line, reader, region); regionlist.push_back(region); } RG.close(); } else if ( (has_regionFile == false) && (has_region == false) ) { for (int i= 0; i < (int)referencedata.size(); i++) { string regionstr = referencedata.at(i).RefName; BamRegion region; ParseRegionString(regionstr, reader, region); if (!reader.SetRegion(region)) // Bam region will get [0,101) = 0 to 100 => [closed, half-opened) { cerr << "ERROR: set region " << regionstr << " failed. Check that REGION describes a valid range... Aborting" << endl; reader.Close(); exit(1); } else regionlist.push_back(region); } } //// BamWriter writer; if (!writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } //// Smallest start position and Largest end position for Req Seq vector<RefData>::iterator refdataIter = referencedata.begin(); vector<BamRegion>::iterator regionListIter = regionlist.begin(); // CLASS RealignFunctionsClass RealignFunction; map<int, string> RefIDRedName; vector<SalRealignInfo> AlGroups; multimap<int, BamAlignment> SortRealignedAlignmentsMultimap; int refid = 0; BamAlignment alignment; bool IsNextAlignment = reader.GetNextAlignment(alignment); //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; int windowrealigned = 0; int TotalWindowDetected = 0; int TotalReadsAligned = 0; int TotalWindow = 0; int TotalReads = 0; while (refdataIter != referencedata.end() ) { string refname = refdataIter->RefName; RefIDRedName[refid] = refname; int reflength = refdataIter->RefLength; int winstartpos, winendpos; int AllowableBasesInWindow = 1; bool nextChrName = false; cerr << "##HeaderINFO: RefID = " << refdataIter->RefName << "\t" << "RefLen = " << reflength << endl; while (nextChrName == false ) { vector<int> minmaxRefSeqPos; bool IsPassDetectorNoRealignment = false; minmaxRefSeqPos.push_back(-1); minmaxRefSeqPos.push_back(0); //cerr << " region: " << (*regionListIter).LeftRefID << " : " << (*regionListIter).LeftPosition << " .. " << (*regionListIter).RightPosition << endl; if ((refid == (int)referencedata.size() - 1) && ((*regionListIter).LeftRefID == refid) && ((has_region==true) || (has_regionFile==true)) ) { //// if ( (has_region == true) || (has_regionFile == true) ) { winstartpos = (*regionListIter).LeftPosition; winendpos = winstartpos + windowlen - 1; reflength = (*regionListIter).RightPosition; if (reflength < winendpos) reflength = winendpos; // Get Next Alignment First if ( (refid == alignment.RefID) && (winstartpos == (*regionListIter).LeftPosition) && (IsNextAlignment == false) ) IsNextAlignment = reader.GetNextAlignment(alignment); } else if (has_region == false) { winstartpos = 0; winendpos = winstartpos + windowlen - 1; // Get Next Alignment First if ( (refid == alignment.RefID) && (winstartpos == 0) && (IsNextAlignment == false) ) IsNextAlignment = reader.GetNextAlignment(alignment); } //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; //// while ((winstartpos < reflength)) { //// Check window end position if (winendpos > reflength) winendpos = reflength; // Reinitialized unsigned int NewReadMappedcount = 0; //// Save and Erase alignments that are outside of window (Deque?) if (!AlGroups.empty()) { minmaxRefSeqPos.at(0) = -1; minmaxRefSeqPos.at(1) = 0; //cerr << "#Start: Keep alignments with start position exceed the right end of the window/Region " << endl; vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); while (Iter != AlGroups.end()) { // Erase alignment s if ((*Iter).al.GetEndPosition() < winstartpos) { //cerr << " ToWrite: " << (*Iter).second.size() << " ; " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl; SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al)); AlGroups.erase(Iter); //cerr << " ToWrite: DONE " << endl; } else { string referenceSequence = reference->getSubSequence(RefIDRedName[(*Iter).al.RefID], (*Iter).al.Position, 2*(*Iter).al.Length); if ((*Iter).HasRealign == true ) { (*Iter).currentReadPosition = 0; (*Iter).currentGenomeSeqPosition = 0; (*Iter).currentAlPosition = (*Iter).al.Position; (*Iter).cigarindex = 0; } (*Iter).CigarSoftclippingLength = 0; SalRealignInfo talr = (*Iter); //cerr << " ToKEEP: " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl; RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, talr, Iter, (*Iter).al, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, false); ++Iter; //Increment iterator } } } // Write Sorted Alignments that are outside of window //cerr << "SortRealignedAlignmentsMultimap: " << SortRealignedAlignmentsMultimap.size() << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl; if (!SortRealignedAlignmentsMultimap.empty()) // && (winWrite < winstartpos ) ) { //cerr << "#Start: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl; multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); while (sraIter != SortRealignedAlignmentsMultimap.end()) { //cerr << " (*sraIter).first= " << (*sraIter).first << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << " winstartpos - ((windowlen - 1)*0.9)= " << winstartpos - ((windowlen - 1)*0.9) << endl; if (((float) (*sraIter).first < floor((float) (winstartpos - ((windowlen - 1)*0.9)))) && ((minmaxRefSeqPos.at(0) > 0) && ((*sraIter).first < minmaxRefSeqPos.at(0)))) { //writer.SaveAlignment((*sraIter).second); // Why sometimes, it doesn't work ????? if (!writer.SaveAlignment((*sraIter).second)) cerr << writer.GetErrorString() << endl; SortRealignedAlignmentsMultimap.erase(sraIter++); } else { ++sraIter; } } //cerr << "#Done: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl; } //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; //cerr << ": " << alignment.RefID << " :" << RefIDRedName[alignment.RefID] << " : " << RefIDRedName[alignment.RefID] << endl; //cerr << "Start: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl; // Gather Reads within a window frame while ((IsNextAlignment) && (refid == alignment.RefID)) // Neeed more conditions { if (SetLowComplexityRegion == true) { string sequenceInWindow = reference->getSubSequence(RefIDRedName[alignment.RefID], winstartpos, (winendpos-winstartpos+1) ); if (IsWindowInRepeatRegion(sequenceInWindow) == true) { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0) { TotalReads++; if (alignment.IsMapped()) { string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2*alignment.Length); vector<SalRealignInfo>::iterator tIter; SalRealignInfo alr; alr.al = alignment; alr.currentReadPosition = 0; alr.currentGenomeSeqPosition = 0; alr.currentAlPosition = alignment.Position; alr.cigarindex = 0; alr.HasRealign = false; alr.CigarSoftclippingLength = 0; string str = "ZZZZZZZZZZZZZZZZZ"; if (alignment.Name.find(str) != string::npos) { stringstream cigar; for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter) cigar << cigarIter->Length << cigarIter->Type; string cigarstr = cigar.str(); cerr << " TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl; } RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true); NewReadMappedcount++; } else { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); cerr << "UNmapped : " << alignment.Name << endl; } } else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); } else break; } else { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) < 2) SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); else break; } } else // (SetLowComplexityRegion == false) { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0) { TotalReads++; if (alignment.IsMapped()) { string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2 * alignment.Length); vector<SalRealignInfo>::iterator tIter; SalRealignInfo alr; alr.al = alignment; alr.currentReadPosition = 0; alr.currentGenomeSeqPosition = 0; alr.currentAlPosition = alignment.Position; alr.cigarindex = 0; alr.HasRealign = false; alr.CigarSoftclippingLength = 0; string str = "ZZZZZZZZZZZZZZZZZ"; if (alignment.Name.find(str) != string::npos) { stringstream cigar; for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter) cigar << cigarIter->Length << cigarIter->Type; string cigarstr = cigar.str(); cerr << " TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl; } RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true); //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " INDEL: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " Length: " << alignment.Length << " CIGARstr: " << cigarstr << endl; NewReadMappedcount++; } else { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); cerr << "UNmapped : " << alignment.Name << endl; } } else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); } else break; } ////Get next alignment IsNextAlignment = reader.GetNextAlignment(alignment); } //cerr << "Done: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl; //// Detector Corner bool ToRealign = MeetIndelDetectorThresholdv(AlGroups); cerr << "MeetIndelDetectorThresholdv(AlGroups).size()= " << AlGroups.size() << endl; // ************** if (ToRealign) { //cerr << " ToRealign: " << refdataIter->RefName << "\t" << reflength << "\t" << winstartpos << "\t" << winendpos << "\t" << AlGroups.size() << endl; //cerr << " minmaxRefSeqPos.at(1)= " << minmaxRefSeqPos.at(1) << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl; ////// Perform Realign routines int TotalAlR = 0; // Total number of alignments to be realigned int NumAlR = 0; // Now many alignments are aligned TotalWindowDetected++; cerr << "#Start: Meet Threshold, Realigning ... " << endl; if (minmaxRefSeqPos.at(1) < winendpos) minmaxRefSeqPos.at(1) = winendpos; if (minmaxRefSeqPos.at(0) > winstartpos) minmaxRefSeqPos.at(0) = winstartpos; bool IsToRealign = RealignFunction.PruningByNaiveSelectionProcedureAndConstructHaplotypes2(winstartpos, winendpos, refid, refname, minmaxRefSeqPos, reference); if (IsToRealign == true) { RealignFunction.SelectHaplotypeCandidates_SmithWatermanBSv(AlGroups, minmaxRefSeqPos, SetLowComplexityRegionSWGapExt); minmaxRefSeqPos.at(0) = -1; minmaxRefSeqPos.at(1) = 0; int nextwinstartpos = winendpos + 1; int nextwinendpos = winstartpos + windowlen - 1; if (nextwinendpos > reflength) nextwinendpos = reflength; //cerr << " Before Realign : " << SortRealignedAlignmentsMultimap.size() << endl; RealignFunction.AdjustCigarsWRTChosenMultipleHaplotypesAndPrepareAlignmentsTobeWrittenOut(AlGroups, SortRealignedAlignmentsMultimap, reference, RefIDRedName, minmaxRefSeqPos, nextwinstartpos, nextwinendpos, minbaseQ, TotalAlR, NumAlR, ploidy); IsPassDetectorNoRealignment = false; // Set flag to false to deactivate write functions //cerr << " After Realign : " << SortRealignedAlignmentsMultimap.size() << endl; TotalReadsAligned += NumAlR; if (NumAlR > 0) // Realignment done windowrealigned++; } else cerr << "#Done: Meet Threshold, Realigning ... " << endl; } if (NewReadMappedcount > 0) TotalWindow++; RealignFunction.Clear(); //// Move the window frame winstartpos = winendpos + 1; winendpos = winstartpos + windowlen - 1; } //// Save and Erase remaining alignments that are outside of window (Deque?) if ((!AlGroups.empty())) { cerr << "#Start: Write Remaining alignments and delete all alignments" << endl; for (vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); Iter != AlGroups.end(); ++Iter) { //cerr << " Remain alignment start: " << (*Iter).al.Name << " " << Iter->al.Position << " < " << winstartpos << " " << winendpos << endl; SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al)); } cerr << "#Done: Write Remaining alignments and delete all alignments" << endl; } AlGroups.clear(); // Write Sorted remaining Alignments that are outside of window if (!SortRealignedAlignmentsMultimap.empty()) { for (multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); sraIter != SortRealignedAlignmentsMultimap.end(); ++sraIter) { //writer.SaveAlignment((*sraIter).second); if (!writer.SaveAlignment((*sraIter).second)) cerr << writer.GetErrorString() << endl; } SortRealignedAlignmentsMultimap.clear(); } } ++regionListIter; if ((*regionListIter).LeftRefID > refid) nextChrName = true; } //// If End of the chromosome position //// increament iterator ++refdataIter; ++refid; } reader.Close(); writer.Close(); cerr << "##-Completed- " << endl; cerr << " Total Reads processed = " << TotalReads << endl; cerr << " Total Reads Aligned = " << TotalReadsAligned << endl; cerr << " Total Window processed = " << TotalWindow << endl; cerr << " Total Window Detected = " << TotalWindowDetected << endl; cerr << " Total Windows Aligned = " << windowrealigned << endl; // Restore cerr's stream buffer before terminating if (cerrlog.is_open()) cerr.rdbuf(cerrsave); commandline.clear(); return 0; }
int main (int argc, char *argv[]) { bool mapped =false; bool unmapped=false; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "This program takes a BAM file as input and produces\n"+ "another where the putative deaminated bases have\n"+ "have been cut\n"+ "\n"+ "Options:\n"); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } // for(int i=1;i<(argc-1);i++){ //all but the last arg // if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){ // mapped=true; // continue; // } // if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){ // unmapped=true; // continue; // } // cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; // return 1; // } if(argc != 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } } }else{ int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"51 "<<al.QueryBases<<endl; // cout<<"51 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"52 "<<al.QueryBases<<endl; // cout<<"52 "<<al.Qualities<<endl; }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"61 "<<al.QueryBases<<endl; // cout<<"61 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"62 "<<al.QueryBases<<endl; // cout<<"62 "<<al.Qualities<<endl; } } //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"21 "<<al.QueryBases<<endl; // cout<<"21 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"22 "<<al.QueryBases<<endl; // cout<<"22 "<<al.Qualities<<endl; } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"11 "<<al.QueryBases<<endl; // cout<<"11 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"12 "<<al.QueryBases<<endl; // cout<<"12 "<<al.Qualities<<endl; } //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"31 "<<al.QueryBases<<endl; // cout<<"31 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"32 "<<al.QueryBases<<endl; // cout<<"32 "<<al.Qualities<<endl; }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"41 "<<al.QueryBases<<endl; // cout<<"41 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"42 "<<al.QueryBases<<endl; // cout<<"42 "<<al.Qualities<<endl; } } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); return 0; }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+ "\nwhich is good if you have many reads in the bam file.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "\narguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\t"+"--1000g [vcf file] : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } if(string(argv[i]) == "--1000g"){ vcf1000g=string(argv[i+1]); i++; continue; } } unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19 bool * hasCnoT; bool * hasGnoA; bool * thousandGenomesHasA; bool * thousandGenomesHasT; cerr<<"Trying to allocating memory"<<endl; try{ hasCnoT = new bool[ maxSizeChromosome ]; hasGnoA = new bool[ maxSizeChromosome ]; thousandGenomesHasA = new bool[ maxSizeChromosome ]; thousandGenomesHasT = new bool[ maxSizeChromosome ]; }catch(bad_alloc& exc){ cerr<<"ERROR: allocating memory failed"<<endl; return 1; } cerr<<"Success in allocating memory"<<endl; for(unsigned int i = 0;i<maxSizeChromosome;i++){ hasCnoT[i]=false; hasGnoA[i]=false; thousandGenomesHasA[i]=false; thousandGenomesHasT[i]=false; } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); cerr<<"Reading consensus VCF "<<vcffiletopen<<" ... "<<endl; VCFreader vcfr (vcffiletopen, // vcffiletopen+".tbi", // chrname, // 1, // maxSizeChromosome, 0); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); if(toprint->getRef().length() != 1 ) continue; //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ hasGnoA[ toprint->getPosition() ] =true; } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ hasCnoT[ toprint->getPosition() ] =true; } } cerr<<"done reading VCF"<<endl; cerr<<"Reading 1000g VCF :"<<vcf1000g<<" ..."<<endl; string line1000g; ifstream myFile1000g; myFile1000g.open(vcf1000g.c_str(), ios::in); if (myFile1000g.is_open()){ while ( getline (myFile1000g,line1000g)){ vector<string> fields=allTokens(line1000g,'\t'); //0 chr //1 pos //2 id //3 ref //4 alt //check if same chr if(fields[0] != chrname){ cerr <<"Error, wrong chromosome in 1000g file for line= "<<line1000g<<endl; return 1; } //skip indels if(fields[3].size() != 1 || fields[4].size() != 1 ) continue; char ref=toupper(fields[3][0]); char alt=toupper(fields[4][0]); unsigned int pos=destringify<unsigned int>( fields[1] ); thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') ); thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') ); } myFile1000g.close(); }else{ cerr <<"Unable to open file "<<vcf1000g<<endl; return 1; } cerr<<"done reading 1000g VCF"<<endl; BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+1] && !thousandGenomesHasA[al.Position+1] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has a at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+2] && !thousandGenomesHasA[al.Position+2] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // // if(toprint->hasAtLeastOneG() && // // toprint->getAlt().find("A") == string::npos){ // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasGnoA[positionJump] && !thousandGenomesHasA[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } // transformRef(&refeBase,&readBase); if(hasCnoT[al.Position+1] && !thousandGenomesHasT[al.Position+1] ) isDeaminated=true; // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one C but no T // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } //transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); delete(hasCnoT); delete(hasGnoA); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
void BedIntersectPE::IntersectBamPE(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } // track the previous and current sequence // names so that we can identify blocks of // alignments for a given read ID. string prevName, currName; prevName = currName = ""; vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. alignments.reserve(100); _bedA->bedType = 10; // it's a full BEDPE given it's BAM // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; while (reader.GetNextAlignment(bam1)) { reader.GetNextAlignment(bam2); if (bam1.Name != bam2.Name) { while (bam1.Name != bam2.Name) { if (bam1.IsPaired()) { cerr << "*****WARNING: Query " << bam1.Name << " is marked as paired, but it's mate does not occur" << " next to it in your BAM file. Skipping. " << endl; } bam1 = bam2; reader.GetNextAlignment(bam2); } } else if (bam1.IsPaired() && bam1.IsPaired()) { ProcessBamBlock(bam1, bam2, refs, writer); } } // close up reader.Close(); if (_bamOutput == true) { writer.Close(); } }
int main(const int argc, char* const argv[]) { int c, min_mapQ=0, seed=chrono::system_clock::now().time_since_epoch().count(); unsigned int flag_on=0, flag_off=0; string fn_tgt, fn_in, fn_out="", out_format="b"; while ((c = getopt(argc, argv, "SbBcCt:h1Ho:q:f:F:ul:r:?T:R:L:s:@:m:x:U:")) >= 0) { switch (c) { case 's': seed = atoi(optarg); break; case 'm': break; case 'c': break; case 'S': break; case 'b': break; case 'C': break; case 'h': break; case 'H': break; case 'o': fn_out = optarg; break; case 'U': break; case 'f': flag_on |= strtol(optarg, 0, 0); break; case 'F': flag_off |= strtol(optarg, 0, 0); break; case 'q': min_mapQ = atoi(optarg); break; case 'u': out_format = "u"; break; case '1': break; case 'l': break; case 'r': break; case 't': fn_tgt = optarg; break; case 'R': break; case '?': return usage(); case 'T': break; case 'B': break; case '@': break; case 'x': break; default: return usage(); } } if (fn_tgt.compare("") == 0) return usage(); if (argc == optind) return usage(); fn_in = argv[optind]; BamReader reader; if (!reader.Open(fn_in)) { cerr << "ERROR: cannot open [" << fn_in << "] for reading\n"; return 1; } if (!reader.LocateIndex()) { cerr << "ERROR: cannot find BAM index for [" << fn_in << "]\n"; return 1; } const SamHeader header = reader.GetHeader(); if (header.SortOrder.compare("coordinate") != 0) { cerr << "ERROR: [" << fn_in << "] not sorted by coordinate\n"; return 1; } const RefVector refseq = reader.GetReferenceData(); vector<BamRegion> regions; vector<unsigned int> src_depths, tgt_depths; if (read_region_depth(fn_tgt.c_str(), reader, regions, src_depths, tgt_depths) != 0) return 1; BamWriter writer; if (!writer.Open(fn_out, header, refseq)) { cerr << "ERROR: cannot open [" << fn_out << "] for writing\n"; return 1; } BamAlignment aln; vector<BamAlignment> reads; vector<string> paired, unpaired; unordered_map<int, int> kept; unordered_map<string, unsigned int> seen, sampled; unordered_map<string, vector<int> > pool; for (size_t i=0; i<regions.size(); ++i) { reads.clear(); paired.clear(); unpaired.clear(); kept.clear(); pool.clear(); char region_string[256]; sprintf(region_string, "%s:%d-%d", refseq[regions[i].LeftRefID].RefName.c_str(), regions[i].LeftPosition, regions[i].RightPosition); if (!reader.SetRegion(regions[i])) { cerr << "WARNING: failed to locate [" << region_string << "]\n"; //cerr << "WARNING: failed to locate [" << refseq[regions[i].LeftRefID].RefName << ':' << regions[i].LeftPosition << '-' << regions[i].RightPosition << "]\n"; continue; } while (reader.GetNextAlignment(aln)) { if ((aln.AlignmentFlag & flag_on) == flag_on && !(aln.AlignmentFlag & flag_off) && aln.MapQuality >= min_mapQ) reads.push_back(aln); } if (reads.size() == 0) continue; unsigned int depth = 0; for (size_t k=0; k<reads.size(); ++k) { aln = reads[k]; string rn = aln.Name; if (seen.find(rn) != seen.end()) { // if seen in previous regions if (sampled.find(rn) != sampled.end()) { // if self or mate sampled before, sample it if (sampled[rn] != aln.AlignmentFlag) kept[k] = 1; // if mate sampled before, keep it depth += get_overlap(aln, regions[i]); } if (seen[rn] != aln.AlignmentFlag) seen[rn] = aln.AlignmentFlag; } else { // if not seen in previous regions pool[rn].push_back(k); } if (depth > tgt_depths[i]) break; } if (depth < tgt_depths[i]) { for (auto it=pool.begin(); it!=pool.end(); ++it) { if (it->second.size()>1) paired.push_back(it->first); else unpaired.push_back(it->first); } shuffle(paired.begin(), paired.end(), default_random_engine(seed)); shuffle(unpaired.begin(), unpaired.end(), default_random_engine(seed)); int n1=paired.size(), n2=unpaired.size(), k1, k2, k3; while (depth < tgt_depths[i] && n1+n2 > 0) { if (n1>0) { k1 = pool[paired[--n1]][0]; k2 = pool[paired[n1]][1]; depth += get_overlap(reads[k1], regions[i]); depth += get_overlap(reads[k2], regions[i]); kept[k1] = 1; kept[k2] = 1; continue; } if (n2>0) { k3 = pool[unpaired[--n2]][0]; depth += get_overlap(reads[k3], regions[i]); kept[k3] = 1; continue; } } } for (auto it=pool.begin(); it!=pool.end(); ++it) { string rn = it->first; seen[rn] = reads[pool[rn].back()].AlignmentFlag; } for (auto it=kept.begin(); it!=kept.end(); ++it) { int k = it->first; string rn = reads[k].Name; sampled[rn] = reads[k].AlignmentFlag; writer.SaveAlignment(reads[k]); } cerr << "INFO: target=[" << tgt_depths[i] << "], actual=[" << depth << "], N(reads)=[" << reads.size() << "], N(kept)=[" << kept.size() << "] at [" << region_string << "]\n"; } reader.Close(); return 0; }
void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, BamWriter &writer) { vector<BED> hits, hits1, hits2; // vector of potential hits hits.reserve(1000); // reserve some space hits1.reserve(1000); hits2.reserve(1000); bool overlapsFound; // flag to indicate if overlaps were found if ( (_searchType == "either") || (_searchType == "xor") || (_searchType == "both") || (_searchType == "notboth") || (_searchType == "neither") ) { // create a new BEDPE feature from the BAM alignments. BEDPE a; ConvertBamToBedPE(bam1, bam2, refs, a); if (_bamOutput == true) { // BAM output // write to BAM if correct hits found overlapsFound = FindOneOrMoreOverlaps(a, _searchType); if (overlapsFound == true) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } else { // BEDPE output FindOverlaps(a, hits1, hits2, _searchType); hits1.clear(); hits2.clear(); } } else if ( (_searchType == "ispan") || (_searchType == "ospan") ) { // only look for ispan and ospan when both ends are mapped. if (bam1.IsMapped() && bam2.IsMapped()) { // only do an inspan or outspan check if the alignment is intrachromosomal if (bam1.RefID == bam2.RefID) { // create a new BEDPE feature from the BAM alignments. BEDPE a; ConvertBamToBedPE(bam1, bam2, refs, a); if (_bamOutput == true) { // BAM output // look for overlaps, and write to BAM if >=1 were found overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); if (overlapsFound == true) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } else { // BEDPE output FindSpanningOverlaps(a, hits, _searchType); hits.clear(); } } } } else if ( (_searchType == "notispan") || (_searchType == "notospan") ) { // only look for notispan and notospan when both ends are mapped. if (bam1.IsMapped() && bam2.IsMapped()) { // only do an inspan or outspan check if the alignment is intrachromosomal if (bam1.RefID == bam2.RefID) { // create a new BEDPE feature from the BAM alignments. BEDPE a; ConvertBamToBedPE(bam1, bam2, refs, a); if (_bamOutput == true) { // BAM output // write to BAM if there were no overlaps overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); if (overlapsFound == false) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } else { // BEDPE output FindSpanningOverlaps(a, hits, _searchType); hits.clear(); } } // if inter-chromosomal or orphaned, we know it's not ispan and not ospan else if (_bamOutput == true) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } // if both ends aren't mapped, we know that it's notispan and not ospan else if (_bamOutput == true) { writer.SaveAlignment(bam1); writer.SaveAlignment(bam2); } } }
int main(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); bool help, combineSffs; string sffFile; string bamFile; vector<string> infiles; opts.GetOption(help,"false", 'h', "help"); opts.GetOption(combineSffs,"false", 'c', "combine-sffs"); opts.GetOption(bamFile,"",'o',"out-filename"); opts.GetLeftoverArguments(infiles); if(help || infiles.empty()) { usage(); } if((!combineSffs) && infiles.size() > 1) { cerr << "sff2bam ERROR: if you want to combine all sff files into a single bam file, please use option -c true." << endl; usage(); } sffFile= infiles.front(); if(bamFile.length() < 1) { bamFile = sffFile.substr(0, sffFile.length() - 3); bamFile += "bam"; } sff_file_t* sff_file = sff_fopen(sffFile.c_str(), "r", NULL, NULL); if(!sff_file) { cerr << "sff2bam ERROR: fail to open " << sffFile << endl; exit(1); } // All sff files must have the same flow and key if(combineSffs && infiles.size() > 1) { for(size_t n = 1; n < infiles.size(); ++n) { sff_file_t* sff_file2 = sff_fopen(infiles[n].c_str(), "r", NULL, NULL); if(!sff_file2) { sff_fclose(sff_file); cerr << "sff2bam ERROR: fail to open " << infiles[n] << endl; exit(1); } if(strcmp(sff_file2->header->flow->s, sff_file->header->flow->s) != 0 || strcmp(sff_file2->header->key->s, sff_file->header->key->s) != 0) { sff_fclose(sff_file); sff_fclose(sff_file2); cerr << "sff2bam ERROR: " << sffFile << " and " << infiles[n] << " have different flows or keys." << endl; exit(1); } sff_fclose(sff_file2); } } sff_t* sff = NULL; // Open 1st read for read group name sff = sff_read(sff_file); if(!sff) { sff_fclose(sff_file); cerr << "sff2bam ERROR: fail to read " << sffFile << endl; exit(1); } // Set up BAM header SamHeader sam_header; sam_header.Version = "1.4"; sam_header.SortOrder = "unsorted"; SamProgram sam_program("sff2bam"); sam_program.Name = "sff2bam"; sam_program.Version = SFF2BAM_VERSION; sam_program.CommandLine = "sff2bam"; sam_header.Programs.Add(sam_program); string rgname = sff->rheader->name->s; int index = rgname.find(":"); rgname = rgname.substr(0, index); SamReadGroup read_group(rgname); read_group.FlowOrder = sff->gheader->flow->s; read_group.KeySequence = sff->gheader->key->s; sam_header.ReadGroups.Add(read_group); RefVector refvec; BamWriter bamWriter; bamWriter.SetCompressionMode(BamWriter::Compressed); if(!bamWriter.Open(bamFile, sam_header, refvec)) { sff_fclose(sff_file); cerr << "sff2bam ERROR: failed to open " << bamFile << endl; exit(1); } // Save 1st read BamAlignment bam_alignment0; bam_alignment0.SetIsMapped(false); bam_alignment0.Name = sff->rheader->name->s; size_t nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left; if(sff->rheader->clip_qual_right > 0) { nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left; } if(nBases > 0) { bam_alignment0.QueryBases.reserve(nBases); bam_alignment0.Qualities.reserve(nBases); for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base) { bam_alignment0.QueryBases.push_back(sff->read->bases->s[base]); bam_alignment0.Qualities.push_back(sff->read->quality->s[base] + 33); } } int clip_flow = 0; for (unsigned int base = 0; base < sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base) { clip_flow += sff->read->flow_index[base]; } if (clip_flow > 0) { clip_flow--; } bam_alignment0.AddTag("RG","Z", rgname); bam_alignment0.AddTag("PG","Z", string("sff2bam")); bam_alignment0.AddTag("ZF","i", clip_flow); // TODO: trim flow vector<uint16_t> flowgram0(sff->gheader->flow_length); copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram0.begin()); bam_alignment0.AddTag("FZ", flowgram0); sff_destroy(sff); sff = NULL; bamWriter.SaveAlignment(bam_alignment0); // Save rest reads while(NULL != (sff = sff_read(sff_file))) { BamAlignment bam_alignment; bam_alignment.SetIsMapped(false); bam_alignment.Name = sff->rheader->name->s; nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left; if(sff->rheader->clip_qual_right > 0) { nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left; } if(nBases > 0) { bam_alignment.QueryBases.reserve(nBases); bam_alignment.Qualities.reserve(nBases); for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base) { bam_alignment.QueryBases.push_back(sff->read->bases->s[base]); bam_alignment.Qualities.push_back(sff->read->quality->s[base] + 33); } } clip_flow = 0; for (unsigned int base = 0; base <= sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base) { clip_flow += sff->read->flow_index[base]; } if (clip_flow > 0) { clip_flow--; } bam_alignment.AddTag("RG","Z", rgname); bam_alignment.AddTag("PG","Z", string("sff2bam")); bam_alignment.AddTag("ZF","i", clip_flow); // TODO: trim flow vector<uint16_t> flowgram(sff->gheader->flow_length); copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram.begin()); bam_alignment.AddTag("FZ", flowgram); sff_destroy(sff); sff = NULL; bamWriter.SaveAlignment(bam_alignment); } sff_fclose(sff_file); if(combineSffs && infiles.size() > 1) { for(size_t n = 1; n < infiles.size(); ++n) { sff_file_t* sff_file2 = sff_fopen(infiles[n].c_str(), "r", NULL, NULL); while(NULL != (sff = sff_read(sff_file2))) { BamAlignment bam_alignment; bam_alignment.SetIsMapped(false); bam_alignment.Name = sff->rheader->name->s; nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left; if(sff->rheader->clip_qual_right > 0) { nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left; } if(nBases > 0) { bam_alignment.QueryBases.reserve(nBases); bam_alignment.Qualities.reserve(nBases); for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base) { bam_alignment.QueryBases.push_back(sff->read->bases->s[base]); bam_alignment.Qualities.push_back(sff->read->quality->s[base] + 33); } } clip_flow = 0; for (unsigned int base = 0; base <= sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base) { clip_flow += sff->read->flow_index[base]; } if (clip_flow > 0) { clip_flow--; } bam_alignment.AddTag("RG","Z", rgname); bam_alignment.AddTag("PG","Z", string("sff2bam")); bam_alignment.AddTag("ZF","i", clip_flow); // TODO: trim flow vector<uint16_t> flowgram(sff->gheader->flow_length); copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram.begin()); bam_alignment.AddTag("FZ", flowgram); sff_destroy(sff); sff = NULL; bamWriter.SaveAlignment(bam_alignment); } sff_fclose(sff_file2); } } bamWriter.Close(); return 0; }
int main ( int argc, char *argv[] ) { struct parameters *param = 0; param = interface(param, argc, argv); //bam input and generate index if not yet //-------------------------------------------------------------------------------------------------------+ // BAM input (file or filenames?) | //-------------------------------------------------------------------------------------------------------+ char *fof = param->mapping_f; FILE *IN=NULL; char linefof[5000]; int filecount=0; vector <string> fnames; if (strchr(fof,' ')!=NULL) { char *ptr; ptr=strtok(fof," "); while (ptr!=NULL) { fnames.push_back(ptr); filecount++; ptr=strtok(NULL," "); } } else { IN=fopen(fof,"rt"); if (IN!=NULL) { long linecount=0; while (fgets(linefof,5000-1,IN)!=NULL) { linecount++; if (linefof[0]!='#' && linefof[0]!='\n') { char *ptr=strchr(linefof,'\n'); if (ptr!=NULL && ptr[0]=='\n') { ptr[0]='\0'; } FILE *dummy=NULL; dummy=fopen(linefof,"rt"); if (dummy!=NULL) { // seems to be a file of filenames... fclose(dummy); fnames.push_back(linefof); filecount++; } else if (filecount==0 || linecount>=1000-1) { // seems to be a single file fnames.push_back(fof); filecount++; break; } } } fclose(IN); } } //file or file name decided and stored in vector "fnames" cerr << "the input mapping files are:" << endl; vector <string>::iterator fit = fnames.begin(); for(; fit != fnames.end(); fit++) { cerr << *fit << endl; } //-------------------------------------------------------------------------------------------------------+ // end of file or filenames | //-------------------------------------------------------------------------------------------------------+ // open the BAM file(s) BamMultiReader reader; reader.Open(fnames); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // attempt to open BamWriter BamWriter writer; string outputBam = param->writer; if ( outputBam != "" ) { if ( !writer.Open(param->writer, header, refs) ) { cerr << "Could not open output BAM file" << endl; exit(0); } } BamAlignment bam; while (reader.GetNextAlignment(bam)) { //change RG string rg = "RG"; string rgType = "Z"; string rgValue = "1"; bam.EditTag(rg,rgType,rgValue); writer.SaveAlignment(bam); } // read a bam return 0; } //main
bool MergeTool::MergeToolPrivate::Run(void) { // set to default input if none provided if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn()); // opens the BAM files (by default without checking for indexes) BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { cerr << "bamtools merge ERROR: could not open input BAM file(s)... Aborting." << endl; return false; } // retrieve header & reference dictionary info std::string mergedHeader = reader.GetHeaderText(); RefVector references = reader.GetReferenceData(); // determine compression mode for BamWriter bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; // open BamWriter BamWriter writer; writer.SetCompressionMode(compressionMode); if ( !writer.Open(m_settings->OutputFilename, mergedHeader, references) ) { cerr << "bamtools merge ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl; reader.Close(); return false; } // if no region specified, store entire contents of file(s) if ( !m_settings->HasRegion ) { BamAlignment al; while ( reader.GetNextAlignmentCore(al) ) writer.SaveAlignment(al); } // otherwise attempt to use region as constraint else { // if region string parses OK BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { // attempt to find index files reader.LocateIndexes(); // if index data available for all BAM files, we can use SetRegion if ( reader.HasIndexes() ) { // attempt to use SetRegion(), if failed report error if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { cerr << "bamtools merge ERROR: set region failed. Check that REGION describes a valid range" << endl; reader.Close(); return false; } // everything checks out, just iterate through specified region, storing alignments BamAlignment al; while ( reader.GetNextAlignmentCore(al) ) writer.SaveAlignment(al); } // no index data available, we have to iterate through until we // find overlapping alignments else { BamAlignment al; while ( reader.GetNextAlignmentCore(al) ) { if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) && (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) { writer.SaveAlignment(al); } } } } // error parsing REGION string else { cerr << "bamtools merge ERROR: could not parse REGION - " << m_settings->Region << endl; cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" << endl; reader.Close(); writer.Close(); return false; } } // clean & exit reader.Close(); writer.Close(); return true; }
//{{{ void process_intra_chrom_split(const BamAlignment &curr, void SV_SplitRead:: process_intra_chrom_split(const BamAlignment &curr, const RefVector refs, BamWriter &inter_chrom_reads, map<string, BamAlignment> &mapped_splits, UCSCBins<SV_BreakPoint*> &r_bin, int weight, int id, int sample_id, SV_SplitReadReader *_reader) { if (mapped_splits.find(curr.Name) == mapped_splits.end()) { uint32_t clipped = count_clipped(curr.CigarData); if ( curr.HasTag("YP") == true) { uint32_t t; curr.GetTag("YP", t); if (t == 2) mapped_splits[curr.Name] = curr; } else if (clipped >= _reader->min_clip) mapped_splits[curr.Name] = curr; } else { if ( mapped_splits[curr.Name].RefID == curr.RefID ) { try { SV_SplitRead *new_split_read = new SV_SplitRead(mapped_splits[curr.Name], curr, refs, weight, id, sample_id, _reader); SV_BreakPoint *new_bp = NULL; if (new_split_read->is_sane()) { new_bp = new_split_read->get_bp(); if (new_bp != NULL) { new_bp->cluster(r_bin); } else { cerr << "Alignment name:" << curr.Name << endl; free(new_split_read); } } else free(new_split_read); } catch (int) { cerr << "Error creating split read: " << endl; } } else { BamAlignment al1 = curr; BamAlignment al2 = mapped_splits[curr.Name]; al1.MateRefID = al2.RefID; al2.MateRefID = al1.RefID; al1.MatePosition = al2.Position; al2.MatePosition = al1.Position; string x = _reader->get_source_file_name(); al1.AddTag("LS","Z",x); al2.AddTag("LS","Z",x); inter_chrom_reads.SaveAlignment(al1); inter_chrom_reads.SaveAlignment(al2); } mapped_splits.erase(curr.Name); } }
bool FilterTool::FilterToolPrivate::Run(void) { // set to default input if none provided if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); // add files in the filelist to the input file list if ( m_settings->HasInputFilelist ) { ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); if ( !filelist.is_open() ) { cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl; return false; } string line; while ( getline(filelist, line) ) m_settings->InputFiles.push_back(line); } // initialize defined properties & user-specified filters // quit if failed if ( !SetupFilters() ) return false; // open reader without index BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { cerr << "bamtools filter ERROR: could not open input files for reading." << endl; return false; } // retrieve reader header & reference data const string headerText = reader.GetHeaderText(); filterToolReferences = reader.GetReferenceData(); // determine compression mode for BamWriter bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; // open BamWriter BamWriter writer; writer.SetCompressionMode(compressionMode); if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) { cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl; reader.Close(); return false; } // if no region specified, filter entire file BamAlignment al; if ( !m_settings->HasRegion ) { while ( reader.GetNextAlignment(al) ) { if ( CheckAlignment(al) ) writer.SaveAlignment(al); } } // otherwise attempt to use region as constraint else { // if region string parses OK BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { // attempt to find index files reader.LocateIndexes(); // if index data available for all BAM files, we can use SetRegion if ( reader.HasIndexes() ) { // attempt to use SetRegion(), if failed report error if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl; reader.Close(); return false; } // everything checks out, just iterate through specified region, filtering alignments while ( reader.GetNextAlignment(al) ) if ( CheckAlignment(al) ) writer.SaveAlignment(al); } // no index data available, we have to iterate through until we // find overlapping alignments else { while ( reader.GetNextAlignment(al) ) { if ( (al.RefID >= region.LeftRefID) && ((al.Position + al.Length) >= region.LeftPosition) && (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) { if ( CheckAlignment(al) ) writer.SaveAlignment(al); } } } } // error parsing REGION string else { cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl; cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" << endl; reader.Close(); return false; } } // clean up & exit reader.Close(); writer.Close(); return true; }
int main (int argc, char * argv[]) { vector<string> inputFilenames; string combinedOutFilename, alignmentsOutFilename; try { TCLAP::CmdLine cmd("Program description", ' ', VERSION); TCLAP::ValueArg<string> combinedOutputArg("o", "out", "Combined output filename (BAM format)", true, "", "combined.bam", cmd); TCLAP::ValueArg<int> minInsertArg("n", "min-insert", "Minimum insert size", false, DEFAULT_MIN_GAP, "min insert size", cmd); TCLAP::ValueArg<int> maxInsertArg("x", "max-insert", "Maximum insert size", false, DEFAULT_MAX_GAP, "max insert size", cmd); TCLAP::MultiArg<string> inputArgs("b", "bam", "Input BAM file", true, "input.bam", cmd); cmd.parse(argc, argv); combinedOutFilename = combinedOutputArg.getValue(); MIN_GAP = minInsertArg.getValue(); MAX_GAP = maxInsertArg.getValue(); inputFilenames = inputArgs.getValue(); } catch (TCLAP::ArgException &e) { cerr << "Error: " << e.error() << " " << e.argId() << endl; } // TODO require that alignments are sorted by name BamMultiReader reader; reader.Open(inputFilenames); if (!ValidOut.Open(combinedOutFilename, reader.GetHeader(), reader.GetReferenceData())) { cerr << ValidOut.GetErrorString() << endl; return 1; } string current, prev; char mateID; Group group; set<string> references; Alignment a; while (reader.GetNextAlignment(a)) { parseID(a.Name, current, mateID); if (current.compare(prev) && prev.size() > 0) { processGroup(group, references); group.clear(); references.clear(); } references.insert(a.RefName); GroupKey key; key.refID = a.RefName; key.mateID = mateID; key.rev = a.IsReverseStrand(); group.insert( std::make_pair( key, a ) ); prev = current; } processGroup(group, references); }
void TagBam::Tag() { // open the annotations files for processing; OpenAnnoFiles(); // open the BAM file BamReader reader; BamWriter writer; if (!reader.Open(_bamFile)) { cerr << "Failed to open BAM file " << _bamFile << endl; exit(1); } // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; // if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); // rip through the BAM file and test for overlaps with each annotation file. BamAlignment al; vector<BED> hits; while (reader.GetNextAlignment(al)) { if (al.IsMapped() == true) { BED a; a.chrom = refs.at(al.RefID).RefName; a.start = al.Position; a.end = al.GetEndPosition(false, false); a.strand = "+"; if (al.IsReverseStrand()) a.strand = "-"; ostringstream annotations; // annotate the BAM file based on overlaps with the annotation files. for (size_t i = 0; i < _annoFiles.size(); ++i) { // grab the current annotation file. BedFile *anno = _annoFiles[i]; if (!_useNames && !_useScores && !_useIntervals) { // add the label for this annotation file to tag if there is overlap if (anno->anyHits(a.chrom, a.start, a.end, a.strand, _sameStrand, _diffStrand, _overlapFraction, false)) { annotations << _annoLabels[i] << ";"; } } // use the score field else if (!_useNames && _useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t i = 0; i < hits.size(); ++i) { annotations << hits[i].score; if (i < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the name field from the annotation files to populate tag else if (_useNames && !_useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << hits[j].name; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the full interval information annotation files to populate tag else if (!_useNames && !_useScores && _useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << _annoLabels[i] << ":" << hits[j].chrom << ":" << hits[j].start << "-" << hits[j].end << "," << hits[j].name << "," << hits[j].score << "," << hits[j].strand; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } } // were there any overlaps with which to make a tag? if (annotations.str().size() > 0) { al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";" } } writer.SaveAlignment(al); } reader.Close(); writer.Close(); // close the annotations files; CloseAnnoFiles(); }
bool RandomTool::RandomToolPrivate::Run(void) { // set to default stdin if no input files provided if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); // add files in the filelist to the input file list if ( m_settings->HasInputFilelist ) { ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); if ( !filelist.is_open() ) { cerr << "bamtools random ERROR: could not open input BAM file list... Aborting." << endl; return false; } string line; while ( getline(filelist, line) ) m_settings->InputFiles.push_back(line); } // open our reader BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl; return false; } // look up index files for all BAM files reader.LocateIndexes(); // make sure index data is available if ( !reader.HasIndexes() ) { cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl; reader.Close(); return false; } // get BamReader metadata const string headerText = reader.GetHeaderText(); const RefVector references = reader.GetReferenceData(); if ( references.empty() ) { cerr << "bamtools random ERROR: no reference data available... Aborting." << endl; reader.Close(); return false; } // determine compression mode for BamWriter bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; // open BamWriter BamWriter writer; writer.SetCompressionMode(compressionMode); if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) { cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename << " for writing... Aborting." << endl; reader.Close(); return false; } // if user specified a REGION constraint, attempt to parse REGION string BamRegion region; if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) { cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl; cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" << endl; reader.Close(); writer.Close(); return false; } // seed our random number generator srand( time(NULL) ); // grab random alignments BamAlignment al; unsigned int i = 0; while ( i < m_settings->AlignmentCount ) { int randomRefId = 0; int randomPosition = 0; // use REGION constraints to select random refId & position if ( m_settings->HasRegion ) { // select a random refId randomRefId = getRandomInt(region.LeftRefID, region.RightRefID); // select a random position based on randomRefId const int lowerBoundPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 ); const int upperBoundPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : (references.at(randomRefId).RefLength - 1) ); randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); } // otherwise select from all possible random refId & position else { // select random refId randomRefId = getRandomInt(0, (int)references.size() - 1); // select random position based on randomRefId const int lowerBoundPosition = 0; const int upperBoundPosition = references.at(randomRefId).RefLength - 1; randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); } // if jump & read successful, save first alignment that overlaps random refId & position if ( reader.Jump(randomRefId, randomPosition) ) { while ( reader.GetNextAlignmentCore(al) ) { if ( al.RefID == randomRefId && al.Position >= randomPosition ) { writer.SaveAlignment(al); ++i; break; } } } } // cleanup & exit reader.Close(); writer.Close(); return true; }
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB = new BedFile(_bedBFile); _bedB->loadBedFileIntoMap(); // create a dummy BED A file for printing purposes if not // using BAM output. if (_bamOutput == false) { _bedA = new BedFile(_bedAFile); _bedA->bedType = 12; } // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // reserve some space hits.reserve(100); BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { // save an unaligned read if -v if (!bam.IsMapped()) { if (_noHit == true) writer.SaveAlignment(bam); continue; } // break alignment into discrete blocks, bedVector bed_blocks; string chrom = refs.at(bam.RefID).RefName; GetBamBlocks(bam, chrom, bed_blocks, false, true); // create a basic BED entry from the BAM alignment BED bed; MakeBedFromBam(bam, chrom, bed_blocks, bed); bool overlapsFound = false; if ((_bamOutput == true) && (_obeySplits == false)) { overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, bed.strand, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); } else if ( ((_bamOutput == true) && (_obeySplits == true)) || ((_bamOutput == false) && (_obeySplits == true)) ) { // find the hits that overlap with the full span of the blocked BED _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand, hits, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); // find the overlaps between the block in A and B overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput); } else if ((_bamOutput == false) && (_obeySplits == false)) { FindOverlaps(bed, hits); } // save the BAM alignment if overlap reqs. were met if (_bamOutput == true) { if ((overlapsFound == true) && (_noHit == false)) writer.SaveAlignment(bam); else if ((overlapsFound == false) && (_noHit == true)) writer.SaveAlignment(bam); } hits.clear(); } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }