pos_t VariantProcessor::processAlignment(const BamAlignment& alignment) { /* For each alignment, extract the MD and NM tags, validate against CIGAR string, and create Variants and ReadHaplotypes. All reads for a block are stored in a deque, and processed again to create candidate haplotypes. Returns the start position of this alignment (TODO correct?) */ if (!alignment.HasTag("NM") || !alignment.HasTag("MD")) { std::cerr << "error: BamAlignment '" << alignment.Name << "' does not have either NM or MD tags" << std::endl; } int nm_tag; string md_tag; unsigned int aln_len = alignment.GetEndPosition() - alignment.Position; alignment.GetTag("MD", md_tag); alignment.GetTag("NM", nm_tag); // Reconstruct reference sequence using MD tags string refseq = createReferenceSequence(alignment); // With reconstructed reference sequence and query sequence, look // for variants. It's a bit roundabout to reconstruct reference from // MD, then use it to find variants (already in MD) but keeping // state between CIGAR and MD is tricky. This also is a good // validation; variants found must much the number of variants in // CIGAR/MD. vector<VariantPtr> variants; vector<VariantPtr> read_variants; const vector<CigarOp>& cigar = alignment.CigarData; int refpos = 0, readpos = 0; for (vector<CigarOp>::const_iterator op = cigar.begin(); op != cigar.end(); ++op) { if (op->Type == 'S') { readpos += op->Length; } else if (op->Type == 'M') { // match or SNP processMatchOrMismatch(alignment, read_variants, op->Length, refseq, refpos, readpos); readpos += op->Length; refpos += op->Length; } else if (op->Type == 'I') { processInsertion(alignment, read_variants, op->Length, refseq, refpos, readpos); readpos += op->Length; } else if (op->Type == 'D') { processDeletion(alignment, read_variants, op->Length, refseq, refpos, readpos); refpos += op->Length; // deletion w.r.t reference; skip ref length } else { cerr << "error: unidentified CIGAR type: " << op->Type << endl; exit(1); } } // Add to alignments list block_alignments.push_back(alignment); return 0; // TODO(vsbuffalo) }
string createReferenceSequence(const BamAlignment& alignment) { // Recreate a reference sequence for a particular alignment. This is // the reference sequence that is identical to the reference at this // spot. This means skipping insertions or soft clipped regions in // reads, adding deletions back in, and keeping read matches. const vector<CigarOp> cigar = alignment.CigarData; const string querybases = alignment.QueryBases; string md_tag; alignment.GetTag("MD", md_tag); vector<MDToken> tokens; string refseq, alignedseq; // final ref bases; aligned portion of ref bases int md_len = TokenizeMD(md_tag, tokens); // Create reference-aligned sequence of read; doesn't contain soft // clips or insertions. Then, deletions and reference alleles are // added onto this. int pos=0; for (vector<CigarOp>::const_iterator op = cigar.begin(); op != cigar.end(); ++op) { if (!(op->Type == 'S' || op->Type == 'I')) { alignedseq.append(querybases.substr(pos, op->Length)); pos += op->Length; } else { pos += op->Length; // increment read position past skipped bases } } // the size of the aligned sequence MUST equal what is returned from // TokenizeMD: the number of aligned bases. Not the real reference // sequence is this length + deletions, which we add in below. assert(alignedseq.size() == md_len); pos = 0; for (vector<MDToken>::const_iterator it = tokens.begin(); it != tokens.end(); ++it) { if (it->type == MDType::isMatch) { refseq.append(alignedseq.substr(pos, it->length)); pos += it->length; } else if (it->type == MDType::isSNP) { assert(it->length == it->seq.size()); refseq.append(it->seq); pos += it->length; } else if (it->type == MDType::isDel) { // does not increment position in alignedseq assert(it->length == it->seq.size()); refseq.append(it->seq); } else { assert(false); } } return refseq; }
// reverts a BAM alignment // default behavior (for now) is : replace Qualities with OQ, clear IsDuplicate flag // can override default behavior using command line options void RevertTool::RevertToolPrivate::RevertAlignment(BamAlignment& al) { // replace Qualities with OQ, if requested if ( !m_settings->IsKeepQualities ) { string originalQualities; if ( al.GetTag(m_OQ, originalQualities) ) { al.Qualities = originalQualities; al.RemoveTag(m_OQ); } } // clear duplicate flag, if requested if ( !m_settings->IsKeepDuplicateFlag ) al.SetIsDuplicate(false); }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:editDist [in bam]"<<endl<<"this program returns the NM field of all aligned reads"<<endl; return 1; } string bamfiletopen = string(argv[1]); // cout<<bamfiletopen<<endl; BamReader reader; // cout<<"ok"<<endl; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } BamAlignment al; // cout<<"ok"<<endl; while ( reader.GetNextAlignment(al) ) { // cout<<al.Name<<endl; if(!al.IsMapped()) continue; if(al.HasTag("NM") ){ int editDist; if(al.GetTag("NM",editDist) ){ cout<<editDist<<endl; }else{ cerr<<"Cannot retrieve NM field for "<<al.Name<<endl; return 1; } }else{ cerr<<"Warning: read "<<al.Name<<" is aligned but has no NM field"<<endl; } } //while al reader.Close(); return 0; }
/** * Gets the library name from the header for the record. If the RG tag is not present on * the record, or the library isn't denoted on the read group, a constant string is * returned. */ string MarkDuplicates::getLibraryName(SamHeader & header, const BamAlignment & rec) { string read_group; static const string RG("RG"); static const string unknown_library("Unknown Library"); rec.GetTag(RG, read_group); if (read_group.size() > 0 && header.ReadGroups.Contains(read_group)) { SamReadGroupDictionary & d = header.ReadGroups; const SamReadGroup & rg = d[read_group]; if(rg.HasLibrary()) { return rg.Library; } } return unknown_library; }
//bool SampleManager::IdentifySample(Alignment& ra) const bool SampleManager::IdentifySample(const BamAlignment& alignment, int& sample_index, bool& primary_sample) const { string read_group; if (!alignment.GetTag("RG", read_group)) { cerr << "ERROR: Couldn't find read group id (@RG tag) for BAM Alignment " << alignment.Name << endl; exit(1); } map<string,int>::const_iterator I = read_group_to_sample_idx_.find(read_group); if (I == read_group_to_sample_idx_.end()) return false; sample_index =I->second; primary_sample = (sample_index == primary_sample_); return true; }
int IonstatsTestFragments(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string fasta_filename = opts.GetFirstString('r', "ref", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) { IonstatsTestFragmentsHelp(); return 1; } // // Prepare for metric calculation // map<string,string> tf_sequences; PopulateReferenceSequences(tf_sequences, fasta_filename); BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } int num_tfs = input_bam.GetReferenceCount(); SamHeader sam_header = input_bam.GetHeader(); if(!sam_header.HasReadGroups()) { fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str()); return 1; } string flow_order; string key; for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) { if(rg->HasFlowOrder()) flow_order = rg->FlowOrder; if(rg->HasKeySequence()) key = rg->KeySequence; } // Need these metrics stratified by TF. vector<ReadLengthHistogram> called_histogram(num_tfs); vector<ReadLengthHistogram> aligned_histogram(num_tfs); vector<ReadLengthHistogram> AQ10_histogram(num_tfs); vector<ReadLengthHistogram> AQ17_histogram(num_tfs); vector<SimpleHistogram> error_by_position(num_tfs); vector<MetricGeneratorSNR> system_snr(num_tfs); vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs); for (int tf = 0; tf < num_tfs; ++tf) { called_histogram[tf].Initialize(histogram_length); aligned_histogram[tf].Initialize(histogram_length); AQ10_histogram[tf].Initialize(histogram_length); AQ17_histogram[tf].Initialize(histogram_length); error_by_position[tf].Initialize(histogram_length); } vector<uint16_t> flow_signal_fz(flow_order.length()); vector<int16_t> flow_signal_zm(flow_order.length()); const RefVector& refs = input_bam.GetReferenceData(); // Missing: // - hp accuracy - tough, copy verbatim from TFMapper? BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // The check below eliminates unexpected alignments if (alignment.IsReverseStrand() or alignment.Position > 5) continue; int current_tf = alignment.RefID; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ10_bases = 0; int AQ17_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; } // // Step 3. Profit // called_histogram[current_tf].Add(alignment.Length); aligned_histogram[current_tf].Add(num_bases); AQ10_histogram[current_tf].Add(AQ10_bases); AQ17_histogram[current_tf].Add(AQ17_bases); if(alignment.GetTag("ZM", flow_signal_zm)) system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order); else if(alignment.GetTag("FZ", flow_signal_fz)) system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order); // HP accuracy - keeping it simple if (!alignment.IsReverseStrand()) { string genome = key + tf_sequences[refs[current_tf].RefName]; string calls = key + alignment.QueryBases; const char *genome_ptr = genome.c_str(); const char *calls_ptr = calls.c_str(); for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) { int genome_hp = 0; int calls_hp = 0; while (*genome_ptr == flow_order[flow]) { genome_hp++; genome_ptr++; } while (*calls_ptr == flow_order[flow]) { calls_hp++; calls_ptr++; } hp_accuracy[current_tf].Add(genome_hp, calls_hp); } } } // // Processing complete, generate ionstats_tf.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_tf"; output_json["meta"]["format_version"] = "1.0"; output_json["results_by_tf"] = Json::objectValue; for (int tf = 0; tf < num_tfs; ++tf) { if (aligned_histogram[tf].num_reads() < 1000) continue; called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]); aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]); AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]); AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]); error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]); system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName]; } input_bam.Close(); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } }
//{{{ void process_intra_chrom_split(const BamAlignment &curr, void SV_SplitRead:: process_intra_chrom_split(const BamAlignment &curr, const RefVector refs, BamWriter &inter_chrom_reads, map<string, BamAlignment> &mapped_splits, UCSCBins<SV_BreakPoint*> &r_bin, int weight, int id, int sample_id, SV_SplitReadReader *_reader) { if (mapped_splits.find(curr.Name) == mapped_splits.end()) { uint32_t clipped = count_clipped(curr.CigarData); if ( curr.HasTag("YP") == true) { uint32_t t; curr.GetTag("YP", t); if (t == 2) mapped_splits[curr.Name] = curr; } else if (clipped >= _reader->min_clip) mapped_splits[curr.Name] = curr; } else { if ( mapped_splits[curr.Name].RefID == curr.RefID ) { try { SV_SplitRead *new_split_read = new SV_SplitRead(mapped_splits[curr.Name], curr, refs, weight, id, sample_id, _reader); SV_BreakPoint *new_bp = NULL; if (new_split_read->is_sane()) { new_bp = new_split_read->get_bp(); if (new_bp != NULL) { new_bp->cluster(r_bin); } else { cerr << "Alignment name:" << curr.Name << endl; free(new_split_read); } } else free(new_split_read); } catch (int) { cerr << "Error creating split read: " << endl; } } else { BamAlignment al1 = curr; BamAlignment al2 = mapped_splits[curr.Name]; al1.MateRefID = al2.RefID; al2.MateRefID = al1.RefID; al1.MatePosition = al2.Position; al2.MatePosition = al1.Position; string x = _reader->get_source_file_name(); al1.AddTag("LS","Z",x); al2.AddTag("LS","Z",x); inter_chrom_reads.SaveAlignment(al1); inter_chrom_reads.SaveAlignment(al2); } mapped_splits.erase(curr.Name); } }
int main (int argc, char *argv[]) { if( (argc!= 3) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cerr<<"Usage:splitByRG [in bam] [out prefix]"<<endl<<"this program creates one bam file per RG in the with the outprefix\nFor example splitByRG in.bam out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl; return 1; } string bamfiletopen = string(argv[1]); // if(!strEndsWith(bamfiletopen,".bam")){ // } string bamDirOutPrefix = string(argv[2]); map<string,BamWriter *> rg2BamWriter; // if(!isDirectory(bamDirOut)){ // cerr<<"ERROR: the out directory does not exist"<<endl; // return 1; // } BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); vector<RefData> refData=reader.GetReferenceData(); string pID = "splitByRG"; string pName = "splitByRG"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"..")); SamReadGroupDictionary srgd=header.ReadGroups; for(SamReadGroupConstIterator srgci=srgd.ConstBegin(); srgci<srgd.ConstEnd(); srgci++){ //cout<<*srgci<<endl; const SamReadGroup rg = (*srgci); //cout<<rg.ID<<endl; rg2BamWriter[rg.ID] = new BamWriter(); rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); } BamAlignment al; unsigned int total=0; while ( reader.GetNextAlignment(al) ) { // al.SetIsFailedQC(false); // writer.SaveAlignment(al); // if(al.IsMapped () ){ // if(rg2BamWriter.find(refData[al.RefID].RefName) == rg2BamWriter.end()){ //new // rg2BamWriter[refData[al.RefID].RefName] = new BamWriter(); // if ( !rg2BamWriter[refData[al.RefID].RefName]->Open(bamDirOutPrefix+"."+refData[al.RefID].RefName+".bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<refData[al.RefID].RefName<<".bam" << endl; // return 1; // } // }else{ // rg2BamWriter[refData[al.RefID].RefName]->SaveAlignment(al); // } // }else{ // unmapped.SaveAlignment(al); // } if(al.HasTag("RG")){ string rgTag; al.GetTag("RG",rgTag); //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new cerr<<"Found new RG "<<rgTag<<endl; rg2BamWriter[rgTag] = new BamWriter(); if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; return 1; } rg2BamWriter[rgTag]->SaveAlignment(al); }else{ rg2BamWriter[rgTag]->SaveAlignment(al); } }else{ string rgTag="unknown"; //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new cerr<<"Found new RG "<<rgTag<<endl; rg2BamWriter[rgTag] = new BamWriter(); if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; return 1; } rg2BamWriter[rgTag]->SaveAlignment(al); }else{ rg2BamWriter[rgTag]->SaveAlignment(al); } // cerr << "Cannot get RG tag for " << al.Name<<endl; // return 1; } total++; } //while al reader.Close(); // writer.Close(); // unmapped.Close(); map<string,BamWriter *>::iterator rg2BamWriterIt; for (rg2BamWriterIt =rg2BamWriter.begin(); rg2BamWriterIt!=rg2BamWriter.end(); rg2BamWriterIt++){ rg2BamWriterIt->second->Close(); } cerr<<"Wrote succesfully "<<total<<" reads"<<endl; return 0; }
int main (int argc, const char *argv[]) { printf ("------------- bamrealignment --------------\n"); OptArgs opts; opts.ParseCmdLine(argc, argv); vector<int> score_vals(4); string input_bam = opts.GetFirstString ('i', "input", ""); string output_bam = opts.GetFirstString ('o', "output", ""); opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores"); int clipping = opts.GetFirstInt ('c', "clipping", 2); bool anchors = opts.GetFirstBoolean ('a', "anchors", true); int bandwidth = opts.GetFirstInt ('b', "bandwidth", 10); bool verbose = opts.GetFirstBoolean ('v', "verbose", false); bool debug = opts.GetFirstBoolean ('d', "debug", false); int format = opts.GetFirstInt ('f', "format", 1); int num_threads = opts.GetFirstInt ('t', "threads", 8); string log_fname = opts.GetFirstString ('l', "log", ""); if (input_bam.empty() or output_bam.empty()) return PrintHelp(); opts.CheckNoLeftovers(); std::ofstream logf; if (log_fname.size ()) { logf.open (log_fname.c_str ()); if (!logf.is_open ()) { fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str()); return 1; } } BamReader reader; if (!reader.Open(input_bam)) { fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str()); return 1; } SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(num_threads); if (format == 1) writer.SetCompressionMode(BamWriter::Uncompressed); else writer.SetCompressionMode(BamWriter::Compressed); if (!writer.Open(output_bam, header, refs)) { fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str()); return 1; } // The meat starts here ------------------------------------ if (verbose) cout << "Verbose option is activated, each alignment will print to screen." << endl << " After a read hit RETURN to continue to the next one," << endl << " or press q RETURN to quit the program," << endl << " or press s Return to silence verbose," << endl << " or press c RETURN to continue printing without further prompt." << endl << endl; unsigned int readcounter = 0; unsigned int mapped_readcounter = 0; unsigned int realigned_readcounter = 0; unsigned int modified_alignment_readcounter = 0; unsigned int pos_update_readcounter = 0; unsigned int failed_clip_realigned_readcount = 0; unsigned int already_perfect_readcount = 0; unsigned int bad_md_tag_readcount = 0; unsigned int error_recreate_ref_readcount = 0; unsigned int error_clip_anchor_readcount = 0; unsigned int error_sw_readcount = 0; unsigned int error_unclip_readcount = 0; unsigned int start_position_shift; int orig_position; int new_position; string md_tag, new_md_tag, input = "x"; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; bool position_shift = false; time_t start_time = time(NULL); Realigner aligner; aligner.verbose_ = verbose; aligner.debug_ = debug; if (!aligner.SetScores(score_vals)) cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl; aligner.SetAlignmentBandwidth(bandwidth); BamAlignment alignment; while(reader.GetNextAlignment(alignment)){ readcounter ++; position_shift = false; if ( (readcounter % 100000) == 0 ) cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl; if (alignment.IsMapped()) { orig_position = alignment.Position; mapped_readcounter++; aligner.SetClipping(clipping, !alignment.IsReverseStrand()); if (aligner.verbose_) { cout << endl; if (alignment.IsReverseStrand()) cout << "The read is from the reverse strand." << endl; else cout << "The read is from the forward strand." << endl; } if (!alignment.GetTag("MD", md_tag)) { if (aligner.verbose_) cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n'; bad_md_tag_readcount++; } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) { bool clipfail = false; if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ()) { clipfail = true; failed_clip_realigned_readcount ++; } if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (aligner.verbose_) cout << "Error in the alignment! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n'; error_sw_readcount++; writer.SaveAlignment(alignment); // Write alignment unchanged continue; } if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) { if (aligner.verbose_) cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n'; writer.SaveAlignment(alignment); // Write alignment unchanged error_unclip_readcount ++; continue; } new_md_tag = aligner.GetMDstring(new_md_data); realigned_readcounter++; // adjust start position of read if (!aligner.LeftAnchorClipped() and start_position_shift != 0) { new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position); if (new_position != alignment.Position) { pos_update_readcounter++; position_shift = true; alignment.Position = new_position; } } if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag) { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD"; if (position_shift) logf << "-SHIFT"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } modified_alignment_readcounter++; } else { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } } if (aligner.verbose_){ cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } // Finally update alignment information alignment.CigarData = new_cigar_data; alignment.EditTag("MD", "Z" , new_md_tag); } // end of CreateRef else if else { switch (aligner.GetCreateRefError ()) { case Realigner::CR_ERR_RECREATE_REF: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n'; error_recreate_ref_readcount++; break; case Realigner::CR_ERR_CLIP_ANCHOR: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n'; error_clip_anchor_readcount++; break; default: // On a good run this writes way too many reads to the log file - don't want to create a too large txt file // if (logf.is_open ()) //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n'; already_perfect_readcount++; break; } if (aligner.verbose_) { cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } } // --- Debug output for Rajesh --- if (debug && aligner.invalid_cigar_in_input) { aligner.verbose_ = true; cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl; // Rerun reference generation to display error aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors); aligner.verbose_ = verbose; aligner.invalid_cigar_in_input = false; } // --- --- --- } // end of if isMapped writer.SaveAlignment(alignment); } // end while loop over reads if (aligner.invalid_cigar_in_input) cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl; // ---------------------------------------------------------------- // program end -- output summary information cout << " File: " << input_bam << endl << " Total reads: " << readcounter << endl << " Mapped reads: " << mapped_readcounter << endl; if (bad_md_tag_readcount) cout << " Skipped: bad MD tags: " << bad_md_tag_readcount << endl; if (error_recreate_ref_readcount) cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl; if (error_clip_anchor_readcount) cout << " Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl; cout << " Skipped: already perfect: " << already_perfect_readcount << endl << " Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl; if (failed_clip_realigned_readcount) cout << " (including " << failed_clip_realigned_readcount << " that failed to clip)" << endl; if (error_sw_readcount) cout << " Failed to complete SW alignment: " << error_sw_readcount << endl; if (error_unclip_readcount) cout << " Failed to unclip anchor: " << error_unclip_readcount << endl; cout << " Succesfully realigned: " << realigned_readcounter << endl << " Modified alignments: " << modified_alignment_readcounter << endl << " Shifted position: " << pos_update_readcounter << endl; cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl; cout << "INFO: The output BAM file may be unsorted." << endl; cout << "------------------------------------------" << endl; return 0; }
bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) { // ensure filter contains string data Variant entireTagFilter = valueFilter.Value; if ( !entireTagFilter.is_type<string>() ) return false; // localize string from variant const string& entireTagFilterString = entireTagFilter.get<string>(); // ensure we have at least "XX:x" if ( entireTagFilterString.length() < 4 ) return false; // get tagName & lookup in alignment // if found, set tagType to tag type character // if not found, return false const string& tagName = entireTagFilterString.substr(0,2); char tagType = '\0'; if ( !al.GetTagType(tagName, tagType) ) return false; // remove tagName & ":" from beginning tagFilter string tagFilterString = entireTagFilterString.substr(3); // switch on tag type to set tag query value & parse filter token int32_t intFilterValue, intQueryValue; uint32_t uintFilterValue, uintQueryValue; float realFilterValue, realQueryValue; string stringFilterValue, stringQueryValue; PropertyFilterValue tagFilter; PropertyFilterValue::ValueCompareType compareType; bool keepAlignment = false; switch (tagType) { // signed int tag type case 'c' : case 's' : case 'i' : if ( al.GetTag(tagName, intQueryValue) ) { if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) { tagFilter.Value = intFilterValue; tagFilter.Type = compareType; keepAlignment = tagFilter.check(intQueryValue); } } break; // unsigned int tag type case 'C' : case 'S' : case 'I' : if ( al.GetTag(tagName, uintQueryValue) ) { if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) { tagFilter.Value = uintFilterValue; tagFilter.Type = compareType; keepAlignment = tagFilter.check(uintQueryValue); } } break; // 'real' tag type case 'f' : if ( al.GetTag(tagName, realQueryValue) ) { if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) { tagFilter.Value = realFilterValue; tagFilter.Type = compareType; keepAlignment = tagFilter.check(realQueryValue); } } break; // string tag type case 'A': case 'Z': case 'H': if ( al.GetTag(tagName, stringQueryValue) ) { if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) { tagFilter.Value = stringFilterValue; tagFilter.Type = compareType; keepAlignment = tagFilter.check(stringQueryValue); } } break; // unknown tag type default : keepAlignment = false; } return keepAlignment; }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<"plotQualScore input.bam"<<endl; return 1; } string bamfiletopen = string(argv[1]); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } // if ( !reader.LocateIndex() ){ // cerr << "warning: cannot locate index for file " << bamfiletopen<<endl; // //return 1; // } BamAlignment al; BamAlignment al2; bool unsurePEorSE=true; bool pe=true; int strLength=-1; int vecLengthToUse=-1; map<short,unsigned long> ** counterA = 0; map<short,unsigned long> ** counterC = 0; map<short,unsigned long> ** counterG = 0; map<short,unsigned long> ** counterT = 0; int lengthIndex1=0; int lengthIndex2=0; string seqInd1; string seqInd2; string qualInd1; string qualInd2; int offsetInd2; while ( reader.GetNextAlignment(al) ) { if(unsurePEorSE){ strLength=al.QueryBases.length(); if(al.IsPaired()){ pe=true; vecLengthToUse=2*strLength; }else{ pe=false; vecLengthToUse=strLength; } string index1; string index2; if(al.HasTag("XI")){ al.GetTag("XI",index1); vecLengthToUse+=index1.length(); lengthIndex1=index1.length(); } if(al.HasTag("XJ")){ al.GetTag("XJ",index2); vecLengthToUse+=index2.length(); lengthIndex2=index2.length(); } counterA = new map<short,unsigned long> * [vecLengthToUse]; counterC = new map<short,unsigned long> * [vecLengthToUse]; counterG = new map<short,unsigned long> * [vecLengthToUse]; counterT = new map<short,unsigned long> * [vecLengthToUse]; for(int i=0;i<vecLengthToUse;i++){ counterA[i]=new map<short,unsigned long> (); counterC[i]=new map<short,unsigned long> (); counterG[i]=new map<short,unsigned long> (); counterT[i]=new map<short,unsigned long> (); for(short k=minQualScore;k<=maxQualScore;k++){ (*counterA[i])[k]=0; (*counterC[i])[k]=0; (*counterG[i])[k]=0; (*counterT[i])[k]=0; } } unsurePEorSE=false; }else{ if(pe && !al.IsPaired()){ cerr << "Cannot have unpaired reads in PE mode" << endl; return 1; } if(!pe && al.IsPaired()){ cerr << "Cannot have unpaired reads in SE mode" << endl; return 1; } } if(al.QueryBases.length() != al.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(int(al.QueryBases.length()) != strLength){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(pe){ if(al.IsFirstMate()){ reader.GetNextAlignment(al2); if(al2.QueryBases.length() != al2.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } }else{ cerr << "First read should be the first mate" << endl; return 1; } } //cycle for(unsigned int i=0;i<al.QueryBases.length();i++){ short x=(short(al.Qualities[i])-qualOffset); if(al.QueryBases[i] == 'A'){ (*counterA[i])[x]++; } if(al.QueryBases[i] == 'C'){ (*counterC[i])[x]++; } if(al.QueryBases[i] == 'G'){ (*counterG[i])[x]++; } if(al.QueryBases[i] == 'T'){ (*counterT[i])[x]++; } } //The indices for al and al2 should hopefully be the same if(lengthIndex1>0){ al.GetTag("XI",seqInd1); al.GetTag("YI",qualInd1); int j; for(int i=0;i<lengthIndex1;i++){ j=i+al.QueryBases.length(); short x=(short(qualInd1[i])-qualOffset); if(seqInd1[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd1[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd1[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd1[i] == 'T'){ (*counterT[j])[x]++; } } } if(pe){ offsetInd2=al.QueryBases.length()+lengthIndex1+al2.QueryBases.length(); int j; for(unsigned int i=0;i<al2.QueryBases.length();i++){ j=i+al.QueryBases.length()+lengthIndex1; short x=(short(al2.Qualities[i])-qualOffset); if(al2.QueryBases[i] == 'A'){ (*counterA[j])[x]++; } if(al2.QueryBases[i] == 'C'){ (*counterC[j])[x]++; } if(al2.QueryBases[i] == 'G'){ (*counterG[j])[x]++; } if(al2.QueryBases[i] == 'T'){ (*counterT[j])[x]++; } } }else{ offsetInd2=al.QueryBases.length()+lengthIndex1; } //The indices for al and al2 should hopefully be the same if(lengthIndex2>0){ al.GetTag("XJ",seqInd2); al.GetTag("YJ",qualInd2); int j; for(int i=0;i<lengthIndex2;i++){ j=offsetInd2+i; short x=(short(qualInd2[i])-qualOffset); if(seqInd2[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd2[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd2[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd2[i] == 'T'){ (*counterT[j])[x]++; } } } } reader.Close(); cout<<"cycle\t"<<"nuc\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<k<<"\t"; } cout<<maxQualScore<<endl; for(int i=0;i<vecLengthToUse;i++){ cout<<(i+1)<<"\t"; cout<<"A\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterA[i])[k]<<"\t"; } cout<<(*counterA[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"C\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterC[i])[k]<<"\t"; } cout<<(*counterC[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"G\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterG[i])[k]<<"\t"; } cout<<(*counterG[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"T\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterT[i])[k]<<"\t"; } cout<<(*counterT[i])[maxQualScore]<<endl; } return 0; }
int IonstatsAlignment(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_alignment.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty()) { IonstatsAlignmentHelp(); return 1; } // // Prepare for metric calculation // BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } ReadLengthHistogram called_histogram; ReadLengthHistogram aligned_histogram; ReadLengthHistogram AQ7_histogram; ReadLengthHistogram AQ10_histogram; ReadLengthHistogram AQ17_histogram; ReadLengthHistogram AQ20_histogram; ReadLengthHistogram AQ47_histogram; SimpleHistogram error_by_position; called_histogram.Initialize(histogram_length); aligned_histogram.Initialize(histogram_length); AQ7_histogram.Initialize(histogram_length); AQ10_histogram.Initialize(histogram_length); AQ17_histogram.Initialize(histogram_length); AQ20_histogram.Initialize(histogram_length); AQ47_histogram.Initialize(histogram_length); error_by_position.Initialize(histogram_length); BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { // Record read length called_histogram.Add(alignment.Length); if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ7_bases = 0; int AQ10_bases = 0; int AQ17_bases = 0; int AQ20_bases = 0; int AQ47_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats alignment: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*5 <= num_bases) AQ7_bases = num_bases; if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; if (num_errors*100 <= num_bases) AQ20_bases = num_bases; if (num_errors == 0) AQ47_bases = num_bases; } // // Step 3. Profit // if (num_bases >= 20) aligned_histogram.Add(num_bases); if (AQ7_bases >= 20) AQ7_histogram.Add(AQ7_bases); if (AQ10_bases >= 20) AQ10_histogram.Add(AQ10_bases); if (AQ17_bases >= 20) AQ17_histogram.Add(AQ17_bases); if (AQ20_bases >= 20) AQ20_histogram.Add(AQ20_bases); if (AQ47_bases >= 20) AQ47_histogram.Add(AQ47_bases); } input_bam.Close(); // // Processing complete, generate ionstats_alignment.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_alignment"; output_json["meta"]["format_version"] = "1.0"; called_histogram.SaveToJson(output_json["full"]); aligned_histogram.SaveToJson(output_json["aligned"]); AQ7_histogram.SaveToJson(output_json["AQ7"]); AQ10_histogram.SaveToJson(output_json["AQ10"]); AQ17_histogram.SaveToJson(output_json["AQ17"]); AQ20_histogram.SaveToJson(output_json["AQ20"]); AQ47_histogram.SaveToJson(output_json["AQ47"]); error_by_position.SaveToJson(output_json["error_by_position"]); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } return 0; }
int IonstatsBasecaller(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_basecaller.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty()) { IonstatsBasecallerHelp(); return 1; } BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } SamHeader sam_header = input_bam.GetHeader(); if(!sam_header.HasReadGroups()) { fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str()); return 1; } ReadLengthHistogram total_full_histo; ReadLengthHistogram total_insert_histo; ReadLengthHistogram total_Q17_histo; ReadLengthHistogram total_Q20_histo; total_full_histo.Initialize(histogram_length); total_insert_histo.Initialize(histogram_length); total_Q17_histo.Initialize(histogram_length); total_Q20_histo.Initialize(histogram_length); MetricGeneratorSNR system_snr; BaseQVHistogram qv_histogram; string flow_order; string key; for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) { if(rg->HasFlowOrder()) flow_order = rg->FlowOrder; if(rg->HasKeySequence()) key = rg->KeySequence; } double qv_to_error_rate[256]; for (int qv = 0; qv < 256; qv++) qv_to_error_rate[qv] = pow(10.0,-0.1*(double)qv); BamAlignment alignment; string read_group; vector<uint16_t> flow_signal_fz(flow_order.length()); vector<int16_t> flow_signal_zm(flow_order.length()); while(input_bam.GetNextAlignment(alignment)) { // Record read length unsigned int full_length = alignment.Length; total_full_histo.Add(full_length); // Record insert length int insert_length = 0; if (alignment.GetTag("ZA",insert_length)) total_insert_histo.Add(insert_length); // Compute and record Q17 and Q20 int Q17_length = 0; int Q20_length = 0; double num_accumulated_errors = 0.0; for(int pos = 0; pos < alignment.Length; ++pos) { num_accumulated_errors += qv_to_error_rate[(int)alignment.Qualities[pos] - 33]; if (num_accumulated_errors / (pos + 1) <= 0.02) Q17_length = pos + 1; if (num_accumulated_errors / (pos + 1) <= 0.01) Q20_length = pos + 1; } total_Q17_histo.Add(Q17_length); total_Q20_histo.Add(Q20_length); // Record data for system snr if(alignment.GetTag("ZM", flow_signal_zm)) system_snr.Add(flow_signal_zm, key.c_str(), flow_order); else if(alignment.GetTag("FZ", flow_signal_fz)) system_snr.Add(flow_signal_fz, key.c_str(), flow_order); // Record qv histogram qv_histogram.Add(alignment.Qualities); } input_bam.Close(); Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_basecaller"; output_json["meta"]["format_version"] = "1.0"; system_snr.SaveToJson(output_json); qv_histogram.SaveToJson(output_json); total_full_histo.SaveToJson(output_json["full"]); total_insert_histo.SaveToJson(output_json["insert"]); total_Q17_histo.SaveToJson(output_json["Q17"]); total_Q20_histo.SaveToJson(output_json["Q20"]); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } }
int main (int argc, char *argv[]) { if( (argc!= 4 && argc !=5 && argc !=6) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cerr<<"Usage:splitByRG [in bam] [rg Tally] [out prefix] (optional target)"<<endl<<"this program will subsample a BAM file per read group for a certain target\nFor example splitByRG in.bam tally.txt out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl; return 1; } string bamfiletopen = string(argv[1]); string rgTally = string(argv[2]); string bamDirOutPrefix = string(argv[3]); int target = 200000; int maxTarget = 1000000; if(argc==5){ target = destringify<int> ( string(argv[4]) ); } if(argc==6){ target = destringify<int> ( string(argv[4]) ); maxTarget = destringify<int> ( string(argv[5]) ); } cerr<<"minimum fragments:\t"<<target<<endl; cerr<<"target fragments:\t"<<maxTarget<<endl; string line; ifstream myFileTally; map<string,double> rg2Fraction; myFileTally.open(rgTally.c_str(), ios::in); cerr<<"Retained groups:\n"<<endl; cerr<<"RG\t#mapped\tfraction retained"<<endl; cerr<<"-----------------------------------"<<endl; if (myFileTally.is_open()){ while ( getline (myFileTally,line)){ vector<string> tokens = allTokens(line,'\t'); if(tokens.size() > 6) if( tokens[1] == "pass" && (tokens[0] != "\"\"" && tokens[0] != "control" && tokens[0] != "TOTAL") ){ //cout<<tokens[0]<<"\t"<<tokens[5]<<endl; int count = destringify<int>(tokens[5]); if(count>target){ if(count>=maxTarget){ rg2Fraction[ tokens[0] ] = double(maxTarget)/double(count); cout<<tokens[0]<<"\t"<<count<<"\t"<<double(maxTarget)/double(count)<<endl; }else{ cout<<tokens[0]<<"\t"<<count<<"\t"<<1.0<<endl; rg2Fraction[ tokens[0] ] = 1.0; } } } } myFileTally.close(); }else{ cerr << "Unable to open file "<<rgTally<<endl; return 1; } map<string,BamWriter *> rg2BamWriter; // if(!isDirectory(bamDirOut)){ // cerr<<"ERROR: the out directory does not exist"<<endl; // return 1; // } BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); vector<RefData> refData=reader.GetReferenceData(); SamReadGroupDictionary srgd=header.ReadGroups; for(SamReadGroupConstIterator srgci=srgd.ConstBegin(); srgci<srgd.ConstEnd(); srgci++){ //cout<<*srgci<<endl; const SamReadGroup rg = (*srgci); //cout<<rg.ID<<endl; if( rg2Fraction.find(rg.ID) != rg2Fraction.end() ){ rg2BamWriter[rg.ID] = new BamWriter(); rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); } //cout<<bamDirOutPrefix+"."+rg.ID+".bam"<<endl; } // return 1; // BamWriter unmapped; // cout<<header.ToString()<<endl; // return 1; // if ( !unmapped.Open(bamDirOutPrefix+".unmapped.bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix+".unmapped.bam" << endl; // return 1; // } // cout<<"reading"<<endl; BamAlignment al; unsigned int total=0; while ( reader.GetNextAlignment(al) ) { if(al.HasTag("RG") && al.IsMapped() ){ string rgTag; al.GetTag("RG",rgTag); //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new: ignore completely }else{ if( randomProb() <= rg2Fraction[ rgTag ] ){ rg2BamWriter[rgTag]->SaveAlignment(al); //cout<<"wrote "<<rgTag<<endl; } else{ //cout<<"skipped "<<rgTag<<endl; } } }// else{ // string rgTag="unknown"; // //cout<<rgTag<<endl; // if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new // cerr<<"Found new RG "<<rgTag<<endl; // rg2BamWriter[rgTag] = new BamWriter(); // if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; // return 1; // } // rg2BamWriter[rgTag]->SaveAlignment(al); // }else{ // rg2BamWriter[rgTag]->SaveAlignment(al); // } // // cerr << "Cannot get RG tag for " << al.Name<<endl; // // return 1; // } total++; } //while al reader.Close(); // writer.Close(); // unmapped.Close(); map<string,BamWriter *>::iterator rg2BamWriterIt; for (rg2BamWriterIt =rg2BamWriter.begin(); rg2BamWriterIt!=rg2BamWriter.end(); rg2BamWriterIt++){ rg2BamWriterIt->second->Close(); } cerr<<"Wrote succesfully "<<total<<" reads"<<endl; return 0; }
int main ( int argc, char *argv[] ) { struct parameters *param = 0; param = interface(param, argc, argv); //region file input (the region file should be sorted as the same way as the bam file) ifstream region_f; region_f.open(param->region_f, ios_base::in); // the region file is opened //bam input and generate index if not yet //-------------------------------------------------------------------------------------------------------+ // BAM input (file or filenames?) | //-------------------------------------------------------------------------------------------------------+ char *fof = param->mapping_f; FILE *IN=NULL; char linefof[5000]; int filecount=0; vector <string> fnames; if (strchr(fof,' ')!=NULL) { char *ptr; ptr=strtok(fof," "); while (ptr!=NULL) { fnames.push_back(ptr); filecount++; ptr=strtok(NULL," "); } } else { IN=fopen(fof,"rt"); if (IN!=NULL) { long linecount=0; while (fgets(linefof,5000-1,IN)!=NULL) { linecount++; if (linefof[0]!='#' && linefof[0]!='\n') { char *ptr=strchr(linefof,'\n'); if (ptr!=NULL && ptr[0]=='\n') { ptr[0]='\0'; } FILE *dummy=NULL; dummy=fopen(linefof,"rt"); if (dummy!=NULL) { // seems to be a file of filenames... fclose(dummy); fnames.push_back(linefof); filecount++; } else if (filecount==0 || linecount>=1000-1) { // seems to be a single file fnames.push_back(fof); filecount++; break; } } } fclose(IN); } } //file or file name decided and stored in vector "fnames" cerr << "the input mapping files are:" << endl; vector <string>::iterator fit = fnames.begin(); for(; fit != fnames.end(); fit++) { cerr << *fit << endl; } //-------------------------------------------------------------------------------------------------------+ // end of file or filenames | //-------------------------------------------------------------------------------------------------------+ // open the BAM file(s) BamMultiReader reader; reader.Open(fnames); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); if ( ! reader.LocateIndexes() ) // opens any existing index files that match our BAM files reader.CreateIndexes(); // creates index files for BAM files that still lack one // locus bias struct lb empty_profile = {0,0,0,0}; vector <struct lb> locus_b(1000, empty_profile); // output locus bias file string locus_bias_set = param->lbias; ofstream locus_bias; if ( locus_bias_set != "" ) { locus_bias.open(param->lbias); if ( !locus_bias ) { cerr << "can not open locus_bias file.\n"; exit(0); } } //should decide which chromosome string line; string old_chr = "SRP"; string type = param->type; //whether do some position-level pile-up stuff bool posc = false; ofstream posc_f; ofstream chrmap_f; string poscset = param->posc; if ( poscset != "" ) { posc = true; posc_f.open(param->posc); chrmap_f.open(param->chrmap); } bool noChr; if ( param->nochr == 1 ){ noChr = true; } else { noChr = false; } //regions for the input of region file deque <struct region> regions; getline(region_f, line); //get the first line eatline(line,regions,noChr); deque <struct region>::iterator it = regions.begin(); while ( it->chr != old_chr ) { old_chr = it->chr; // set the current chr as old chr int chr_id = reader.GetReferenceID(it->chr); if ( chr_id == -1 ) { //reference not found for (; it != regions.end() && it->chr == old_chr; ) { gene_processing(*it,locus_b); // print the old region info it = regions.erase(it); // erase the current region } while ( regions.empty() ) { getline(region_f, line); if ( region_f.eof() ){ cerr << "finished: end of region file, zone 0" << endl; break; } eatline(line, regions,noChr); it = regions.begin(); if (it->chr == old_chr){ gene_processing(*it,locus_b); regions.clear(); continue; } } continue; } int chr_len = refs.at(chr_id).RefLength; if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region { cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl; reader.Close(); exit(1); } //pile-up pos stats set <string> fragment; map <string, unsigned int> pileup; bool isposPileup = false; unsigned int old_start = 0; unsigned int total_tags = 0; unsigned int total_pos = 0; unsigned int pileup_pos = 0; BamAlignment bam; while (reader.GetNextAlignment(bam)) { if ( bam.IsMapped() == false ) continue; // skip unaligned reads unsigned int unique; bam.GetTag("NH", unique); if (param->unique == 1) { if (unique != 1) { // skipe uniquelly mapped reads continue; } } if (read_length == 0){ read_length = bam.Length; } //cout << bam.Name << endl; string chrom = refs.at(bam.RefID).RefName; string strand = "+"; if (bam.IsReverseStrand()) strand = "-"; unsigned int alignmentStart = bam.Position+1; unsigned int mateStart; if (type == "p") mateStart = bam.MatePosition+1; unsigned int alignmentEnd = bam.GetEndPosition(); unsigned int cigarEnd; vector <int> blockLengths; vector <int> blockStarts; blockStarts.push_back(0); ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd); // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads) if (posc == true && unique == 1) { if (type == "p" && fragment.count(bam.Name) > 0) fragment.erase(bam.Name); else { total_tags++; if (type == "p"){ fragment.insert(bam.Name); } string alignSum; if (type == "p") { alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand; } else { alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand; } if ( alignmentStart != old_start ) { isposPileup = false; map <string, unsigned int>::iterator pit = pileup.begin(); for (; pit != pileup.end(); pit++) { posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl; //print pileup } pileup.clear(); //clear pileup set pileup.insert( pair <string, unsigned int> (alignSum, 1) ); //insert the new read total_pos++; } else if ( alignmentStart == old_start ) { // same starts if ( pileup.count(alignSum) > 0 ) { // pileup if ( pileup[alignSum] == 1 && isposPileup == false ) { pileup_pos++; isposPileup = true; } pileup[alignSum]++; } else { pileup.insert( pair <string, unsigned int> (alignSum, 1) ); } } //same starts } //new fragment old_start = alignmentStart; } // do pos check float incre = 1.; if (blockStarts.size() > 1) incre = 0.5; // incre half for junction reads incre /= static_cast < float >(unique); // for multi aligned reads deque <struct region>::iterator iter = regions.begin(); if ( iter->start > alignmentEnd ) continue; // skip reads not overlapping with the first region while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) { if (iter->end < alignmentStart) { // the region end is beyond the alignmentStart gene_processing(*iter,locus_b); // processing iter = regions.erase(iter); // this region should be removed if ( regions.empty() ) { getline(region_f, line); // get a line of region file if ( ! region_f.eof() ) { eatline(line, regions, noChr); // eat a line and put it into the duque iter = regions.begin(); } else { // it's reaching the end of the region file cerr << "finished: end of region file, zone 3" << endl; break; } } continue; } if (iter->end >= alignmentStart && iter->start <= alignmentEnd) { //overlapping, should take action vector <int>::iterator cigit = blockStarts.begin(); for (; cigit != blockStarts.end(); cigit++) { unsigned int current_start = *cigit + alignmentStart; int current_pos = current_start - (iter->start); //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl; if ( (iter->tags).count(current_pos) > 0 ) { (iter->tags)[current_pos] += incre; } else (iter->tags).insert( pair<int, float>(current_pos, incre) ); } } // overlapping take action! if ( (iter+1) != regions.end() ) iter++; // if this region is not the last element in the deque else { // the last element getline(region_f, line); // get a line of region file if ( ! region_f.eof() ){ eatline(line, regions, noChr); // eat a line and put it into the duque iter = regions.end(); iter--; } else { //it's reaching the end of the region file cerr << "finished: end of region file, zone 4" << endl; break; } } } //while } // read a bam // print chr map if (posc == true) { chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl; } //somehow to loop back it = regions.begin(); //reset to begin for (; it != regions.end() && it->chr == old_chr; ) { gene_processing(*it,locus_b); // print the old region info it = regions.erase(it); // erase the current region } while ( regions.empty() ) { getline(region_f, line); if ( region_f.eof() ){ cerr << "finished: end of region file, zone 5" << endl; //print locus bias for (unsigned int l = 0; l < 1000; l++){ locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl; } exit(0); } eatline(line, regions, noChr); it = regions.begin(); if (it->chr == old_chr){ gene_processing(*it, locus_b); regions.clear(); continue; } } } // region chr != old chr regions.clear(); reader.Close(); region_f.close(); return 0; } //main