void setMateInfo( BamAlignment & rec1, BamAlignment & rec2, SamHeader & header) { const int NO_ALIGNMENT_REFERENCE_INDEX = -1; const int NO_ALIGNMENT_START = -1; // If neither read is unmapped just set their mate info if (rec1.IsMapped() && rec2.IsMapped()) { rec1.MateRefID = rec2.MateRefID; rec1.MatePosition = rec2.Position; rec1.SetIsReverseStrand(rec2.IsReverseStrand()); rec1.SetIsMapped(true); rec1.AddTag("MQ", "i", rec2.MapQuality); rec2.MateRefID = rec1.RefID; rec2.MatePosition = rec1.Position; rec2.SetIsReverseStrand( rec1.IsReverseStrand() ); rec2.SetIsMapped(true); rec2.AddTag("MQ", "i", rec1.MapQuality); } // Else if they're both unmapped set that straight else if (!rec1.IsMapped() && !rec2.IsMapped()) { rec1.RefID = NO_ALIGNMENT_REFERENCE_INDEX; rec1.Position = NO_ALIGNMENT_START; rec1.MateRefID = NO_ALIGNMENT_REFERENCE_INDEX; rec1.MatePosition = NO_ALIGNMENT_START; rec1.SetIsReverseStrand(rec2.IsReverseStrand()); rec1.SetIsMapped(false); rec2.RemoveTag("MQ"); rec1.Length = 0; rec2.RefID = NO_ALIGNMENT_REFERENCE_INDEX; rec2.Position = NO_ALIGNMENT_START; rec2.MateRefID = NO_ALIGNMENT_REFERENCE_INDEX; rec2.MatePosition = NO_ALIGNMENT_START; rec2.SetIsReverseStrand(rec1.IsReverseStrand()); rec2.SetIsMapped(false); rec2.RemoveTag("MQ"); rec2.Length = 0; } // And if only one is mapped copy it's coordinate information to the mate else { BamAlignment & mapped = rec1.IsMapped() ? rec1 : rec2; BamAlignment & unmapped = rec1.IsMapped() ? rec2 : rec1; unmapped.RefID = mapped.RefID; unmapped.Position = mapped.Position; mapped.MateRefID = unmapped.RefID; mapped.MatePosition = unmapped.Position; mapped.SetIsMateReverseStrand(unmapped.IsReverseStrand()); mapped.SetIsMateMapped(false); mapped.Length = 0; unmapped.MateRefID = mapped.RefID; unmapped.MatePosition = mapped.Position; unmapped.SetIsMateReverseStrand(mapped.IsReverseStrand()); unmapped.SetIsMateMapped(true); unmapped.Length = 0; } const int insertSize = computeInsertSize(rec1, rec2); rec1.Length = insertSize; rec2.Length = -insertSize; }
void realign_bam(Parameters& params) { FastaReference reference; reference.open(params.fasta_reference); bool suppress_output = false; int dag_window_size = params.dag_window_size; // open BAM file BamReader reader; if (!reader.Open("stdin")) { cerr << "could not open stdin for reading" << endl; exit(1); } BamWriter writer; if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } // store the names of all the reference sequences in the BAM file map<int, string> referenceIDToName; vector<RefData> referenceSequences = reader.GetReferenceData(); int i = 0; for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) { referenceIDToName[i] = r->RefName; ++i; } vcf::VariantCallFile vcffile; if (!params.vcf_file.empty()) { if (!vcffile.open(params.vcf_file)) { cerr << "could not open VCF file " << params.vcf_file << endl; exit(1); } } else { cerr << "realignment requires VCF file" << endl; exit(1); } vcf::Variant var(vcffile); BamAlignment alignment; map<long int, vector<BamAlignment> > alignmentSortQueue; // get alignment // assemble DAG in region around alignment // loop for each alignment in BAM: // update DAG when current alignment gets close to edge of assembled DAG // attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal // if alignment to DAG has fewer mismatches and gaps than original alignment, use it // flatten read into reference space (for now just output alleles from VCF un-spanned insertions) // write read to queue for streaming re-sorting (some positional change will occur) long int dag_start_position = 0; string currentSeqname; string ref; //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph //vector<long int> refpositions; // contains the reference start coords of nodes in the graph ReferenceMappings ref_map; gssw_graph* graph = gssw_graph_create(0); int8_t* nt_table = gssw_create_nt_table(); int8_t* mat = gssw_create_score_matrix(params.match, params.mism); int total_reads = 0; int total_realigned = 0; int total_improved = 0; bool emptyDAG = false; // if the dag is constructed over empty sequence // such as when realigning reads mapped to all-N sequence if (params.debug) { cerr << "about to start processing alignments" << endl; } while (reader.GetNextAlignment(alignment)) { string& seqname = referenceIDToName[alignment.RefID]; if (params.debug) { cerr << "--------------------------------------------" << endl << "processing alignment " << alignment.Name << " at " << seqname << ":" << alignment.Position << endl; } /* if (!alignment.IsMapped() && graph->size == 0) { if (params.debug) { cerr << "unable to build DAG using unmapped read " << alignment.Name << " @ " << seqname << ":" << alignment.Position << " no previous mapped read found and DAG currently empty" << endl; } alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment); continue; } */ ++total_reads; BamAlignment originalAlignment = alignment; long unsigned int initialAlignmentPosition = alignment.Position; //if (dag_start_position == 1) { // dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2); //} // should we construct a new DAG? do so when 3/4 of the way through the current one // center on current position + 1/2 dag window // TODO check this scheme using some scribbles on paper // alignment.IsMapped() if ((seqname != currentSeqname || ((alignment.Position + (alignment.QueryBases.size()/2) > (3*dag_window_size/4) + dag_start_position))) && alignment.Position < reference.sequenceLength(seqname)) { if (seqname != currentSeqname) { if (params.debug) { cerr << "switched ref seqs" << endl; } dag_start_position = max((long int) 0, (long int) (alignment.GetEndPosition() - dag_window_size/2)); // recenter DAG } else if (!ref_map.empty()) { dag_start_position = dag_start_position + dag_window_size/2; dag_start_position = max(dag_start_position, (long int) (alignment.GetEndPosition() - dag_window_size/2)); } else { dag_start_position = alignment.Position - dag_window_size/2; } dag_start_position = max((long int)0, dag_start_position); // TODO get sequence length and use to bound noted window size (edge case) //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl; // get variants for new DAG vector<vcf::Variant> variants; if (!vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size)) { // this is not necessarily an error; there should be a better way to check for VCF file validity /* cerr << "could not set region on VCF file to " << currentSeqname << ":" << dag_start_position << "-" << dag_start_position + ref.size() << endl; */ //exit(1); } else { // check first variant if (vcffile.getNextVariant(var)) { while (var.position <= dag_start_position + 1) { //cerr << "var position == dag_start_position " << endl; dag_start_position -= 1; vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); if (!vcffile.getNextVariant(var)) { break; } } } vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); while (vcffile.getNextVariant(var)) { if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl; //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl; //cerr << var.position << " >= " << dag_start_position << endl; if (var.position + var.ref.length() <= dag_start_position + dag_window_size && var.position >= dag_start_position) { variants.push_back(var); } } } //cerr << "dag_start_position " << dag_start_position << endl; ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), dag_window_size); // 0/1 conversion // clear graph and metadata ref_map.clear(); //cigars.clear(); //refpositions.clear(); gssw_graph_destroy(graph); if (params.debug) { cerr << "constructing DAG" << endl; } // and build the DAG graph = gssw_graph_create(0); constructDAGProgressive(graph, ref_map, ref, seqname, variants, dag_start_position, nt_table, mat, params.flat_input_vcf); if (params.debug) { cerr << "graph has " << graph->size << " nodes" << endl; cerr << "DAG generated from input variants over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } if (params.display_dag) { gssw_graph_print(graph); /* for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) { cout << b->first << " " << b->first->id << " " << b->second.ref_position << " " << b->second.cigar << endl << b->first->seq << endl; } */ } if (graph->size == 1 && allN(ref) || graph->size == 0) { if (params.debug) { cerr << "DAG is empty (1 node, all N). Alignment is irrelevant." << endl; } emptyDAG = true; } else { emptyDAG = false; } } AlignmentStats stats_before; bool was_mapped = alignment.IsMapped(); bool has_realigned = false; if (was_mapped) { if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } } if (params.debug) { if (emptyDAG) { cerr << "cannot realign against empty (all-N single node) graph" << endl; } } if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) { ++total_realigned; if (params.debug) { cerr << "realigning: " << alignment.Name << " " << alignment.QueryBases << endl << " aligned @ " << alignment.Position << " to variant graph over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } //{ try { Cigar flat_cigar; string read = alignment.QueryBases; string qualities = alignment.Qualities; int score; long int position; string strand; gssw_graph_mapping* gm = gswalign(graph, ref_map, read, qualities, params, position, score, flat_cigar, strand, nt_table, mat); // gssw_graph_mapping_destroy(gm); if (params.dry_run) { if (strand == "-" && !alignment.IsMapped()) { read = reverseComplement(read); } cout << read << endl; cout << graph_mapping_to_string(gm) << endl; cout << score << " " << strand << " " << position << " " << flat_cigar << endl; } else { /* if (strand == "-") { read = reverseComplement(trace_report.read); } */ // TODO the qualities are not on the right side of the read if (strand == "-" && alignment.IsMapped()) { // if we're realigning, this is always true unless we swapped strands alignment.SetIsReverseStrand(true); //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities } //alignment.QueryBases = reverseComplement(trace_report.read); alignment.QueryBases = read; alignment.Qualities = qualities; alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x; alignment.SetIsMapped(true); if (!alignment.MapQuality) { alignment.MapQuality = 20; // horrible hack... at least approximate with alignment mismatches against graph } // check if somehow we've ended up with an indel at the ends // if so, grab the reference sequence right beyond it and add // a single match to the cigar, allowing variant detection methods // to run on the results without internal modification Cigar& cigar = flat_cigar; //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; int flankSize = params.flatten_flank; if (cigar.front().isIndel() || (cigar.front().isSoftclip() && cigar.at(1).isIndel())) { alignment.Position -= flankSize; string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize); if (cigar.front().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.begin(), alignment.QueryBases.begin()+cigar.front().length); alignment.Qualities.erase(alignment.Qualities.begin(), alignment.Qualities.begin()+cigar.front().length); cigar.erase(cigar.begin()); } alignment.QueryBases.insert(0, refBase); alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30))); Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); newCigar.append(flat_cigar); flat_cigar = newCigar; } if (cigar.back().isIndel() || (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) { string refBase = reference.getSubSequence(seqname, alignment.Position + flat_cigar.refLen(), flankSize); if (cigar.back().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length, alignment.QueryBases.end()); alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length, alignment.Qualities.end()); cigar.pop_back(); } Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); flat_cigar.append(newCigar); //flat_cigar.append(newCigar); alignment.QueryBases.append(refBase); alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30))); } flat_cigar.toCigarData(alignment.CigarData); //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } AlignmentStats stats_after; countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug); /* if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ /* if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ // we accept the new alignment if... if (!was_mapped // it wasn't mapped previously // or if we have removed soft clips or mismatches (per quality) from the alignment //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum // && stats_before.mismatch_qsum >= stats_after.mismatch_qsum) || ((stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum) // and if we have added gaps, we have added them to remove mismatches or softclips && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment && (stats_before.softclip_qsum + stats_before.mismatch_qsum > stats_after.softclip_qsum + stats_after.mismatch_qsum)))) // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches // as provided in input parameters && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { // keep the alignment // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...) if (params.debug) { cerr << "realigned " << alignment.Name << " to graph, which it maps to with " << stats_after.mismatch_qsum << "q in mismatches and " << stats_after.softclip_qsum << "q in soft clips" << endl; } ++total_improved; has_realigned = true; } else { // reset to old version of alignment if (params.debug) { cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and " << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl; } has_realigned = false; alignment = originalAlignment; } } //} // try block } catch (...) { cerr << "exception when realigning " << alignment.Name << " at position " << referenceIDToName[alignment.RefID] << ":" << alignment.Position << " " << alignment.QueryBases << endl; // reset to original alignment has_realigned = false; alignment = originalAlignment; } } // ensure correct order if alignments move long int maxOutputPos = initialAlignmentPosition - dag_window_size; // if we switched sequences we need to flush out all the reads from the previous one string lastSeqname = currentSeqname; if (seqname != currentSeqname) { // so the max output position is set past the end of the last chromosome if (!currentSeqname.empty()) { maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size; } currentSeqname = seqname; } if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { // except if we are running in unsorted mode, stop when we are at the window size if (!params.unsorted_output && p->first > maxOutputPos) { break; // no more to do } else { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) { writer.SaveAlignment(*a); } } } if (p != alignmentSortQueue.begin()) { alignmentSortQueue.erase(alignmentSortQueue.begin(), p); } if (!params.only_realigned || has_realigned) { alignmentSortQueue[alignment.Position].push_back(alignment); } } } // end GetNextAlignment loop if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) writer.SaveAlignment(*a); } } gssw_graph_destroy(graph); free(nt_table); free(mat); reader.Close(); writer.Close(); if (params.debug) { cerr << "total reads:\t" << total_reads << endl; cerr << "realigned:\t" << total_realigned << endl; cerr << "improved:\t" << total_improved << endl; } }