// fill out al and info for a given mate // this does *not* fill in mapq, as that requires knowledge of which mate is the anchor as well as second-best scores void DebugOutput::process_one_mate(DbgAlignment& al, DbgInfo& info, const AlignmentData& alignment, const AlignmentData& mate, const uint32 mapq) { // output read_id as CRC of read name al.read_id = crcCalc(alignment.read_name, strlen(alignment.read_name)); al.read_len = alignment.read_len; if (alignment.best->is_aligned()) { // setup alignment information const io::BNTAnn* ann = std::upper_bound( bnt.data.anns, bnt.data.anns + bnt.info.n_seqs, alignment.cigar_pos, SeqFinder() ) - 1u; al.alignment_pos = alignment.cigar_pos - int32(ann->offset) + 1u; info.flag = (alignment.best->mate() ? DbgInfo::READ_2 : DbgInfo::READ_1) | (alignment.best->is_rc() ? DbgInfo::REVERSE : 0u); if (alignment_type == PAIRED_END) { if (alignment.best->is_paired()) // FIXME: this should be other_mate.is_concordant() { info.flag |= DbgInfo::PROPER_PAIR; } if (mate.best->is_aligned() == false) { info.flag |= DbgInfo::MATE_UNMAPPED; } } const uint32 ref_cigar_len = reference_cigar_length(alignment.cigar, alignment.cigar_len); if (alignment.cigar_pos + ref_cigar_len > ann->offset + ann->len) { // flag UNMAPPED as this alignment bridges two adjacent reference sequences info.flag |= DbgInfo::UNMAPPED; } uint32 n_mm; uint32 n_gapo; uint32 n_gape; analyze_md_string(alignment.mds_vec, n_mm, n_gapo, n_gape); info.ref_id = uint32(ann - bnt.data.anns); info.mate = alignment.best->mate(); info.score = alignment.best->score(); info.mapQ = mapq; info.ed = alignment.best->ed(); info.subs = count_symbols(Cigar::SUBSTITUTION, alignment.cigar, alignment.cigar_len); info.ins = count_symbols(Cigar::INSERTION, alignment.cigar, alignment.cigar_len); info.dels = count_symbols(Cigar::DELETION, alignment.cigar, alignment.cigar_len); info.mms = n_mm; info.gapo = n_gapo; info.gape = n_gape; info.sec_score = alignment.second_best->score(); if (info.sec_score == alignment.second_best->is_aligned()) { info.sec_score = alignment.second_best->score(); info.has_second = 1; } else { info.sec_score = Field_traits<int16>::min(); info.has_second = 0; } info.pad = 0x69; } else { // unmapped alignment al.alignment_pos = 0; } }
uint32 BamOutput::process_one_alignment(AlignmentData& alignment, AlignmentData& mate) { // BAM alignment header struct BAM_alignment alnh; // data block with actual alignment info struct BAM_alignment_data_block alnd; uint8 mapq; // xxxnsubtil: this is probably not needed memset(&alnh, 0, sizeof(alnh)); memset(&alnd, 0, sizeof(alnd)); const uint32 ref_cigar_len = reference_cigar_length(alignment.cigar, alignment.cigar_len); // setup alignment information const uint32 seq_index = uint32(std::upper_bound( bnt.sequence_index, bnt.sequence_index + bnt.n_seqs, alignment.cigar_pos ) - bnt.sequence_index) - 1u; // fill out read name and length alnd.name = alignment.read_name; alnh.bin_mq_nl = (uint8)(strlen(alnd.name) + 1); // fill out read data // (PackedStream is not used here to avoid doing a read-modify-write on every BP) { for(uint32 i = 0; i < alignment.read_len; i += 2) { uint8 out_bp; uint8 s; if (alignment.aln->m_rc) { nvbio::complement_functor<4> complement; s = complement(alignment.read_data[i]); s = encode_bp(s); out_bp = s << 4; if (i + 1 < alignment.read_len) { s = complement(alignment.read_data[(i + 1)]); s = encode_bp(s); out_bp |= s; } } else { s = alignment.read_data[alignment.read_len - i - 1]; s = encode_bp(s); out_bp = s << 4; if (i + 1 < alignment.read_len) { s = alignment.read_data[alignment.read_len - (i + 1) - 1]; s = encode_bp(s); out_bp |= s; } } alnd.seq[i / 2] = out_bp; } alnh.l_seq = alignment.read_len; } // fill out quality data for(uint32 i = 0; i < alignment.read_len; i++) { char q; if (alignment.aln->m_rc) q = alignment.qual[i]; else q = alignment.qual[alignment.read_len - i - 1]; alnd.qual[i] = q; } // compute mapping quality mapq = alignment.mapq; // check if we're mapped if (alignment.aln->is_aligned() == false || mapq < mapq_filter) { alnh.refID = -1; alnh.pos = -1; alnh.flag_nc = BAM_FLAGS_UNMAPPED; alnh.next_refID = -1; alnh.next_pos = -1; // mark the md string as empty alnd.md_string[0] = '\0'; // unaligned reads don't need anything else; output and return output_alignment(alnh, alnd); return 0; } // compute alignment flags alnh.flag_nc = (alignment.aln->mate() ? BAM_FLAGS_READ_2 : BAM_FLAGS_READ_1); if (alignment.aln->m_rc) alnh.flag_nc |= BAM_FLAGS_REVERSE; if (alignment_type == PAIRED_END) { alnh.flag_nc |= BAM_FLAGS_PAIRED; if (mate.aln->is_concordant()) alnh.flag_nc |= BAM_FLAGS_PROPER_PAIR; if (!mate.aln->is_aligned()) alnh.flag_nc |= BAM_FLAGS_MATE_UNMAPPED; if (mate.aln->is_rc()) alnh.flag_nc |= BAM_FLAGS_MATE_REVERSE; } if (alignment.cigar_pos + ref_cigar_len > bnt.sequence_index[ seq_index+1 ]) { // flag UNMAP as this alignment bridges two adjacent reference sequences // xxxnsubtil: we still output the rest of the alignment data, does that make sense? alnh.flag_nc |= BAM_FLAGS_UNMAPPED; // make this look like a real unmapped alignment alnh.refID = -1; alnh.pos = -1; alnh.next_refID = -1; alnh.next_pos = -1; alnd.md_string[0] = '\0'; output_alignment(alnh, alnd); return 0; } // fill out alignment reference ID and position alnh.refID = seq_index; alnh.pos = uint32(alignment.cigar_pos - bnt.sequence_index[ seq_index ]); // write out mapq alnh.bin_mq_nl |= (mapq << 8); // BAM alignment bin is always 0 // xxxnsubtil: is the bin useful? // fill out the cigar string... uint32 computed_cigar_len = generate_cigar(alnh, alnd, alignment); // ... and make sure it makes (some) sense if (computed_cigar_len != alignment.read_len) { log_error(stderr, "BAM output : cigar length doesn't match read %u (%u != %u)\n", alignment.read_id /* xxxnsubtil: global_read_id */, computed_cigar_len, alignment.read_len); return mapq; } if (alignment_type == PAIRED_END) { if (mate.aln->is_aligned()) { const uint32 o_ref_cigar_len = reference_cigar_length(mate.cigar, mate.cigar_len); // setup alignment information for the opposite mate const uint32 o_seq_index = uint32(std::upper_bound( bnt.sequence_index, bnt.sequence_index + bnt.n_seqs, mate.cigar_pos ) - bnt.sequence_index) - 1u; alnh.next_refID = uint32(o_seq_index - seq_index); // next_pos here is equivalent to SAM's PNEXT, // but it's zero-based in BAM and one-based in SAM alnh.next_pos = int32( mate.cigar_pos - bnt.sequence_index[ o_seq_index ] ); if (o_seq_index != seq_index) alnh.tlen = 0; else { alnh.tlen = nvbio::max(mate.cigar_pos + o_ref_cigar_len, alignment.cigar_pos + ref_cigar_len) - nvbio::min(mate.cigar_pos, alignment.cigar_pos); if (mate.cigar_pos < alignment.cigar_pos) alnh.tlen = -alnh.tlen; } } else { // other mate is unmapped // xxxnsubtil: this follows the same convention that was documented in the old code for SAM, // except that BAM does not have an encoding for '=' here // it's somewhat unclear whether this is correct alnh.next_refID = alnh.refID; alnh.next_pos = int32( alignment.cigar_pos - bnt.sequence_index[ seq_index ] ); // xxx: check whether this is really correct alnh.tlen = 0; } } else { alnh.next_refID = -1; alnh.next_pos = -1; alnh.tlen = 0; } // fill out tag data alnd.ed = alignment.aln->ed(); alnd.score = alignment.aln->score(); alnd.second_score_valid = false; // TODO! generate_md_string(alnh, alnd, alignment); // write out the alignment output_alignment(alnh, alnd); return mapq; }