Ejemplo n.º 1
0
static int unpad_seq(bam1_t *b, kstring_t *s)
{
    // Returns 0 on success, -1 on an error
    int k, j, i;
    int length;
    int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */
    uint32_t *cigar = bam_get_cigar(b);
    uint8_t *seq = bam_get_seq(b);

    // b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
    // We need the padded length after alignment from the CIGAR (excluding
    // soft clips S, but including pads from CIGAR D operations)
    length = bam_cigar2rlen(b->core.n_cigar, cigar);
    ks_resize(s, length);
    for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
        int op, ol;
        op = bam_cigar_op(cigar[k]);
        ol = bam_cigar_oplen(cigar[k]);
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j);
        } else if (op == BAM_CSOFT_CLIP) {
            j += ol;
        } else if (op == BAM_CHARD_CLIP) {
            /* do nothing */
        } else if (op == BAM_CDEL) {
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
        } else if (op == BAM_CREF_SKIP) {
            /* Treat CIGAR N as D (not ideal, but better than ignoring it) */
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
            if (0 == cigar_n_warning) {
                cigar_n_warning = -1;
                fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
            }
        } else {
            fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
            return -1;
        }
    }
    return length != s->l;
}
Ejemplo n.º 2
0
static bool sam_fetch_coords(const CallFileEntry *centry,
                             const char *flank5p, size_t flank5p_len,
                             const char *flank3p, size_t flank3p_len,
                             size_t *cpy_flnk_5p, size_t *cpy_flnk_3p,
                             const read_t **chrom_ptr,
                             size_t *start, size_t *end,
                             bool *fw_strand_ptr)
{
  // Get the next primary alignment
  do {
    if(sam_read1(samfh, bam_header, bamentry) < 0)
      die("We've run out of SAM entries!");
  } while(bamentry->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY));

  if(bamentry->core.flag & BAM_FUNMAP) { num_flank5p_unmapped++; return false; }
  if(bamentry->core.qual < min_mapq)   { num_flank5p_lowqual++;  return false; }

  bool fw_strand = !bam_is_rev(bamentry);
  *fw_strand_ptr = fw_strand;

  const char *chrom_name = bam_header->target_name[bamentry->core.tid];
  const read_t *chrom = seq_fetch_chrom(genome, chrom_name);
  *chrom_ptr = chrom;

  const uint32_t *cigar = bam_get_cigar(bamentry);
  int cigar2rlen = bam_cigar2rlen(bamentry->core.n_cigar, cigar);

  // cpy_flnk_5p is soft clipped at right end of flank
  // Eat up hard masked (H), soft masked (S) and inserted bases (relative to ref) (I)
  *cpy_flnk_5p = bam_get_end_padding(bamentry->core.n_cigar, cigar);
  *cpy_flnk_3p = 0; // set this later

  // Get bam query name
  char *bname = bam_get_qname(bamentry);

  // Check entry/flank names match
  const char *hdrline = call_file_get_line(centry, 0);
  if(hdrline[0] != '>') die("Unexpected line: %s", hdrline);
  hdrline++;
  const char *hdrline_end = str_fasta_name_end(hdrline);
  int hdrline_len = hdrline_end - hdrline;

  if(strncmp(hdrline, bname, hdrline_len) != 0)
    die("SAM/BAM and call entries mismatch '%s' vs '%s'", hdrline, bname);

  // Find 3p flank position using search for first kmer
  char endkmer[200];
  ctx_assert(kmer_size+1 <= sizeof(endkmer));
  ctx_assert(flank3p_len >= kmer_size || call_file_min_allele_len(centry) == 0);
  bubble_get_end_kmer(flank5p, flank5p_len, flank3p, flank3p_len, kmer_size, endkmer);
  if(!fw_strand) dna_revcomp_str(endkmer, endkmer, kmer_size);

  // Determine search space
  // Choose a region of the ref to search for the end flank
  // end is index after last char
  long search_start, search_end;
  size_t longest_allele = call_file_max_allele_len(centry);

  if(fw_strand) {
    search_start = (long)bamentry->core.pos + cigar2rlen - kmer_size*2;
    search_end = (long)bamentry->core.pos + cigar2rlen + longest_allele + kmer_size*2 + 10;
  } else {
    search_start = (long)bamentry->core.pos - (longest_allele + kmer_size*2 + 10);
    search_end = (long)bamentry->core.pos + kmer_size*2;
  }

  search_start = MAX2(search_start, 0);
  search_end   = MIN2(search_end,   (long)chrom->seq.end);

  const char *search_region = chrom->seq.b + search_start;
  size_t search_len = (size_t)(search_end - search_start);

  // Now do search with kmer
  // Attempt to find perfect match for kmer within search region

  // Search, if there is more than one match -> abandon
  const char *kmer_match = ctx_strnstr(search_region, endkmer, search_len);

  if(kmer_match != NULL)
  {
    // Check for multiple hits
    size_t rem_search_len = search_region+search_len-kmer_match;
    if(ctx_strnstr(kmer_match+1, endkmer, rem_search_len-1) != NULL) {
      num_flank3p_multihits++;
      return false;
    }

    if(fw_strand) {
      *start = bamentry->core.pos + cigar2rlen;
      *end   = kmer_match - chrom->seq.b;
    } else {
      *start = kmer_match + kmer_size - chrom->seq.b;
      *end   = bamentry->core.pos;
    }
    num_flank3p_exact_match++;
    return true;
  }
  else
  {
    // Look for approximate match
    needleman_wunsch_align2(search_region, endkmer, search_len, kmer_size,
                            &nw_scoring_flank, nw_aligner, aln);
    num_nw_flank++;
    const char *ref = aln->result_a, *alt = aln->result_b;
    // --aa--dd-cge
    // bb--ccd-ecge

    // Find positions of first and last match
    int i, l, r, matches = 0;
    int ref_offset_left = 0, ref_offset_rght = 0;
    int alt_offset_left = 0, alt_offset_rght = 0;

    for(l = 0; l < (int)aln->length && ref[l] != alt[l]; l++) {
      ref_offset_left += (ref[l] != '-');
      alt_offset_left += (alt[l] != '-');
    }
    for(r = aln->length-1; r > 0 && ref[r] != alt[r]; r--) {
      ref_offset_rght += (ref[r] != '-');
      alt_offset_rght += (alt[r] != '-');
    }

    // Count matches
    for(i = l; i <= r; i++) matches += (ref[i] == alt[i]);

    if(matches < (int)kmer_size / 2)
    {
      // flank doesn't map well
      num_flank3p_not_found++;
      return false;
    }

    num_flank3p_approx_match++;

    *cpy_flnk_3p += fw_strand ? alt_offset_left : alt_offset_rght;

    if(fw_strand) {
      *start = bamentry->core.pos + cigar2rlen;
      *end   = search_region + ref_offset_left - chrom->seq.b;
    } else {
      *start = (search_region + search_len - ref_offset_rght) - chrom->seq.b;
      *end   = bamentry->core.pos;
    }

    return true;
  }
}
Ejemplo n.º 3
0
bool BamRecordBuilder::BuildInPlace(BamRecord& record) const
{
    // initialize with basic 'core data'
    PBBAM_SHARED_PTR<bam1_t> recordRawData = internal::BamRecordMemory::GetRawData(record); /*   record.impl_.RawData().get();*/
    if (!recordRawData || !recordRawData->data)
        throw std::runtime_error("BamRecord memory in invalid state");    
    recordRawData->core = core_;

    // setup variable length data
    const std::vector<uint8_t> encodedTags = BamTagCodec::Encode(tags_);

    const size_t nameLength  = name_.size() + 1;
    const size_t numCigarOps = cigar_.size();
    const size_t cigarLength = numCigarOps * sizeof(uint32_t);
    const size_t seqLength   = sequence_.size();
    const size_t qualLength  = seqLength;
    const size_t tagLength   = encodedTags.size();
    const size_t dataLength  = nameLength + cigarLength + seqLength + qualLength + tagLength;

    // realloc if necessary
    uint8_t* varLengthDataBlock = recordRawData->data;
    if (!varLengthDataBlock)
        throw std::runtime_error("BamRecord memory in invalid state");
    size_t allocatedDataLength = recordRawData->m_data;
    if (allocatedDataLength < dataLength) {
        allocatedDataLength = dataLength;
        kroundup32(allocatedDataLength);
        varLengthDataBlock = (uint8_t*)realloc(varLengthDataBlock, allocatedDataLength);
    }
    recordRawData->data = varLengthDataBlock;
    recordRawData->l_data = dataLength;
    recordRawData->m_data = allocatedDataLength;

    size_t index = 0;

    // name
    memcpy(&varLengthDataBlock[index], name_.c_str(), nameLength);
    index += nameLength;

    // cigar
    if (cigarLength > 0) {
        std::vector<uint32_t> encodedCigar(numCigarOps);
        for (size_t i = 0; i < numCigarOps; ++i) {
            const CigarOperation& op = cigar_.at(i);
            encodedCigar[i] = op.Length() << BAM_CIGAR_SHIFT;
            const uint8_t type = static_cast<uint8_t>(op.Type());
            if (type >= 8)
                throw std::runtime_error("invalid CIGAR op type: " + std::to_string(type));
            encodedCigar[i] |= type;
        }
        memcpy(&varLengthDataBlock[index], &encodedCigar[0], cigarLength);
        index += cigarLength;

        // update bin after we've calculated cigar info
        const int32_t endPosition = bam_cigar2rlen(recordRawData->core.n_cigar, &encodedCigar[0]);
        recordRawData->core.bin = hts_reg2bin(core_.pos, endPosition, 14, 5);
    }

    // seq & qual
    if (seqLength > 0) {

        uint8_t* s = &varLengthDataBlock[index];
        for (size_t i = 0; i < seqLength; ++i)
            s[i>>1] |= ( seq_nt16_table[static_cast<int>(sequence_.at(i))] << ((~i&1)<<2) );
        index += seqLength;

        uint8_t* q = &varLengthDataBlock[index];
        if (!qualities_.empty())
            memset(q, 0xFF, seqLength);
        else {
            for (size_t i = 0; i < seqLength; ++i)
                q[i] = qualities_.at(i) - 33;
        }
        index += seqLength;
    }