コード例 #1
0
ファイル: ctx_calls2vcf.c プロジェクト: ambarrio/mccortex
/**
 * @param cpy_flnk_5p how many characters to copy from end of 5' flank to start of allele
 * @param cpy_flnk_3p how many characters to copy from end of 3' flank to end of allele
 */
static void align_entry_allele(const char *line, size_t linelen,
                               const char *flank5p, size_t flank5p_len,
                               const char *flank3p, size_t flank3p_len,
                               size_t cpy_flnk_5p, size_t cpy_flnk_3p,
                               const read_t *chr,
                               size_t ref_start, size_t ref_end,
                               bool fw_strand,
                               const char *info, const char **genotypes,
                               StrBuf *tmpbuf, FILE *fout)
{
  (void)flank3p_len;
  ctx_assert(ref_start <= ref_end);

  // Ref allele
  const char *ref_allele = chr->seq.b + ref_start;
  size_t ref_len = ref_end-ref_start;

  // Construct alt allele
  const char *alt_allele;
  size_t alt_len;

  if(cpy_flnk_5p + cpy_flnk_3p == 0 && fw_strand)
  {
    alt_allele = line;
    alt_len = linelen;
  }
  else
  {
    strbuf_reset(tmpbuf);
    strbuf_append_strn(tmpbuf, flank5p+flank5p_len-cpy_flnk_5p, cpy_flnk_5p);
    strbuf_append_strn(tmpbuf, line, linelen);
    strbuf_append_strn(tmpbuf, flank3p, cpy_flnk_3p);

    if(!fw_strand) dna_revcomp_str(tmpbuf->b, tmpbuf->b, tmpbuf->end);

    alt_allele = tmpbuf->b;
    alt_len = tmpbuf->end;
  }

  // printf("%.*s vs %.*s\n", (int)(ref_end-ref_start), chr->seq.b + ref_start,
  //                          (int)alt_len, seq);

  // Align chrom and seq
  needleman_wunsch_align2(ref_allele, alt_allele, ref_len, alt_len,
                          &nw_scoring_allele, nw_aligner, aln);
  num_nw_allele++;

  // Break into variants and print VCF
  align_biallelic(aln->result_a, aln->result_b,
                  chr, ref_start,
                  info, genotypes, fout);
}
コード例 #2
0
ファイル: aligned_call.c プロジェクト: rec3141/mccortex
void acall_decompose(CallDecomp *dc, const AlignedCall *call,
                     size_t max_line_len, size_t max_allele_len)
{
  dc->stats.ncalls++;
  if(call->chrom == NULL) { return; }
  dc->stats.ncalls_mapped++;

  const read_t *chrom = call->chrom;
  const char *ref_allele = chrom->seq.b + call->start;
  size_t i, ref_len = call->end - call->start;
  const StrBuf *alt;

  ctx_assert2(call->start <= call->end, "%u .. %u", call->start, call->end);

  if(ref_len > max_line_len) {
    dc->stats.ncalls_ref_allele_too_long++;
    return; // can't align
  }

  dc->stats.nlines += call->n_lines;

  // printf("chr:%s %u - %u\n", call->chrom->name.b, call->start, call->end);

  for(i = 0; i < call->n_lines; i++)
  {
    alt = &call->lines[i];
    ctx_assert(strlen(alt->b) == alt->end);

    // Quick check if sequence too long or are matching
    if(alt->end > max_line_len) {
      dc->stats.nlines_too_long++;
    } else if(ref_len == alt->end && strncasecmp(ref_allele, alt->b, ref_len) == 0) {
      dc->stats.nlines_match_ref++;
    } else {
      // printf("REF: '%*.s' [%zu]\n", (int)ref_len, ref_allele, ref_len);
      // printf("ALT: '%*.s' [%zu]\n", (int)alt->end, alt->b, alt->end);

      needleman_wunsch_align2(ref_allele, alt->b, ref_len, alt->end,
                              dc->scoring, dc->nw_aligner, dc->aln);

      // printf("ALNA: %s\n", dc->aln->result_a);
      // printf("ALNB: %s\n", dc->aln->result_b);

      align_biallelic(dc->aln->result_a, dc->aln->result_b, chrom,
                      call->gts+i*call->n_samples, call->n_samples,
                      dc, call, max_allele_len);
      dc->stats.nlines_mapped++;
    }
  }
}
コード例 #3
0
ファイル: ctx_calls2vcf.c プロジェクト: ambarrio/mccortex
static bool sam_fetch_coords(const CallFileEntry *centry,
                             const char *flank5p, size_t flank5p_len,
                             const char *flank3p, size_t flank3p_len,
                             size_t *cpy_flnk_5p, size_t *cpy_flnk_3p,
                             const read_t **chrom_ptr,
                             size_t *start, size_t *end,
                             bool *fw_strand_ptr)
{
  // Get the next primary alignment
  do {
    if(sam_read1(samfh, bam_header, bamentry) < 0)
      die("We've run out of SAM entries!");
  } while(bamentry->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY));

  if(bamentry->core.flag & BAM_FUNMAP) { num_flank5p_unmapped++; return false; }
  if(bamentry->core.qual < min_mapq)   { num_flank5p_lowqual++;  return false; }

  bool fw_strand = !bam_is_rev(bamentry);
  *fw_strand_ptr = fw_strand;

  const char *chrom_name = bam_header->target_name[bamentry->core.tid];
  const read_t *chrom = seq_fetch_chrom(genome, chrom_name);
  *chrom_ptr = chrom;

  const uint32_t *cigar = bam_get_cigar(bamentry);
  int cigar2rlen = bam_cigar2rlen(bamentry->core.n_cigar, cigar);

  // cpy_flnk_5p is soft clipped at right end of flank
  // Eat up hard masked (H), soft masked (S) and inserted bases (relative to ref) (I)
  *cpy_flnk_5p = bam_get_end_padding(bamentry->core.n_cigar, cigar);
  *cpy_flnk_3p = 0; // set this later

  // Get bam query name
  char *bname = bam_get_qname(bamentry);

  // Check entry/flank names match
  const char *hdrline = call_file_get_line(centry, 0);
  if(hdrline[0] != '>') die("Unexpected line: %s", hdrline);
  hdrline++;
  const char *hdrline_end = str_fasta_name_end(hdrline);
  int hdrline_len = hdrline_end - hdrline;

  if(strncmp(hdrline, bname, hdrline_len) != 0)
    die("SAM/BAM and call entries mismatch '%s' vs '%s'", hdrline, bname);

  // Find 3p flank position using search for first kmer
  char endkmer[200];
  ctx_assert(kmer_size+1 <= sizeof(endkmer));
  ctx_assert(flank3p_len >= kmer_size || call_file_min_allele_len(centry) == 0);
  bubble_get_end_kmer(flank5p, flank5p_len, flank3p, flank3p_len, kmer_size, endkmer);
  if(!fw_strand) dna_revcomp_str(endkmer, endkmer, kmer_size);

  // Determine search space
  // Choose a region of the ref to search for the end flank
  // end is index after last char
  long search_start, search_end;
  size_t longest_allele = call_file_max_allele_len(centry);

  if(fw_strand) {
    search_start = (long)bamentry->core.pos + cigar2rlen - kmer_size*2;
    search_end = (long)bamentry->core.pos + cigar2rlen + longest_allele + kmer_size*2 + 10;
  } else {
    search_start = (long)bamentry->core.pos - (longest_allele + kmer_size*2 + 10);
    search_end = (long)bamentry->core.pos + kmer_size*2;
  }

  search_start = MAX2(search_start, 0);
  search_end   = MIN2(search_end,   (long)chrom->seq.end);

  const char *search_region = chrom->seq.b + search_start;
  size_t search_len = (size_t)(search_end - search_start);

  // Now do search with kmer
  // Attempt to find perfect match for kmer within search region

  // Search, if there is more than one match -> abandon
  const char *kmer_match = ctx_strnstr(search_region, endkmer, search_len);

  if(kmer_match != NULL)
  {
    // Check for multiple hits
    size_t rem_search_len = search_region+search_len-kmer_match;
    if(ctx_strnstr(kmer_match+1, endkmer, rem_search_len-1) != NULL) {
      num_flank3p_multihits++;
      return false;
    }

    if(fw_strand) {
      *start = bamentry->core.pos + cigar2rlen;
      *end   = kmer_match - chrom->seq.b;
    } else {
      *start = kmer_match + kmer_size - chrom->seq.b;
      *end   = bamentry->core.pos;
    }
    num_flank3p_exact_match++;
    return true;
  }
  else
  {
    // Look for approximate match
    needleman_wunsch_align2(search_region, endkmer, search_len, kmer_size,
                            &nw_scoring_flank, nw_aligner, aln);
    num_nw_flank++;
    const char *ref = aln->result_a, *alt = aln->result_b;
    // --aa--dd-cge
    // bb--ccd-ecge

    // Find positions of first and last match
    int i, l, r, matches = 0;
    int ref_offset_left = 0, ref_offset_rght = 0;
    int alt_offset_left = 0, alt_offset_rght = 0;

    for(l = 0; l < (int)aln->length && ref[l] != alt[l]; l++) {
      ref_offset_left += (ref[l] != '-');
      alt_offset_left += (alt[l] != '-');
    }
    for(r = aln->length-1; r > 0 && ref[r] != alt[r]; r--) {
      ref_offset_rght += (ref[r] != '-');
      alt_offset_rght += (alt[r] != '-');
    }

    // Count matches
    for(i = l; i <= r; i++) matches += (ref[i] == alt[i]);

    if(matches < (int)kmer_size / 2)
    {
      // flank doesn't map well
      num_flank3p_not_found++;
      return false;
    }

    num_flank3p_approx_match++;

    *cpy_flnk_3p += fw_strand ? alt_offset_left : alt_offset_rght;

    if(fw_strand) {
      *start = bamentry->core.pos + cigar2rlen;
      *end   = search_region + ref_offset_left - chrom->seq.b;
    } else {
      *start = (search_region + search_len - ref_offset_rght) - chrom->seq.b;
      *end   = bamentry->core.pos;
    }

    return true;
  }
}
コード例 #4
0
void needleman_wunsch_align(const char *a, const char *b,
                            const scoring_t *scoring,
                            nw_aligner_t *nw, alignment_t *result)
{
  needleman_wunsch_align2(a, b, strlen(a), strlen(b), scoring, nw, result);
}