/** * @param cpy_flnk_5p how many characters to copy from end of 5' flank to start of allele * @param cpy_flnk_3p how many characters to copy from end of 3' flank to end of allele */ static void align_entry_allele(const char *line, size_t linelen, const char *flank5p, size_t flank5p_len, const char *flank3p, size_t flank3p_len, size_t cpy_flnk_5p, size_t cpy_flnk_3p, const read_t *chr, size_t ref_start, size_t ref_end, bool fw_strand, const char *info, const char **genotypes, StrBuf *tmpbuf, FILE *fout) { (void)flank3p_len; ctx_assert(ref_start <= ref_end); // Ref allele const char *ref_allele = chr->seq.b + ref_start; size_t ref_len = ref_end-ref_start; // Construct alt allele const char *alt_allele; size_t alt_len; if(cpy_flnk_5p + cpy_flnk_3p == 0 && fw_strand) { alt_allele = line; alt_len = linelen; } else { strbuf_reset(tmpbuf); strbuf_append_strn(tmpbuf, flank5p+flank5p_len-cpy_flnk_5p, cpy_flnk_5p); strbuf_append_strn(tmpbuf, line, linelen); strbuf_append_strn(tmpbuf, flank3p, cpy_flnk_3p); if(!fw_strand) dna_revcomp_str(tmpbuf->b, tmpbuf->b, tmpbuf->end); alt_allele = tmpbuf->b; alt_len = tmpbuf->end; } // printf("%.*s vs %.*s\n", (int)(ref_end-ref_start), chr->seq.b + ref_start, // (int)alt_len, seq); // Align chrom and seq needleman_wunsch_align2(ref_allele, alt_allele, ref_len, alt_len, &nw_scoring_allele, nw_aligner, aln); num_nw_allele++; // Break into variants and print VCF align_biallelic(aln->result_a, aln->result_b, chr, ref_start, info, genotypes, fout); }
void acall_decompose(CallDecomp *dc, const AlignedCall *call, size_t max_line_len, size_t max_allele_len) { dc->stats.ncalls++; if(call->chrom == NULL) { return; } dc->stats.ncalls_mapped++; const read_t *chrom = call->chrom; const char *ref_allele = chrom->seq.b + call->start; size_t i, ref_len = call->end - call->start; const StrBuf *alt; ctx_assert2(call->start <= call->end, "%u .. %u", call->start, call->end); if(ref_len > max_line_len) { dc->stats.ncalls_ref_allele_too_long++; return; // can't align } dc->stats.nlines += call->n_lines; // printf("chr:%s %u - %u\n", call->chrom->name.b, call->start, call->end); for(i = 0; i < call->n_lines; i++) { alt = &call->lines[i]; ctx_assert(strlen(alt->b) == alt->end); // Quick check if sequence too long or are matching if(alt->end > max_line_len) { dc->stats.nlines_too_long++; } else if(ref_len == alt->end && strncasecmp(ref_allele, alt->b, ref_len) == 0) { dc->stats.nlines_match_ref++; } else { // printf("REF: '%*.s' [%zu]\n", (int)ref_len, ref_allele, ref_len); // printf("ALT: '%*.s' [%zu]\n", (int)alt->end, alt->b, alt->end); needleman_wunsch_align2(ref_allele, alt->b, ref_len, alt->end, dc->scoring, dc->nw_aligner, dc->aln); // printf("ALNA: %s\n", dc->aln->result_a); // printf("ALNB: %s\n", dc->aln->result_b); align_biallelic(dc->aln->result_a, dc->aln->result_b, chrom, call->gts+i*call->n_samples, call->n_samples, dc, call, max_allele_len); dc->stats.nlines_mapped++; } } }
static bool sam_fetch_coords(const CallFileEntry *centry, const char *flank5p, size_t flank5p_len, const char *flank3p, size_t flank3p_len, size_t *cpy_flnk_5p, size_t *cpy_flnk_3p, const read_t **chrom_ptr, size_t *start, size_t *end, bool *fw_strand_ptr) { // Get the next primary alignment do { if(sam_read1(samfh, bam_header, bamentry) < 0) die("We've run out of SAM entries!"); } while(bamentry->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); if(bamentry->core.flag & BAM_FUNMAP) { num_flank5p_unmapped++; return false; } if(bamentry->core.qual < min_mapq) { num_flank5p_lowqual++; return false; } bool fw_strand = !bam_is_rev(bamentry); *fw_strand_ptr = fw_strand; const char *chrom_name = bam_header->target_name[bamentry->core.tid]; const read_t *chrom = seq_fetch_chrom(genome, chrom_name); *chrom_ptr = chrom; const uint32_t *cigar = bam_get_cigar(bamentry); int cigar2rlen = bam_cigar2rlen(bamentry->core.n_cigar, cigar); // cpy_flnk_5p is soft clipped at right end of flank // Eat up hard masked (H), soft masked (S) and inserted bases (relative to ref) (I) *cpy_flnk_5p = bam_get_end_padding(bamentry->core.n_cigar, cigar); *cpy_flnk_3p = 0; // set this later // Get bam query name char *bname = bam_get_qname(bamentry); // Check entry/flank names match const char *hdrline = call_file_get_line(centry, 0); if(hdrline[0] != '>') die("Unexpected line: %s", hdrline); hdrline++; const char *hdrline_end = str_fasta_name_end(hdrline); int hdrline_len = hdrline_end - hdrline; if(strncmp(hdrline, bname, hdrline_len) != 0) die("SAM/BAM and call entries mismatch '%s' vs '%s'", hdrline, bname); // Find 3p flank position using search for first kmer char endkmer[200]; ctx_assert(kmer_size+1 <= sizeof(endkmer)); ctx_assert(flank3p_len >= kmer_size || call_file_min_allele_len(centry) == 0); bubble_get_end_kmer(flank5p, flank5p_len, flank3p, flank3p_len, kmer_size, endkmer); if(!fw_strand) dna_revcomp_str(endkmer, endkmer, kmer_size); // Determine search space // Choose a region of the ref to search for the end flank // end is index after last char long search_start, search_end; size_t longest_allele = call_file_max_allele_len(centry); if(fw_strand) { search_start = (long)bamentry->core.pos + cigar2rlen - kmer_size*2; search_end = (long)bamentry->core.pos + cigar2rlen + longest_allele + kmer_size*2 + 10; } else { search_start = (long)bamentry->core.pos - (longest_allele + kmer_size*2 + 10); search_end = (long)bamentry->core.pos + kmer_size*2; } search_start = MAX2(search_start, 0); search_end = MIN2(search_end, (long)chrom->seq.end); const char *search_region = chrom->seq.b + search_start; size_t search_len = (size_t)(search_end - search_start); // Now do search with kmer // Attempt to find perfect match for kmer within search region // Search, if there is more than one match -> abandon const char *kmer_match = ctx_strnstr(search_region, endkmer, search_len); if(kmer_match != NULL) { // Check for multiple hits size_t rem_search_len = search_region+search_len-kmer_match; if(ctx_strnstr(kmer_match+1, endkmer, rem_search_len-1) != NULL) { num_flank3p_multihits++; return false; } if(fw_strand) { *start = bamentry->core.pos + cigar2rlen; *end = kmer_match - chrom->seq.b; } else { *start = kmer_match + kmer_size - chrom->seq.b; *end = bamentry->core.pos; } num_flank3p_exact_match++; return true; } else { // Look for approximate match needleman_wunsch_align2(search_region, endkmer, search_len, kmer_size, &nw_scoring_flank, nw_aligner, aln); num_nw_flank++; const char *ref = aln->result_a, *alt = aln->result_b; // --aa--dd-cge // bb--ccd-ecge // Find positions of first and last match int i, l, r, matches = 0; int ref_offset_left = 0, ref_offset_rght = 0; int alt_offset_left = 0, alt_offset_rght = 0; for(l = 0; l < (int)aln->length && ref[l] != alt[l]; l++) { ref_offset_left += (ref[l] != '-'); alt_offset_left += (alt[l] != '-'); } for(r = aln->length-1; r > 0 && ref[r] != alt[r]; r--) { ref_offset_rght += (ref[r] != '-'); alt_offset_rght += (alt[r] != '-'); } // Count matches for(i = l; i <= r; i++) matches += (ref[i] == alt[i]); if(matches < (int)kmer_size / 2) { // flank doesn't map well num_flank3p_not_found++; return false; } num_flank3p_approx_match++; *cpy_flnk_3p += fw_strand ? alt_offset_left : alt_offset_rght; if(fw_strand) { *start = bamentry->core.pos + cigar2rlen; *end = search_region + ref_offset_left - chrom->seq.b; } else { *start = (search_region + search_len - ref_offset_rght) - chrom->seq.b; *end = bamentry->core.pos; } return true; } }
void needleman_wunsch_align(const char *a, const char *b, const scoring_t *scoring, nw_aligner_t *nw, alignment_t *result) { needleman_wunsch_align2(a, b, strlen(a), strlen(b), scoring, nw, result); }