Esempio n. 1
0
// Function to compare reads and determine which one is < the other
static inline int bam1_lt(const bam1_p a, const bam1_p b)
{
    if (g_is_by_qname) {
        int t = strnum_cmp(bam_get_qname(a), bam_get_qname(b));
        return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
    } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam_is_rev(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam_is_rev(b)));
}
Esempio n. 2
0
static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
{
    int j;
    if (p->is_head) {
        putc('^', fp);
        putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
    }
    if (!p->is_del) {
        int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
        if (ref) {
            int rb = pos < ref_len? ref[pos] : 'N';
            if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.';
            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
        } else {
            if (c == '=') c = bam_is_rev(p->b)? ',' : '.';
            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
        }
        putc(c, fp);
    } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp);
    if (p->indel > 0) {
        putc('+', fp); printw(p->indel, fp);
        for (j = 1; j <= p->indel; ++j) {
            int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)];
            putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
        }
    } else if (p->indel < 0) {
        printw(p->indel, fp);
        for (j = 1; j <= -p->indel; ++j) {
            int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
            putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
        }
    }
    if (p->is_tail) putc('$', fp);
}
Esempio n. 3
0
static void make_single_key(key_data_t *key, bam1_t *bam) {
    int32_t this_ref, this_coord;
    int32_t orientation;

    this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash

    if (bam_is_rev(bam)) {
        this_coord = unclipped_end(bam);
        orientation = O_RR;
    } else {
        this_coord = unclipped_start(bam);
        orientation = O_FF;
    }

    key->single        = 1;
    key->this_ref      = this_ref;
    key->this_coord    = this_coord;
    key->orientation   = orientation;
}
Esempio n. 4
0
static bool sam_fetch_coords(const CallFileEntry *centry,
                             const char *flank5p, size_t flank5p_len,
                             const char *flank3p, size_t flank3p_len,
                             size_t *cpy_flnk_5p, size_t *cpy_flnk_3p,
                             const read_t **chrom_ptr,
                             size_t *start, size_t *end,
                             bool *fw_strand_ptr)
{
  // Get the next primary alignment
  do {
    if(sam_read1(samfh, bam_header, bamentry) < 0)
      die("We've run out of SAM entries!");
  } while(bamentry->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY));

  if(bamentry->core.flag & BAM_FUNMAP) { num_flank5p_unmapped++; return false; }
  if(bamentry->core.qual < min_mapq)   { num_flank5p_lowqual++;  return false; }

  bool fw_strand = !bam_is_rev(bamentry);
  *fw_strand_ptr = fw_strand;

  const char *chrom_name = bam_header->target_name[bamentry->core.tid];
  const read_t *chrom = seq_fetch_chrom(genome, chrom_name);
  *chrom_ptr = chrom;

  const uint32_t *cigar = bam_get_cigar(bamentry);
  int cigar2rlen = bam_cigar2rlen(bamentry->core.n_cigar, cigar);

  // cpy_flnk_5p is soft clipped at right end of flank
  // Eat up hard masked (H), soft masked (S) and inserted bases (relative to ref) (I)
  *cpy_flnk_5p = bam_get_end_padding(bamentry->core.n_cigar, cigar);
  *cpy_flnk_3p = 0; // set this later

  // Get bam query name
  char *bname = bam_get_qname(bamentry);

  // Check entry/flank names match
  const char *hdrline = call_file_get_line(centry, 0);
  if(hdrline[0] != '>') die("Unexpected line: %s", hdrline);
  hdrline++;
  const char *hdrline_end = str_fasta_name_end(hdrline);
  int hdrline_len = hdrline_end - hdrline;

  if(strncmp(hdrline, bname, hdrline_len) != 0)
    die("SAM/BAM and call entries mismatch '%s' vs '%s'", hdrline, bname);

  // Find 3p flank position using search for first kmer
  char endkmer[200];
  ctx_assert(kmer_size+1 <= sizeof(endkmer));
  ctx_assert(flank3p_len >= kmer_size || call_file_min_allele_len(centry) == 0);
  bubble_get_end_kmer(flank5p, flank5p_len, flank3p, flank3p_len, kmer_size, endkmer);
  if(!fw_strand) dna_revcomp_str(endkmer, endkmer, kmer_size);

  // Determine search space
  // Choose a region of the ref to search for the end flank
  // end is index after last char
  long search_start, search_end;
  size_t longest_allele = call_file_max_allele_len(centry);

  if(fw_strand) {
    search_start = (long)bamentry->core.pos + cigar2rlen - kmer_size*2;
    search_end = (long)bamentry->core.pos + cigar2rlen + longest_allele + kmer_size*2 + 10;
  } else {
    search_start = (long)bamentry->core.pos - (longest_allele + kmer_size*2 + 10);
    search_end = (long)bamentry->core.pos + kmer_size*2;
  }

  search_start = MAX2(search_start, 0);
  search_end   = MIN2(search_end,   (long)chrom->seq.end);

  const char *search_region = chrom->seq.b + search_start;
  size_t search_len = (size_t)(search_end - search_start);

  // Now do search with kmer
  // Attempt to find perfect match for kmer within search region

  // Search, if there is more than one match -> abandon
  const char *kmer_match = ctx_strnstr(search_region, endkmer, search_len);

  if(kmer_match != NULL)
  {
    // Check for multiple hits
    size_t rem_search_len = search_region+search_len-kmer_match;
    if(ctx_strnstr(kmer_match+1, endkmer, rem_search_len-1) != NULL) {
      num_flank3p_multihits++;
      return false;
    }

    if(fw_strand) {
      *start = bamentry->core.pos + cigar2rlen;
      *end   = kmer_match - chrom->seq.b;
    } else {
      *start = kmer_match + kmer_size - chrom->seq.b;
      *end   = bamentry->core.pos;
    }
    num_flank3p_exact_match++;
    return true;
  }
  else
  {
    // Look for approximate match
    needleman_wunsch_align2(search_region, endkmer, search_len, kmer_size,
                            &nw_scoring_flank, nw_aligner, aln);
    num_nw_flank++;
    const char *ref = aln->result_a, *alt = aln->result_b;
    // --aa--dd-cge
    // bb--ccd-ecge

    // Find positions of first and last match
    int i, l, r, matches = 0;
    int ref_offset_left = 0, ref_offset_rght = 0;
    int alt_offset_left = 0, alt_offset_rght = 0;

    for(l = 0; l < (int)aln->length && ref[l] != alt[l]; l++) {
      ref_offset_left += (ref[l] != '-');
      alt_offset_left += (alt[l] != '-');
    }
    for(r = aln->length-1; r > 0 && ref[r] != alt[r]; r--) {
      ref_offset_rght += (ref[r] != '-');
      alt_offset_rght += (alt[r] != '-');
    }

    // Count matches
    for(i = l; i <= r; i++) matches += (ref[i] == alt[i]);

    if(matches < (int)kmer_size / 2)
    {
      // flank doesn't map well
      num_flank3p_not_found++;
      return false;
    }

    num_flank3p_approx_match++;

    *cpy_flnk_3p += fw_strand ? alt_offset_left : alt_offset_rght;

    if(fw_strand) {
      *start = bamentry->core.pos + cigar2rlen;
      *end   = search_region + ref_offset_left - chrom->seq.b;
    } else {
      *start = (search_region + search_len - ref_offset_rght) - chrom->seq.b;
      *end   = bamentry->core.pos;
    }

    return true;
  }
}
Esempio n. 5
0
static int make_pair_key(key_data_t *key, bam1_t *bam) {
    int32_t this_ref, this_coord, this_end;
    int32_t other_ref, other_coord, other_end;
    int32_t orientation, leftmost;
    uint8_t *data;
    char *cig;

    this_ref    = bam->core.tid + 1; // avoid a 0 being put into the hash
    other_ref   = bam->core.mtid + 1;

    this_coord = unclipped_start(bam);
    this_end   = unclipped_end(bam);

    if ((data = bam_aux_get(bam, "MC"))) {
        cig = bam_aux2Z(data);
        other_end   = unclipped_other_end(bam->core.mpos, cig);
        other_coord = unclipped_other_start(bam->core.mpos, cig);
    } else {
        fprintf(stderr, "[markdup] error: no MC tag.\n");
        return 1;
    }

    // work out orientations
    if (this_ref != other_ref) {
        leftmost = this_ref < other_ref;
    } else {
        if (bam_is_rev(bam) == bam_is_mrev(bam)) {
            if (!bam_is_rev(bam)) {
                leftmost = this_coord <= other_coord;
            } else {
                leftmost = this_end <= other_end;
            }
        } else {
            if (bam_is_rev(bam)) {
                leftmost = this_end <= other_coord;
            } else {
                leftmost = this_coord <= other_end;
            }
        }
    }

    // pair orientation
    if (leftmost) {
        if (bam_is_rev(bam) == bam_is_mrev(bam)) {
            other_coord = other_end;

            if (!bam_is_rev(bam)) {
                if (bam->core.flag & BAM_FREAD1) {
                    orientation = O_FF;
                } else {
                    orientation = O_RR;
                }
            } else {
                if (bam->core.flag & BAM_FREAD1) {
                    orientation = O_RR;
                } else {
                    orientation = O_FF;
                }
            }
        } else {
            if (!bam_is_rev(bam)) {
                orientation = O_FR;
                other_coord = other_end;
            } else {
                orientation = O_RF;
                this_coord = this_end;
            }
        }
    } else {
        if (bam_is_rev(bam) == bam_is_mrev(bam)) {
            this_coord = this_end;

            if (!bam_is_rev(bam)) {
                if (bam->core.flag & BAM_FREAD1) {
                    orientation = O_RR;
                } else {
                    orientation = O_FF;
                }
            } else {
                if (bam->core.flag & BAM_FREAD1) {
                    orientation = O_FF;
                } else {
                    orientation = O_RR;
                }
            }
        } else {
            if (!bam_is_rev(bam)) {
                orientation = O_RF;
                other_coord = other_end;
            } else {
                orientation = O_FR;
                this_coord = this_end;
            }
        }
    }

    if (!leftmost)
        leftmost = 13;
    else
        leftmost = 11;

    key->single        = 0;
    key->this_ref      = this_ref;
    key->this_coord    = this_coord;
    key->other_ref     = other_ref;
    key->other_coord   = other_coord;
    key->leftmost      = leftmost;
    key->orientation   = orientation;

    return 0;
}
Esempio n. 6
0
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	int n_seqs, l, i;
	long n_trimmed = 0, n_tot = 0;
	bam1_t *b;
	int res;

	b = bam_init1();
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
#ifdef USE_HTSLIB
	while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) {
#else
	while ((res = bam_read1(bs->fp, b)) >= 0) {
#endif
		uint8_t *s, *q;
		int go = 0;
		if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
		if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
		if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
		if (go == 0) continue;
		l = b->core.l_qseq;
		p = &seqs[n_seqs++];
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
#ifdef USE_HTSLIB
		s = bam_get_seq(b); q = bam_get_qual(b);
#else
		s = bam1_seq(b); q = bam1_qual(b);
#endif
		p->seq = (ubyte_t*)calloc(p->len + 1, 1);
		p->qual = (ubyte_t*)calloc(p->len + 1, 1);
		for (i = 0; i != p->full_len; ++i) {
#ifdef USE_HTSLIB
			p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)];
#else
			p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
#endif
			p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
		}
#ifdef USE_HTSLIB
		if (bam_is_rev(b)) { // then reverse 
#else
		if (bam1_strand(b)) { // then reverse 
#endif
			seq_reverse(p->len, p->seq, 1);
			seq_reverse(p->len, p->qual, 0);
		}
		if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
#ifdef USE_HTSLIB
		p->name = strdup((const char*)bam_get_qname(b));
#else
		p->name = strdup((const char*)bam1_qname(b));
#endif
		if (n_seqs == n_needed) break;
	}
	if (res < 0 && res != -1) err_fatal_simple("Error reading bam file");
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		bam_destroy1(b);
		return 0;
	}
	bam_destroy1(b);
	return seqs;
}

#define BARCODE_LOW_QUAL 13

bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	kseq_t *seq = bs->ks;
	int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
	long n_trimmed = 0, n_tot = 0;

	if (l_bc > BWA_MAX_BCLEN) {
		fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN);
		return 0;
	}
	if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
	while ((l = kseq_read(seq)) >= 0) {
		if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) {
			// skip reads that are marked to be filtered by Casava
			char *s = index(seq->comment.s, ':');
			if (s && *(++s) == 'Y') {
				continue;
			}
		}
		if (is_64 && seq->qual.l)
			for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
		if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
		p = &seqs[n_seqs++];
		if (l_bc) { // then trim barcode
			for (i = 0; i < l_bc; ++i)
				p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
			p->bc[i] = 0;
			for (; i < seq->seq.l; ++i)
				seq->seq.s[i - l_bc] = seq->seq.s[i];
			seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
			if (seq->qual.l) {
				for (i = l_bc; i < seq->qual.l; ++i)
					seq->qual.s[i - l_bc] = seq->qual.s[i];
				seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
			}
			l = seq->seq.l;
		} else p->bc[0] = 0;
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
		p->seq = (ubyte_t*)calloc(p->full_len, 1);
		for (i = 0; i != p->full_len; ++i)
			p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
		if (seq->qual.l) { // copy quality
			p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
			if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		}
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
		p->name = strdup((const char*)seq->name.s);
		{ // trim /[12]$
			int t = strlen(p->name);
			if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
		}
		if (n_seqs == n_needed) break;
	}
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		return 0;
	}
	return seqs;
}

void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
{
	int i, j;
	for (i = 0; i != n_seqs; ++i) {
		bwa_seq_t *p = seqs + i;
		for (j = 0; j < p->n_multi; ++j)
			if (p->multi[j].cigar) free(p->multi[j].cigar);
		free(p->name);
		free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
		free(p->cigar);
	}
	free(seqs);
}
Esempio n. 7
0
int main_bam2fq(int argc, char *argv[])
{
    BGZF *fp, *fpse = 0;
    bam1_t *b;
    uint8_t *buf;
    int max_buf, c, has12 = 0;
    kstring_t str;
    int64_t n_singletons = 0, n_reads = 0;
    char last[512], *fnse = 0;

    while ((c = getopt(argc, argv, "as:")) > 0)
        if (c == 'a') has12 = 1;
        else if (c == 's') fnse = optarg;
    if (argc == optind) {
        fprintf(stderr, "\nUsage:   bam2fq [-a] [-s outSE] <in.bam>\n\n");
        fprintf(stderr, "Options: -a        append /1 and /2 to the read name\n");
        fprintf(stderr, "         -s FILE   write singleton reads to FILE [assume single-end]\n");
        fprintf(stderr, "\n");
        return 1;
    }
    fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r");
    assert(fp);
    bam_hdr_destroy(bam_hdr_read(fp));
    buf = 0;
    max_buf = 0;
    str.l = str.m = 0;
    str.s = 0;
    last[0] = 0;
    if (fnse) fpse = bgzf_open(fnse, "w1");

    b = bam_init1();
    while (bam_read1(fp, b) >= 0) {
        int i, qlen = b->core.l_qseq, is_print = 0;
        uint8_t *qual, *seq;
        if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments
        ++n_reads;
        if (fpse) {
            if (str.l && strcmp(last, bam_get_qname(b))) {
                bgzf_write(fpse, str.s, str.l);
                str.l = 0;
                ++n_singletons;
            }
            if (str.l) is_print = 1;
            strcpy(last, bam_get_qname(b));
        } else is_print = 1;
        qual = bam_get_qual(b);
        kputc(qual[0] == 0xff? '>' : '@', &str);
        kputsn(bam_get_qname(b), b->core.l_qname - 1, &str);
        if (has12) {
            kputc('/', &str);
            kputw(b->core.flag>>6&3, &str);
        }
        kputc('\n', &str);
        if (max_buf < qlen + 1) {
            max_buf = qlen + 1;
            kroundup32(max_buf);
            buf = (uint8_t*)realloc(buf, max_buf);
        }
        buf[qlen] = 0;
        seq = bam_get_seq(b);
        for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence
        if (bam_is_rev(b)) { // reverse complement
            for (i = 0; i < qlen>>1; ++i) {
                int8_t t = seq_comp_table[buf[qlen - 1 - i]];
                buf[qlen - 1 - i] = seq_comp_table[buf[i]];
                buf[i] = t;
            }
            if (qlen&1) buf[i] = seq_comp_table[buf[i]];
        }
        for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]];
        kputsn((char*)buf, qlen, &str);
        kputc('\n', &str);
        if (qual[0] != 0xff) {
            kputsn("+\n", 2, &str);
            for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i];
            if (bam_is_rev(b)) { // reverse
                for (i = 0; i < qlen>>1; ++i) {
                    uint8_t t = buf[qlen - 1 - i];
                    buf[qlen - 1 - i] = buf[i];
                    buf[i] = t;
                }
            }
        }
        kputsn((char*)buf, qlen, &str);
        kputc('\n', &str);
        if (is_print) {
            fwrite(str.s, 1, str.l, stdout);
            str.l = 0;
        }
    }
    if (fpse) {
        if (str.l) {
            bgzf_write(fpse, str.s, str.l);
            ++n_singletons;
        }
        fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons);
        bgzf_close(fpse);
    }
    fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads);
    free(buf);
    free(str.s);
    bam_destroy1(b);
    bgzf_close(fp);
    return 0;
}
Esempio n. 8
0
/*!
  @abstract    Merge multiple sorted BAM.
  @param  is_by_qname whether to sort by query name
  @param  out         output BAM file name
  @param  mode        sam_open() mode to be used to create the final output file
                      (overrides level settings from UNCOMP and LEVEL1 flags)
  @param  headers     name of SAM file from which to copy '@' header lines,
                      or NULL to copy them from the first file to be merged
  @param  n           number of files to be merged
  @param  fn          names of files to be merged
  @param  flag        flags that control how the merge is undertaken
  @param  reg         region to merge
  @param  n_threads   number of threads to use (passed to htslib)
  @discussion Padding information may NOT correctly maintained. This
  function is NOT thread safe.
 */
int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads)
{
    samFile *fpout, **fp;
    heap1_t *heap;
    bam_hdr_t *hout = NULL;
    int i, j, *RG_len = NULL;
    uint64_t idx = 0;
    char **RG = NULL;
    hts_itr_t **iter = NULL;
    bam_hdr_t **hdr = NULL;
    trans_tbl_t *translation_tbl = NULL;

    // Is there a specified pre-prepared header to use for output?
    if (headers) {
        samFile* fpheaders = sam_open(headers, "r");
        if (fpheaders == NULL) {
            const char *message = strerror(errno);
            fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
            return -1;
        }
        hout = sam_hdr_read(fpheaders);
        sam_close(fpheaders);
    }

    g_is_by_qname = by_qname;
    fp = (samFile**)calloc(n, sizeof(samFile*));
    heap = (heap1_t*)calloc(n, sizeof(heap1_t));
    iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
    hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
    translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
    // prepare RG tag from file names
    if (flag & MERGE_RG) {
        RG = (char**)calloc(n, sizeof(char*));
        RG_len = (int*)calloc(n, sizeof(int));
        for (i = 0; i != n; ++i) {
            int l = strlen(fn[i]);
            const char *s = fn[i];
            if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
            for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
            ++j; l -= j;
            RG[i] = (char*)calloc(l + 1, 1);
            RG_len[i] = l;
            strncpy(RG[i], s + j, l);
        }
    }
    // open and read the header from each file
    for (i = 0; i < n; ++i) {
        bam_hdr_t *hin;
        fp[i] = sam_open(fn[i], "r");
        if (fp[i] == NULL) {
            int j;
            fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]);
            for (j = 0; j < i; ++j) sam_close(fp[j]);
            free(fp); free(heap);
            // FIXME: possible memory leak
            return -1;
        }
        hin = sam_hdr_read(fp[i]);
        if (hout)
            trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
        else {
            // As yet, no headers to merge into...
            hout = bam_hdr_dup(hin);
            // ...so no need to translate header into itself
            trans_tbl_init(hout, hin, translation_tbl+i, true, true);
        }

        // TODO sam_itr_next() doesn't yet work for SAM files,
        // so for those keep the headers around for use with sam_read1()
        if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
        else { bam_hdr_destroy(hin); hdr[i] = NULL; }

        if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
            fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
        }
    }

    // Transform the header into standard form
    pretty_header(&hout->text,hout->l_text);

    // If we're only merging a specified region move our iters to start at that point
    if (reg) {
        int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl);

        int tid, beg, end;
        const char *name_lim = hts_parse_reg(reg, &beg, &end);
        char *name = malloc(name_lim - reg + 1);
        memcpy(name, reg, name_lim - reg);
        name[name_lim - reg] = '\0';
        tid = bam_name2id(hout, name);
        free(name);
        if (tid < 0) {
            fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__);
            return -1;
        }
        for (i = 0; i < n; ++i) {
            hts_idx_t *idx = sam_index_load(fp[i], fn[i]);
            // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
            int mapped_tid = rtrans[i*hout->n_targets+tid];
            if (mapped_tid != INT32_MIN) {
                iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
            } else {
                iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
            }
            hts_idx_destroy(idx);
            if (iter[i] == NULL) break;
        }
        free(rtrans);
    } else {
        for (i = 0; i < n; ++i) {
            if (hdr[i] == NULL) {
                iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
                if (iter[i] == NULL) break;
            }
            else iter[i] = NULL;
        }
    }

    if (i < n) {
        fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__);
        return -1;
    }

    // Load the first read from each file into the heap
    for (i = 0; i < n; ++i) {
        heap1_t *h = heap + i;
        h->i = i;
        h->b = bam_init1();
        if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) {
            bam_translate(h->b, translation_tbl + i);
            h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
            h->idx = idx++;
        }
        else {
            h->pos = HEAP_EMPTY;
            bam_destroy1(h->b);
            h->b = NULL;
        }
    }

    // Open output file and write header
    if ((fpout = sam_open(out, mode)) == 0) {
        fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__);
        return -1;
    }
    sam_hdr_write(fpout, hout);
    if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads);

    // Begin the actual merge
    ks_heapmake(heap, n, heap);
    while (heap->pos != HEAP_EMPTY) {
        bam1_t *b = heap->b;
        if (flag & MERGE_RG) {
            uint8_t *rg = bam_aux_get(b, "RG");
            if (rg) bam_aux_del(b, rg);
            bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
        }
        sam_write1(fpout, hout, b);
        if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) {
            bam_translate(b, translation_tbl + heap->i);
            heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
            heap->idx = idx++;
        } else if (j == -1) {
            heap->pos = HEAP_EMPTY;
            bam_destroy1(heap->b);
            heap->b = NULL;
        } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
        ks_heapadjust(heap, 0, n, heap);
    }

    // Clean up and close
    if (flag & MERGE_RG) {
        for (i = 0; i != n; ++i) free(RG[i]);
        free(RG); free(RG_len);
    }
    for (i = 0; i < n; ++i) {
        trans_tbl_destroy(translation_tbl + i);
        hts_itr_destroy(iter[i]);
        bam_hdr_destroy(hdr[i]);
        sam_close(fp[i]);
    }
    bam_hdr_destroy(hout);
    sam_close(fpout);
    free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
    return 0;
}
Esempio n. 9
0
int main(int argc, char **argv)
{
    if (argc < 4)
        errx(1,
             "usage\t:%s <bam> <split out> <discord out> (optional #threads)",
             argv[0]);

    char *bam_file_name = argv[1];
    char *split_file_name = argv[2];
    char *disc_file_name = argv[3];
    int threads = 2;
    if (argc == 5) {
        threads = atoi(argv[4]);
    }

    samFile *disc = sam_open(disc_file_name, "wb");

    samFile *split = sam_open(split_file_name, "wb");

    samFile *in = sam_open(bam_file_name, "rb");
    if(in == NULL)
        errx(1, "Unable to open BAM/SAM file.");

    // TODO: handle cram.
    if (threads > 1) {
        bgzf_mt(in->fp.bgzf, threads, 256);
    }

    hts_idx_t *idx = sam_index_load(in, bam_file_name);
    if(idx == NULL)
        errx(1,"Unable to open BAM/SAM index.");

    bam_hdr_t *hdr = sam_hdr_read(in);

    int r = sam_hdr_write(disc, hdr);
    r = sam_hdr_write(split, hdr);

    bam1_t *aln = bam_init1();
    int ret;

    while(ret = sam_read1(in, hdr, aln) >= 0) {
        if (((aln->core.flag) & 1294) == 0)
            r = sam_write1(disc, hdr, aln);

        uint8_t *sa = bam_aux_get(aln, "SA");

        if (sa != 0) {
            char *sa_tag = strdup(bam_aux2Z(sa));
            if ( count_tags(sa_tag) == 1) {
                char *chrm, strand, *cigar;
                uint32_t pos;
                split_sa_tag(sa_tag,
                             &chrm,
                             &pos,
                             &strand,
                             &cigar);

                struct line sa, al;

                calcOffsets(cigar,
                            pos,
                            strand,
                            &sa);
                sa.chrm = chrm;
                sa.strand = strand;


                calcAlnOffsets(bam_get_cigar(aln),
                               aln->core.n_cigar,
                               aln->core.pos,
                               bam_is_rev(aln) ? '-' : '+',
                               &al);
                al.chrm = hdr->target_name[aln->core.tid];
                al.strand = bam_is_rev(aln) ? '-' : '+';

                struct line *left = &al, *right = &sa;

                if (left->SQO > right->SQO) {
                    left = &sa;
                    right = &al;
                }

                int overlap = MAX(1 + MIN(left->EQO, right->EQO) - 
                        MAX(left->SQO, right->SQO), 0);
                int alen1 = 1 + left->EQO - left->SQO;
                int alen2 = 1 + right->EQO - right->SQO;
                int mno = MIN(alen1-overlap, alen2-overlap);
                if (mno < MIN_NON_OVERLAP) 
                    continue;

                if ( (strcmp(left->chrm, right->chrm) == 0) &&
                     (left->strand == right->strand) ) {

                    int leftDiag, rightDiag, insSize;
                    if (left->strand == '-') {
                        leftDiag = left->rapos - left->sclip;
                        rightDiag = (right->rapos + right->raLen) - 
                                (right->sclip + right->qaLen);
                        insSize = rightDiag - leftDiag;
                    } else {
                        leftDiag = (left->rapos + left->raLen) - 
                                (left->sclip + left->qaLen);
                        rightDiag = right->rapos - right->sclip;
                        insSize = leftDiag - rightDiag;
                    }
                    int desert = right->SQO - left->EQO - 1;
                    if ((abs(insSize) < MIN_INDEL_SIZE) || 
                        ((desert > 0) && (
                            (desert - (int)MAX(0, insSize)) >
                            MAX_UNMAPPED_BASES)))
                        continue;
                }

                char *qname =  bam_get_qname(aln);
                if ((aln->core.flag & 64) == 64)
                    qname[0] = 'A'; 
                else
                    qname[0] = 'B'; 

                r = sam_write1(split, hdr, aln);
            }
            free(sa_tag);
        }
    }

    bam_destroy1(aln);
    hts_idx_destroy(idx);
    bam_hdr_destroy(hdr);
    sam_close(in);
    sam_close(disc);
    sam_close(split);
    if(ret < -1) {
        errx(1, "lumpy_filter: error reading bam: %s\n", bam_file_name);
    }
}