// Function to compare reads and determine which one is < the other static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam_get_qname(a), bam_get_qname(b)); return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0))); } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam_is_rev(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam_is_rev(b))); }
static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref) { int j; if (p->is_head) { putc('^', fp); putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp); } if (!p->is_del) { int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]; if (ref) { int rb = pos < ref_len? ref[pos] : 'N'; if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.'; else c = bam_is_rev(p->b)? tolower(c) : toupper(c); } else { if (c == '=') c = bam_is_rev(p->b)? ',' : '.'; else c = bam_is_rev(p->b)? tolower(c) : toupper(c); } putc(c, fp); } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp); if (p->indel > 0) { putc('+', fp); printw(p->indel, fp); for (j = 1; j <= p->indel; ++j) { int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]; putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); } } else if (p->indel < 0) { printw(p->indel, fp); for (j = 1; j <= -p->indel; ++j) { int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); } } if (p->is_tail) putc('$', fp); }
static void make_single_key(key_data_t *key, bam1_t *bam) { int32_t this_ref, this_coord; int32_t orientation; this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash if (bam_is_rev(bam)) { this_coord = unclipped_end(bam); orientation = O_RR; } else { this_coord = unclipped_start(bam); orientation = O_FF; } key->single = 1; key->this_ref = this_ref; key->this_coord = this_coord; key->orientation = orientation; }
static bool sam_fetch_coords(const CallFileEntry *centry, const char *flank5p, size_t flank5p_len, const char *flank3p, size_t flank3p_len, size_t *cpy_flnk_5p, size_t *cpy_flnk_3p, const read_t **chrom_ptr, size_t *start, size_t *end, bool *fw_strand_ptr) { // Get the next primary alignment do { if(sam_read1(samfh, bam_header, bamentry) < 0) die("We've run out of SAM entries!"); } while(bamentry->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); if(bamentry->core.flag & BAM_FUNMAP) { num_flank5p_unmapped++; return false; } if(bamentry->core.qual < min_mapq) { num_flank5p_lowqual++; return false; } bool fw_strand = !bam_is_rev(bamentry); *fw_strand_ptr = fw_strand; const char *chrom_name = bam_header->target_name[bamentry->core.tid]; const read_t *chrom = seq_fetch_chrom(genome, chrom_name); *chrom_ptr = chrom; const uint32_t *cigar = bam_get_cigar(bamentry); int cigar2rlen = bam_cigar2rlen(bamentry->core.n_cigar, cigar); // cpy_flnk_5p is soft clipped at right end of flank // Eat up hard masked (H), soft masked (S) and inserted bases (relative to ref) (I) *cpy_flnk_5p = bam_get_end_padding(bamentry->core.n_cigar, cigar); *cpy_flnk_3p = 0; // set this later // Get bam query name char *bname = bam_get_qname(bamentry); // Check entry/flank names match const char *hdrline = call_file_get_line(centry, 0); if(hdrline[0] != '>') die("Unexpected line: %s", hdrline); hdrline++; const char *hdrline_end = str_fasta_name_end(hdrline); int hdrline_len = hdrline_end - hdrline; if(strncmp(hdrline, bname, hdrline_len) != 0) die("SAM/BAM and call entries mismatch '%s' vs '%s'", hdrline, bname); // Find 3p flank position using search for first kmer char endkmer[200]; ctx_assert(kmer_size+1 <= sizeof(endkmer)); ctx_assert(flank3p_len >= kmer_size || call_file_min_allele_len(centry) == 0); bubble_get_end_kmer(flank5p, flank5p_len, flank3p, flank3p_len, kmer_size, endkmer); if(!fw_strand) dna_revcomp_str(endkmer, endkmer, kmer_size); // Determine search space // Choose a region of the ref to search for the end flank // end is index after last char long search_start, search_end; size_t longest_allele = call_file_max_allele_len(centry); if(fw_strand) { search_start = (long)bamentry->core.pos + cigar2rlen - kmer_size*2; search_end = (long)bamentry->core.pos + cigar2rlen + longest_allele + kmer_size*2 + 10; } else { search_start = (long)bamentry->core.pos - (longest_allele + kmer_size*2 + 10); search_end = (long)bamentry->core.pos + kmer_size*2; } search_start = MAX2(search_start, 0); search_end = MIN2(search_end, (long)chrom->seq.end); const char *search_region = chrom->seq.b + search_start; size_t search_len = (size_t)(search_end - search_start); // Now do search with kmer // Attempt to find perfect match for kmer within search region // Search, if there is more than one match -> abandon const char *kmer_match = ctx_strnstr(search_region, endkmer, search_len); if(kmer_match != NULL) { // Check for multiple hits size_t rem_search_len = search_region+search_len-kmer_match; if(ctx_strnstr(kmer_match+1, endkmer, rem_search_len-1) != NULL) { num_flank3p_multihits++; return false; } if(fw_strand) { *start = bamentry->core.pos + cigar2rlen; *end = kmer_match - chrom->seq.b; } else { *start = kmer_match + kmer_size - chrom->seq.b; *end = bamentry->core.pos; } num_flank3p_exact_match++; return true; } else { // Look for approximate match needleman_wunsch_align2(search_region, endkmer, search_len, kmer_size, &nw_scoring_flank, nw_aligner, aln); num_nw_flank++; const char *ref = aln->result_a, *alt = aln->result_b; // --aa--dd-cge // bb--ccd-ecge // Find positions of first and last match int i, l, r, matches = 0; int ref_offset_left = 0, ref_offset_rght = 0; int alt_offset_left = 0, alt_offset_rght = 0; for(l = 0; l < (int)aln->length && ref[l] != alt[l]; l++) { ref_offset_left += (ref[l] != '-'); alt_offset_left += (alt[l] != '-'); } for(r = aln->length-1; r > 0 && ref[r] != alt[r]; r--) { ref_offset_rght += (ref[r] != '-'); alt_offset_rght += (alt[r] != '-'); } // Count matches for(i = l; i <= r; i++) matches += (ref[i] == alt[i]); if(matches < (int)kmer_size / 2) { // flank doesn't map well num_flank3p_not_found++; return false; } num_flank3p_approx_match++; *cpy_flnk_3p += fw_strand ? alt_offset_left : alt_offset_rght; if(fw_strand) { *start = bamentry->core.pos + cigar2rlen; *end = search_region + ref_offset_left - chrom->seq.b; } else { *start = (search_region + search_len - ref_offset_rght) - chrom->seq.b; *end = bamentry->core.pos; } return true; } }
static int make_pair_key(key_data_t *key, bam1_t *bam) { int32_t this_ref, this_coord, this_end; int32_t other_ref, other_coord, other_end; int32_t orientation, leftmost; uint8_t *data; char *cig; this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash other_ref = bam->core.mtid + 1; this_coord = unclipped_start(bam); this_end = unclipped_end(bam); if ((data = bam_aux_get(bam, "MC"))) { cig = bam_aux2Z(data); other_end = unclipped_other_end(bam->core.mpos, cig); other_coord = unclipped_other_start(bam->core.mpos, cig); } else { fprintf(stderr, "[markdup] error: no MC tag.\n"); return 1; } // work out orientations if (this_ref != other_ref) { leftmost = this_ref < other_ref; } else { if (bam_is_rev(bam) == bam_is_mrev(bam)) { if (!bam_is_rev(bam)) { leftmost = this_coord <= other_coord; } else { leftmost = this_end <= other_end; } } else { if (bam_is_rev(bam)) { leftmost = this_end <= other_coord; } else { leftmost = this_coord <= other_end; } } } // pair orientation if (leftmost) { if (bam_is_rev(bam) == bam_is_mrev(bam)) { other_coord = other_end; if (!bam_is_rev(bam)) { if (bam->core.flag & BAM_FREAD1) { orientation = O_FF; } else { orientation = O_RR; } } else { if (bam->core.flag & BAM_FREAD1) { orientation = O_RR; } else { orientation = O_FF; } } } else { if (!bam_is_rev(bam)) { orientation = O_FR; other_coord = other_end; } else { orientation = O_RF; this_coord = this_end; } } } else { if (bam_is_rev(bam) == bam_is_mrev(bam)) { this_coord = this_end; if (!bam_is_rev(bam)) { if (bam->core.flag & BAM_FREAD1) { orientation = O_RR; } else { orientation = O_FF; } } else { if (bam->core.flag & BAM_FREAD1) { orientation = O_FF; } else { orientation = O_RR; } } } else { if (!bam_is_rev(bam)) { orientation = O_RF; other_coord = other_end; } else { orientation = O_FR; this_coord = this_end; } } } if (!leftmost) leftmost = 13; else leftmost = 11; key->single = 0; key->this_ref = this_ref; key->this_coord = this_coord; key->other_ref = other_ref; key->other_coord = other_coord; key->leftmost = leftmost; key->orientation = orientation; return 0; }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); #ifdef USE_HTSLIB while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) { #else while ((res = bam_read1(bs->fp, b)) >= 0) { #endif uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; #ifdef USE_HTSLIB s = bam_get_seq(b); q = bam_get_qual(b); #else s = bam1_seq(b); q = bam1_qual(b); #endif p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { #ifdef USE_HTSLIB p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)]; #else p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; #endif p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } #ifdef USE_HTSLIB if (bam_is_rev(b)) { // then reverse #else if (bam1_strand(b)) { // then reverse #endif seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); #ifdef USE_HTSLIB p->name = strdup((const char*)bam_get_qname(b)); #else p->name = strdup((const char*)bam1_qname(b)); #endif if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; } #define BARCODE_LOW_QUAL 13 bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; } void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) { int i, j; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } free(seqs); }
int main_bam2fq(int argc, char *argv[]) { BGZF *fp, *fpse = 0; bam1_t *b; uint8_t *buf; int max_buf, c, has12 = 0; kstring_t str; int64_t n_singletons = 0, n_reads = 0; char last[512], *fnse = 0; while ((c = getopt(argc, argv, "as:")) > 0) if (c == 'a') has12 = 1; else if (c == 's') fnse = optarg; if (argc == optind) { fprintf(stderr, "\nUsage: bam2fq [-a] [-s outSE] <in.bam>\n\n"); fprintf(stderr, "Options: -a append /1 and /2 to the read name\n"); fprintf(stderr, " -s FILE write singleton reads to FILE [assume single-end]\n"); fprintf(stderr, "\n"); return 1; } fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r"); assert(fp); bam_hdr_destroy(bam_hdr_read(fp)); buf = 0; max_buf = 0; str.l = str.m = 0; str.s = 0; last[0] = 0; if (fnse) fpse = bgzf_open(fnse, "w1"); b = bam_init1(); while (bam_read1(fp, b) >= 0) { int i, qlen = b->core.l_qseq, is_print = 0; uint8_t *qual, *seq; if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments ++n_reads; if (fpse) { if (str.l && strcmp(last, bam_get_qname(b))) { bgzf_write(fpse, str.s, str.l); str.l = 0; ++n_singletons; } if (str.l) is_print = 1; strcpy(last, bam_get_qname(b)); } else is_print = 1; qual = bam_get_qual(b); kputc(qual[0] == 0xff? '>' : '@', &str); kputsn(bam_get_qname(b), b->core.l_qname - 1, &str); if (has12) { kputc('/', &str); kputw(b->core.flag>>6&3, &str); } kputc('\n', &str); if (max_buf < qlen + 1) { max_buf = qlen + 1; kroundup32(max_buf); buf = (uint8_t*)realloc(buf, max_buf); } buf[qlen] = 0; seq = bam_get_seq(b); for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence if (bam_is_rev(b)) { // reverse complement for (i = 0; i < qlen>>1; ++i) { int8_t t = seq_comp_table[buf[qlen - 1 - i]]; buf[qlen - 1 - i] = seq_comp_table[buf[i]]; buf[i] = t; } if (qlen&1) buf[i] = seq_comp_table[buf[i]]; } for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]]; kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (qual[0] != 0xff) { kputsn("+\n", 2, &str); for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i]; if (bam_is_rev(b)) { // reverse for (i = 0; i < qlen>>1; ++i) { uint8_t t = buf[qlen - 1 - i]; buf[qlen - 1 - i] = buf[i]; buf[i] = t; } } } kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (is_print) { fwrite(str.s, 1, str.l, stdout); str.l = 0; } } if (fpse) { if (str.l) { bgzf_write(fpse, str.s, str.l); ++n_singletons; } fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons); bgzf_close(fpse); } fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads); free(buf); free(str.s); bam_destroy1(b); bgzf_close(fp); return 0; }
/*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @param out output BAM file name @param mode sam_open() mode to be used to create the final output file (overrides level settings from UNCOMP and LEVEL1 flags) @param headers name of SAM file from which to copy '@' header lines, or NULL to copy them from the first file to be merged @param n number of files to be merged @param fn names of files to be merged @param flag flags that control how the merge is undertaken @param reg region to merge @param n_threads number of threads to use (passed to htslib) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads) { samFile *fpout, **fp; heap1_t *heap; bam_hdr_t *hout = NULL; int i, j, *RG_len = NULL; uint64_t idx = 0; char **RG = NULL; hts_itr_t **iter = NULL; bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { const char *message = strerror(errno); fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hout = sam_hdr_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (samFile**)calloc(n, sizeof(samFile*)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); // prepare RG tag from file names if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(char*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = (char*)calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // open and read the header from each file for (i = 0; i < n; ++i) { bam_hdr_t *hin; fp[i] = sam_open(fn[i], "r"); if (fp[i] == NULL) { int j; fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) sam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = sam_hdr_read(fp[i]); if (hout) trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); else { // As yet, no headers to merge into... hout = bam_hdr_dup(hin); // ...so no need to translate header into itself trans_tbl_init(hout, hin, translation_tbl+i, true, true); } // TODO sam_itr_next() doesn't yet work for SAM files, // so for those keep the headers around for use with sam_read1() if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; else { bam_hdr_destroy(hin); hdr[i] = NULL; } if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } } // Transform the header into standard form pretty_header(&hout->text,hout->l_text); // If we're only merging a specified region move our iters to start at that point if (reg) { int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl); int tid, beg, end; const char *name_lim = hts_parse_reg(reg, &beg, &end); char *name = malloc(name_lim - reg + 1); memcpy(name, reg, name_lim - reg); name[name_lim - reg] = '\0'; tid = bam_name2id(hout, name); free(name); if (tid < 0) { fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { hts_idx_t *idx = sam_index_load(fp[i], fn[i]); // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space int mapped_tid = rtrans[i*hout->n_targets+tid]; if (mapped_tid != INT32_MIN) { iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); } else { iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); if (iter[i] == NULL) break; } free(rtrans); } else { for (i = 0; i < n; ++i) { if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); if (iter[i] == NULL) break; } else iter[i] = NULL; } } if (i < n) { fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); return -1; } // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = bam_init1(); if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } else { h->pos = HEAP_EMPTY; bam_destroy1(h->b); h->b = NULL; } } // Open output file and write header if ((fpout = sam_open(out, mode)) == 0) { fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__); return -1; } sam_hdr_write(fpout, hout); if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } sam_write1(fpout, hout, b); if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; bam_destroy1(heap->b); heap->b = NULL; } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } // Clean up and close if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i < n; ++i) { trans_tbl_destroy(translation_tbl + i); hts_itr_destroy(iter[i]); bam_hdr_destroy(hdr[i]); sam_close(fp[i]); } bam_hdr_destroy(hout); sam_close(fpout); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); return 0; }
int main(int argc, char **argv) { if (argc < 4) errx(1, "usage\t:%s <bam> <split out> <discord out> (optional #threads)", argv[0]); char *bam_file_name = argv[1]; char *split_file_name = argv[2]; char *disc_file_name = argv[3]; int threads = 2; if (argc == 5) { threads = atoi(argv[4]); } samFile *disc = sam_open(disc_file_name, "wb"); samFile *split = sam_open(split_file_name, "wb"); samFile *in = sam_open(bam_file_name, "rb"); if(in == NULL) errx(1, "Unable to open BAM/SAM file."); // TODO: handle cram. if (threads > 1) { bgzf_mt(in->fp.bgzf, threads, 256); } hts_idx_t *idx = sam_index_load(in, bam_file_name); if(idx == NULL) errx(1,"Unable to open BAM/SAM index."); bam_hdr_t *hdr = sam_hdr_read(in); int r = sam_hdr_write(disc, hdr); r = sam_hdr_write(split, hdr); bam1_t *aln = bam_init1(); int ret; while(ret = sam_read1(in, hdr, aln) >= 0) { if (((aln->core.flag) & 1294) == 0) r = sam_write1(disc, hdr, aln); uint8_t *sa = bam_aux_get(aln, "SA"); if (sa != 0) { char *sa_tag = strdup(bam_aux2Z(sa)); if ( count_tags(sa_tag) == 1) { char *chrm, strand, *cigar; uint32_t pos; split_sa_tag(sa_tag, &chrm, &pos, &strand, &cigar); struct line sa, al; calcOffsets(cigar, pos, strand, &sa); sa.chrm = chrm; sa.strand = strand; calcAlnOffsets(bam_get_cigar(aln), aln->core.n_cigar, aln->core.pos, bam_is_rev(aln) ? '-' : '+', &al); al.chrm = hdr->target_name[aln->core.tid]; al.strand = bam_is_rev(aln) ? '-' : '+'; struct line *left = &al, *right = &sa; if (left->SQO > right->SQO) { left = &sa; right = &al; } int overlap = MAX(1 + MIN(left->EQO, right->EQO) - MAX(left->SQO, right->SQO), 0); int alen1 = 1 + left->EQO - left->SQO; int alen2 = 1 + right->EQO - right->SQO; int mno = MIN(alen1-overlap, alen2-overlap); if (mno < MIN_NON_OVERLAP) continue; if ( (strcmp(left->chrm, right->chrm) == 0) && (left->strand == right->strand) ) { int leftDiag, rightDiag, insSize; if (left->strand == '-') { leftDiag = left->rapos - left->sclip; rightDiag = (right->rapos + right->raLen) - (right->sclip + right->qaLen); insSize = rightDiag - leftDiag; } else { leftDiag = (left->rapos + left->raLen) - (left->sclip + left->qaLen); rightDiag = right->rapos - right->sclip; insSize = leftDiag - rightDiag; } int desert = right->SQO - left->EQO - 1; if ((abs(insSize) < MIN_INDEL_SIZE) || ((desert > 0) && ( (desert - (int)MAX(0, insSize)) > MAX_UNMAPPED_BASES))) continue; } char *qname = bam_get_qname(aln); if ((aln->core.flag & 64) == 64) qname[0] = 'A'; else qname[0] = 'B'; r = sam_write1(split, hdr, aln); } free(sa_tag); } } bam_destroy1(aln); hts_idx_destroy(idx); bam_hdr_destroy(hdr); sam_close(in); sam_close(disc); sam_close(split); if(ret < -1) { errx(1, "lumpy_filter: error reading bam: %s\n", bam_file_name); } }