void get_dqstats( const bam_pileup1_t* buf, int n_reads, int ref_base, uint32_t wanted_bases, dqstats_t *dqs ) { int i, j; int base; memset(dqs, 0, sizeof(dqstats_t)); for (i = 0; i < n_reads; ++i) { if (buf[i].is_del || buf[i].b->core.flag&BAM_FUNMAP) continue; ++dqs->total_depth; dqs->total_mean_mapQ += buf[i].b->core.qual; base = bam1_seqi(bam1_seq(buf[i].b), buf[i].qpos); if (base == ref_base) ++dqs->dp4[bam1_strand(buf[i].b)]; else ++dqs->dp4[2+bam1_strand(buf[i].b)]; for (j = 0; j < 4; ++j) { int value = 1 << j; if ((base & value) == base) { ++dqs->base_occ[j]; if (value & wanted_bases) { dqs->mean_baseQ[j] += bam1_qual(buf[i].b)[buf[i].qpos]; dqs->mean_mapQ[j] += buf[i].b->core.qual; } } } } for (i = 0; i < 4; ++i) { if (dqs->base_occ[i] > 0) { dqs->mean_baseQ[i] = dqs->mean_baseQ[i]/(double)dqs->base_occ[i] + .499; dqs->mean_mapQ[i] = dqs->mean_mapQ[i]/(double)dqs->base_occ[i] + .499; } } if (dqs->total_depth > 0) dqs->total_mean_mapQ = dqs->total_mean_mapQ / (double)dqs->total_depth + .499; }
static int fetch_disc_read_callback(const bam1_t* alignment, void* data) { // MEI_data* mei_data = static_cast<MEI_data*>(data); std::pair<MEI_data*, UserDefinedSettings*>* env = static_cast<std::pair<MEI_data*, UserDefinedSettings*>*>(data); MEI_data* mei_data = env->first; UserDefinedSettings* userSettings = env->second; if (!(alignment->core.flag & BAM_FUNMAP || alignment->core.flag & BAM_FMUNMAP) && // Both ends are mapped. !is_concordant(alignment, mei_data->current_insert_size) && // Ends map discordantly. // Extra check for (very) large mapping distance. This is done beside the check for read // discordance to speed up computation by ignoring signals from small structural variants. (alignment->core.tid != alignment->core.mtid || abs(alignment->core.pos - alignment->core.mpos) > userSettings->MIN_DD_MAP_DISTANCE)) { // Save alignment as simple_read object. std::string read_name = enrich_read_name(bam1_qname(alignment), alignment->core.flag & BAM_FREAD1); char strand = bam1_strand(alignment)? Minus : Plus; char mate_strand = bam1_mstrand(alignment)? Minus : Plus; std::string read_group; get_read_group(alignment, read_group); std::string sample_name; get_sample_name(read_group, mei_data->sample_names, sample_name); simple_read* read = new simple_read(read_name, alignment->core.tid, alignment->core.pos, strand, sample_name, get_sequence(bam1_seq(alignment), alignment->core.l_qseq), alignment->core.mtid, alignment->core.mpos, mate_strand); mei_data->discordant_reads.push_back(read); } return 0; }
static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0))); } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b))); }
static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) { int j; if (p->is_head) { putchar('^'); putchar(p->b->core.qual > 93? 126 : p->b->core.qual + 33); } if (!p->is_del) { int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; if (ref) { int rb = pos < ref_len? ref[pos] : 'N'; if (c == '=' || bam_nt16_table[c] == bam_nt16_table[rb]) c = bam1_strand(p->b)? ',' : '.'; else c = bam1_strand(p->b)? tolower(c) : toupper(c); } else { if (c == '=') c = bam1_strand(p->b)? ',' : '.'; else c = bam1_strand(p->b)? tolower(c) : toupper(c); } putchar(c); } else putchar(p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*'); if (p->indel > 0) { putchar('+'); printw(p->indel, stdout); for (j = 1; j <= p->indel; ++j) { int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); } } else if (p->indel < 0) { printw(p->indel, stdout); for (j = 1; j <= -p->indel; ++j) { int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); } } if (p->is_tail) putchar('$'); }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((res = bam_read1(bs->fp, b)) >= 0) { uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; s = bam1_seq(b); q = bam1_qual(b); p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } if (bam1_strand(b)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; }
void hash_reads( table* T, const char* reads_fn, interval_stack* is ) { samfile_t* reads_f = samopen( reads_fn, "rb", NULL ); if( reads_f == NULL ) { failf( "Can't open bam file '%s'.", reads_fn ); } bam_index_t* reads_index = bam_index_load( reads_fn ); if( reads_index == NULL ) { failf( "Can't open bam index '%s.bai'.", reads_fn ); } bam_init_header_hash( reads_f->header ); table_create( T, reads_f->header->n_targets ); T->seq_names = (char**)malloc( sizeof(char*) * reads_f->header->n_targets ); size_t k; for( k = 0; k < reads_f->header->n_targets; k++ ) { T->seq_names[k] = strdup(reads_f->header->target_name[k]); } log_puts( LOG_MSG, "hashing reads ... \n" ); log_indent(); bam_iter_t read_iter; bam1_t* read = bam_init1(); int tid; interval_stack::iterator i; for( i = is->begin(); i != is->end(); i++ ) { tid = bam_get_tid( reads_f->header, i->seqname ); if( tid < 0 ) continue; read_iter = bam_iter_query( reads_index, tid, i->start, i->end ); while( bam_iter_read( reads_f->x.bam, read_iter, read ) >= 0 ) { if( bam1_strand(read) == i->strand ) { table_inc( T, read ); } } bam_iter_destroy(read_iter); } bam_destroy1(read); log_unindent(); log_printf( LOG_MSG, "done. (%zu unique reads hashed)\n", T->m ); bam_index_destroy(reads_index); samclose(reads_f); }
static char *_bamqual(const bam1_t * bam, BAM_DATA bd) { const uint32_t len = bam->core.l_qseq; const unsigned char *bamq = bam1_qual(bam); char *s = Calloc(len + 1, char); for (uint32_t i = 0; i < len; ++i) s[i] = bamq[i] + 33; if (bd->reverseComplement && (bam1_strand(bam) == 1)) _reverse(s, len); s[len] = '\0'; return s; }
/*! @abstract Get the color quality of the color encoding the previous and current base @param b pointer to an alignment @param i The i-th position, 0-based @return color quality @discussion Returns 0 no color information is found. */ char bam_aux_getCQi(bam1_t *b, int i) { uint8_t *c = bam_aux_get(b, "CQ"); char *cq = NULL; // return the base if the tag was not found if(0 == c) return 0; cq = bam_aux2Z(c); // adjust for strandedness if(bam1_strand(b)) i = strlen(cq) - 1 - i; return cq[i]; }
/*! @abstract Get the color encoding the previous and current base @param b pointer to an alignment @param i The i-th position, 0-based @return color @discussion Returns 0 no color information is found. */ char bam_aux_getCSi(bam1_t *b, int i) { uint8_t *c = bam_aux_get(b, "CS"); char *cs = NULL; // return the base if the tag was not found if(0 == c) return 0; cs = bam_aux2Z(c); // adjust for strandedness and leading adaptor if(bam1_strand(b)) i = strlen(cs) - 1 - i; else i++; return cs[i]; }
static char *_bamseq(const bam1_t * bam, BAM_DATA bd) { static const char key[] = { '-', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N' }; const uint32_t len = bam->core.l_qseq; const unsigned char *seq = bam1_seq(bam); char *s = Calloc(len + 1, char); for (uint32_t i = 0; i < len; ++i) s[i] = key[bam1_seqi(seq, i)]; if (bd->reverseComplement && (bam1_strand(bam) == 1)) _reverseComplement(s, len); s[len] = '\0'; return s; }
// Returns true for a given read whether it is concordantly mapped together with its mate. bool is_concordant(const bam1_t* read, unsigned int insert_size) { if (read->core.flag & BAM_FUNMAP || read->core.flag & BAM_FMUNMAP) { // At least one of the pair is unmapped. return false; } if (read->core.tid != read->core.mtid) { // Reads mapping to different chromosomes. return false; } if (bam1_strand(read) == bam1_mstrand(read)) { // Reads mapping to the same strand. return false; } // Return true if insert size is as expected. This definition of concordance is // equivalent to read-pair construction elsewhere in Pindel. return (unsigned int) abs(read->core.isize) < read->core.l_qseq + 2 * insert_size; }
/*! @abstract Get the color error profile at the give position @param b pointer to an alignment @return the original color if the color was an error, '-' (dash) otherwise @discussion Returns 0 no color information is found. */ char bam_aux_getCEi(bam1_t *b, int i) { int cs_i; uint8_t *c = bam_aux_get(b, "CS"); char *cs = NULL; char prev_b, cur_b; char cur_color, cor_color; // return the base if the tag was not found if(0 == c) return 0; cs = bam_aux2Z(c); // adjust for strandedness and leading adaptor if(bam1_strand(b)) { //reverse strand cs_i = strlen(cs) - 1 - i; // get current color cur_color = cs[cs_i]; // get previous base. Note: must rc adaptor prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; // get current base cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; } else { cs_i=i+1; // get current color cur_color = cs[cs_i]; // get previous base prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; // get current base cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; } // corrected color cor_color = bam_aux_ntnt2cs(prev_b, cur_b); if(cur_color == cor_color) { return '-'; } else { return cur_color; } }
// Mostly stolen from bwa_read_bam. void bam1_to_seq(bam1_t *raw, bwa_seq_t *p, int is_comp, int trim_qual) { // long n_trimmed = 0; uint8_t *s, *q; int i, l = raw->core.l_qseq; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; // n_tot += p->full_len; s = bam1_seq(raw); q = bam1_qual(raw); p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } if (bam1_strand(raw)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) /* n_trimmed += */ bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->max_entries = 0 ; // We don't set a name, it's contained in the original record // anyway. // p->name = strdup((const char*)bam1_qname(raw)); // No place to put the tally right now. // if (n_seqs && trim_qual >= 1) // fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); }
// TODO soft-clipping bam1_t *sw_align(graph_t *g, bam1_t *b, node_t *n, sw_heap_t *heap, char *rg_id, int32_t offset, cov_cutoffs_t *cutoffs, uint8_t correct_bases, uint8_t use_qualities, int32_t max_total_coverage, int32_t max_heap_size) { char *colors = NULL; char *color_qualities = NULL; char base, qual; uint8_t space = SRMA_SPACE_NT; uint8_t strand; int32_t i, j, aln_start; int32_t num_start_nodes_added=0; int32_t sw_node_i=-1, sw_node_best_i=-1, sw_node_cur_i=-1, sw_node_next_i=-1; int32_t soft_clip_start_l = 0, soft_clip_end_l = 0; strand = bam1_strand(b); // soft-clipping if(1 == strand) { //reverse // going from 3'->5' soft_clip_start_l = sw_align_get_soft_clip(b, 1); soft_clip_end_l = sw_align_get_soft_clip(b, 0); } else { // going from 5'->3' soft_clip_start_l = sw_align_get_soft_clip(b, 0); soft_clip_end_l = sw_align_get_soft_clip(b, 1); } // FOR NOW if(0 < soft_clip_start_l || 0 < soft_clip_end_l) { return b; } // Check color space colors = sw_align_get_cs(b); if(NULL == colors) { space = SRMA_SPACE_NT; } else { space = SRMA_SPACE_CS; color_qualities = sw_align_get_cq(b); // Some aligners include a quality value for the adapter. A quality value // IMHO should not be given for an unobserved (assumed) peice of data. Trim // the first quality in this case if(strlen(colors) == strlen(color_qualities)) { // ignore leading quality color_qualities++; } if(0 < soft_clip_start_l || 0 < soft_clip_end_l) { srma_error(__func__, "Soft clipping not supported for color space", Exit, OutOfRange); } } // remove mate info b->core.flag &= ~(BAM_FPROPER_PAIR | BAM_FMREVERSE | BAM_FMUNMAP); b->core.mtid = -1; b->core.mpos = -1; b->core.isize = 0; // re-type heap heap->type = (1 == strand) ? SRMA_SW_HEAP_MAX : SRMA_SW_HEAP_MIN; // bound with original alignment sw_node_best_i = sw_align_bound(g, b, n, heap, strand, colors, color_qualities, space, cutoffs, use_qualities, max_total_coverage, max_heap_size); if(0 <= sw_node_best_i) { /* sw_heap_reset(heap); // reset the heap, keep old nodes fprintf(stderr, "BOUNDED score=%d coverage_sum=%hu\n", heap->nodes[sw_node_best_i].score, heap->nodes[sw_node_best_i].coverage_sum); // DEBUG */ } else { //fprintf(stderr, "NOT BOUNDED\n"); // DEBUG // nodes do not need to be preserved sw_heap_clear(heap); } //return b; // HERE DEBUG HERE BUG // add start nodes if(strand) { if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1]]; qual = color_qualities[0]; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1)]; qual = bam1_qual(b)[b->core.l_qseq-1] + 33; } aln_start = bam_calend(&b->core, bam1_cigar(b)); for(i=aln_start+offset;aln_start-offset<=i;i--) { int32_t pos = graph_get_node_list_index_at_or_before(g, i); node_list_t *list = graph_get_node_list(g, pos); if(1 != pos && NULL != list) { for(j=0;j<list->length;j++) { node_t *node = list->nodes[j]; int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage); if(0 == pass) { sw_node_i = sw_heap_get_node_i(heap); sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } if(node->position < i) { i = node->position; } num_start_nodes_added++; } } } } else { if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1]]; qual = color_qualities[0]; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), 0)]; qual = bam1_qual(b)[0] + 33; } aln_start = b->core.pos; for(i=aln_start-offset;i<=aln_start+offset;i++) { int32_t pos = graph_get_node_list_index_at_or_after(g, i); node_list_t *list = graph_get_node_list(g, pos); if(0 != pos && NULL != list) { for(j=0;j<list->length;j++) { node_t *node = list->nodes[j]; int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage); if(0 == pass) { sw_node_i = sw_heap_get_node_i(heap); sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } if(node->position < i) { i = node->position; } num_start_nodes_added++; } } } } if(0 == num_start_nodes_added) { srma_error(__func__, "Did not add any start nodes", Exit, OutOfRange); } sw_node_cur_i = sw_heap_poll_i(heap); while(0 <= sw_node_cur_i) { if(max_heap_size < heap->queue_end - heap->queue_start + 1) { // too many to consider sw_heap_clear(heap); // clear heap return b; } sw_node_next_i = sw_heap_peek_i(heap); assert(0 <= sw_node_cur_i); // DEBUG while(NODE_INSERTION != __node_type(heap->nodes[sw_node_cur_i].node) && 0 <= sw_node_next_i && 0 == sw_node_compare(&heap->nodes[sw_node_cur_i], &heap->nodes[sw_node_next_i], heap->type)) { if(heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_next_i].score || (heap->nodes[sw_node_cur_i].score == heap->nodes[sw_node_next_i].score && heap->nodes[sw_node_cur_i].coverage_sum < heap->nodes[sw_node_next_i].coverage_sum)) { sw_node_cur_i = sw_heap_poll_i(heap); } else { // ignore the next node sw_heap_poll_i(heap); } sw_node_next_i = sw_heap_peek_i(heap); } sw_node_next_i = -1; if(heap->nodes[sw_node_cur_i].read_offset == b->core.l_qseq-1) { // found, keep best if(sw_node_best_i < 0 || heap->nodes[sw_node_best_i].score < heap->nodes[sw_node_cur_i].score || (heap->nodes[sw_node_best_i].score == heap->nodes[sw_node_cur_i].score && heap->nodes[sw_node_best_i].coverage_sum < heap->nodes[sw_node_cur_i].coverage_sum)) { //fprintf(stderr, "FOUND BEST\n"); // DEBUG sw_node_best_i = sw_node_cur_i; } } else if(0 <= sw_node_best_i && heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_best_i].score) { // ignore, under the assumption that scores can only // become more negative. } else { edge_list_t *list = NULL; if(1 == strand) { // reverse list = heap->nodes[sw_node_cur_i].node->prev; } else { list = heap->nodes[sw_node_cur_i].node->next; } { // get the base and quality if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1 + (heap->nodes[sw_node_cur_i].read_offset+1)]]; qual = color_qualities[heap->nodes[sw_node_cur_i].read_offset+1]; } else { if(strand) { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1)]; qual = bam1_qual(b)[b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1] + 33; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), (heap->nodes[sw_node_cur_i].read_offset+1))]; qual = bam1_qual(b)[(heap->nodes[sw_node_cur_i].read_offset+1)] + 33; } } } /* node_t *node = heap->nodes[sw_node_cur_i].node; fprintf(stderr, "NODE %d:%d offset=%d coverage=%d base=%d\n", node->contig, node->position, node->offset, node->coverage, node->base); fprintf(stderr, "SW_NODE read_offset=%d score=%d coverage_sum=%d start_position=%d space=%d\n", heap->nodes[sw_node_cur_i].read_offset, heap->nodes[sw_node_cur_i].score, heap->nodes[sw_node_cur_i].coverage_sum, heap->nodes[sw_node_cur_i].start_position, space); */ for(i=0;i<list->length;i++) { node_t *node_cur = list->nodes[i]; uint16_t coverage_cur = list->coverages[i]; int32_t pass = pass_filters(g, node_cur, coverage_cur, cutoffs, max_total_coverage); if(0 == pass) { // add to the heap sw_node_i = sw_heap_get_node_i(heap); // DEBUG assert(0 <= sw_node_cur_i); assert(0 <= heap->nodes[sw_node_cur_i].read_offset); sw_node_init(&heap->nodes[sw_node_i], &heap->nodes[sw_node_cur_i], node_cur, coverage_cur, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } } } // get the next node sw_node_cur_i = sw_heap_poll_i(heap); } /* fprintf(stderr, "sw_node_best_i=%d\n", sw_node_best_i); // DEBUG if(0 <= sw_node_best_i) { fprintf(stderr, "END score=%d coverage_sum=%hu\n", heap->nodes[sw_node_best_i].score, heap->nodes[sw_node_best_i].coverage_sum); // DEBUG } */ // update SAM/BAM b = sw_align_update_bam(b, rg_id, heap, sw_node_best_i, space, colors, color_qualities, strand, correct_bases); sw_heap_clear(heap); // clear heap return b; }
/** * @brief Major quality check point for each read * * @param rm Empty read_metrics_t to be updated with the results * @param temp_read The read to be assessed * @param user_args User arguments to be considered during assessment * @param bresults Block wide results of the parsing * @param lpos Current position * @return void * @details Major quality check point for each read * @todo nothing */ void quality_check(read_metrics_t *rm,bam1_t *temp_read,user_arguments_t *user_args,seq_block_t *bresults,int lpos){ static int pos_dupcounter=0,neg_dupcounter=0; rm->skip=0; rm->read_length=0; rm->genomic_end= bam_calend(&temp_read->core,bam1_cigar(temp_read)); /* Determine read length */ if(bam1_pair(temp_read)){ ++bresults->paired; if (bam1_ppair(temp_read))++bresults->ppairs; } ++bresults->total_reads; if(temp_read->core.qual < user_args->TMAPQ || bam1_unmapped(temp_read)){ ++bresults->lowqual; rm->skip=1; return; } if(user_args->UNIQUE && bam1_multimap(temp_read)){ rm->skip=1; return; } if(!user_args->PAIRED){ rm->revcomp=bam1_strand(temp_read); rm->read_length=bam_cigar2qlen(&temp_read->core,bam1_cigar(temp_read)); } else if (bam1_ppair(temp_read) && !bam1_notprimary(temp_read)){ rm->revcomp=bam1_revpair(temp_read); if(!user_args->READTHROUGH){rm->read_length=bam_cigar2qlen(&temp_read->core,bam1_cigar(temp_read));//sets the read length only!! }else if(temp_read->core.isize!=0 ){ if((bam1_firstr(temp_read)&&!bam1_revpair(temp_read))||(bam1_secondr(temp_read)&&bam1_mrevpair(temp_read))){ rm->read_length=temp_read->core.isize; } else { rm->skip=1; return; } } else{ warning("ISIZE not set in SAM/BAM file. Re-run without using the readthrough_pairs option\n"); rm->skip=-4; return; } } else{ rm->skip=1; return; } if(!rm->read_length){ rm->read_length=temp_read->core.l_qseq; if(!rm->read_length){ warning("Read length neither found in core.isize=%d, core.l_qseq=%d or cigar=%d!\n",temp_read->core.isize,temp_read->core.l_qseq,bam1_cigar(temp_read)); rm->skip=-4; return; } } /* END */ if(user_args->STRANDED!=0){ if((user_args->STRANDED==-1 && !rm->revcomp) || (user_args->STRANDED==1 && rm->revcomp)){ rm->skip=1;return; } } if(user_args->COLLAPSE>0){ if(lpos==temp_read->core.pos){ if(!rm->revcomp)++pos_dupcounter; else ++neg_dupcounter; if(pos_dupcounter>=user_args->COLLAPSE || neg_dupcounter>=user_args->COLLAPSE){ ++bresults->collapsed; rm->skip=1; return; } }else{ pos_dupcounter=0; neg_dupcounter=0; } } if(!rm->skip){ rm->revcomp ? ++bresults->neg_strand : ++bresults->pos_strand; ++bresults->filtered_reads; bresults->mapmass+=rm->read_length; } }
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level) #endif { bamFile fpout, *fp; heap1_t *heap; bam_header_t *hout = 0; bam_header_t *hheaders = NULL; int i, j, *RG_len = 0; uint64_t idx = 0; char **RG = 0, mode[8]; bam_iter_t *iter = 0; if (headers) { tamFile fpheaders = sam_open(headers); if (fpheaders == 0) { const char *message = strerror(errno); fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hheaders = sam_header_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); // prepare RG tag if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(void*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // read the first for (i = 0; i != n; ++i) { bam_header_t *hin; fp[i] = bam_open(fn[i], "r"); if (fp[i] == 0) { int j; fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) bam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = bam_header_read(fp[i]); if (i == 0) { // the first BAM hout = hin; } else { // validate multiple baf int min_n_targets = hout->n_targets; if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; for (j = 0; j < min_n_targets; ++j) if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", hout->target_name[j], hin->target_name[j], fn[i]); return -1; } // If this input file has additional target reference sequences, // add them to the headers to be output if (hin->n_targets > hout->n_targets) { swap_header_targets(hout, hin); // FIXME Possibly we should also create @SQ text headers // for the newly added reference sequences } bam_header_destroy(hin); } } if (hheaders) { // If the text headers to be swapped in include any @SQ headers, // check that they are consistent with the existing binary list // of reference information. if (hheaders->n_targets > 0) { if (hout->n_targets != hheaders->n_targets) { fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); if (!reg) return -1; } for (j = 0; j < hout->n_targets; ++j) if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); if (!reg) return -1; } } swap_header_text(hout, hheaders); bam_header_destroy(hheaders); } if (reg) { int tid, beg, end; if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { bam_index_t *idx; idx = bam_index_load(fn[i]); iter[i] = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } } for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } if (flag & MERGE_UNCOMP) level = 0; else if (flag & MERGE_LEVEL1) level = 1; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; } bam_header_write(fpout, hout); bam_header_destroy(hout); #ifndef _PBGZF_USE if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); #endif ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; free(heap->b->data); free(heap->b); heap->b = 0; } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i != n; ++i) { bam_iter_destroy(iter[i]); bam_close(fp[i]); } bam_close(fpout); free(fp); free(heap); free(iter); return 0; }
int bam1_strand_(bam1_t *b) { return bam1_strand(b); }
/* Counts probability of non-match count along the read after * subtracting error prob at that position (using the original * orientation). used_pos is an array of ints indicating whether * position was used or not (trimmed, clipped etc). alnerrprof and * used_pos must be of at least length b->core.l_qseq. Note: will add * to alnerrprof and used_pos, i.e. arrays should be initialized to 0 if * you don't want aggregate values. * * WARNING code duplication with count_cigar_ops but merging the two * functions is messy. */ void calc_read_alnerrprof(double *alnerrprof, unsigned long int *used_pos, const bam1_t *b, const char *ref) { /* modelled after bam.c:bam_calend(), bam_format1_core() and * pysam's aligned_pairs (./pysam/csamtools.pyx) */ uint32_t *cigar = bam1_cigar(b); uint32_t k, i; const bam1_core_t *c = &b->core; #if 0 int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */ #else int qlen = b->core.l_qseq; /* read length */ #endif uint32_t pos = c->pos; /* pos on genome */ uint32_t qpos = 0; /* pos on read/query */ uint32_t qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;/* original qpos before mapping as possible reverse */ /* loop over cigar to get aligned bases * * read: bam_format1_core(NULL, b, BAM_OFDEC); */ for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */ int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */ uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT; /* following conditionals could be collapsed to much shorter * code, but we keep them as they were in pysam's * aligned_pairs to make later handling of indels easier */ if (op == BAM_CMATCH || op == BAM_CDIFF) { for (i=pos; i<pos+l; i++) { assert(qpos < qlen); /* case agnostic */ char ref_nt = ref[i]; char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)]; int bq = bam1_qual(b)[qpos]; #if 0 printf("[M]MATCH qpos,i,ref,read = %d,%d,%c,%c\n", qpos, i, ref_nt, read_nt); #endif if (ref_nt != 'N') { if (ref_nt != read_nt || op == BAM_CDIFF) { alnerrprof[qpos_org] += (1.0 - PHREDQUAL_TO_PROB(bq)); } /* otherwise leave at 0.0 but count anyway */ used_pos[qpos_org] += 1; } qpos += 1; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } pos += l; } else if (op == BAM_CINS) { for (i=pos; i<pos+l; i++) { assert(qpos < qlen); alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT)); used_pos[qpos] += 1; #if 0 printf("INS qpos,i = %d,None\n", qpos); #endif qpos += 1; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { for (i=pos; i<pos+l; i++) { #if 0 printf("DEL qpos,i = None,%d\n", i); #endif if (op == BAM_CDEL) { alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT)); used_pos[qpos] += 1; } } pos += l; /* deletion: don't increase qpos */ } else if (op == BAM_CSOFT_CLIP) { #if 0 printf("SOFT CLIP qpos = %d\n", qpos); #endif qpos += l; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } else if (op != BAM_CHARD_CLIP) { LOG_WARN("Unknown op %d in cigar %s\n", op, cigar_str_from_bam(b)); } } /* for k */ assert(pos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */ if (qpos != qlen) { LOG_FIXME("got qpos=%d and qlen=%d for cigar %s l_qseq %d\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq); } assert(qpos == qlen); /* FIXME correct assert? What if hard clipped? */ #if 0 fprintf(stderr, "%s:", __FUNCTION__); for (i=0; i< b->core.l_qseq; i++) { fprintf(stderr, " %g/%d", alnerrprof[i], used_pos[i]); } fprintf(stderr, "\n"); #endif }
node_t *graph_add_sam(graph_t *g, bam1_t *b, ref_t *ref, int32_t use_threads) { bam_aln_t *aln = NULL; int32_t aln_start, aln_index, ref_index, aln_ref_index; int32_t i; node_t *prev_node=NULL, *cur_node=NULL, *ret_node=NULL; uint8_t type, strand; aln_start = b->core.pos+1; aln_ref_index = b->core.tid; aln = bam_aln_init(b, ref); strand = bam1_strand(b); // --- SYNC ON --- if(1 == use_threads) pthread_mutex_lock(&graph_mutex); // synchronize start if(aln_start < g->position_start) { int32_t diff = g->position_start - aln_start; graph_nodes_realloc(g, g->position_end - aln_start + 1); // alloc more memory if needed // shift up for(i=g->position_end-g->position_start;0<=i;i--) { // swap node_list_t *list = g->nodes[i+diff]; g->nodes[i+diff] = g->nodes[i]; g->nodes[i] = list; } g->position_start = aln_start; } if(1 == g->is_empty) { for(i=0;i<g->position_end - g->position_start + 1;i++) { node_list_clear(g->nodes[i]); assert(0 == g->nodes[i]->length); // DEBUG } g->position_start = aln_start; if(ALN_GAP == aln->ref[0]) { g->position_start--; } g->position_end = g->position_start; g->contig = aln_ref_index + 1; g->is_empty = 0; } if(1 == use_threads) pthread_mutex_unlock(&graph_mutex); // synchronize end // --- SYNC OFF --- for(aln_index=0,ref_index=-1;aln_index<aln->length;aln_index++,prev_node=cur_node) { // Skip over a deletion while(ALN_GAP == aln->read[aln_index]) { aln_index++; ref_index++; } if(aln->read[aln_index] == aln->ref[aln_index]) { // match type = NODE_MATCH; } else if(aln->ref[aln_index] == ALN_GAP) { // insertion type = NODE_INSERTION; } else { // mismatch type = NODE_MISMATCH; } if(NULL == prev_node || NODE_INSERTION != __node_type(prev_node)) { // previous was an insertion, already on the position ref_index++; } cur_node = graph_add_node(g, node_init(aln->read[aln_index], type, g->contig, aln_start + ref_index, prev_node), prev_node, use_threads); if(NULL == prev_node && 0 == strand) { // first node and forward strand ret_node = cur_node; } } if(1 == strand) { ret_node = cur_node; } bam_aln_free(aln); return ret_node; }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); int storeSize = 100; int delStore[2][100] = {{0},{0}}; typedef char * mstring; while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { //MDW start //for each position in the pileup column int charLen = 16; int countChars[ charLen ][2]; int countiChars[ charLen ][2]; int countGap[2]={0,0}; //double qvTotal=0; int numStruck=0; int numGood=0; int tti; int ttj; mstring insAllele[100]; int insAlleleCnt[100]; int sf=0; int flag=0; //typedef char * string; char insStr0[10000]; int iCnt0=0; char insStr1[10000]; int iCnt1=0; char delStr0[10000]; int dCnt0=0; char delStr1[10000]; int dCnt1=0; float qposP[10000]; int qposCnt=0; //initialize with zeros for(tti=0;tti<charLen;tti++){ countChars[tti][0]=0; countChars[tti][1]=0; } // define repeat length here; look back up to 10 prior positions // start one position away. int replC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos-1])==toupper(ref[pos-tti])){ replC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int reprC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos+1])==toupper(ref[pos+tti])){ reprC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int repT = replC; if(replC < reprC){ repT=reprC; } for (j = 0; j < n_plp[i]; ++j){ const bam_pileup1_t *p = plp[i] + j; /* SAME LOGIC AS pileup_seq() */ if(p->is_refskip){ // never count intron gaps in numStruck continue; } if(p->is_del){ // skip deletion gap, after first position which is the first aligned char continue; } if( p->b->core.qual < conf->min_mqToCount || // mapping quality conf->maxrepC < (repT) || // max homopolymer run, this will not (!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) || // trimEnd is 1-based p->zf == 1 || // fusion tag p->ih > conf->maxIH || // max hit index (p->nmd > conf->maxNM) || // max mismatch (conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs (conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary (conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup (conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY)) || // optionally strike secondary or dup (conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) )) // optionally strike secondary, dup and QCfail ){ numStruck++; continue; } //printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]); if(!p->is_del && p->indel==0){ countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++; numGood++; }else if(p->is_refskip){ countGap[ bam1_strand(p->b) ]++; } if(p->indel<0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr0[dCnt0] = ref[pos+tti]; dCnt0++; } delStr0[dCnt0] = ','; dCnt0++; }else{ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr1[dCnt1] = ref[pos+tti]; dCnt1++; } delStr1[dCnt1] = ','; dCnt1++; } }else if(p->indel>0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt0++; } insStr0[iCnt0] = ','; iCnt0++; }else{ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt1++; } insStr1[iCnt1] = ','; iCnt1++; } } //calculate position of variant within aligned read - no soft clips if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0 ){ //distance to end; calculate distance to end of aligned read. removes soft clips. int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd; qposP[qposCnt] = distToEnd; qposCnt++; // printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd); } } // //print A,C,G,T, by +/- printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", countChars[1][0],countChars[1][1], countChars[2][0],countChars[2][1], countChars[4][0],countChars[4][1], countChars[8][0],countChars[8][1], countChars[7][0],countChars[7][1]); putchar('\t'); for(tti=0;tti<dCnt0;tti++){ putchar(delStr0[tti]); } putchar('\t'); for(tti=0;tti<dCnt1;tti++){ putchar(delStr1[tti]); } putchar('\t'); for(tti=0;tti<iCnt0;tti++){ putchar(insStr0[tti]); } putchar('\t'); for(tti=0;tti<iCnt1;tti++){ putchar(insStr1[tti]); } printf("\t%d\t%d",numGood,numStruck); // get non-ref qpos variation float medqpos = -1; float medAbsDev = -1; if(qposCnt>0){ medqpos = median(qposCnt,qposP); float absDev[qposCnt]; for(tti=0;tti<qposCnt;tti++){ absDev[tti] = abs(medqpos - qposP[tti]); } medAbsDev = median(qposCnt-1,absDev); } printf("\t%f",medAbsDev); ///END MDW } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); #ifdef USE_HTSLIB while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) { #else while ((res = bam_read1(bs->fp, b)) >= 0) { #endif uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; #ifdef USE_HTSLIB s = bam_get_seq(b); q = bam_get_qual(b); #else s = bam1_seq(b); q = bam1_qual(b); #endif p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { #ifdef USE_HTSLIB p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)]; #else p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; #endif p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } #ifdef USE_HTSLIB if (bam_is_rev(b)) { // then reverse #else if (bam1_strand(b)) { // then reverse #endif seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); #ifdef USE_HTSLIB p->name = strdup((const char*)bam_get_qname(b)); #else p->name = strdup((const char*)bam1_qname(b)); #endif if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; } #define BARCODE_LOW_QUAL 13 bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; } void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) { int i, j; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } free(seqs); }