Пример #1
0
void get_dqstats(
        const bam_pileup1_t* buf,
        int n_reads,
        int ref_base,
        uint32_t wanted_bases,
        dqstats_t *dqs
        )
{
    int i, j;
    int base;

    memset(dqs, 0, sizeof(dqstats_t));

    for (i = 0; i < n_reads; ++i) {
        if (buf[i].is_del || buf[i].b->core.flag&BAM_FUNMAP)
            continue;

        ++dqs->total_depth;
        dqs->total_mean_mapQ += buf[i].b->core.qual;

        base = bam1_seqi(bam1_seq(buf[i].b), buf[i].qpos);
        if (base == ref_base)
            ++dqs->dp4[bam1_strand(buf[i].b)];
        else
            ++dqs->dp4[2+bam1_strand(buf[i].b)];

        for (j = 0; j < 4; ++j) {
            int value = 1 << j;
            if ((base & value) == base) {
                ++dqs->base_occ[j];
                if (value & wanted_bases) {
                    dqs->mean_baseQ[j] += bam1_qual(buf[i].b)[buf[i].qpos];
                    dqs->mean_mapQ[j] += buf[i].b->core.qual;
                }
            }
        }
    }

    for (i = 0; i < 4; ++i) {
        if (dqs->base_occ[i] > 0) {
            dqs->mean_baseQ[i] = dqs->mean_baseQ[i]/(double)dqs->base_occ[i] + .499;
            dqs->mean_mapQ[i] = dqs->mean_mapQ[i]/(double)dqs->base_occ[i] + .499;
        }
    }

    if (dqs->total_depth > 0)
        dqs->total_mean_mapQ = dqs->total_mean_mapQ / (double)dqs->total_depth + .499;
}
Пример #2
0
static int fetch_disc_read_callback(const bam1_t* alignment, void* data) {
    //    MEI_data* mei_data = static_cast<MEI_data*>(data);
    std::pair<MEI_data*, UserDefinedSettings*>* env = static_cast<std::pair<MEI_data*, UserDefinedSettings*>*>(data);
    MEI_data* mei_data = env->first;
    UserDefinedSettings* userSettings = env->second;
    if (!(alignment->core.flag & BAM_FUNMAP || alignment->core.flag & BAM_FMUNMAP) && // Both ends are mapped.
        !is_concordant(alignment, mei_data->current_insert_size) &&                   // Ends map discordantly.
        // Extra check for (very) large mapping distance.  This is done beside the check for read
        // discordance to speed up computation by ignoring signals from small structural variants.
        (alignment->core.tid != alignment->core.mtid ||
         abs(alignment->core.pos - alignment->core.mpos) > userSettings->MIN_DD_MAP_DISTANCE)) {
            
            // Save alignment as simple_read object.
            std::string read_name = enrich_read_name(bam1_qname(alignment), alignment->core.flag & BAM_FREAD1);
            char strand = bam1_strand(alignment)? Minus : Plus;
            char mate_strand = bam1_mstrand(alignment)? Minus : Plus;
            std::string read_group;
            get_read_group(alignment, read_group);
            std::string sample_name;
            get_sample_name(read_group, mei_data->sample_names, sample_name);
            
            simple_read* read = new simple_read(read_name, alignment->core.tid, alignment->core.pos, strand, sample_name,
                                                get_sequence(bam1_seq(alignment), alignment->core.l_qseq),
                                                alignment->core.mtid, alignment->core.mpos, mate_strand);
            mei_data->discordant_reads.push_back(read);
        }
    return 0;
}
Пример #3
0
static inline int bam1_lt(const bam1_p a, const bam1_p b)
{
	if (g_is_by_qname) {
		int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
		return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
	} else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b)));
}
Пример #4
0
static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
{
	int j;
	if (p->is_head) {
		putchar('^');
		putchar(p->b->core.qual > 93? 126 : p->b->core.qual + 33);
	}
	if (!p->is_del) {
		int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
		if (ref) {
			int rb = pos < ref_len? ref[pos] : 'N';
			if (c == '=' || bam_nt16_table[c] == bam_nt16_table[rb]) c = bam1_strand(p->b)? ',' : '.';
			else c = bam1_strand(p->b)? tolower(c) : toupper(c);
		} else {
			if (c == '=') c = bam1_strand(p->b)? ',' : '.';
			else c = bam1_strand(p->b)? tolower(c) : toupper(c);
		}
		putchar(c);
	} else putchar(p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*');
	if (p->indel > 0) {
		putchar('+'); printw(p->indel, stdout);
		for (j = 1; j <= p->indel; ++j) {
			int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
			putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
		}
	} else if (p->indel < 0) {
		printw(p->indel, stdout);
		for (j = 1; j <= -p->indel; ++j) {
			int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
			putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
		}
	}
	if (p->is_tail) putchar('$');
}
Пример #5
0
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	int n_seqs, l, i;
	long n_trimmed = 0, n_tot = 0;
	bam1_t *b;
	int res;

	b = bam_init1();
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
	while ((res = bam_read1(bs->fp, b)) >= 0) {
		uint8_t *s, *q;
		int go = 0;
		if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
		if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
		if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
		if (go == 0) continue;
		l = b->core.l_qseq;
		p = &seqs[n_seqs++];
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
		s = bam1_seq(b); q = bam1_qual(b);
		p->seq = (ubyte_t*)calloc(p->len + 1, 1);
		p->qual = (ubyte_t*)calloc(p->len + 1, 1);
		for (i = 0; i != p->full_len; ++i) {
			p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
			p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
		}
		if (bam1_strand(b)) { // then reverse 
			seq_reverse(p->len, p->seq, 1);
			seq_reverse(p->len, p->qual, 0);
		}
		if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
		p->name = strdup((const char*)bam1_qname(b));
		if (n_seqs == n_needed) break;
	}
	if (res < 0 && res != -1) err_fatal_simple("Error reading bam file");
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		bam_destroy1(b);
		return 0;
	}
	bam_destroy1(b);
	return seqs;
}
Пример #6
0
void hash_reads( table* T, const char* reads_fn, interval_stack* is )
{
    samfile_t* reads_f = samopen( reads_fn, "rb", NULL );
    if( reads_f == NULL ) {
        failf( "Can't open bam file '%s'.", reads_fn );
    }

    bam_index_t* reads_index = bam_index_load( reads_fn );
    if( reads_index == NULL ) {
        failf( "Can't open bam index '%s.bai'.", reads_fn );
    }

    bam_init_header_hash( reads_f->header );

    table_create( T, reads_f->header->n_targets );
    T->seq_names = (char**)malloc( sizeof(char*) * reads_f->header->n_targets );
    size_t k;
    for( k = 0; k < reads_f->header->n_targets; k++ ) {
        T->seq_names[k] = strdup(reads_f->header->target_name[k]);
    }

    log_puts( LOG_MSG, "hashing reads ... \n" );
    log_indent();
    bam_iter_t read_iter;
    bam1_t* read = bam_init1();
    int tid;

    interval_stack::iterator i;
    for( i = is->begin(); i != is->end(); i++ ) {
        tid = bam_get_tid( reads_f->header, i->seqname );
        if( tid < 0 ) continue;

        read_iter = bam_iter_query( reads_index, tid,
                                    i->start, i->end );

        while( bam_iter_read( reads_f->x.bam, read_iter, read ) >= 0 ) {
            if( bam1_strand(read) == i->strand ) {
                table_inc( T, read );
            }
        }

        bam_iter_destroy(read_iter);
    }

    bam_destroy1(read);

    log_unindent();
    log_printf( LOG_MSG, "done. (%zu unique reads hashed)\n", T->m );


    bam_index_destroy(reads_index);
    samclose(reads_f);
}
Пример #7
0
static char *_bamqual(const bam1_t * bam, BAM_DATA bd)
{
    const uint32_t len = bam->core.l_qseq;
    const unsigned char *bamq = bam1_qual(bam);
    char *s = Calloc(len + 1, char);
    for (uint32_t i = 0; i < len; ++i)
        s[i] = bamq[i] + 33;
    if (bd->reverseComplement && (bam1_strand(bam) == 1))
        _reverse(s, len);
    s[len] = '\0';
    return s;
}
Пример #8
0
/*!
 @abstract     Get the color quality of the color encoding the previous and current base
 @param b      pointer to an alignment
 @param i      The i-th position, 0-based
 @return       color quality

 @discussion   Returns 0 no color information is found.
 */
char bam_aux_getCQi(bam1_t *b, int i)
{
	uint8_t *c = bam_aux_get(b, "CQ");
	char *cq = NULL;
	
	// return the base if the tag was not found
	if(0 == c) return 0;

	cq = bam_aux2Z(c);
	// adjust for strandedness
	if(bam1_strand(b)) i = strlen(cq) - 1 - i;
	return cq[i];
}
Пример #9
0
/*!
 @abstract     Get the color encoding the previous and current base
 @param b      pointer to an alignment
 @param i      The i-th position, 0-based
 @return       color

 @discussion   Returns 0 no color information is found.
 */
char bam_aux_getCSi(bam1_t *b, int i)
{
	uint8_t *c = bam_aux_get(b, "CS");
	char *cs = NULL;

	// return the base if the tag was not found
	if(0 == c) return 0;

	cs = bam_aux2Z(c);
	// adjust for strandedness and leading adaptor
	if(bam1_strand(b)) i = strlen(cs) - 1 - i;
	else i++;
	return cs[i];
}
Пример #10
0
static char *_bamseq(const bam1_t * bam, BAM_DATA bd)
{
    static const char key[] = {
        '-', 'A', 'C', 'M', 'G', 'R', 'S', 'V',
        'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'
    };

    const uint32_t len = bam->core.l_qseq;
    const unsigned char *seq = bam1_seq(bam);
    char *s = Calloc(len + 1, char);
    for (uint32_t i = 0; i < len; ++i)
        s[i] = key[bam1_seqi(seq, i)];
    if (bd->reverseComplement && (bam1_strand(bam) == 1))
        _reverseComplement(s, len);
    s[len] = '\0';
    return s;
}
Пример #11
0
// Returns true for a given read whether it is concordantly mapped together with its mate.
bool is_concordant(const bam1_t* read, unsigned int insert_size) {
    if (read->core.flag & BAM_FUNMAP || read->core.flag & BAM_FMUNMAP) {
        // At least one of the pair is unmapped.
        return false;
    }
    if (read->core.tid != read->core.mtid) {
        // Reads mapping to different chromosomes.
        return false;
    }
    if (bam1_strand(read) == bam1_mstrand(read)) {
        // Reads mapping to the same strand.
        return false;
    }
    
    // Return true if insert size is as expected.  This definition of concordance is
    // equivalent to read-pair construction elsewhere in Pindel.
    return (unsigned int) abs(read->core.isize) < read->core.l_qseq + 2 * insert_size;
}
Пример #12
0
/*!
 @abstract     Get the color error profile at the give position    
 @param b      pointer to an alignment
 @return       the original color if the color was an error, '-' (dash) otherwise

 @discussion   Returns 0 no color information is found.
 */
char bam_aux_getCEi(bam1_t *b, int i)
{
	int cs_i;
	uint8_t *c = bam_aux_get(b, "CS");
	char *cs = NULL;
	char prev_b, cur_b;
	char cur_color, cor_color;

	// return the base if the tag was not found
	if(0 == c) return 0;
	
	cs = bam_aux2Z(c);

	// adjust for strandedness and leading adaptor
	if(bam1_strand(b)) { //reverse strand
		cs_i = strlen(cs) - 1 - i;
		// get current color
		cur_color = cs[cs_i];
		// get previous base.  Note: must rc adaptor
		prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
		// get current base
		cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; 
	}
	else {
		cs_i=i+1;
		// get current color
		cur_color = cs[cs_i];
		// get previous base
		prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
		// get current base
		cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
	}

	// corrected color
	cor_color = bam_aux_ntnt2cs(prev_b, cur_b);

	if(cur_color == cor_color) { 
		return '-';
	}
	else {
		return cur_color;
	}
}
Пример #13
0
// Mostly stolen from bwa_read_bam.
void bam1_to_seq(bam1_t *raw, bwa_seq_t *p, int is_comp, int trim_qual)
{
    // long n_trimmed = 0;

    uint8_t *s, *q;
    int i, l = raw->core.l_qseq;
    p->tid = -1; // no assigned to a thread
    p->qual = 0;
    p->full_len = p->clip_len = p->len = l;
    // n_tot += p->full_len;
    s = bam1_seq(raw);
    q = bam1_qual(raw);
    p->seq = (ubyte_t*)calloc(p->len + 1, 1);
    p->qual = (ubyte_t*)calloc(p->len + 1, 1);
    for (i = 0; i != p->full_len; ++i) {
        p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
        p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
    }
    if (bam1_strand(raw)) { // then reverse
        seq_reverse(p->len, p->seq, 1);
        seq_reverse(p->len, p->qual, 0);
    }
    if (trim_qual >= 1) /* n_trimmed += */ bwa_trim_read(trim_qual, p);
    p->rseq = (ubyte_t*)calloc(p->full_len, 1);
    memcpy(p->rseq, p->seq, p->len);
    seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
    seq_reverse(p->len, p->rseq, is_comp);
    p->max_entries = 0 ;

    // We don't set a name, it's contained in the original record
    // anyway.
    // p->name = strdup((const char*)bam1_qname(raw));

    // No place to put the tally right now.
    // if (n_seqs && trim_qual >= 1)
    // fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
}
Пример #14
0
// TODO soft-clipping
bam1_t *sw_align(graph_t *g, bam1_t *b, node_t *n, sw_heap_t *heap, char *rg_id, int32_t offset, cov_cutoffs_t *cutoffs, uint8_t correct_bases, uint8_t use_qualities, int32_t max_total_coverage, int32_t max_heap_size)
{
	char *colors = NULL;
	char *color_qualities = NULL;
	char base, qual;
	uint8_t space = SRMA_SPACE_NT;
	uint8_t strand;
	int32_t i, j, aln_start;
	int32_t num_start_nodes_added=0;
	int32_t sw_node_i=-1, sw_node_best_i=-1, sw_node_cur_i=-1, sw_node_next_i=-1;
	int32_t soft_clip_start_l = 0, soft_clip_end_l = 0;


	strand = bam1_strand(b);

	// soft-clipping
	if(1 == strand) { //reverse
		// going from 3'->5'
		soft_clip_start_l = sw_align_get_soft_clip(b, 1); 
		soft_clip_end_l = sw_align_get_soft_clip(b, 0);
	}
	else {
		// going from 5'->3'
		soft_clip_start_l = sw_align_get_soft_clip(b, 0); 
		soft_clip_end_l = sw_align_get_soft_clip(b, 1);
	}
	// FOR NOW
	if(0 < soft_clip_start_l || 0 < soft_clip_end_l) {
		return b;
	}

	// Check color space
	colors = sw_align_get_cs(b);
	if(NULL == colors) {
		space = SRMA_SPACE_NT;
	}
	else {
		space = SRMA_SPACE_CS;
		color_qualities  = sw_align_get_cq(b);
		// Some aligners include a quality value for the adapter.  A quality value
		// IMHO should not be given for an unobserved (assumed) peice of data.  Trim
		// the first quality in this case
		if(strlen(colors) == strlen(color_qualities)) {  // ignore leading quality
			color_qualities++;
		}
		if(0 < soft_clip_start_l || 0 < soft_clip_end_l) {
			srma_error(__func__, "Soft clipping not supported for color space", Exit, OutOfRange);
		}
	}	

	// remove mate info 
	b->core.flag &= ~(BAM_FPROPER_PAIR | BAM_FMREVERSE | BAM_FMUNMAP);
	b->core.mtid = -1;
	b->core.mpos = -1;
	b->core.isize = 0;

	// re-type heap
	heap->type = (1 == strand) ? SRMA_SW_HEAP_MAX : SRMA_SW_HEAP_MIN;

	// bound with original alignment
	sw_node_best_i = sw_align_bound(g, b, n, heap, strand, colors, color_qualities, space, cutoffs, use_qualities, max_total_coverage, max_heap_size);
	if(0 <= sw_node_best_i) {
		/*
		sw_heap_reset(heap); // reset the heap, keep old nodes
		   fprintf(stderr, "BOUNDED score=%d coverage_sum=%hu\n", 
		   heap->nodes[sw_node_best_i].score,
		   heap->nodes[sw_node_best_i].coverage_sum); // DEBUG
		   */
	}
	else {
		//fprintf(stderr, "NOT BOUNDED\n"); // DEBUG
		// nodes do not need to be preserved
		sw_heap_clear(heap);
	}
	//return b; // HERE DEBUG HERE BUG
                                        
	// add start nodes
	if(strand) {
		if(SRMA_SPACE_CS == space) {
			base = nt2int_table[(int)colors[1]];
			qual = color_qualities[0]; 
		}
		else {
			base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1)];
			qual = bam1_qual(b)[b->core.l_qseq-1] + 33;
		}
		aln_start = bam_calend(&b->core, bam1_cigar(b));
		for(i=aln_start+offset;aln_start-offset<=i;i--) {
			int32_t pos = graph_get_node_list_index_at_or_before(g, i);
			node_list_t *list = graph_get_node_list(g, pos);
			if(1 != pos && NULL != list) {
				for(j=0;j<list->length;j++) {
					node_t *node = list->nodes[j];
					int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage);
					if(0 == pass) {
						sw_node_i = sw_heap_get_node_i(heap);
						sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); 
						sw_heap_add_i(heap, sw_node_i);
					}
					else if(pass < 0) {
						sw_heap_clear(heap); // clear heap
						return b;
					}
					if(node->position < i) {
						i = node->position;
					}
					num_start_nodes_added++;
				}
			}
		}
	}
	else {
		if(SRMA_SPACE_CS == space) {
			base = nt2int_table[(int)colors[1]];
			qual = color_qualities[0]; 
		}
		else {
			base = nt4bit_to_int[bam1_seqi(bam1_seq(b), 0)];
			qual = bam1_qual(b)[0] + 33;
		}
		aln_start = b->core.pos;
		for(i=aln_start-offset;i<=aln_start+offset;i++) {
			int32_t pos = graph_get_node_list_index_at_or_after(g, i);
			node_list_t *list = graph_get_node_list(g, pos);
			if(0 != pos && NULL != list) {
				for(j=0;j<list->length;j++) {
					node_t *node = list->nodes[j];
					int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage);
					if(0 == pass) {
						sw_node_i = sw_heap_get_node_i(heap);
						sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); 
						sw_heap_add_i(heap, sw_node_i);
					}
					else if(pass < 0) {
						sw_heap_clear(heap); // clear heap
						return b;
					}
					if(node->position < i) {
						i = node->position;
					}
					num_start_nodes_added++;
				}
			}
		}
	}
	if(0 == num_start_nodes_added) {
		srma_error(__func__, "Did not add any start nodes", Exit, OutOfRange);
	}

	sw_node_cur_i = sw_heap_poll_i(heap);
	while(0 <= sw_node_cur_i) {
                    
		if(max_heap_size < heap->queue_end - heap->queue_start + 1) {
			// too many to consider
			sw_heap_clear(heap); // clear heap
			return b;
		}

		sw_node_next_i = sw_heap_peek_i(heap);
		assert(0 <= sw_node_cur_i); // DEBUG
		while(NODE_INSERTION != __node_type(heap->nodes[sw_node_cur_i].node)
				&& 0 <= sw_node_next_i
				&& 0 == sw_node_compare(&heap->nodes[sw_node_cur_i], &heap->nodes[sw_node_next_i], heap->type)) {
			if(heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_next_i].score ||
					(heap->nodes[sw_node_cur_i].score == heap->nodes[sw_node_next_i].score &&
					 heap->nodes[sw_node_cur_i].coverage_sum < heap->nodes[sw_node_next_i].coverage_sum)) { 
				sw_node_cur_i = sw_heap_poll_i(heap);
			}
			else {
				// ignore the next node
				sw_heap_poll_i(heap);
			}
			sw_node_next_i = sw_heap_peek_i(heap);
		}
		sw_node_next_i = -1;

		if(heap->nodes[sw_node_cur_i].read_offset == b->core.l_qseq-1) { // found, keep best
			if(sw_node_best_i < 0 ||
					heap->nodes[sw_node_best_i].score < heap->nodes[sw_node_cur_i].score ||
					(heap->nodes[sw_node_best_i].score == heap->nodes[sw_node_cur_i].score && 
					 heap->nodes[sw_node_best_i].coverage_sum < heap->nodes[sw_node_cur_i].coverage_sum)) {
				//fprintf(stderr, "FOUND BEST\n"); // DEBUG
				sw_node_best_i = sw_node_cur_i;
			}
		}
                else if(0 <= sw_node_best_i && 
                        heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_best_i].score) {
                        // ignore, under the assumption that scores can only
                        // become more negative.
                }
		else {
			edge_list_t *list = NULL;
			if(1 == strand) { // reverse
				list = heap->nodes[sw_node_cur_i].node->prev;
			}
			else {
				list = heap->nodes[sw_node_cur_i].node->next;
			}
			{ // get the base and quality
				if(SRMA_SPACE_CS == space) {
					base = nt2int_table[(int)colors[1 + (heap->nodes[sw_node_cur_i].read_offset+1)]];
					qual = color_qualities[heap->nodes[sw_node_cur_i].read_offset+1]; 
				}
				else {
					if(strand) {
						base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1)];
						qual = bam1_qual(b)[b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1] + 33;
					}
					else {
						base = nt4bit_to_int[bam1_seqi(bam1_seq(b), (heap->nodes[sw_node_cur_i].read_offset+1))];
						qual = bam1_qual(b)[(heap->nodes[sw_node_cur_i].read_offset+1)] + 33;
					}
				}
			}
			/*
			   node_t *node = heap->nodes[sw_node_cur_i].node;
			   fprintf(stderr, "NODE %d:%d offset=%d coverage=%d base=%d\n",
			   node->contig, node->position, node->offset, node->coverage, node->base);
			   fprintf(stderr, "SW_NODE read_offset=%d score=%d coverage_sum=%d start_position=%d space=%d\n",
			   heap->nodes[sw_node_cur_i].read_offset, heap->nodes[sw_node_cur_i].score, heap->nodes[sw_node_cur_i].coverage_sum, heap->nodes[sw_node_cur_i].start_position, space);
			   */
			for(i=0;i<list->length;i++) {
				node_t *node_cur = list->nodes[i];
				uint16_t coverage_cur = list->coverages[i];
				int32_t pass = pass_filters(g, node_cur, coverage_cur, cutoffs, max_total_coverage);
				if(0 == pass) {
					// add to the heap
					sw_node_i = sw_heap_get_node_i(heap);
					// DEBUG
					assert(0 <= sw_node_cur_i);
					assert(0 <= heap->nodes[sw_node_cur_i].read_offset);
					sw_node_init(&heap->nodes[sw_node_i], &heap->nodes[sw_node_cur_i], node_cur, coverage_cur, base, qual, use_qualities, space); 
					sw_heap_add_i(heap, sw_node_i);
				}
				else if(pass < 0) {
					sw_heap_clear(heap); // clear heap
					return b;
				}
			}
		}
		// get the next node
		sw_node_cur_i = sw_heap_poll_i(heap);
	}

        /*
	fprintf(stderr, "sw_node_best_i=%d\n", sw_node_best_i); // DEBUG
	if(0 <= sw_node_best_i) {
	fprintf(stderr, "END score=%d coverage_sum=%hu\n", 
	heap->nodes[sw_node_best_i].score,
	heap->nodes[sw_node_best_i].coverage_sum); // DEBUG
	}
        */
	// update SAM/BAM
	b = sw_align_update_bam(b, rg_id, heap, sw_node_best_i, space, colors, color_qualities, strand, correct_bases);
	sw_heap_clear(heap); // clear heap
	return b;
}	
Пример #15
0
/**
* @brief Major quality check point for each read
*
* @param rm Empty read_metrics_t to be updated with the results
* @param temp_read The read to be assessed
* @param user_args User arguments to be considered during assessment
* @param bresults Block wide results of the parsing
* @param lpos Current position
* @return void
* @details Major quality check point for each read
* @todo nothing
*/
void quality_check(read_metrics_t *rm,bam1_t *temp_read,user_arguments_t *user_args,seq_block_t *bresults,int lpos){
	static int pos_dupcounter=0,neg_dupcounter=0;
	rm->skip=0;
	rm->read_length=0;

	rm->genomic_end= bam_calend(&temp_read->core,bam1_cigar(temp_read));

	/* Determine read length */
	if(bam1_pair(temp_read)){
		++bresults->paired;
		if (bam1_ppair(temp_read))++bresults->ppairs;
	}

	++bresults->total_reads;
	if(temp_read->core.qual < user_args->TMAPQ || bam1_unmapped(temp_read)){
		++bresults->lowqual;
		rm->skip=1;
		return;
	}

	if(user_args->UNIQUE && bam1_multimap(temp_read)){
		rm->skip=1;
		return;
	}

	if(!user_args->PAIRED){
		rm->revcomp=bam1_strand(temp_read);
		rm->read_length=bam_cigar2qlen(&temp_read->core,bam1_cigar(temp_read));
	} else if (bam1_ppair(temp_read) && !bam1_notprimary(temp_read)){
		rm->revcomp=bam1_revpair(temp_read);
		if(!user_args->READTHROUGH){rm->read_length=bam_cigar2qlen(&temp_read->core,bam1_cigar(temp_read));//sets the read length only!!
		}else if(temp_read->core.isize!=0 ){
				if((bam1_firstr(temp_read)&&!bam1_revpair(temp_read))||(bam1_secondr(temp_read)&&bam1_mrevpair(temp_read))){
					rm->read_length=temp_read->core.isize;
				} else {
					rm->skip=1;
					return;
				}
		} else{
			warning("ISIZE not set in SAM/BAM file. Re-run without using the readthrough_pairs option\n");
			rm->skip=-4;
			return;
		}
	} else{
		rm->skip=1;
		return;
	}

	if(!rm->read_length){
		rm->read_length=temp_read->core.l_qseq;
		if(!rm->read_length){
			warning("Read length neither found in core.isize=%d, core.l_qseq=%d or cigar=%d!\n",temp_read->core.isize,temp_read->core.l_qseq,bam1_cigar(temp_read));
			rm->skip=-4;
			return;
		}
	}
	/* END */

	if(user_args->STRANDED!=0){
		if((user_args->STRANDED==-1 && !rm->revcomp) || (user_args->STRANDED==1 && rm->revcomp)){
			rm->skip=1;return;
		}
	}

	if(user_args->COLLAPSE>0){
		if(lpos==temp_read->core.pos){
			if(!rm->revcomp)++pos_dupcounter;
			else ++neg_dupcounter;
			if(pos_dupcounter>=user_args->COLLAPSE || neg_dupcounter>=user_args->COLLAPSE){
				++bresults->collapsed;
				rm->skip=1;
				return;
			}
		}else{
			pos_dupcounter=0;
			neg_dupcounter=0;
		}
	}

	if(!rm->skip){
		rm->revcomp ? ++bresults->neg_strand : ++bresults->pos_strand;
		++bresults->filtered_reads;
		bresults->mapmass+=rm->read_length;
	}
}
Пример #16
0
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level)
#endif
{
	bamFile fpout, *fp;
	heap1_t *heap;
	bam_header_t *hout = 0;
	bam_header_t *hheaders = NULL;
	int i, j, *RG_len = 0;
	uint64_t idx = 0;
	char **RG = 0, mode[8];
	bam_iter_t *iter = 0;

	if (headers) {
		tamFile fpheaders = sam_open(headers);
		if (fpheaders == 0) {
			const char *message = strerror(errno);
			fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
			return -1;
		}
		hheaders = sam_header_read(fpheaders);
		sam_close(fpheaders);
	}

	g_is_by_qname = by_qname;
	fp = (bamFile*)calloc(n, sizeof(bamFile));
	heap = (heap1_t*)calloc(n, sizeof(heap1_t));
	iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
	// prepare RG tag
	if (flag & MERGE_RG) {
		RG = (char**)calloc(n, sizeof(void*));
		RG_len = (int*)calloc(n, sizeof(int));
		for (i = 0; i != n; ++i) {
			int l = strlen(fn[i]);
			const char *s = fn[i];
			if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
			for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
			++j; l -= j;
			RG[i] = calloc(l + 1, 1);
			RG_len[i] = l;
			strncpy(RG[i], s + j, l);
		}
	}
	// read the first
	for (i = 0; i != n; ++i) {
		bam_header_t *hin;
		fp[i] = bam_open(fn[i], "r");
		if (fp[i] == 0) {
			int j;
			fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
			for (j = 0; j < i; ++j) bam_close(fp[j]);
			free(fp); free(heap);
			// FIXME: possible memory leak
			return -1;
		}
		hin = bam_header_read(fp[i]);
		if (i == 0) { // the first BAM
			hout = hin;
		} else { // validate multiple baf
			int min_n_targets = hout->n_targets;
			if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;

			for (j = 0; j < min_n_targets; ++j)
				if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
					fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
							hout->target_name[j], hin->target_name[j], fn[i]);
					return -1;
				}

			// If this input file has additional target reference sequences,
			// add them to the headers to be output
			if (hin->n_targets > hout->n_targets) {
				swap_header_targets(hout, hin);
				// FIXME Possibly we should also create @SQ text headers
				// for the newly added reference sequences
			}

			bam_header_destroy(hin);
		}
	}

	if (hheaders) {
		// If the text headers to be swapped in include any @SQ headers,
		// check that they are consistent with the existing binary list
		// of reference information.
		if (hheaders->n_targets > 0) {
			if (hout->n_targets != hheaders->n_targets) {
				fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
				if (!reg) return -1;
			}
			for (j = 0; j < hout->n_targets; ++j)
				if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
					fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
					if (!reg) return -1;
				}
		}

		swap_header_text(hout, hheaders);
		bam_header_destroy(hheaders);
	}

	if (reg) {
		int tid, beg, end;
		if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
			fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
			return -1;
		}
		for (i = 0; i < n; ++i) {
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			iter[i] = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
	}

	for (i = 0; i < n; ++i) {
		heap1_t *h = heap + i;
		h->i = i;
		h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
		if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
			h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
			h->idx = idx++;
		}
		else h->pos = HEAP_EMPTY;
	}
	if (flag & MERGE_UNCOMP) level = 0;
	else if (flag & MERGE_LEVEL1) level = 1;
	strcpy(mode, "w");
	if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
	if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
		fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
		return -1;
	}
	bam_header_write(fpout, hout);
	bam_header_destroy(hout);
#ifndef _PBGZF_USE 
	if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
#endif

	ks_heapmake(heap, n, heap);
	while (heap->pos != HEAP_EMPTY) {
		bam1_t *b = heap->b;
		if (flag & MERGE_RG) {
			uint8_t *rg = bam_aux_get(b, "RG");
			if (rg) bam_aux_del(b, rg);
			bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
		}
		bam_write1_core(fpout, &b->core, b->data_len, b->data);
		if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
			heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
			heap->idx = idx++;
		} else if (j == -1) {
			heap->pos = HEAP_EMPTY;
			free(heap->b->data); free(heap->b);
			heap->b = 0;
		} else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
		ks_heapadjust(heap, 0, n, heap);
	}

	if (flag & MERGE_RG) {
		for (i = 0; i != n; ++i) free(RG[i]);
		free(RG); free(RG_len);
	}
	for (i = 0; i != n; ++i) {
		bam_iter_destroy(iter[i]);
		bam_close(fp[i]);
	}
	bam_close(fpout);
	free(fp); free(heap); free(iter);
	return 0;
}
Пример #17
0
int bam1_strand_(bam1_t *b) { return bam1_strand(b); }
Пример #18
0
/* Counts probability of non-match count along the read after
 * subtracting error prob at that position (using the original
 * orientation). used_pos is an array of ints indicating whether
 * position was used or not (trimmed, clipped etc). alnerrprof and
 * used_pos must be of at least length b->core.l_qseq. Note: will add
 * to alnerrprof and used_pos, i.e. arrays should be initialized to 0 if
 * you don't want aggregate values.
 *
 * WARNING code duplication with count_cigar_ops but merging the two
 * functions is messy.
 */
void
calc_read_alnerrprof(double *alnerrprof, unsigned long int *used_pos, 
                   const bam1_t *b, const char *ref)
{
     /* modelled after bam.c:bam_calend(), bam_format1_core() and
      * pysam's aligned_pairs (./pysam/csamtools.pyx)
      */
     uint32_t *cigar = bam1_cigar(b);
     uint32_t k, i;
     const bam1_core_t *c = &b->core;
#if 0
     int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */
#else
     int qlen = b->core.l_qseq; /* read length */
#endif
     uint32_t pos = c->pos; /* pos on genome */
     uint32_t qpos = 0; /* pos on read/query */
     uint32_t qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;/* original qpos before mapping as possible reverse */


     /* loop over cigar to get aligned bases
      *
      * read: bam_format1_core(NULL, b, BAM_OFDEC);
      */
     for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */
          int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */
          uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT;

          /* following conditionals could be collapsed to much shorter
           * code, but we keep them as they were in pysam's
           * aligned_pairs to make later handling of indels easier
           */
          if (op == BAM_CMATCH || op == BAM_CDIFF) {
               for (i=pos; i<pos+l; i++) {                             
                    assert(qpos < qlen);
                    /* case agnostic */
                    char ref_nt = ref[i];
                    char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)];
                    int bq = bam1_qual(b)[qpos];
#if 0
                    printf("[M]MATCH qpos,i,ref,read = %d,%d,%c,%c\n", qpos, i, ref_nt, read_nt);
#endif                    

                    if (ref_nt != 'N') {
                         if (ref_nt != read_nt || op == BAM_CDIFF) {
                              alnerrprof[qpos_org] += (1.0 - PHREDQUAL_TO_PROB(bq));
                         } /* otherwise leave at 0.0 but count anyway */
                         used_pos[qpos_org] += 1;
                    }
                    qpos += 1;
                    qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;
               }
               pos += l;

          } else if (op == BAM_CINS) {
               for (i=pos; i<pos+l; i++) {
                    assert(qpos < qlen);
                    
                    alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT));
                    used_pos[qpos] += 1;
#if 0
                    printf("INS qpos,i = %d,None\n", qpos);
#endif
                    qpos += 1;
                    qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;
               }
               
          } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
               for (i=pos; i<pos+l; i++) {
#if 0
                    printf("DEL qpos,i = None,%d\n", i);
#endif

                    if (op == BAM_CDEL) {
                         alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT));
                         used_pos[qpos] += 1;
                    }
               }
               pos += l;
               /* deletion: don't increase qpos */

          } else if (op == BAM_CSOFT_CLIP) {
#if 0
               printf("SOFT CLIP qpos = %d\n", qpos);
#endif
               qpos += l;
               qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;

          } else if (op != BAM_CHARD_CLIP) {
               LOG_WARN("Unknown op %d in cigar %s\n", op, cigar_str_from_bam(b));

          }
     } /* for k */
     assert(pos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */
     if (qpos != qlen) {
          LOG_FIXME("got qpos=%d and qlen=%d for cigar %s l_qseq %d\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq);
     }
     assert(qpos == qlen); /* FIXME correct assert? What if hard clipped? */

#if 0
     fprintf(stderr, "%s:", __FUNCTION__);
     for (i=0; i< b->core.l_qseq; i++) {
          fprintf(stderr, " %g/%d", alnerrprof[i], used_pos[i]);
     }
     fprintf(stderr, "\n");
#endif
}
Пример #19
0
Файл: graph.c Проект: nh13/SRMA
node_t *graph_add_sam(graph_t *g, bam1_t *b, ref_t *ref, int32_t use_threads)
{
	bam_aln_t *aln = NULL;
	int32_t aln_start, aln_index, ref_index, aln_ref_index;
	int32_t i;
	node_t *prev_node=NULL, *cur_node=NULL, *ret_node=NULL;
	uint8_t type, strand;

	aln_start = b->core.pos+1;
	aln_ref_index = b->core.tid;
	aln = bam_aln_init(b, ref);
	strand =  bam1_strand(b);

	// --- SYNC ON --- 
	if(1 == use_threads) pthread_mutex_lock(&graph_mutex); // synchronize start
	if(aln_start < g->position_start) {
		int32_t diff = g->position_start - aln_start;
		graph_nodes_realloc(g, g->position_end - aln_start + 1); // alloc more memory if needed
		// shift up
		for(i=g->position_end-g->position_start;0<=i;i--) {
			// swap
			node_list_t *list = g->nodes[i+diff];
			g->nodes[i+diff] = g->nodes[i];
			g->nodes[i] = list;
		}
		g->position_start = aln_start;
	}

	if(1 == g->is_empty) {
		for(i=0;i<g->position_end - g->position_start + 1;i++) {
			node_list_clear(g->nodes[i]);
			assert(0 == g->nodes[i]->length); // DEBUG
		}
		g->position_start = aln_start;
		if(ALN_GAP == aln->ref[0]) {
			g->position_start--;
		}
		g->position_end = g->position_start;
		g->contig = aln_ref_index + 1;
		g->is_empty = 0;
	}
	if(1 == use_threads) pthread_mutex_unlock(&graph_mutex); // synchronize end 
	// --- SYNC OFF --- 
	
	for(aln_index=0,ref_index=-1;aln_index<aln->length;aln_index++,prev_node=cur_node) {

		// Skip over a deletion
		while(ALN_GAP == aln->read[aln_index]) {
			aln_index++;
			ref_index++;
		}

		if(aln->read[aln_index] == aln->ref[aln_index]) { // match
			type = NODE_MATCH;
		}
		else if(aln->ref[aln_index] == ALN_GAP) { // insertion
			type = NODE_INSERTION;
		}
		else { // mismatch
			type = NODE_MISMATCH;
		}
		if(NULL == prev_node || NODE_INSERTION != __node_type(prev_node)) { // previous was an insertion, already on the position
			ref_index++;
		}

		cur_node = graph_add_node(g, 
				node_init(aln->read[aln_index], type, g->contig, aln_start + ref_index, prev_node),
				prev_node,
				use_threads);
	
		if(NULL == prev_node && 0 == strand) { // first node and forward strand
			ret_node = cur_node;
		}
	}

	if(1 == strand) {
		ret_node = cur_node;
	}

	bam_aln_free(aln);
	
	return ret_node;
}
Пример #20
0
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);


	int storeSize = 100;

	int delStore[2][100] = {{0},{0}};

	typedef char * mstring;

	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					//MDW start					
					//for each position in the pileup column
					int charLen = 16;
					int countChars[ charLen ][2];
					int countiChars[ charLen ][2];

					int countGap[2]={0,0};

					//double qvTotal=0;
					int numStruck=0;
					int numGood=0;
					int tti;
					int ttj;
					mstring insAllele[100];
					int insAlleleCnt[100];
					int sf=0;
					int flag=0;

					//typedef char * string;
					char insStr0[10000];
					int iCnt0=0;

					char insStr1[10000];
					int iCnt1=0;

					char delStr0[10000];
					int dCnt0=0;

					char delStr1[10000];
					int dCnt1=0;


					float qposP[10000];
					int qposCnt=0;



					//initialize with zeros
						for(tti=0;tti<charLen;tti++){
						  countChars[tti][0]=0;
						  countChars[tti][1]=0;
						}

					// define repeat length here; look back up to 10 prior positions
					// start one position away.
					int replC=0; //
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos-1])==toupper(ref[pos-tti])){
							replC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}					
					int reprC=0; // 
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos+1])==toupper(ref[pos+tti])){
							reprC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}		
					int repT = replC;
					if(replC < reprC){
						repT=reprC;
					}



					for (j = 0; j < n_plp[i]; ++j){
						const bam_pileup1_t *p = plp[i] + j;
									
						/*
						SAME LOGIC AS pileup_seq()
						*/

						if(p->is_refskip){ // never count intron gaps in numStruck
							continue;
						}

						if(p->is_del){ // skip deletion gap, after first position which is the first aligned char
							continue;
						}

						if( 	p->b->core.qual < conf->min_mqToCount  || // mapping quality
							conf->maxrepC < (repT) || // max homopolymer run, this will not 
							(!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches
							p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) ||  // trimEnd is 1-based
							p->zf == 1 || // fusion tag
							p->ih > conf->maxIH  || // max hit index
							(p->nmd > conf->maxNM) || // max mismatch
							(conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs
							(conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary
							(conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup
							(conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY))  || // optionally strike secondary or dup
							(conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) ))   // optionally strike secondary, dup and QCfail


						){
							numStruck++;
							continue;
						}

						
						//printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]);


						if(!p->is_del && p->indel==0){
  						  countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++;
						  numGood++;			

						}else if(p->is_refskip){
						  countGap[ bam1_strand(p->b) ]++;
						}
						
						if(p->indel<0){
    						  numGood++;			
						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr0[dCnt0] =  ref[pos+tti];
							   dCnt0++;
							  }	
							  delStr0[dCnt0] = ',';
							  dCnt0++;
						  }else{
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr1[dCnt1] = ref[pos+tti];
							   dCnt1++;
							  }	
							  delStr1[dCnt1] = ',';
							  dCnt1++;
						  }



						}else if(p->indel>0){
						  numGood++;			

						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt0++;
							  }	
							  insStr0[iCnt0] = ',';
							  iCnt0++;
						  }else{
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt1++;
							  }	
							  insStr1[iCnt1] = ',';
							  iCnt1++;
						  }


						}
						//calculate position of variant within aligned read - no soft clips
						if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0  ){

						//distance to end; calculate distance to end of aligned read.  removes soft clips.
						int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd;
						qposP[qposCnt] = distToEnd;						  
						qposCnt++;	
						// printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd);
						}	
					}

					//

					//print A,C,G,T, by +/-
				        printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", 	countChars[1][0],countChars[1][1],
											countChars[2][0],countChars[2][1],
											countChars[4][0],countChars[4][1],
											countChars[8][0],countChars[8][1],
											countChars[7][0],countChars[7][1]);
					
					putchar('\t');
					for(tti=0;tti<dCnt0;tti++){
					  putchar(delStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<dCnt1;tti++){
					  putchar(delStr1[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt0;tti++){
					  putchar(insStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt1;tti++){
					  putchar(insStr1[tti]);
					}

					printf("\t%d\t%d",numGood,numStruck);					

					// get non-ref qpos variation

					float medqpos = -1;
					float medAbsDev = -1;
					if(qposCnt>0){
					  medqpos = median(qposCnt,qposP);
					  float absDev[qposCnt];
					  for(tti=0;tti<qposCnt;tti++){
						absDev[tti] = abs(medqpos - qposP[tti]);
					  }
					  medAbsDev = median(qposCnt-1,absDev);
					}
					printf("\t%f",medAbsDev);

					///END MDW
				}



			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
Пример #21
0
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	int n_seqs, l, i;
	long n_trimmed = 0, n_tot = 0;
	bam1_t *b;
	int res;

	b = bam_init1();
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
#ifdef USE_HTSLIB
	while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) {
#else
	while ((res = bam_read1(bs->fp, b)) >= 0) {
#endif
		uint8_t *s, *q;
		int go = 0;
		if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
		if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
		if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
		if (go == 0) continue;
		l = b->core.l_qseq;
		p = &seqs[n_seqs++];
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
#ifdef USE_HTSLIB
		s = bam_get_seq(b); q = bam_get_qual(b);
#else
		s = bam1_seq(b); q = bam1_qual(b);
#endif
		p->seq = (ubyte_t*)calloc(p->len + 1, 1);
		p->qual = (ubyte_t*)calloc(p->len + 1, 1);
		for (i = 0; i != p->full_len; ++i) {
#ifdef USE_HTSLIB
			p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)];
#else
			p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
#endif
			p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
		}
#ifdef USE_HTSLIB
		if (bam_is_rev(b)) { // then reverse 
#else
		if (bam1_strand(b)) { // then reverse 
#endif
			seq_reverse(p->len, p->seq, 1);
			seq_reverse(p->len, p->qual, 0);
		}
		if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
#ifdef USE_HTSLIB
		p->name = strdup((const char*)bam_get_qname(b));
#else
		p->name = strdup((const char*)bam1_qname(b));
#endif
		if (n_seqs == n_needed) break;
	}
	if (res < 0 && res != -1) err_fatal_simple("Error reading bam file");
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		bam_destroy1(b);
		return 0;
	}
	bam_destroy1(b);
	return seqs;
}

#define BARCODE_LOW_QUAL 13

bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	kseq_t *seq = bs->ks;
	int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
	long n_trimmed = 0, n_tot = 0;

	if (l_bc > BWA_MAX_BCLEN) {
		fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN);
		return 0;
	}
	if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
	while ((l = kseq_read(seq)) >= 0) {
		if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) {
			// skip reads that are marked to be filtered by Casava
			char *s = index(seq->comment.s, ':');
			if (s && *(++s) == 'Y') {
				continue;
			}
		}
		if (is_64 && seq->qual.l)
			for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
		if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
		p = &seqs[n_seqs++];
		if (l_bc) { // then trim barcode
			for (i = 0; i < l_bc; ++i)
				p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
			p->bc[i] = 0;
			for (; i < seq->seq.l; ++i)
				seq->seq.s[i - l_bc] = seq->seq.s[i];
			seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
			if (seq->qual.l) {
				for (i = l_bc; i < seq->qual.l; ++i)
					seq->qual.s[i - l_bc] = seq->qual.s[i];
				seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
			}
			l = seq->seq.l;
		} else p->bc[0] = 0;
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
		p->seq = (ubyte_t*)calloc(p->full_len, 1);
		for (i = 0; i != p->full_len; ++i)
			p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
		if (seq->qual.l) { // copy quality
			p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
			if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		}
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
		p->name = strdup((const char*)seq->name.s);
		{ // trim /[12]$
			int t = strlen(p->name);
			if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
		}
		if (n_seqs == n_needed) break;
	}
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		return 0;
	}
	return seqs;
}

void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
{
	int i, j;
	for (i = 0; i != n_seqs; ++i) {
		bwa_seq_t *p = seqs + i;
		for (j = 0; j < p->n_multi; ++j)
			if (p->multi[j].cigar) free(p->multi[j].cigar);
		free(p->name);
		free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
		free(p->cigar);
	}
	free(seqs);
}