Ejemplo n.º 1
0
static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
{
    int j;
    if (p->is_head) {
        putc('^', fp);
        putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
    }
    if (!p->is_del) {
        int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
        if (ref) {
            int rb = pos < ref_len? ref[pos] : 'N';
            if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.';
            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
        } else {
            if (c == '=') c = bam_is_rev(p->b)? ',' : '.';
            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
        }
        putc(c, fp);
    } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp);
    if (p->indel > 0) {
        putc('+', fp); printw(p->indel, fp);
        for (j = 1; j <= p->indel; ++j) {
            int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)];
            putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
        }
    } else if (p->indel < 0) {
        printw(p->indel, fp);
        for (j = 1; j <= -p->indel; ++j) {
            int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
            putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
        }
    }
    if (p->is_tail) putc('$', fp);
}
Ejemplo n.º 2
0
/**
 * Gets the read sequence from a bam record
 */
void bam_get_seq_string(bam1_t *s, kstring_t *seq)
{
    seq->l=0;
    uint8_t* sq = bam_get_seq(s);
    for (uint16_t i = 0; i < bam_get_l_qseq(s); ++i)
    {
        kputc("=ACMGRSVTWYHKDBN"[bam_seqi(sq, i)], seq);
    }
};
Ejemplo n.º 3
0
void dump_read(bam1_t* b) {
    printf("->core.tid:(%d)\n", b->core.tid);
    printf("->core.pos:(%d)\n", b->core.pos);
    printf("->core.bin:(%d)\n", b->core.bin);
    printf("->core.qual:(%d)\n", b->core.qual);
    printf("->core.l_qname:(%d)\n", b->core.l_qname);
    printf("->core.flag:(%d)\n", b->core.flag);
    printf("->core.n_cigar:(%d)\n", b->core.n_cigar);
    printf("->core.l_qseq:(%d)\n", b->core.l_qseq);
    printf("->core.mtid:(%d)\n", b->core.mtid);
    printf("->core.mpos:(%d)\n", b->core.mpos);
    printf("->core.isize:(%d)\n", b->core.isize);
    if (b->data) {
        printf("->data:");
        int i;
        for (i = 0; i < b->l_data; ++i) {
            printf("%x ", b->data[i]);
        }
        printf("\n");
    }
    if (b->core.l_qname) {
        printf("qname: %s\n",bam_get_qname(b));
    }
    if (b->core.l_qseq) {
        printf("qseq:");
        int i;
        for (i = 0; i < b->core.l_qseq; ++i) {
            printf("%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]);
        }
        printf("\n");
        printf("qual:");
        for (i = 0; i < b->core.l_qseq; ++i) {
            printf("%c",bam_get_qual(b)[i]);
        }
        printf("\n");

    }

    if (bam_get_l_aux(b)) {
        int i = 0;
        uint8_t* aux = bam_get_aux(b);

        while (i < bam_get_l_aux(b)) {
            printf("%.2s:%c:",aux+i,*(aux+i+2));
            i += 2;
            switch (*(aux+i)) {
                case 'Z':
                    while (*(aux+1+i) != '\0') { putc(*(aux+1+i), stdout); ++i; }
                    break;
            }
            putc('\n',stdout);
            ++i;++i;
        }
    }
    printf("\n");
}
Ejemplo n.º 4
0
Alignment bam_to_alignment(const bam1_t *b, map<string, string>& rg_sample) {

    Alignment alignment;

    // get the sequence and qual
    int32_t lqseq = b->core.l_qseq;
    string sequence; sequence.resize(lqseq);

    uint8_t* qualptr = bam_get_qual(b);
    string quality;//(lqseq, 0);
    quality.assign((char*)qualptr, lqseq);

    // process the sequence into chars
    uint8_t* seqptr = bam_get_seq(b);
    for (int i = 0; i < lqseq; ++i) {
        sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)];
    }

    // get the read group and sample name
    uint8_t *rgptr = bam_aux_get(b, "RG");
    char* rg = (char*) (rgptr+1);
    //if (!rg_sample
    string sname;
    if (!rg_sample.empty()) {
        sname = rg_sample[string(rg)];
    }

    // Now name the read after the scaffold
    string read_name = bam_get_qname(b);

    // Decide if we are a first read (/1) or second (last) read (/2)
    if(b->core.flag & BAM_FREAD1) {
        read_name += "/1";
    }
    if(b->core.flag & BAM_FREAD2) {
        read_name += "/2";
    }
    
    // If we are marked as both first and last we get /1/2, and if we are marked
    // as neither the scaffold name comes through unchanged as the read name.
    // TODO: produce correct names for intermediate reads on >2 read scaffolds.

    // add features to the alignment
    alignment.set_name(read_name);
    alignment.set_sequence(sequence);
    alignment.set_quality(quality);
    
    // TODO: htslib doesn't wrap this flag for some reason.
    alignment.set_is_secondary(b->core.flag & BAM_FSECONDARY);
    if (sname.size()) {
        alignment.set_sample_name(sname);
        alignment.set_read_group(rg);
    }

    return alignment;
}
Ejemplo n.º 5
0
char *get_sequence(const bam1_t *entry) {
  int i;
  int length = entry->core.l_qseq;
  char *seq = malloc(length + 1);
  unsigned char *bam_seq = bam_get_seq(entry);
  /* The for could be factored but this prevents from having a if in the
     loop */
  if (entry->core.flag & BAM_FREVERSE) {
    for (i = 0; i < length; i++) {
      seq[length - i - 1] = comp_seq_nt16_str[bam_seqi(bam_seq, i)];
    }
  } else {
    for (i = 0; i < length; ++i) {
      seq[i] = seq_nt16_str[bam_seqi(bam_seq,i)];
    }
  }
  seq[i] = 0;
  return seq;
}
Ejemplo n.º 6
0
const char* get_sequence(const bam1_t *b) {
    if(b == NULL) die("get_sequence: parameter error\n");
    const uint8_t *seq = bam_get_seq(b);
    size_t len = b->core.l_qseq;
    char* sequence;
    sequence = malloc(len*sizeof(char));
    uint8_t offset = (b->core.flag & BAM_FREVERSE) ? 16 : 0;
    size_t i;
    for (i=0; i<len; i++) {
        switch(bam_seqi(seq, i) + offset)
        {
        case 1:
            sequence[i] = 'A';
            break;
        case 2:
            sequence[i] = 'C';
            break;
        case 4:
            sequence[i] = 'G';
            break;
        case 8:
            sequence[i] = 'T';
            break;
        case 15:
            sequence[i] = 'N';
            break;
        //Complements (original index + 16)
        case 17:
            sequence[i] = 'T';
            break;
        case 18:
            sequence[i] = 'G';
            break;
        case 20:
            sequence[i] = 'C';
            break;
        case 24:
            sequence[i] = 'A';
            break;
        case 31:
            sequence[i] = 'N';
            break;
        default:
            sequence[i] = 'N';
            break;
        }
    }
    if (offset) sequence = strrev(sequence);
    return sequence;
}
Ejemplo n.º 7
0
static int unpad_seq(bam1_t *b, kstring_t *s)
{
    // Returns 0 on success, -1 on an error
    int k, j, i;
    int length;
    int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */
    uint32_t *cigar = bam_get_cigar(b);
    uint8_t *seq = bam_get_seq(b);

    // b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
    // We need the padded length after alignment from the CIGAR (excluding
    // soft clips S, but including pads from CIGAR D operations)
    length = bam_cigar2rlen(b->core.n_cigar, cigar);
    ks_resize(s, length);
    for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
        int op, ol;
        op = bam_cigar_op(cigar[k]);
        ol = bam_cigar_oplen(cigar[k]);
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j);
        } else if (op == BAM_CSOFT_CLIP) {
            j += ol;
        } else if (op == BAM_CHARD_CLIP) {
            /* do nothing */
        } else if (op == BAM_CDEL) {
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
        } else if (op == BAM_CREF_SKIP) {
            /* Treat CIGAR N as D (not ideal, but better than ignoring it) */
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
            if (0 == cigar_n_warning) {
                cigar_n_warning = -1;
                fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
            }
        } else {
            fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
            return -1;
        }
    }
    return length != s->l;
}
Ejemplo n.º 8
0
 Mapping(const bam_hdr_t * hdr_p, bam1_t * rec_p)
     : _rec_p(rec_p)
 {
     _query_name = bam_get_qname(rec_p);
     _flag = rec_p->core.flag;
     for (int i = 0; i < rec_p->core.l_qseq; ++i)
     {
         _seq += seq_nt16_str[bam_seqi(bam_get_seq(rec_p), i)];
     }
     if (is_mapped())
     {
         _chr_name = hdr_p->target_name[rec_p->core.tid];
         _rf_start = rec_p->core.pos;
         _cigar = Cigar(bam_get_cigar(rec_p), rec_p->core.n_cigar);
         _rf_len = _cigar.rf_len();
     }
     if (is_paired() and mp_is_mapped())
     {
         _mp_chr_name = hdr_p->target_name[rec_p->core.mtid];
         _mp_rf_start = rec_p->core.mpos;
     }
 }
Ejemplo n.º 9
0
	int baseCodeAt(int pos) const {
		assert(pos >= 0 && pos < len);
		return (return_current ? codes[bam_seqi(seq, pos)] : rcodes[bam_seqi(seq, len - pos - 1)]);
	}
Ejemplo n.º 10
0
	char baseAt(int pos) const {
		assert(pos >= 0 && pos < len);
		return (return_current ? decode[bam_seqi(seq, pos)] : decode_r[bam_seqi(seq, len - pos - 1)]);
	}
Ejemplo n.º 11
0
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
{
    uint8_t *seq = bam_get_seq(b);
    uint32_t *cigar = bam_get_cigar(b);
    bam1_core_t *c = &b->core;
    int i, x, y, u = 0;
    kstring_t *str;
    int32_t old_nm_i = -1, nm = 0;

    str = (kstring_t*)calloc(1, sizeof(kstring_t));
    for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
        int j, l = cigar[i]>>4, op = cigar[i]&0xf;
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (j = 0; j < l; ++j) {
                int c1, c2, z = y + j;
                if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
                c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
                if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                    if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
                    ++u;
                } else {
                    kputw(u, str);
                    kputc(ref[x+j], str);
                    u = 0;
                    ++nm;
                }
            }
            if (j < l) break;
            x += l;
            y += l;
        } else if (op == BAM_CDEL) {
            kputw(u, str);
            kputc('^', str);
            for (j = 0; j < l; ++j) {
                if (x+j >= ref_len || ref[x+j] == '\0') break;
                kputc(ref[x+j], str);
            }
            u = 0;
            x += j;
            nm += j;
            if (j < l) break;
        } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
            y += l;
            if (op == BAM_CINS) nm += l;
        } else if (op == BAM_CREF_SKIP) {
            x += l;
        }
    }
    kputw(u, str);
    // apply max_nm
    if (max_nm > 0 && nm >= max_nm) {
        for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
            int j, l = cigar[i]>>4, op = cigar[i]&0xf;
            if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                for (j = 0; j < l; ++j) {
                    int c1, c2, z = y + j;
                    if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
                    c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
                    if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                        seq[z/2] |= (z&1)? 0x0f : 0xf0;
                        bam_get_qual(b)[z] = 0;
                    }
                }
                if (j < l) break;
                x += l;
                y += l;
            } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
            else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
        }
    }
    // update NM
    if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
        uint8_t *old_nm = bam_aux_get(b, "NM");
        if (old_nm) old_nm_i = bam_aux2i(old_nm);
        if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
        else if (nm != old_nm_i) {
            fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
            bam_aux_del(b, old_nm);
            bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
        }
    }
    // update MD
    if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
        uint8_t *old_md = bam_aux_get(b, "MD");
        if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
        else {
            int is_diff = 0;
            if (strlen((char*)old_md+1) == str->l) {
                for (i = 0; i < str->l; ++i)
                    if (toupper(old_md[i+1]) != toupper(str->s[i]))
                        break;
                if (i < str->l) is_diff = 1;
            } else is_diff = 1;
            if (is_diff) {
                fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
                bam_aux_del(b, old_md);
                bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
            }
        }
    }

    // drop all tags but RG
    if (flag&DROP_TAG) {
        uint8_t *q = bam_aux_get(b, "RG");
        bam_aux_drop_other(b, q);
    }
    // reduce the resolution of base quality
    if (flag&BIN_QUAL) {
        uint8_t *qual = bam_get_qual(b);
        for (i = 0; i < b->core.l_qseq; ++i)
            if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
    }

    free(str->s);
    free(str);
}
Ejemplo n.º 12
0
// Transform a bam1_t record into a string with the FASTQ representation of it
// @returns false for error, true for success
static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
{
    int i;
    int32_t qlen = b->core.l_qseq;
    assert(qlen >= 0);
    uint8_t *seq;
    uint8_t *qual = bam_get_qual(b);
    const uint8_t *oq = NULL;
    if (state->use_oq) {
        oq = bam_aux_get(b, "OQ");
        if (oq) oq++; // skip tag type
    }
    bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality

    linebuf->l = 0;
    // Write read name
    readpart readpart = which_readpart(b);
    kputc(state->filetype == FASTA? '>' : '@', linebuf);
    kputs(bam_get_qname(b), linebuf);
    // Add the /1 /2 if requested
    if (state->has12) {
        if (readpart == READ_1) kputs("/1", linebuf);
        else if (readpart == READ_2) kputs("/2", linebuf);
    }
    if (state->copy_tags) {
        for (i = 0; copied_tags[i]; ++i) {
            uint8_t *s;
            if ((s = bam_aux_get(b, copied_tags[i])) != 0) {
                kputc('\t', linebuf);
                kputsn(copied_tags[i], 2, linebuf);
                kputsn(":Z:", 3, linebuf);
                kputs(bam_aux2Z(s), linebuf);
            }
        }
    }
    kputc('\n', linebuf);

    seq = bam_get_seq(b);

    if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
        for (i = qlen-1; i > -1; --i) {
            char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]];
            kputc(c, linebuf);
        }
    } else {
        for (i = 0; i < qlen; ++i) {
            char c = seq_nt16_str[bam_seqi(seq,i)];
            kputc(c, linebuf);
        }
    }
    kputc('\n', linebuf);

    if (state->filetype == FASTQ) {
        // Write quality
        kputs("+\n", linebuf);
        if (has_qual) {
            if (state->use_oq && oq) {
                if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
                    for (i = qlen-1; i > -1; --i) {
                        kputc(oq[i], linebuf);
                    }
                } else {
                    kputs((char*)oq, linebuf);
                }
            } else {
                if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
                    for (i = qlen-1; i > -1; --i) {
                        kputc(33 + qual[i], linebuf);
                    }
                } else {
                    for (i = 0; i < qlen; ++i) {
                        kputc(33 + qual[i], linebuf);
                    }
                }
            }
        } else {
            for (i = 0; i < qlen; ++i) {
                kputc(33 + state->def_qual, linebuf);
            }
        }
        kputc('\n', linebuf);
    }
    return true;
}
Ejemplo n.º 13
0
static int trim_ns(bam1_t *b, void *data) {
    int ret = 0;
    opts_t *op((opts_t *)data);
    std::vector<uint8_t> aux(bam_get_aux(b), bam_get_aux(b) + bam_get_l_aux(b));
    int tmp;
    uint8_t *const seq(bam_get_seq(b));
    uint32_t *const cigar(bam_get_cigar(b));
    //op->n_cigar = b->core.n_cigar;
    op->resize(b->l_data); // Make sure it's big enough to hold everything.
    memcpy(op->data, b->data, b->core.l_qname);

    // Get #Ns at the beginning
    for(tmp = 0; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; ++tmp);
    const int n_start(tmp);

    if(tmp == b->core.l_qseq - 1) // all bases are N -- garbage read
         ret |= op->skip_all_ns;

    // Get #Ns at the end
    for(tmp = b->core.l_qseq - 1; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; --tmp);
    const int n_end(b->core.l_qseq - 1 - tmp);

    // Get new length for read
    int final_len(b->core.l_qseq - n_end - n_start);
    if(final_len < 0) final_len = 0;
    if(final_len < op->min_trimmed_len) // Too short.
        ret |= 1;
    // Copy in qual and all of aux.

    if(n_end) {
        if((tmp = bam_cigar_oplen(cigar[b->core.n_cigar - 1]) - n_end) == 0) {
            LOG_DEBUG("Entire cigar operation is the softclip. Decrease the number of new cigar operations.\n");
            --b->core.n_cigar;
        } else {
            LOG_DEBUG("Updating second cigar operation in-place.\n");
            cigar[b->core.n_cigar - 1] = bam_cigar_gen(tmp, BAM_CSOFT_CLIP);
        }
    }

    // Get new n_cigar.
    if((tmp = bam_cigar_oplen(*cigar) - n_start) == 0) {
        memcpy(op->data + b->core.l_qname, cigar + 1, (--b->core.n_cigar) << 2); // << 2 for 4 bit per cigar op
    } else {
        if(n_start) *cigar = bam_cigar_gen(tmp, BAM_CSOFT_CLIP);
        memcpy(op->data + b->core.l_qname, cigar, b->core.n_cigar << 2);
    }
    uint8_t *opseq(op->data + b->core.l_qname + (b->core.n_cigar << 2)); // Pointer to the seq region of new data field.
    for(tmp = 0; tmp < final_len >> 1; ++tmp)
        opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4) | (bam_seqi(seq, (tmp << 1) + n_start + 1));
    if(final_len & 1)
        opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4);

    tmp = bam_get_l_aux(b);
    memcpy(opseq + ((final_len + 1) >> 1), bam_get_qual(b) + n_start, final_len + tmp);
    // Switch data strings
    std::swap(op->data, b->data);
    b->core.l_qseq = final_len;
    memcpy(bam_get_aux(b), aux.data(), aux.size());
    b->l_data = (bam_get_aux(b) - b->data) + aux.size();
    if(n_end) bam_aux_append(b, "NE", 'i', sizeof(int), (uint8_t *)&n_end);
    if(n_start) bam_aux_append(b, "NS", 'i', sizeof(int), (uint8_t *)&n_start);
    const uint32_t *pvar((uint32_t *)dlib::array_tag(b, "PV"));
    tmp = b->core.flag & BAM_FREVERSE ? n_end: n_start;
    if(pvar) {
        std::vector<uint32_t>pvals(pvar + tmp, pvar + final_len + tmp);
        bam_aux_del(b, (uint8_t *)(pvar) - 6);
        dlib::bam_aux_array_append(b, "PV", 'I', sizeof(uint32_t), final_len, (uint8_t *)pvals.data());
    }
    const uint32_t *fvar((uint32_t *)dlib::array_tag(b, "FA"));
    if(fvar) {
        std::vector<uint32_t>fvals(fvar + tmp, fvar + final_len + tmp);
        bam_aux_del(b, (uint8_t *)(fvar) - 6);
        dlib::bam_aux_array_append(b, "FA", 'I', sizeof(uint32_t), final_len, (uint8_t *)fvals.data());
    }
    return ret;
}
Ejemplo n.º 14
0
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	int n_seqs, l, i;
	long n_trimmed = 0, n_tot = 0;
	bam1_t *b;
	int res;

	b = bam_init1();
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
#ifdef USE_HTSLIB
	while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) {
#else
	while ((res = bam_read1(bs->fp, b)) >= 0) {
#endif
		uint8_t *s, *q;
		int go = 0;
		if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
		if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
		if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
		if (go == 0) continue;
		l = b->core.l_qseq;
		p = &seqs[n_seqs++];
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
#ifdef USE_HTSLIB
		s = bam_get_seq(b); q = bam_get_qual(b);
#else
		s = bam1_seq(b); q = bam1_qual(b);
#endif
		p->seq = (ubyte_t*)calloc(p->len + 1, 1);
		p->qual = (ubyte_t*)calloc(p->len + 1, 1);
		for (i = 0; i != p->full_len; ++i) {
#ifdef USE_HTSLIB
			p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)];
#else
			p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
#endif
			p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
		}
#ifdef USE_HTSLIB
		if (bam_is_rev(b)) { // then reverse 
#else
		if (bam1_strand(b)) { // then reverse 
#endif
			seq_reverse(p->len, p->seq, 1);
			seq_reverse(p->len, p->qual, 0);
		}
		if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
#ifdef USE_HTSLIB
		p->name = strdup((const char*)bam_get_qname(b));
#else
		p->name = strdup((const char*)bam1_qname(b));
#endif
		if (n_seqs == n_needed) break;
	}
	if (res < 0 && res != -1) err_fatal_simple("Error reading bam file");
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		bam_destroy1(b);
		return 0;
	}
	bam_destroy1(b);
	return seqs;
}

#define BARCODE_LOW_QUAL 13

bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	kseq_t *seq = bs->ks;
	int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
	long n_trimmed = 0, n_tot = 0;

	if (l_bc > BWA_MAX_BCLEN) {
		fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN);
		return 0;
	}
	if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
	while ((l = kseq_read(seq)) >= 0) {
		if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) {
			// skip reads that are marked to be filtered by Casava
			char *s = index(seq->comment.s, ':');
			if (s && *(++s) == 'Y') {
				continue;
			}
		}
		if (is_64 && seq->qual.l)
			for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
		if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
		p = &seqs[n_seqs++];
		if (l_bc) { // then trim barcode
			for (i = 0; i < l_bc; ++i)
				p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
			p->bc[i] = 0;
			for (; i < seq->seq.l; ++i)
				seq->seq.s[i - l_bc] = seq->seq.s[i];
			seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
			if (seq->qual.l) {
				for (i = l_bc; i < seq->qual.l; ++i)
					seq->qual.s[i - l_bc] = seq->qual.s[i];
				seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
			}
			l = seq->seq.l;
		} else p->bc[0] = 0;
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
		p->seq = (ubyte_t*)calloc(p->full_len, 1);
		for (i = 0; i != p->full_len; ++i)
			p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
		if (seq->qual.l) { // copy quality
			p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
			if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		}
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
		p->name = strdup((const char*)seq->name.s);
		{ // trim /[12]$
			int t = strlen(p->name);
			if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
		}
		if (n_seqs == n_needed) break;
	}
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		return 0;
	}
	return seqs;
}

void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
{
	int i, j;
	for (i = 0; i != n_seqs; ++i) {
		bwa_seq_t *p = seqs + i;
		for (j = 0; j < p->n_multi; ++j)
			if (p->multi[j].cigar) free(p->multi[j].cigar);
		free(p->name);
		free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
		free(p->cigar);
	}
	free(seqs);
}
Ejemplo n.º 15
0
int main_bam2fq(int argc, char *argv[])
{
    BGZF *fp, *fpse = 0;
    bam1_t *b;
    uint8_t *buf;
    int max_buf, c, has12 = 0;
    kstring_t str;
    int64_t n_singletons = 0, n_reads = 0;
    char last[512], *fnse = 0;

    while ((c = getopt(argc, argv, "as:")) > 0)
        if (c == 'a') has12 = 1;
        else if (c == 's') fnse = optarg;
    if (argc == optind) {
        fprintf(stderr, "\nUsage:   bam2fq [-a] [-s outSE] <in.bam>\n\n");
        fprintf(stderr, "Options: -a        append /1 and /2 to the read name\n");
        fprintf(stderr, "         -s FILE   write singleton reads to FILE [assume single-end]\n");
        fprintf(stderr, "\n");
        return 1;
    }
    fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r");
    assert(fp);
    bam_hdr_destroy(bam_hdr_read(fp));
    buf = 0;
    max_buf = 0;
    str.l = str.m = 0;
    str.s = 0;
    last[0] = 0;
    if (fnse) fpse = bgzf_open(fnse, "w1");

    b = bam_init1();
    while (bam_read1(fp, b) >= 0) {
        int i, qlen = b->core.l_qseq, is_print = 0;
        uint8_t *qual, *seq;
        if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments
        ++n_reads;
        if (fpse) {
            if (str.l && strcmp(last, bam_get_qname(b))) {
                bgzf_write(fpse, str.s, str.l);
                str.l = 0;
                ++n_singletons;
            }
            if (str.l) is_print = 1;
            strcpy(last, bam_get_qname(b));
        } else is_print = 1;
        qual = bam_get_qual(b);
        kputc(qual[0] == 0xff? '>' : '@', &str);
        kputsn(bam_get_qname(b), b->core.l_qname - 1, &str);
        if (has12) {
            kputc('/', &str);
            kputw(b->core.flag>>6&3, &str);
        }
        kputc('\n', &str);
        if (max_buf < qlen + 1) {
            max_buf = qlen + 1;
            kroundup32(max_buf);
            buf = (uint8_t*)realloc(buf, max_buf);
        }
        buf[qlen] = 0;
        seq = bam_get_seq(b);
        for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence
        if (bam_is_rev(b)) { // reverse complement
            for (i = 0; i < qlen>>1; ++i) {
                int8_t t = seq_comp_table[buf[qlen - 1 - i]];
                buf[qlen - 1 - i] = seq_comp_table[buf[i]];
                buf[i] = t;
            }
            if (qlen&1) buf[i] = seq_comp_table[buf[i]];
        }
        for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]];
        kputsn((char*)buf, qlen, &str);
        kputc('\n', &str);
        if (qual[0] != 0xff) {
            kputsn("+\n", 2, &str);
            for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i];
            if (bam_is_rev(b)) { // reverse
                for (i = 0; i < qlen>>1; ++i) {
                    uint8_t t = buf[qlen - 1 - i];
                    buf[qlen - 1 - i] = buf[i];
                    buf[i] = t;
                }
            }
        }
        kputsn((char*)buf, qlen, &str);
        kputc('\n', &str);
        if (is_print) {
            fwrite(str.s, 1, str.l, stdout);
            str.l = 0;
        }
    }
    if (fpse) {
        if (str.l) {
            bgzf_write(fpse, str.s, str.l);
            ++n_singletons;
        }
        fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons);
        bgzf_close(fpse);
    }
    fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads);
    free(buf);
    free(str.s);
    bam_destroy1(b);
    bgzf_close(fp);
    return 0;
}
Ejemplo n.º 16
0
BM_mappedRead * extractReads(char * bamFile,
                             char ** contigs,
                             int numContigs,
                             uint16_t * groups,
                             char * prettyName,
                             int headersOnly,
                             int minMapQual,
                             int maxMisMatches,
                             int ignoreSuppAlignments,
                             int ignoreSecondaryAlignments) {
    //-----
    // code uses the pattern outlined in samtools view (sam_view.c)
    // thanks lh3!
    //
    int i = 0;
    int result = -1;
    int hh = 0;

    int supp_check = 0x0; // include supp mappings
    if (ignoreSuppAlignments) {
        supp_check |= BAM_FSUPPLEMENTARY;
    }
    if (ignoreSecondaryAlignments) {
        supp_check |= BAM_FSECONDARY;
    }

    // we need to let the users know if their pairings
    // will be corrupted
    int p_corrupt = 0;

    // helper variables
    samFile *in = 0;
    bam_hdr_t *header = NULL;
    bam1_t *b = bam_init1();

    BM_mappedRead * root = 0;
    BM_mappedRead * prev = 0;

    // open file handlers
    if ((in = sam_open(bamFile, "r")) == 0) {
        fprintf(stderr,
                "ERROR: Failed to open \"%s\" for reading.\n",
                bamFile);
    }
    else {
        // retrieve the header
        if ((header = sam_hdr_read(in)) == 0) {
            fprintf(stderr,
                    "ERROR: Failed to read the header from \"%s\".\n",
                    bamFile);
        }
        else {
            // check the index is intact
            hts_idx_t *idx = sam_index_load(in, bamFile); // load index
            if (idx == 0) { // index is unavailable
                fprintf(stderr,
                        "ERROR: Random retrieval only works "\
                        "for indexed files.\n");
            }
            else {
                cfuhash_table_t *pair_buffer = \
                    cfuhash_new_with_initial_size(1000000);
                cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS);

                for (hh = 0; hh < numContigs; ++hh) {
                    // parse a region in the format like `chr2:100-200'
                    hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]);
                    if (iter == NULL) { // reference name is not found
                        fprintf(stderr,
                                "WARNING: Could not find contig: "\
                                "[%s] in BAM: [%s].\n",
                                contigs[hh],
                                bamFile);
                    }

                    // fetch alignments
                    int line = 0;
                    while ((result = sam_itr_next(in, iter, b)) >= 0) {
                        bam1_core_t core = b->core;
                        line += 1;
                        // only high quality?, primary? mappings
                        if ( core.qual < minMapQual)
                            continue;
                        if ((core.flag & supp_check) != 0)
                            continue;
                        if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) {
                            continue;
                        }

                        char * seqId = bam_get_qname(b);
                        char * seq = 0;
                        char * qual = 0;
                        int qual_len = 0;
                        int seq_len = 0;

                        // get sequence and quality
                        if(0 == headersOnly) {
                            // no point allocating unused space
                            seq = calloc(core.l_qseq+1, sizeof(char));
                            qual = calloc(core.l_qseq+1, sizeof(char));
                            uint8_t *s = bam_get_seq(b);
                            if (core.flag&BAM_FREVERSE) {
                                // reverse the read
                                int r = 0;
                                for (i = core.l_qseq-1; i >=0 ; --i) {
                                    seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s,
                                                                       i)];
                                    ++r;
                                }
                            }
                            else {
                                for (i = 0; i < core.l_qseq; ++i) {
                                    seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s,
                                                                       i)];
                                }
                            }
                            seq_len = core.l_qseq;

                            s = bam_get_qual(b);
                            if (s[0] != 0xff) {
                                qual_len = core.l_qseq;
                                for (i = 0; i < core.l_qseq; ++i) {
                                    qual[i] = (char)(s[i] + 33);
                                }
                            }
                            else if (qual != 0) {
                                free(qual);
                                qual = 0;
                            }
                        }

                        // work out pairing information
                        uint8_t rpi = RPI_ERROR;
                        if (core.flag&BAM_FPAIRED) {
                            if(core.flag&BAM_FMUNMAP) {
                                if (core.flag&BAM_FREAD1) {
                                    rpi = RPI_SNGL_FIR;
                                }
                                else if (core.flag&BAM_FREAD2) {
                                    rpi = RPI_SNGL_SEC;
                                }
                            }
                            else {
                                if (core.flag&BAM_FREAD1) {
                                    rpi = RPI_FIR;
                                }
                                else if (core.flag&BAM_FREAD2) {
                                    rpi = RPI_SEC;
                                }
                            }
                        }
                        else {
                            rpi = RPI_SNGL;
                        }

                        // make the funky Id
                        #define MAX_SEQ_ID_LEN 80
                        char * seq_id = calloc(MAX_SEQ_ID_LEN,
                                               sizeof(char));
                        // allocate the string to the buffer but check to
                        // ensure we're not cutting anything off
                        int id_len = snprintf(seq_id,
                                              MAX_SEQ_ID_LEN,
                                              "b_%s;c_%s;r_%s",
                                              prettyName,
                                              contigs[hh],
                                              seqId);
                        if(id_len >= MAX_SEQ_ID_LEN) {
                            seq_id = calloc(id_len+1, sizeof(char));
                            snprintf(seq_id,
                                     id_len+1, // don't forget the NULL!
                                     "b_%s;c_%s;r_%s",
                                     prettyName,
                                     contigs[hh],
                                     seqId);
                        }

                        // make the mapped read struct
                        prev = makeMappedRead(seq_id,
                                              seq,
                                              qual,
                                              id_len,
                                              seq_len,
                                              qual_len,
                                              rpi,
                                              groups[hh],
                                              prev);

                        if (0 == root) { root = prev; }

                        if(rpi == RPI_SNGL || \
                           rpi == RPI_SNGL_FIR || \
                           rpi == RPI_SNGL_SEC) {
                            // we can just add away
                            // indicate singleton reads by pointing the
                            // partner pointer to itself
                            prev->partnerRead = prev;
                        }
                        else {
                            // RPI_FIR or RPI_SEC
                            // work out pairing information using the hash
                            // we append a 1 or 2 to the end so that
                            // we don't accidentally pair 1's with 1's etc.
                            char * stripped_result;
                            if(rpi == RPI_FIR) {
                                stripped_result = \
                                    pairStripper(seqId,
                                                 core.l_qname-1,
                                                 '2');
                            }
                            else {
                                stripped_result = \
                                    pairStripper(seqId,
                                                 core.l_qname-1,
                                                 '1');
                            }

                            char * stripped = seqId;
                            if(stripped_result)
                                stripped = stripped_result;

                            //fprintf(stdout, "SEARCH %s\n", stripped);
                            // now stripped always holds a stripped value
                            // see if it is in the hash already
                            BM_mappedRead * stored_MR = \
                                cfuhash_get(pair_buffer,
                                            stripped);

                            if (0 != stored_MR) {
                                // exists in the hash -> Add the pair info
                                if(rpi == RPI_FIR) {
                                    prev->partnerRead = stored_MR;
                                }
                                else {
                                    stored_MR->partnerRead = prev;
                                }

                                // delete the entry from the hash
                                cfuhash_delete(pair_buffer,
                                               stripped);
                            }
                            else {
                                // we should put it in the hash
                                // make sure to change it into something
                                // we will find next time
                                if(rpi == RPI_FIR)
                                    stripped[strlen(stripped)-1] = '1';
                                else
                                    stripped[strlen(stripped)-1] = '2';

                                // check to make sure we're not overwriting
                                // anything important. cfuhash overwrites
                                // duplicate entries, so we need to grab
                                // it and put it to "SNGL_XXX" before we
                                // lose the pointer
                                BM_mappedRead * OWMMR = \
                                    cfuhash_put(pair_buffer,
                                                stripped, prev);
                                if(OWMMR) {
                                    if(OWMMR->rpi == RPI_FIR)
                                        OWMMR->rpi = RPI_SNGL_FIR;
                                    else
                                        OWMMR->rpi = RPI_SNGL_SEC;
                                    OWMMR->partnerRead = OWMMR;
                                    printPairCorruptionWarning(p_corrupt);
                                    p_corrupt = 1;
                                }


                            }

                            if(stripped_result != 0) { // free this!
                                free(stripped_result);
                                stripped_result = 0;
                            }
                        }
                    }
                    hts_itr_destroy(iter);
                    if (result < -1) {
                        fprintf(stderr, "ERROR: retrieval of reads from "\
                                        "contig:  \"%s\" failed due to "\
                                        "truncated file or corrupt BAM index "\
                                        "file\n", header->target_name[hh]);
                        break;
                    }
                }

                // any entries left in the hash are pairs whose mates did
                // not meet quality standards
                size_t key_size = 0;
                char * key;
                BM_mappedRead * LOMMR;
                size_t pr_size = 1;
                if(cfuhash_each_data(pair_buffer,
                                     (void**)&key,
                                     &key_size,
                                     (void**)&LOMMR,
                                     &pr_size)) {
                    do {
                        // get the mapped read
                        // update it's pairing so we know it's really single
                        if (LOMMR->rpi == RPI_FIR)
                            LOMMR->rpi = RPI_SNGL_FIR;
                        else if (LOMMR->rpi == RPI_SEC)
                            LOMMR->rpi = RPI_SNGL_SEC;

                        // indicate singleton reads by pointing the
                        // partner pointer to itself
                        LOMMR->partnerRead = LOMMR;

                    } while(cfuhash_next_data(pair_buffer,
                                              (void**)&key,
                                              &key_size,
                                              (void**)&LOMMR,
                                              &pr_size));
                }

                cfuhash_clear(pair_buffer);
                cfuhash_destroy(pair_buffer);
            }
            hts_idx_destroy(idx); // destroy the BAM index
        }
    }
    // always do this
    if (in) sam_close(in);
    bam_destroy1(b);
    if ( header ) bam_hdr_destroy(header);

    return root;
}
Ejemplo n.º 17
0
loci_stats *bam_access_get_position_base_counts(char *chr, int posn){
	char *region = NULL;
	hts_itr_t *iter = NULL;
	bam1_t* b = NULL;
	bam_plp_t buf;
	loci_stats *stats = malloc(sizeof(loci_stats *));
	check_mem(stats);
	stats->base_counts = malloc(sizeof(int) * 4);
	check_mem(stats->base_counts);
	stats->base_counts[0] = 0;
	stats->base_counts[1] = 0;
	stats->base_counts[2] = 0;
	stats->base_counts[3] = 0;
	fholder->stats = stats;

	region = malloc((sizeof(char *) * (strlen(chr)+1))+sizeof(":")+sizeof("-")+(sizeof(char)*((no_of_digits(posn)*2)+1)));
	sprintf(region,"%s:%d-%d",chr,posn,posn);
	fholder->beg = posn;
	fholder->end = posn;

  // initialize pileup
	buf = bam_plp_init(pileup_func, (void *)fholder);
	bam_plp_set_maxcnt(buf,maxitercnt);

  /*
  sam_fetch(fholder->in, fholder->idx, ref, fholder->beg, fholder->end, buf, fetch_algo_func);
  */
  //Replace fetch with iterator for htslib compatibility.
  b = bam_init1();
  iter = sam_itr_querys(fholder->idx, fholder->head, region);
  int result;
  int count = 0;
  while ((result = sam_itr_next(fholder->in, iter, b)) >= 0) {
    if(b->core.qual < min_map_qual || (b->core.flag & BAM_FUNMAP)
			|| !(b->core.flag & BAM_FPROPER_PAIR) || (b->core.flag & BAM_FMUNMAP)//Proper pair and mate unmapped
			|| (b->core.flag & BAM_FDUP)//1024 is PCR/optical duplicate
			|| (b->core.flag & BAM_FSECONDARY) || (b->core.flag & BAM_FQCFAIL)//Secondary alignment, quality fail
			|| (b->core.flag & BAM_FSUPPLEMENTARY) ) continue;
    count++;
    bam_plp_push(buf, b);
  }
  sam_itr_destroy(iter);
  bam_plp_push(buf, 0);
  int tid, pos, n_plp = -1;
  const bam_pileup1_t *pil;
  while ( (pil=bam_plp_next(buf, &tid, &pos, &n_plp)) > 0) {
    if((pos+1) != posn) continue;
    int i=0;
   	for(i=0;i<n_plp;i++){
      const bam_pileup1_t *p = pil + i;
      int qual = bam_get_qual(p->b)[p->qpos];
      uint8_t c = bam_seqi(bam_get_seq(p->b), p->qpos);
      if(!(p->is_del) &&  qual >= min_base_qual
            &&  p->b->core.qual >= min_map_qual){
           //&& (c == 1 /*A*/|| c == 2 /*C*/|| c == 4 /*G*/|| c == 8 /*T*/)){
        //Now we add a new read pos struct to the list since the read is valid.
        //char cbase = toupper(bam_nt16_rev_table[c]);
        switch(c){
          case 1:
            fholder->stats->base_counts[0]++;
            break;

          case 2:
            fholder->stats->base_counts[1]++;
            break;

          case 4:
            fholder->stats->base_counts[2]++;
            break;

          case 8:
            fholder->stats->base_counts[3]++;
            break;

          default:
            break;

          }; // End of args switch statement */
   	    }
   	  }
  } //End of iteration through pileup
	//bam_plp_push(buf, 0); // finalize pileup
  bam_plp_destroy(buf);
	free(region);
	return fholder->stats;

error:
	//if(region) free(region);
	if(fholder->stats){
		if(fholder->stats->base_counts) free(fholder->stats->base_counts);
		free(fholder->stats);
	}
	if(iter) sam_itr_destroy(iter);
	if(b) bam_destroy1(b);
	if(region) free(region);
	return NULL;
}