Esempio n. 1
0
static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
{
    int j;
    if (p->is_head) {
        putc('^', fp);
        putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
    }
    if (!p->is_del) {
        int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
        if (ref) {
            int rb = pos < ref_len? ref[pos] : 'N';
            if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.';
            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
        } else {
            if (c == '=') c = bam_is_rev(p->b)? ',' : '.';
            else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
        }
        putc(c, fp);
    } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp);
    if (p->indel > 0) {
        putc('+', fp); printw(p->indel, fp);
        for (j = 1; j <= p->indel; ++j) {
            int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)];
            putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
        }
    } else if (p->indel < 0) {
        printw(p->indel, fp);
        for (j = 1; j <= -p->indel; ++j) {
            int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
            putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
        }
    }
    if (p->is_tail) putc('$', fp);
}
Esempio n. 2
0
/**
 * Gets the read sequence from a bam record
 */
void bam_get_seq_string(bam1_t *s, kstring_t *seq)
{
    seq->l=0;
    uint8_t* sq = bam_get_seq(s);
    for (uint16_t i = 0; i < bam_get_l_qseq(s); ++i)
    {
        kputc("=ACMGRSVTWYHKDBN"[bam_seqi(sq, i)], seq);
    }
};
Esempio n. 3
0
void dump_read(bam1_t* b) {
    printf("->core.tid:(%d)\n", b->core.tid);
    printf("->core.pos:(%d)\n", b->core.pos);
    printf("->core.bin:(%d)\n", b->core.bin);
    printf("->core.qual:(%d)\n", b->core.qual);
    printf("->core.l_qname:(%d)\n", b->core.l_qname);
    printf("->core.flag:(%d)\n", b->core.flag);
    printf("->core.n_cigar:(%d)\n", b->core.n_cigar);
    printf("->core.l_qseq:(%d)\n", b->core.l_qseq);
    printf("->core.mtid:(%d)\n", b->core.mtid);
    printf("->core.mpos:(%d)\n", b->core.mpos);
    printf("->core.isize:(%d)\n", b->core.isize);
    if (b->data) {
        printf("->data:");
        int i;
        for (i = 0; i < b->l_data; ++i) {
            printf("%x ", b->data[i]);
        }
        printf("\n");
    }
    if (b->core.l_qname) {
        printf("qname: %s\n",bam_get_qname(b));
    }
    if (b->core.l_qseq) {
        printf("qseq:");
        int i;
        for (i = 0; i < b->core.l_qseq; ++i) {
            printf("%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]);
        }
        printf("\n");
        printf("qual:");
        for (i = 0; i < b->core.l_qseq; ++i) {
            printf("%c",bam_get_qual(b)[i]);
        }
        printf("\n");

    }

    if (bam_get_l_aux(b)) {
        int i = 0;
        uint8_t* aux = bam_get_aux(b);

        while (i < bam_get_l_aux(b)) {
            printf("%.2s:%c:",aux+i,*(aux+i+2));
            i += 2;
            switch (*(aux+i)) {
                case 'Z':
                    while (*(aux+1+i) != '\0') { putc(*(aux+1+i), stdout); ++i; }
                    break;
            }
            putc('\n',stdout);
            ++i;++i;
        }
    }
    printf("\n");
}
Esempio n. 4
0
Alignment bam_to_alignment(const bam1_t *b, map<string, string>& rg_sample) {

    Alignment alignment;

    // get the sequence and qual
    int32_t lqseq = b->core.l_qseq;
    string sequence; sequence.resize(lqseq);

    uint8_t* qualptr = bam_get_qual(b);
    string quality;//(lqseq, 0);
    quality.assign((char*)qualptr, lqseq);

    // process the sequence into chars
    uint8_t* seqptr = bam_get_seq(b);
    for (int i = 0; i < lqseq; ++i) {
        sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)];
    }

    // get the read group and sample name
    uint8_t *rgptr = bam_aux_get(b, "RG");
    char* rg = (char*) (rgptr+1);
    //if (!rg_sample
    string sname;
    if (!rg_sample.empty()) {
        sname = rg_sample[string(rg)];
    }

    // Now name the read after the scaffold
    string read_name = bam_get_qname(b);

    // Decide if we are a first read (/1) or second (last) read (/2)
    if(b->core.flag & BAM_FREAD1) {
        read_name += "/1";
    }
    if(b->core.flag & BAM_FREAD2) {
        read_name += "/2";
    }
    
    // If we are marked as both first and last we get /1/2, and if we are marked
    // as neither the scaffold name comes through unchanged as the read name.
    // TODO: produce correct names for intermediate reads on >2 read scaffolds.

    // add features to the alignment
    alignment.set_name(read_name);
    alignment.set_sequence(sequence);
    alignment.set_quality(quality);
    
    // TODO: htslib doesn't wrap this flag for some reason.
    alignment.set_is_secondary(b->core.flag & BAM_FSECONDARY);
    if (sname.size()) {
        alignment.set_sample_name(sname);
        alignment.set_read_group(rg);
    }

    return alignment;
}
Esempio n. 5
0
const char* get_sequence(const bam1_t *b) {
    if(b == NULL) die("get_sequence: parameter error\n");
    const uint8_t *seq = bam_get_seq(b);
    size_t len = b->core.l_qseq;
    char* sequence;
    sequence = malloc(len*sizeof(char));
    uint8_t offset = (b->core.flag & BAM_FREVERSE) ? 16 : 0;
    size_t i;
    for (i=0; i<len; i++) {
        switch(bam_seqi(seq, i) + offset)
        {
        case 1:
            sequence[i] = 'A';
            break;
        case 2:
            sequence[i] = 'C';
            break;
        case 4:
            sequence[i] = 'G';
            break;
        case 8:
            sequence[i] = 'T';
            break;
        case 15:
            sequence[i] = 'N';
            break;
        //Complements (original index + 16)
        case 17:
            sequence[i] = 'T';
            break;
        case 18:
            sequence[i] = 'G';
            break;
        case 20:
            sequence[i] = 'C';
            break;
        case 24:
            sequence[i] = 'A';
            break;
        case 31:
            sequence[i] = 'N';
            break;
        default:
            sequence[i] = 'N';
            break;
        }
    }
    if (offset) sequence = strrev(sequence);
    return sequence;
}
Esempio n. 6
0
char *get_sequence(const bam1_t *entry) {
  int i;
  int length = entry->core.l_qseq;
  char *seq = malloc(length + 1);
  unsigned char *bam_seq = bam_get_seq(entry);
  /* The for could be factored but this prevents from having a if in the
     loop */
  if (entry->core.flag & BAM_FREVERSE) {
    for (i = 0; i < length; i++) {
      seq[length - i - 1] = comp_seq_nt16_str[bam_seqi(bam_seq, i)];
    }
  } else {
    for (i = 0; i < length; ++i) {
      seq[i] = seq_nt16_str[bam_seqi(bam_seq,i)];
    }
  }
  seq[i] = 0;
  return seq;
}
Esempio n. 7
0
static int unpad_seq(bam1_t *b, kstring_t *s)
{
    // Returns 0 on success, -1 on an error
    int k, j, i;
    int length;
    int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */
    uint32_t *cigar = bam_get_cigar(b);
    uint8_t *seq = bam_get_seq(b);

    // b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
    // We need the padded length after alignment from the CIGAR (excluding
    // soft clips S, but including pads from CIGAR D operations)
    length = bam_cigar2rlen(b->core.n_cigar, cigar);
    ks_resize(s, length);
    for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
        int op, ol;
        op = bam_cigar_op(cigar[k]);
        ol = bam_cigar_oplen(cigar[k]);
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j);
        } else if (op == BAM_CSOFT_CLIP) {
            j += ol;
        } else if (op == BAM_CHARD_CLIP) {
            /* do nothing */
        } else if (op == BAM_CDEL) {
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
        } else if (op == BAM_CREF_SKIP) {
            /* Treat CIGAR N as D (not ideal, but better than ignoring it) */
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
            if (0 == cigar_n_warning) {
                cigar_n_warning = -1;
                fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
            }
        } else {
            fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
            return -1;
        }
    }
    return length != s->l;
}
Esempio n. 8
0
 Mapping(const bam_hdr_t * hdr_p, bam1_t * rec_p)
     : _rec_p(rec_p)
 {
     _query_name = bam_get_qname(rec_p);
     _flag = rec_p->core.flag;
     for (int i = 0; i < rec_p->core.l_qseq; ++i)
     {
         _seq += seq_nt16_str[bam_seqi(bam_get_seq(rec_p), i)];
     }
     if (is_mapped())
     {
         _chr_name = hdr_p->target_name[rec_p->core.tid];
         _rf_start = rec_p->core.pos;
         _cigar = Cigar(bam_get_cigar(rec_p), rec_p->core.n_cigar);
         _rf_len = _cigar.rf_len();
     }
     if (is_paired() and mp_is_mapped())
     {
         _mp_chr_name = hdr_p->target_name[rec_p->core.mtid];
         _mp_rf_start = rec_p->core.mpos;
     }
 }
Esempio n. 9
0
BM_mappedRead * extractReads(char * bamFile,
                             char ** contigs,
                             int numContigs,
                             uint16_t * groups,
                             char * prettyName,
                             int headersOnly,
                             int minMapQual,
                             int maxMisMatches,
                             int ignoreSuppAlignments,
                             int ignoreSecondaryAlignments) {
    //-----
    // code uses the pattern outlined in samtools view (sam_view.c)
    // thanks lh3!
    //
    int i = 0;
    int result = -1;
    int hh = 0;

    int supp_check = 0x0; // include supp mappings
    if (ignoreSuppAlignments) {
        supp_check |= BAM_FSUPPLEMENTARY;
    }
    if (ignoreSecondaryAlignments) {
        supp_check |= BAM_FSECONDARY;
    }

    // we need to let the users know if their pairings
    // will be corrupted
    int p_corrupt = 0;

    // helper variables
    samFile *in = 0;
    bam_hdr_t *header = NULL;
    bam1_t *b = bam_init1();

    BM_mappedRead * root = 0;
    BM_mappedRead * prev = 0;

    // open file handlers
    if ((in = sam_open(bamFile, "r")) == 0) {
        fprintf(stderr,
                "ERROR: Failed to open \"%s\" for reading.\n",
                bamFile);
    }
    else {
        // retrieve the header
        if ((header = sam_hdr_read(in)) == 0) {
            fprintf(stderr,
                    "ERROR: Failed to read the header from \"%s\".\n",
                    bamFile);
        }
        else {
            // check the index is intact
            hts_idx_t *idx = sam_index_load(in, bamFile); // load index
            if (idx == 0) { // index is unavailable
                fprintf(stderr,
                        "ERROR: Random retrieval only works "\
                        "for indexed files.\n");
            }
            else {
                cfuhash_table_t *pair_buffer = \
                    cfuhash_new_with_initial_size(1000000);
                cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS);

                for (hh = 0; hh < numContigs; ++hh) {
                    // parse a region in the format like `chr2:100-200'
                    hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]);
                    if (iter == NULL) { // reference name is not found
                        fprintf(stderr,
                                "WARNING: Could not find contig: "\
                                "[%s] in BAM: [%s].\n",
                                contigs[hh],
                                bamFile);
                    }

                    // fetch alignments
                    int line = 0;
                    while ((result = sam_itr_next(in, iter, b)) >= 0) {
                        bam1_core_t core = b->core;
                        line += 1;
                        // only high quality?, primary? mappings
                        if ( core.qual < minMapQual)
                            continue;
                        if ((core.flag & supp_check) != 0)
                            continue;
                        if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) {
                            continue;
                        }

                        char * seqId = bam_get_qname(b);
                        char * seq = 0;
                        char * qual = 0;
                        int qual_len = 0;
                        int seq_len = 0;

                        // get sequence and quality
                        if(0 == headersOnly) {
                            // no point allocating unused space
                            seq = calloc(core.l_qseq+1, sizeof(char));
                            qual = calloc(core.l_qseq+1, sizeof(char));
                            uint8_t *s = bam_get_seq(b);
                            if (core.flag&BAM_FREVERSE) {
                                // reverse the read
                                int r = 0;
                                for (i = core.l_qseq-1; i >=0 ; --i) {
                                    seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s,
                                                                       i)];
                                    ++r;
                                }
                            }
                            else {
                                for (i = 0; i < core.l_qseq; ++i) {
                                    seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s,
                                                                       i)];
                                }
                            }
                            seq_len = core.l_qseq;

                            s = bam_get_qual(b);
                            if (s[0] != 0xff) {
                                qual_len = core.l_qseq;
                                for (i = 0; i < core.l_qseq; ++i) {
                                    qual[i] = (char)(s[i] + 33);
                                }
                            }
                            else if (qual != 0) {
                                free(qual);
                                qual = 0;
                            }
                        }

                        // work out pairing information
                        uint8_t rpi = RPI_ERROR;
                        if (core.flag&BAM_FPAIRED) {
                            if(core.flag&BAM_FMUNMAP) {
                                if (core.flag&BAM_FREAD1) {
                                    rpi = RPI_SNGL_FIR;
                                }
                                else if (core.flag&BAM_FREAD2) {
                                    rpi = RPI_SNGL_SEC;
                                }
                            }
                            else {
                                if (core.flag&BAM_FREAD1) {
                                    rpi = RPI_FIR;
                                }
                                else if (core.flag&BAM_FREAD2) {
                                    rpi = RPI_SEC;
                                }
                            }
                        }
                        else {
                            rpi = RPI_SNGL;
                        }

                        // make the funky Id
                        #define MAX_SEQ_ID_LEN 80
                        char * seq_id = calloc(MAX_SEQ_ID_LEN,
                                               sizeof(char));
                        // allocate the string to the buffer but check to
                        // ensure we're not cutting anything off
                        int id_len = snprintf(seq_id,
                                              MAX_SEQ_ID_LEN,
                                              "b_%s;c_%s;r_%s",
                                              prettyName,
                                              contigs[hh],
                                              seqId);
                        if(id_len >= MAX_SEQ_ID_LEN) {
                            seq_id = calloc(id_len+1, sizeof(char));
                            snprintf(seq_id,
                                     id_len+1, // don't forget the NULL!
                                     "b_%s;c_%s;r_%s",
                                     prettyName,
                                     contigs[hh],
                                     seqId);
                        }

                        // make the mapped read struct
                        prev = makeMappedRead(seq_id,
                                              seq,
                                              qual,
                                              id_len,
                                              seq_len,
                                              qual_len,
                                              rpi,
                                              groups[hh],
                                              prev);

                        if (0 == root) { root = prev; }

                        if(rpi == RPI_SNGL || \
                           rpi == RPI_SNGL_FIR || \
                           rpi == RPI_SNGL_SEC) {
                            // we can just add away
                            // indicate singleton reads by pointing the
                            // partner pointer to itself
                            prev->partnerRead = prev;
                        }
                        else {
                            // RPI_FIR or RPI_SEC
                            // work out pairing information using the hash
                            // we append a 1 or 2 to the end so that
                            // we don't accidentally pair 1's with 1's etc.
                            char * stripped_result;
                            if(rpi == RPI_FIR) {
                                stripped_result = \
                                    pairStripper(seqId,
                                                 core.l_qname-1,
                                                 '2');
                            }
                            else {
                                stripped_result = \
                                    pairStripper(seqId,
                                                 core.l_qname-1,
                                                 '1');
                            }

                            char * stripped = seqId;
                            if(stripped_result)
                                stripped = stripped_result;

                            //fprintf(stdout, "SEARCH %s\n", stripped);
                            // now stripped always holds a stripped value
                            // see if it is in the hash already
                            BM_mappedRead * stored_MR = \
                                cfuhash_get(pair_buffer,
                                            stripped);

                            if (0 != stored_MR) {
                                // exists in the hash -> Add the pair info
                                if(rpi == RPI_FIR) {
                                    prev->partnerRead = stored_MR;
                                }
                                else {
                                    stored_MR->partnerRead = prev;
                                }

                                // delete the entry from the hash
                                cfuhash_delete(pair_buffer,
                                               stripped);
                            }
                            else {
                                // we should put it in the hash
                                // make sure to change it into something
                                // we will find next time
                                if(rpi == RPI_FIR)
                                    stripped[strlen(stripped)-1] = '1';
                                else
                                    stripped[strlen(stripped)-1] = '2';

                                // check to make sure we're not overwriting
                                // anything important. cfuhash overwrites
                                // duplicate entries, so we need to grab
                                // it and put it to "SNGL_XXX" before we
                                // lose the pointer
                                BM_mappedRead * OWMMR = \
                                    cfuhash_put(pair_buffer,
                                                stripped, prev);
                                if(OWMMR) {
                                    if(OWMMR->rpi == RPI_FIR)
                                        OWMMR->rpi = RPI_SNGL_FIR;
                                    else
                                        OWMMR->rpi = RPI_SNGL_SEC;
                                    OWMMR->partnerRead = OWMMR;
                                    printPairCorruptionWarning(p_corrupt);
                                    p_corrupt = 1;
                                }


                            }

                            if(stripped_result != 0) { // free this!
                                free(stripped_result);
                                stripped_result = 0;
                            }
                        }
                    }
                    hts_itr_destroy(iter);
                    if (result < -1) {
                        fprintf(stderr, "ERROR: retrieval of reads from "\
                                        "contig:  \"%s\" failed due to "\
                                        "truncated file or corrupt BAM index "\
                                        "file\n", header->target_name[hh]);
                        break;
                    }
                }

                // any entries left in the hash are pairs whose mates did
                // not meet quality standards
                size_t key_size = 0;
                char * key;
                BM_mappedRead * LOMMR;
                size_t pr_size = 1;
                if(cfuhash_each_data(pair_buffer,
                                     (void**)&key,
                                     &key_size,
                                     (void**)&LOMMR,
                                     &pr_size)) {
                    do {
                        // get the mapped read
                        // update it's pairing so we know it's really single
                        if (LOMMR->rpi == RPI_FIR)
                            LOMMR->rpi = RPI_SNGL_FIR;
                        else if (LOMMR->rpi == RPI_SEC)
                            LOMMR->rpi = RPI_SNGL_SEC;

                        // indicate singleton reads by pointing the
                        // partner pointer to itself
                        LOMMR->partnerRead = LOMMR;

                    } while(cfuhash_next_data(pair_buffer,
                                              (void**)&key,
                                              &key_size,
                                              (void**)&LOMMR,
                                              &pr_size));
                }

                cfuhash_clear(pair_buffer);
                cfuhash_destroy(pair_buffer);
            }
            hts_idx_destroy(idx); // destroy the BAM index
        }
    }
    // always do this
    if (in) sam_close(in);
    bam_destroy1(b);
    if ( header ) bam_hdr_destroy(header);

    return root;
}
Esempio n. 10
0
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
{
    uint8_t *seq = bam_get_seq(b);
    uint32_t *cigar = bam_get_cigar(b);
    bam1_core_t *c = &b->core;
    int i, x, y, u = 0;
    kstring_t *str;
    int32_t old_nm_i = -1, nm = 0;

    str = (kstring_t*)calloc(1, sizeof(kstring_t));
    for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
        int j, l = cigar[i]>>4, op = cigar[i]&0xf;
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (j = 0; j < l; ++j) {
                int c1, c2, z = y + j;
                if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
                c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
                if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                    if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
                    ++u;
                } else {
                    kputw(u, str);
                    kputc(ref[x+j], str);
                    u = 0;
                    ++nm;
                }
            }
            if (j < l) break;
            x += l;
            y += l;
        } else if (op == BAM_CDEL) {
            kputw(u, str);
            kputc('^', str);
            for (j = 0; j < l; ++j) {
                if (x+j >= ref_len || ref[x+j] == '\0') break;
                kputc(ref[x+j], str);
            }
            u = 0;
            x += j;
            nm += j;
            if (j < l) break;
        } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
            y += l;
            if (op == BAM_CINS) nm += l;
        } else if (op == BAM_CREF_SKIP) {
            x += l;
        }
    }
    kputw(u, str);
    // apply max_nm
    if (max_nm > 0 && nm >= max_nm) {
        for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
            int j, l = cigar[i]>>4, op = cigar[i]&0xf;
            if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                for (j = 0; j < l; ++j) {
                    int c1, c2, z = y + j;
                    if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
                    c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
                    if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                        seq[z/2] |= (z&1)? 0x0f : 0xf0;
                        bam_get_qual(b)[z] = 0;
                    }
                }
                if (j < l) break;
                x += l;
                y += l;
            } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
            else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
        }
    }
    // update NM
    if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
        uint8_t *old_nm = bam_aux_get(b, "NM");
        if (old_nm) old_nm_i = bam_aux2i(old_nm);
        if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
        else if (nm != old_nm_i) {
            fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
            bam_aux_del(b, old_nm);
            bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
        }
    }
    // update MD
    if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
        uint8_t *old_md = bam_aux_get(b, "MD");
        if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
        else {
            int is_diff = 0;
            if (strlen((char*)old_md+1) == str->l) {
                for (i = 0; i < str->l; ++i)
                    if (toupper(old_md[i+1]) != toupper(str->s[i]))
                        break;
                if (i < str->l) is_diff = 1;
            } else is_diff = 1;
            if (is_diff) {
                fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
                bam_aux_del(b, old_md);
                bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
            }
        }
    }

    // drop all tags but RG
    if (flag&DROP_TAG) {
        uint8_t *q = bam_aux_get(b, "RG");
        bam_aux_drop_other(b, q);
    }
    // reduce the resolution of base quality
    if (flag&BIN_QUAL) {
        uint8_t *qual = bam_get_qual(b);
        for (i = 0; i < b->core.l_qseq; ++i)
            if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
    }

    free(str->s);
    free(str);
}
Esempio n. 11
0
// Transform a bam1_t record into a string with the FASTQ representation of it
// @returns false for error, true for success
static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
{
    int i;
    int32_t qlen = b->core.l_qseq;
    assert(qlen >= 0);
    uint8_t *seq;
    uint8_t *qual = bam_get_qual(b);
    const uint8_t *oq = NULL;
    if (state->use_oq) {
        oq = bam_aux_get(b, "OQ");
        if (oq) oq++; // skip tag type
    }
    bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality

    linebuf->l = 0;
    // Write read name
    readpart readpart = which_readpart(b);
    kputc(state->filetype == FASTA? '>' : '@', linebuf);
    kputs(bam_get_qname(b), linebuf);
    // Add the /1 /2 if requested
    if (state->has12) {
        if (readpart == READ_1) kputs("/1", linebuf);
        else if (readpart == READ_2) kputs("/2", linebuf);
    }
    if (state->copy_tags) {
        for (i = 0; copied_tags[i]; ++i) {
            uint8_t *s;
            if ((s = bam_aux_get(b, copied_tags[i])) != 0) {
                kputc('\t', linebuf);
                kputsn(copied_tags[i], 2, linebuf);
                kputsn(":Z:", 3, linebuf);
                kputs(bam_aux2Z(s), linebuf);
            }
        }
    }
    kputc('\n', linebuf);

    seq = bam_get_seq(b);

    if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
        for (i = qlen-1; i > -1; --i) {
            char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]];
            kputc(c, linebuf);
        }
    } else {
        for (i = 0; i < qlen; ++i) {
            char c = seq_nt16_str[bam_seqi(seq,i)];
            kputc(c, linebuf);
        }
    }
    kputc('\n', linebuf);

    if (state->filetype == FASTQ) {
        // Write quality
        kputs("+\n", linebuf);
        if (has_qual) {
            if (state->use_oq && oq) {
                if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
                    for (i = qlen-1; i > -1; --i) {
                        kputc(oq[i], linebuf);
                    }
                } else {
                    kputs((char*)oq, linebuf);
                }
            } else {
                if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
                    for (i = qlen-1; i > -1; --i) {
                        kputc(33 + qual[i], linebuf);
                    }
                } else {
                    for (i = 0; i < qlen; ++i) {
                        kputc(33 + qual[i], linebuf);
                    }
                }
            }
        } else {
            for (i = 0; i < qlen; ++i) {
                kputc(33 + state->def_qual, linebuf);
            }
        }
        kputc('\n', linebuf);
    }
    return true;
}
Esempio n. 12
0
int main(int argc, char **argv) {
    cram_fd *out;
    bam_file_t *in;
    bam_seq_t *s = NULL;
    char *out_fn;
    int level = '\0'; // nul terminate string => auto level
    char out_mode[4];
    int c, verbose = 0;
    int s_opt = 0, S_opt = 0, embed_ref = 0;
    char *arg_list, *ref_fn = NULL;

    while ((c = getopt(argc, argv, "u0123456789hvs:S:V:r:X")) != -1) {
	switch (c) {
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	    level = c;
	    break;
	    
	case 'u':
	    level = '0';
	    break;

	case 'h':
	    usage(stdout);
	    return 0;

	case 'v':
	    verbose++;
	    break;

	case 's':
	    s_opt = atoi(optarg);
	    break;

	case 'S':
	    S_opt = atoi(optarg);
	    break;

	case 'V':
	    cram_set_option(NULL, CRAM_OPT_VERSION, optarg);
	    break;

	case 'r':
	    ref_fn = optarg;
	    break;

	case 'X':
	    embed_ref = 1;
	    break;

	case '?':
	    fprintf(stderr, "Unrecognised option: -%c\n", optopt);
	    usage(stderr);
	    return 1;
	}
    }

    if (argc - optind != 1 && argc - optind != 2) {
	usage(stderr);
	return 1;
    }

    /* opening */
    if (NULL == (in = bam_open(argv[optind], "rb"))) {
	perror(argv[optind]);
	return 1;
    }

    out_fn = argc - optind == 2 ? argv[optind+1] : "-";
    sprintf(out_mode, "wb%c", level);
    if (NULL == (out = cram_open(out_fn, out_mode))) {
	fprintf(stderr, "Error opening CRAM file '%s'.\n", out_fn);
	return 1;
    }

    /* SAM Header */
    if (!(arg_list = stringify_argv(argc, argv)))
	return 1;
    sam_hdr_add_PG(in->header, "sam_to_cram",
		   "VN", PACKAGE_VERSION,
		   "CL", arg_list, NULL);
    free(arg_list);

    /* Find and load reference */
    if (!ref_fn) {
	SAM_hdr_type *ty = sam_hdr_find(in->header, "SQ", NULL, NULL);
	if (ty) {
	    SAM_hdr_tag *tag;

	    if ((tag = sam_hdr_find_key(in->header, ty, "UR", NULL))) {
		ref_fn  = tag->str + 3;
		if (strncmp(ref_fn, "file:", 5) == 0)
		    ref_fn += 5;
	    }
	}
    }

    out->header = in->header;
    if (ref_fn)
	cram_load_reference(out, ref_fn);

    if (!out->refs) {
	fprintf(stderr, "Unable to open reference.\n"
		"Please specify a valid reference with -r ref.fa option.\n");
	return 1;
    }
    refs2id(out->refs, out->header);

    if (-1 == cram_write_SAM_hdr(out, in->header))
	return 1;

    cram_set_option(out, CRAM_OPT_VERBOSITY, verbose);
    if (s_opt)
	cram_set_option(out, CRAM_OPT_SEQS_PER_SLICE, s_opt);

    if (S_opt)
	cram_set_option(out, CRAM_OPT_SLICES_PER_CONTAINER, S_opt);

    if (embed_ref)
	cram_set_option(out, CRAM_OPT_EMBED_REF, embed_ref);

    /* Sequence iterators */
    while (bam_get_seq(in, &s) > 0) {
	if (-1 == cram_put_bam_seq(out, s)) {
	    fprintf(stderr, "Failed in cram_put_bam_seq()\n");
	    return 1;
	}
    }

    bam_close(in);
    out->header = NULL; // freed by bam_close()
    if (-1 == cram_close(out)) {
	fprintf(stderr, "Failed in cram_close()\n");
	return 1;
    }

    if (s)
	free(s);

    return 0;
}
Esempio n. 13
0
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	int n_seqs, l, i;
	long n_trimmed = 0, n_tot = 0;
	bam1_t *b;
	int res;

	b = bam_init1();
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
#ifdef USE_HTSLIB
	while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) {
#else
	while ((res = bam_read1(bs->fp, b)) >= 0) {
#endif
		uint8_t *s, *q;
		int go = 0;
		if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
		if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
		if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
		if (go == 0) continue;
		l = b->core.l_qseq;
		p = &seqs[n_seqs++];
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
#ifdef USE_HTSLIB
		s = bam_get_seq(b); q = bam_get_qual(b);
#else
		s = bam1_seq(b); q = bam1_qual(b);
#endif
		p->seq = (ubyte_t*)calloc(p->len + 1, 1);
		p->qual = (ubyte_t*)calloc(p->len + 1, 1);
		for (i = 0; i != p->full_len; ++i) {
#ifdef USE_HTSLIB
			p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)];
#else
			p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
#endif
			p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
		}
#ifdef USE_HTSLIB
		if (bam_is_rev(b)) { // then reverse 
#else
		if (bam1_strand(b)) { // then reverse 
#endif
			seq_reverse(p->len, p->seq, 1);
			seq_reverse(p->len, p->qual, 0);
		}
		if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
#ifdef USE_HTSLIB
		p->name = strdup((const char*)bam_get_qname(b));
#else
		p->name = strdup((const char*)bam1_qname(b));
#endif
		if (n_seqs == n_needed) break;
	}
	if (res < 0 && res != -1) err_fatal_simple("Error reading bam file");
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		bam_destroy1(b);
		return 0;
	}
	bam_destroy1(b);
	return seqs;
}

#define BARCODE_LOW_QUAL 13

bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
{
	bwa_seq_t *seqs, *p;
	kseq_t *seq = bs->ks;
	int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
	long n_trimmed = 0, n_tot = 0;

	if (l_bc > BWA_MAX_BCLEN) {
		fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN);
		return 0;
	}
	if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
	n_seqs = 0;
	seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
	while ((l = kseq_read(seq)) >= 0) {
		if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) {
			// skip reads that are marked to be filtered by Casava
			char *s = index(seq->comment.s, ':');
			if (s && *(++s) == 'Y') {
				continue;
			}
		}
		if (is_64 && seq->qual.l)
			for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
		if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
		p = &seqs[n_seqs++];
		if (l_bc) { // then trim barcode
			for (i = 0; i < l_bc; ++i)
				p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
			p->bc[i] = 0;
			for (; i < seq->seq.l; ++i)
				seq->seq.s[i - l_bc] = seq->seq.s[i];
			seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
			if (seq->qual.l) {
				for (i = l_bc; i < seq->qual.l; ++i)
					seq->qual.s[i - l_bc] = seq->qual.s[i];
				seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
			}
			l = seq->seq.l;
		} else p->bc[0] = 0;
		p->tid = -1; // no assigned to a thread
		p->qual = 0;
		p->full_len = p->clip_len = p->len = l;
		n_tot += p->full_len;
		p->seq = (ubyte_t*)calloc(p->full_len, 1);
		for (i = 0; i != p->full_len; ++i)
			p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
		if (seq->qual.l) { // copy quality
			p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
			if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
		}
		p->rseq = (ubyte_t*)calloc(p->full_len, 1);
		memcpy(p->rseq, p->seq, p->len);
		seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
		seq_reverse(p->len, p->rseq, is_comp);
		p->name = strdup((const char*)seq->name.s);
		{ // trim /[12]$
			int t = strlen(p->name);
			if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
		}
		if (n_seqs == n_needed) break;
	}
	*n = n_seqs;
	if (n_seqs && trim_qual >= 1)
		fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
	if (n_seqs == 0) {
		free(seqs);
		return 0;
	}
	return seqs;
}

void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
{
	int i, j;
	for (i = 0; i != n_seqs; ++i) {
		bwa_seq_t *p = seqs + i;
		for (j = 0; j < p->n_multi; ++j)
			if (p->multi[j].cigar) free(p->multi[j].cigar);
		free(p->name);
		free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
		free(p->cigar);
	}
	free(seqs);
}
Esempio n. 14
0
int main_bam2fq(int argc, char *argv[])
{
    BGZF *fp, *fpse = 0;
    bam1_t *b;
    uint8_t *buf;
    int max_buf, c, has12 = 0;
    kstring_t str;
    int64_t n_singletons = 0, n_reads = 0;
    char last[512], *fnse = 0;

    while ((c = getopt(argc, argv, "as:")) > 0)
        if (c == 'a') has12 = 1;
        else if (c == 's') fnse = optarg;
    if (argc == optind) {
        fprintf(stderr, "\nUsage:   bam2fq [-a] [-s outSE] <in.bam>\n\n");
        fprintf(stderr, "Options: -a        append /1 and /2 to the read name\n");
        fprintf(stderr, "         -s FILE   write singleton reads to FILE [assume single-end]\n");
        fprintf(stderr, "\n");
        return 1;
    }
    fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r");
    assert(fp);
    bam_hdr_destroy(bam_hdr_read(fp));
    buf = 0;
    max_buf = 0;
    str.l = str.m = 0;
    str.s = 0;
    last[0] = 0;
    if (fnse) fpse = bgzf_open(fnse, "w1");

    b = bam_init1();
    while (bam_read1(fp, b) >= 0) {
        int i, qlen = b->core.l_qseq, is_print = 0;
        uint8_t *qual, *seq;
        if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments
        ++n_reads;
        if (fpse) {
            if (str.l && strcmp(last, bam_get_qname(b))) {
                bgzf_write(fpse, str.s, str.l);
                str.l = 0;
                ++n_singletons;
            }
            if (str.l) is_print = 1;
            strcpy(last, bam_get_qname(b));
        } else is_print = 1;
        qual = bam_get_qual(b);
        kputc(qual[0] == 0xff? '>' : '@', &str);
        kputsn(bam_get_qname(b), b->core.l_qname - 1, &str);
        if (has12) {
            kputc('/', &str);
            kputw(b->core.flag>>6&3, &str);
        }
        kputc('\n', &str);
        if (max_buf < qlen + 1) {
            max_buf = qlen + 1;
            kroundup32(max_buf);
            buf = (uint8_t*)realloc(buf, max_buf);
        }
        buf[qlen] = 0;
        seq = bam_get_seq(b);
        for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence
        if (bam_is_rev(b)) { // reverse complement
            for (i = 0; i < qlen>>1; ++i) {
                int8_t t = seq_comp_table[buf[qlen - 1 - i]];
                buf[qlen - 1 - i] = seq_comp_table[buf[i]];
                buf[i] = t;
            }
            if (qlen&1) buf[i] = seq_comp_table[buf[i]];
        }
        for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]];
        kputsn((char*)buf, qlen, &str);
        kputc('\n', &str);
        if (qual[0] != 0xff) {
            kputsn("+\n", 2, &str);
            for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i];
            if (bam_is_rev(b)) { // reverse
                for (i = 0; i < qlen>>1; ++i) {
                    uint8_t t = buf[qlen - 1 - i];
                    buf[qlen - 1 - i] = buf[i];
                    buf[i] = t;
                }
            }
        }
        kputsn((char*)buf, qlen, &str);
        kputc('\n', &str);
        if (is_print) {
            fwrite(str.s, 1, str.l, stdout);
            str.l = 0;
        }
    }
    if (fpse) {
        if (str.l) {
            bgzf_write(fpse, str.s, str.l);
            ++n_singletons;
        }
        fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons);
        bgzf_close(fpse);
    }
    fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads);
    free(buf);
    free(str.s);
    bam_destroy1(b);
    bgzf_close(fp);
    return 0;
}
Esempio n. 15
0
loci_stats *bam_access_get_position_base_counts(char *chr, int posn){
	char *region = NULL;
	hts_itr_t *iter = NULL;
	bam1_t* b = NULL;
	bam_plp_t buf;
	loci_stats *stats = malloc(sizeof(loci_stats *));
	check_mem(stats);
	stats->base_counts = malloc(sizeof(int) * 4);
	check_mem(stats->base_counts);
	stats->base_counts[0] = 0;
	stats->base_counts[1] = 0;
	stats->base_counts[2] = 0;
	stats->base_counts[3] = 0;
	fholder->stats = stats;

	region = malloc((sizeof(char *) * (strlen(chr)+1))+sizeof(":")+sizeof("-")+(sizeof(char)*((no_of_digits(posn)*2)+1)));
	sprintf(region,"%s:%d-%d",chr,posn,posn);
	fholder->beg = posn;
	fholder->end = posn;

  // initialize pileup
	buf = bam_plp_init(pileup_func, (void *)fholder);
	bam_plp_set_maxcnt(buf,maxitercnt);

  /*
  sam_fetch(fholder->in, fholder->idx, ref, fholder->beg, fholder->end, buf, fetch_algo_func);
  */
  //Replace fetch with iterator for htslib compatibility.
  b = bam_init1();
  iter = sam_itr_querys(fholder->idx, fholder->head, region);
  int result;
  int count = 0;
  while ((result = sam_itr_next(fholder->in, iter, b)) >= 0) {
    if(b->core.qual < min_map_qual || (b->core.flag & BAM_FUNMAP)
			|| !(b->core.flag & BAM_FPROPER_PAIR) || (b->core.flag & BAM_FMUNMAP)//Proper pair and mate unmapped
			|| (b->core.flag & BAM_FDUP)//1024 is PCR/optical duplicate
			|| (b->core.flag & BAM_FSECONDARY) || (b->core.flag & BAM_FQCFAIL)//Secondary alignment, quality fail
			|| (b->core.flag & BAM_FSUPPLEMENTARY) ) continue;
    count++;
    bam_plp_push(buf, b);
  }
  sam_itr_destroy(iter);
  bam_plp_push(buf, 0);
  int tid, pos, n_plp = -1;
  const bam_pileup1_t *pil;
  while ( (pil=bam_plp_next(buf, &tid, &pos, &n_plp)) > 0) {
    if((pos+1) != posn) continue;
    int i=0;
   	for(i=0;i<n_plp;i++){
      const bam_pileup1_t *p = pil + i;
      int qual = bam_get_qual(p->b)[p->qpos];
      uint8_t c = bam_seqi(bam_get_seq(p->b), p->qpos);
      if(!(p->is_del) &&  qual >= min_base_qual
            &&  p->b->core.qual >= min_map_qual){
           //&& (c == 1 /*A*/|| c == 2 /*C*/|| c == 4 /*G*/|| c == 8 /*T*/)){
        //Now we add a new read pos struct to the list since the read is valid.
        //char cbase = toupper(bam_nt16_rev_table[c]);
        switch(c){
          case 1:
            fholder->stats->base_counts[0]++;
            break;

          case 2:
            fholder->stats->base_counts[1]++;
            break;

          case 4:
            fholder->stats->base_counts[2]++;
            break;

          case 8:
            fholder->stats->base_counts[3]++;
            break;

          default:
            break;

          }; // End of args switch statement */
   	    }
   	  }
  } //End of iteration through pileup
	//bam_plp_push(buf, 0); // finalize pileup
  bam_plp_destroy(buf);
	free(region);
	return fholder->stats;

error:
	//if(region) free(region);
	if(fholder->stats){
		if(fholder->stats->base_counts) free(fholder->stats->base_counts);
		free(fholder->stats);
	}
	if(iter) sam_itr_destroy(iter);
	if(b) bam_destroy1(b);
	if(region) free(region);
	return NULL;
}