Esempio n. 1
0
static int mplp_func(void *data, bam1_t *b)
{
    char *ref;
    mplp_aux_t *ma = (mplp_aux_t*)data;
    int ret, skip = 0, ref_len;
    do {
        int has_ref;
        ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
        if (ret < 0) break;
        // The 'B' cigar operation is not part of the specification, considering as obsolete.
        //  bam_remove_B(b);
        if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads
            skip = 1;
            continue;
        }
        if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
        if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
        if (ma->conf->bed && ma->conf->all == 0) { // test overlap
            skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b));
            if (skip) continue;
        }
        if (ma->conf->rghash) { // exclude read groups
            uint8_t *rg = bam_aux_get(b, "RG");
            skip = (rg && khash_str2int_get(ma->conf->rghash, (const char*)(rg+1), NULL)==0);
            if (skip) continue;
        }
        if (ma->conf->flag & MPLP_ILLUMINA13) {
            int i;
            uint8_t *qual = bam_get_qual(b);
            for (i = 0; i < b->core.l_qseq; ++i)
                qual[i] = qual[i] > 31? qual[i] - 31 : 0;
        }

        if (ma->conf->fai && b->core.tid >= 0) {
            has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
            if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
                fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
                        __func__, b->core.pos, ref_len, b->core.tid);
                skip = 1;
                continue;
            }
        } else {
            has_ref = 0;
        }

        skip = 0;
        if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
        if (has_ref && ma->conf->capQ_thres > 10) {
            int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
            if (q < 0) skip = 1;
            else if (b->core.qual > q) b->core.qual = q;
        }
        if (b->core.qual < ma->conf->min_mq) skip = 1;
        else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) skip = 1;
    } while (skip);
    return ret;
}
Esempio n. 2
0
int bam_fillmd(int argc, char *argv[])
{
    int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
    samFile *fp = NULL, *fpout = NULL;
    bam_hdr_t *header = NULL;
    faidx_t *fai = NULL;
    char *ref = NULL, mode_w[8], *ref_file;
    bam1_t *b = NULL;
    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;

    static const struct option lopts[] = {
        SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
        { NULL, 0, NULL, 0 }
    };

    flt_flag = UPDATE_NM | UPDATE_MD;
    is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
    strcpy(mode_w, "w");
    while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) {
        switch (c) {
        case 'r':
            is_realn = 1;
            break;
        case 'e':
            flt_flag |= USE_EQUAL;
            break;
        case 'd':
            flt_flag |= DROP_TAG;
            break;
        case 'q':
            flt_flag |= BIN_QUAL;
            break;
        case 'h':
            flt_flag |= HASH_QNM;
            break;
        case 'N':
            flt_flag &= ~(UPDATE_MD|UPDATE_NM);
            break;
        case 'b':
            is_bam_out = 1;
            break;
        case 'u':
            is_uncompressed = is_bam_out = 1;
            break;
        case 'S':
            break;
        case 'n':
            max_nm = atoi(optarg);
            break;
        case 'C':
            capQ = atoi(optarg);
            break;
        case 'A':
            baq_flag |= 1;
            break;
        case 'E':
            baq_flag |= 2;
            break;
        default:
            if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
            fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
        /* else fall-through */
        case '?':
            return calmd_usage();
        }
    }
    if (is_bam_out) strcat(mode_w, "b");
    else strcat(mode_w, "h");
    if (is_uncompressed) strcat(mode_w, "0");
    if (optind + (ga.reference == NULL) >= argc)
        return calmd_usage();
    fp = sam_open_format(argv[optind], "r", &ga.in);
    if (fp == NULL) {
        print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]);
        return 1;
    }

    header = sam_hdr_read(fp);
    if (header == NULL || header->n_targets == 0) {
        fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
        goto fail;
    }

    fpout = sam_open_format("-", mode_w, &ga.out);
    if (fpout == NULL) {
        print_error_errno("calmd", "Failed to open output");
        goto fail;
    }
    if (sam_hdr_write(fpout, header) < 0) {
        print_error_errno("calmd", "Failed to write sam header");
        goto fail;
    }

    ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
    fai = fai_load(ref_file);

    if (!fai) {
        print_error_errno("calmd", "Failed to open reference file '%s'", ref_file);
        goto fail;
    }

    b = bam_init1();
    if (!b) {
        fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n");
        goto fail;
    }
    while ((ret = sam_read1(fp, header, b)) >= 0) {
        if (b->core.tid >= 0) {
            if (tid != b->core.tid) {
                free(ref);
                ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
                tid = b->core.tid;
                if (ref == 0) { // FIXME: Should this always be fatal?
                    fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
                            header->target_name[tid]);
                    if (is_realn || capQ > 10) goto fail; // Would otherwise crash
                }
            }
            if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
            if (capQ > 10) {
                int q = sam_cap_mapq(b, ref, len, capQ);
                if (b->core.qual > q) b->core.qual = q;
            }
            if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
        }
        if (sam_write1(fpout, header, b) < 0) {
            print_error_errno("calmd", "failed to write to output file");
            goto fail;
        }
    }
    if (ret < -1) {
        fprintf(stderr, "[bam_fillmd] Error reading input.\n");
        goto fail;
    }
    bam_destroy1(b);
    bam_hdr_destroy(header);

    free(ref);
    fai_destroy(fai);
    sam_close(fp);
    if (sam_close(fpout) < 0) {
        fprintf(stderr, "[bam_fillmd] error when closing output file\n");
        return 1;
    }
    return 0;

fail:
    free(ref);
    if (b) bam_destroy1(b);
    if (header) bam_hdr_destroy(header);
    if (fai) fai_destroy(fai);
    if (fp) sam_close(fp);
    if (fpout) sam_close(fpout);
    return 1;
}
Esempio n. 3
0
static int mplp_func(void *data, bam1_t *b)
{
    char *ref;
    mplp_aux_t *ma = (mplp_aux_t*)data;
    int ret, ref_len;
    while (1)
    {
        int has_ref;
        ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
        if (ret < 0) break;
        // The 'B' cigar operation is not part of the specification, considering as obsolete.
        //  bam_remove_B(b);
        if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads
        if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue;
        if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue;
        if (ma->conf->bed)
        {
            // test overlap
            regitr_t *itr = ma->conf->bed_itr;
            int beg = b->core.pos, end = bam_endpos(b)-1;
            int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr);
            if ( !ma->conf->bed_logic && !overlap )
            {
                // exclude only reads which are fully contained in the region
                while ( regitr_overlap(itr) )
                {
                    if ( beg < itr->beg ) { overlap = 1; break; }
                    if ( end > itr->end ) { overlap = 1; break; }
                }
            }
            if ( !overlap ) continue;
        }
        if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue;
        if (ma->conf->flag & MPLP_ILLUMINA13) {
            int i;
            uint8_t *qual = bam_get_qual(b);
            for (i = 0; i < b->core.l_qseq; ++i)
                qual[i] = qual[i] > 31? qual[i] - 31 : 0;
        }

        if (ma->conf->fai && b->core.tid >= 0) {
            has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
            if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
                fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
                        __func__, b->core.pos, ref_len, b->core.tid);
                continue;
            }
        } else {
            has_ref = 0;
        }

        if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
        if (has_ref && ma->conf->capQ_thres > 10) {
            int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
            if (q < 0) continue;    // skip
            else if (b->core.qual > q) b->core.qual = q;
        }
        if (b->core.qual < ma->conf->min_mq) continue;
        else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue;

        return ret;
    };
    return ret;
}