Пример #1
0
static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec)
{
    if ( arec && arec->errcode )
        error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname);
    if ( brec && brec->errcode )
        error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname);

    int i, nsmpl = bcf_hdr_nsamples(args->out_hdr);
    int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec));
    if ( args->prev_chr<0 || args->prev_chr!=chr_id )
    {
        if ( args->prev_chr>=0 ) phased_flush(args);

        for (i=0; i<nsmpl; i++)
            args->phase_set[i] = arec->pos+1;
        args->phase_set_changed = 1;

        if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec));
        args->seen_seq[chr_id] = 1;
        args->prev_chr = chr_id;
        args->prev_pos_check = -1;
    }

    if ( !brec )
    {
        bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
        if ( args->nswap )
            phase_update(args, args->out_hdr, arec);
        if ( !args->compact_PS || args->phase_set_changed )
        {
            bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
            args->phase_set_changed = 0;
        }
        bcf_write(args->out_fh, args->out_hdr, arec);

        if ( arec->pos < args->prev_pos_check )
            error("FIXME, disorder: %s:%d in %s vs %d written  [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1);
        args->prev_pos_check = arec->pos;
        return;
    }

    int m = args->mbuf;
    args->nbuf += 2;
    hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf);
    for (i=m; i<args->mbuf; i++)
        args->buf[i] = bcf_init1();

    SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]);
    SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]);
}
Пример #2
0
/*
    Called for each VCF record after all standard annotation things are finished.
    Return 0 on success, 1 to suppress the line from printing, -1 on critical errors.
*/
int process(bcf1_t *rec)
{
    int i, ret;

    printf("%s\t%d\t%s\t%s", bcf_seqname(in_hdr,rec),rec->pos+1,rec->d.allele[0],rec->n_allele>1 ? rec->d.allele[1] : ".");
    if ( rec->n_allele==1 )
    {
        for (i=0; i<rec->n_sample; i++) printf("\t0.0");
    }
    else
    {
        for (i=0; i<nhandlers; i++)
        {
            ret = handlers[i](rec);
            if ( !ret ) break;  // successfully printed
        }
        if ( i==nhandlers )
        {
            // none of the annotations present
            for (i=0; i<rec->n_sample; i++) printf("\t-1.0");
        }
    }
    printf("\n");

    return 1;
}
Пример #3
0
static int load_genmap(args_t *args, bcf1_t *line)
{
    if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; }

    kstring_t str = {0,0,0};
    char *fname = strstr(args->genmap_fname,"{CHROM}");
    if ( fname )
    {
        kputsn(args->genmap_fname, fname - args->genmap_fname, &str);
        kputs(bcf_seqname(args->hdr,line), &str);
        kputs(fname+7,&str);
        fname = str.s;
    }
    else
        fname = args->genmap_fname;

    htsFile *fp = hts_open(fname, "rb");
    if ( !fp )
    {
        args->ngenmap = 0;
        return -1;
    }

    hts_getline(fp, KS_SEP_LINE, &str);
    if ( strcmp(str.s,"position COMBINED_rate(cM/Mb) Genetic_Map(cM)") )
        error("Unexpected header, found:\n\t[%s], but expected:\n\t[position COMBINED_rate(cM/Mb) Genetic_Map(cM)]\n", fname, str.s);

    args->ngenmap = args->igenmap = 0;
    while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
    {
        args->ngenmap++;
        hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap);
        genmap_t *gm = &args->genmap[args->ngenmap-1];

        char *tmp, *end;
        gm->pos = strtol(str.s, &tmp, 10);
        if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s);

        // skip second column
        tmp++;
        while ( *tmp && !isspace(*tmp) ) tmp++;

        // read the genetic map in cM
        gm->rate = strtod(tmp+1, &end);
        if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s);
    }
    if ( !args->ngenmap ) error("Genetic map empty?\n");
    int i;
    for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1
    if ( hts_close(fp) ) error("Close failed\n");
    free(str.s);
    return 0;
}
Пример #4
0
static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
{
    bcf1_t *rec = *rec_ptr;
    if ( args->vcf_rbuf.n >= args->vcf_rbuf.m )
        error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);

    // Insert the new record in the buffer. The line would be overwritten in
    // the next bcf_sr_next_line call, therefore we need to swap it with an
    // unused one
    int i = rbuf_append(&args->vcf_rbuf);
    if ( !args->vcf_buf[i] ) args->vcf_buf[i] = bcf_init1();
    bcf1_t *tmp = rec; *rec_ptr = args->vcf_buf[i]; args->vcf_buf[i] = tmp;
}
Пример #5
0
int process(bcf1_t *rec)
{
    if ( rec->n_allele<2 ) return 0;    // not a variant

    int type = bcf_get_variant_types(rec);
    if ( !(type&VCF_INDEL) ) return 0;  // not an indel

    int i, len = 0;
    for (i=1; i<rec->n_allele; i++)
        if ( len > rec->d.var[i].n ) len = rec->d.var[i].n;

    int pos_to = len!=0 ? rec->pos : rec->pos - len;    // len is negative
    if ( bcf_sr_regions_overlap(exons, bcf_seqname(in_hdr,rec),rec->pos,pos_to) ) return 0;  // no overlap

    hts_expand(int32_t,rec->n_allele-1,nfrm,frm);
    for (i=1; i<rec->n_allele; i++)
    {
        if ( rec->d.var[i].type!=VCF_INDEL ) {
            frm[i-1] = -1;
            continue;
        }

        int len = rec->d.var[i].n, tlen = 0;
        if ( len>0 )
        {
            // insertion
            if ( exons->start <= rec->pos && exons->end > rec->pos ) tlen = abs(len);
        }
        else if ( exons->start <= rec->pos + abs(len) )
        {
            // deletion
            tlen = abs(len);
            if ( rec->pos < exons->start )              // trim the beginning
                tlen -= exons->start - rec->pos + 1;
            if ( exons->end < rec->pos + abs(len) )     // trim the end
                tlen -= rec->pos + abs(len) - exons->end;
        }
        if ( tlen )     // there are some deleted/inserted bases in the exon
        {
            if ( tlen%3 ) frm[i-1] = 1; // out-of-frame
            else frm[i-1] = 0;          // in-frame
        }
        else frm[i-1] = -1;             // not applicable (is outside)
    }

    if ( bcf_update_info_int32(out_hdr,rec,"OOF",frm,rec->n_allele-1)<0 ) return -1;
    return 0;
}
Пример #6
0
static void set_ploidy(args_t *args, bcf1_t *rec)
{
    ploidy_query(args->ploidy,(char*)bcf_seqname(args->aux.hdr,rec),rec->pos,args->sex2ploidy,NULL,NULL);

    int i;
    for (i=0; i<args->nsex; i++)
        if ( args->sex2ploidy[i]!=args->sex2ploidy_prev[i] ) break;

    if ( i==args->nsex ) return;    // ploidy same as previously

    for (i=0; i<args->nsamples; i++)
    {
        if ( args->sample2sex[i]<0 )
            args->aux.ploidy[i] = -1*args->sample2sex[i];
        else
            args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
    }

    int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
}
Пример #7
0
void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
{
    int *map = (int*) calloc(line->n_allele, sizeof(int));

    // create map of indexes from old to new ALT numbering and modify ALT
    kstring_t str = {0,0,0};
    kputs(line->d.allele[0], &str);

    int nrm = 0, i,j;  // i: ori alleles, j: new alleles
    for (i=1, j=1; i<line->n_allele; i++) 
    {
        if ( rm_mask & 1<<i )
        {
            // remove this allele
            line->d.allele[i] = NULL;
            nrm++;
            continue;
        }
        kputc(',', &str);
        kputs(line->d.allele[i], &str);
        map[i] = j;
        j++;
    }
    if ( !nrm ) { free(map); free(str.s); return; }

    int nR_ori = line->n_allele;
    int nR_new = line->n_allele-nrm;
    assert(nR_new > 0); // should not be able to remove reference allele
    int nA_ori = nR_ori-1;
    int nA_new = nR_new-1;

    int nG_ori = nR_ori*(nR_ori + 1)/2;
    int nG_new = nR_new*(nR_new + 1)/2;

    bcf_update_alleles_str(header, line, str.s);

    // remove from Number=G, Number=R and Number=A INFO fields.
    uint8_t *dat = NULL;
    int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
    for (i=0; i<line->n_info; i++)
    {
        bcf_info_t *info = &line->d.info[i];
        int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
        
        if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change

        int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
        if ( type==BCF_HT_FLAG ) continue;
        int size = 1;
        if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;

        mdat = mdat_bytes / size;
        nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
        mdat_bytes = mdat * size;
        if ( nret<0 ) 
        { 
            fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, 
                bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); 
            exit(1);
        }
        if ( type==BCF_HT_STR ) 
        { 
            str.l = 0;
            char *ss = (char*) dat, *se = (char*) dat;
            if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
            {
                int nexp, inc = 0;
                if ( vlen==BCF_VL_A )
                {
                    nexp = nA_ori;
                    inc  = 1;
                }
                else
                    nexp = nR_ori;
                for (j=0; j<nexp; j++)
                {
                    if ( !*se ) break;
                    while ( *se && *se!=',' ) se++;
                    if ( rm_mask & 1<<(j+inc) ) 
                    { 
                        if ( *se ) se++;
                        ss = se; 
                        continue; 
                    }
                    if ( str.l ) kputc(',',&str);
                    kputsn(ss,se-ss,&str);
                    if ( *se ) se++;
                    ss = se;
                }
                assert( j==nexp );
            }
            else    // Number=G, assuming diploid genotype
            {
                int k = 0, n = 0;
                for (j=0; j<nR_ori; j++)
                {
                    for (k=0; k<=j; k++)
                    {
                        if ( !*se ) break;
                        while ( *se && *se!=',' ) se++;
                        n++;
                        if ( rm_mask & 1<<j || rm_mask & 1<<k ) 
                        { 
                            if ( *se ) se++;
                            ss = se; 
                            continue; 
                        }
                        if ( str.l ) kputc(',',&str);
                        kputsn(ss,se-ss,&str);
                        if ( *se ) se++;
                        ss = se;
                    }
                    if ( !*se ) break;
                }
                assert( n=nG_ori );
            }

            nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
            if ( nret<0 )
            {
                fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                        bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
                exit(1);
            }
            continue; 
        }
        
        if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
        {
            int inc = 0, ntop;
            if ( vlen==BCF_VL_A )
            {
                assert( nret==nA_ori );
                ntop = nA_ori;
                ndat = nA_new;
                inc  = 1;
            }
            else
            {
                assert( nret==nR_ori );
                ntop = nR_ori;
                ndat = nR_new;
            }
            int k = 0;

            #define BRANCH(type_t,is_vector_end) \
            { \
                type_t *ptr = (type_t*) dat; \
                int size = sizeof(type_t); \
                for (j=0; j<ntop; j++) /* j:ori, k:new */ \
                { \
                    if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
                    if ( rm_mask & 1<<(j+inc) ) continue; \
                    if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
                    k++; \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
            }
            #undef BRANCH
        }
        else    // Number=G
        {
            assert( nret==nG_ori );
            int k, l_ori = -1, l_new = 0;
            ndat = nG_new;

            #define BRANCH(type_t,is_vector_end) \
            { \
                type_t *ptr = (type_t*) dat; \
                int size = sizeof(type_t); \
                for (j=0; j<nR_ori; j++) \
                { \
                    for (k=0; k<=j; k++) \
                    { \
                        l_ori++; \
                        if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
                        if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \
                        if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
                        l_new++; \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
            }
            #undef BRANCH
        }

        nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
    }

    // Update GT fields, the allele indexes might have changed
    for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
    if ( i<line->n_allele )
    {
        mdat = mdat_bytes / 4;  // sizeof(int32_t)
        nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
        mdat_bytes = mdat * 4;
        if ( nret>0 )
        {
            nret /= line->n_sample;
            int32_t *ptr = (int32_t*) dat;
            for (i=0; i<line->n_sample; i++)
            {
                for (j=0; j<nret; j++)
                {
                    if ( ptr[j]==bcf_gt_missing ) continue;
                    if ( ptr[j]==bcf_int32_vector_end ) break;
                    int al = bcf_gt_allele(ptr[j]);
                    assert( al<nR_ori && map[al]>=0 );
                    ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
                }
                ptr += nret;
            }
            bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
        }
    }

    // Remove from Number=G, Number=R and Number=A FORMAT fields. 
    // Assuming haploid or diploid GTs
    for (i=0; i<line->n_fmt; i++)
    {
        bcf_fmt_t *fmt = &line->d.fmt[i];
        int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);

        if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change

        int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
        if ( type==BCF_HT_FLAG ) continue;

        int size = 1;
        if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;

        mdat = mdat_bytes / size;
        nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
        mdat_bytes = mdat * size;
        if ( nret<0 ) 
        { 
            fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, 
                    bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); 
            exit(1);
        }

        if ( type==BCF_HT_STR ) 
        {
            int size = nret/line->n_sample;     // number of bytes per sample
            str.l = 0;
            if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
            {
                int nexp, inc = 0;
                if ( vlen==BCF_VL_A )
                {
                    nexp = nA_ori;
                    inc  = 1;
                }
                else
                    nexp = nR_ori;
                for (j=0; j<line->n_sample; j++)
                {
                    char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
                    int k_src = 0, k_dst = 0, l = str.l;
                    for (k_src=0; k_src<nexp; k_src++)
                    {
                        if ( ptr>=se || !*ptr) break;
                        while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                        if ( rm_mask & 1<<(k_src+inc) )
                        {
                            ss = ++ptr;
                            continue;
                        }
                        if ( k_dst ) kputc(',',&str);
                        kputsn(ss,ptr-ss,&str);
                        ss = ++ptr;
                        k_dst++;
                    }
                    assert( k_src==nexp );
                    l = str.l - l;
                    for (; l<size; l++) kputc(0, &str);
                }
            }
            else    // Number=G, diploid or haploid
            {
                for (j=0; j<line->n_sample; j++)
                {
                    char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
                    int k_src = 0, k_dst = 0, l = str.l;
                    int nexp = 0; // diploid or haploid?
                    while ( ptr<se )
                    {
                        if ( !*ptr ) break;
                        if ( *ptr==',' ) nexp++;
                        ptr++;
                    }
                    if ( ptr!=ss ) nexp++;
                    assert( nexp==nG_ori || nexp==nR_ori );
                    ptr = ss;
                    if ( nexp==nG_ori ) // diploid
                    {
                        int ia, ib;
                        for (ia=0; ia<nR_ori; ia++)
                        {
                            for (ib=0; ib<=ia; ib++)
                            {
                                if ( ptr>=se || !*ptr ) break;
                                while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                                if ( rm_mask & 1<<ia || rm_mask & 1<<ib )
                                {
                                    ss = ++ptr;
                                    continue;
                                }
                                if ( k_dst ) kputc(',',&str);
                                kputsn(ss,ptr-ss,&str);
                                ss = ++ptr;
                                k_dst++;
                            }
                            if ( ptr>=se || !*ptr ) break;
                        }
                    }
                    else    // haploid
                    {
                        for (k_src=0; k_src<nR_ori; k_src++)
                        {
                            if ( ptr>=se || !*ptr ) break;
                            while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                            if ( rm_mask & 1<<k_src )
                            {
                                ss = ++ptr;
                                continue;
                            }
                            if ( k_dst ) kputc(',',&str);
                            kputsn(ss,ptr-ss,&str);
                            ss = ++ptr;
                            k_dst++;
                        }
                        assert( k_src==nR_ori );
                        l = str.l - l;
                        for (; l<size; l++) kputc(0, &str);
                    }
                }
            }
            nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
            if ( nret<0 )
            {
                fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                        bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
                exit(1);
            }
            continue;
        }

        int nori = nret / line->n_sample;
        if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
        {
            int ntop, inc = 0;
            if ( vlen==BCF_VL_A )
            {
                assert( nori==nA_ori );     // todo: will fail if all values are missing
                ntop = nA_ori;
                ndat = nA_new*line->n_sample;
                inc  = 1;
            }
            else
            {
                assert( nori==nR_ori );     // todo: will fail if all values are missing
                ntop = nR_ori;
                ndat = nR_new*line->n_sample;
            }

            #define BRANCH(type_t,is_vector_end) \
            { \
                for (j=0; j<line->n_sample; j++) \
                { \
                    type_t *ptr_src = ((type_t*)dat) + j*nori; \
                    type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \
                    int size = sizeof(type_t); \
                    int k_src, k_dst = 0; \
                    for (k_src=0; k_src<ntop; k_src++) \
                    { \
                        if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
                        if ( rm_mask & 1<<(k_src+inc) ) continue; \
                        if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                        k_dst++; \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
            }
            #undef BRANCH
        }
        else    // Number=G, diploid or mixture of haploid+diploid
        {
            assert( nori==nG_ori );
            ndat = nG_new*line->n_sample;

            #define BRANCH(type_t,is_vector_end) \
            { \
                for (j=0; j<line->n_sample; j++) \
                { \
                    type_t *ptr_src = ((type_t*)dat) + j*nori; \
                    type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
                    int size = sizeof(type_t); \
                    int ia, ib, k_dst = 0, k_src; \
                    int nset = 0;   /* haploid or diploid? */ \
                    for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
                    if ( nset==nR_ori ) /* haploid */ \
                    { \
                        for (k_src=0; k_src<nR_ori; k_src++) \
                        { \
                            if ( rm_mask & 1<<k_src ) continue; \
                            if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                            k_dst++; \
                        } \
                        memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                    } \
                    else /* diploid */ \
                    { \
                        k_src = -1; \
                        for (ia=0; ia<nR_ori; ia++) \
                        { \
                            for (ib=0; ib<=ia; ib++) \
                            { \
                                k_src++; \
                                if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
                                if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \
                                if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                                k_dst++; \
                            } \
                        } \
                    } \
                } \
            }
            switch (type) 
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
            }
            #undef BRANCH
        }
        nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
    }
    free(dat);
    free(str.s);
    free(map);
}
Пример #8
0
bcf1_t *process(bcf1_t *rec)
{
    bcf1_t *dflt = args.mode&MODE_LIST_GOOD ? rec : NULL;
    args.nrec++;

    if ( rec->n_allele > 63 ) return dflt;      // we use 64bit bitmask below

    int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
    if ( ngt<0 ) return dflt;
    if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt;
    ngt /= bcf_hdr_nsamples(args.hdr);

    int itr_set = regidx_overlap(args.rules, bcf_seqname(args.hdr,rec),rec->pos,rec->pos, args.itr_ori);

    int i, has_bad = 0, needs_update = 0;
    for (i=0; i<args.ntrios; i++)
    {
        int32_t a,b,c,d,e,f;
        trio_t *trio = &args.trios[i];

        a = args.gt_arr[ngt*trio->imother];
        b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end;
        c = args.gt_arr[ngt*trio->ifather];
        d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end;
        e = args.gt_arr[ngt*trio->ichild];
        f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end;

        // skip sites with missing data in child
        if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue;

        uint64_t mother = 0, father = 0,child1,child2;

        int is_ok = 0;
        if ( !itr_set )
        {
            if ( f==bcf_int32_vector_end ) { warn_ploidy(rec); continue; }

            // All M,F,C genotypes are diploid. Missing data are considered consistent.
            child1 = 1<<bcf_gt_allele(e);
            child2 = 1<<bcf_gt_allele(f);
            mother  = bcf_gt_is_missing(a) ? child1|child2 : 1<<bcf_gt_allele(a);
            mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(b);
            father  = bcf_gt_is_missing(c) ? child1|child2 : 1<<bcf_gt_allele(c);
            father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(d);

            if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1;
        }
        else
        {
            child1  = 1<<bcf_gt_allele(e);
            child2  = bcf_gt_is_missing(f) || f==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(f);
            mother |= bcf_gt_is_missing(a) ? 0 : 1<<bcf_gt_allele(a);
            mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(b);
            father |= bcf_gt_is_missing(c) ? 0 : 1<<bcf_gt_allele(c);
            father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(d);

            regitr_copy(args.itr, args.itr_ori);
            while ( !is_ok && regitr_overlap(args.itr) )
            {
                rule_t *rule = &regitr_payload(args.itr,rule_t);
                if ( child1 && child2 )
                {
                    if ( !rule->mal || !rule->fal ) continue;   // wrong rule (haploid), but this is a diploid GT
                    if ( !mother ) mother = child1|child2;
                    if ( !father ) father = child1|child2;
                    if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; 
                    continue;
                }
                if ( rule->mal )
                {
                    if ( mother && !(child1&mother) ) continue;
                }
                if ( rule->fal )
                {
                    if ( father && !(child1&father) ) continue;
                }
                is_ok = 1;
            }
        }
        if ( is_ok )
        {
            trio->nok++;
        }
        else
        {
            trio->nbad++;
            has_bad = 1;
            if ( args.mode&MODE_DELETE )
            {
                args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
                if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true 
                args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
                if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing;
                args.gt_arr[ngt*trio->ichild] = bcf_gt_missing;
                if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1]  = bcf_gt_missing;
                needs_update = 1;
            }
        }
    }

    if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) )
        error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);

    if ( args.mode&MODE_DELETE ) return rec;
    if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec;
    if ( args.mode&MODE_LIST_BAD ) return has_bad ? rec : NULL;

    return NULL;
}
Пример #9
0
bcf1_t *process(bcf1_t *rec)
{
    int i, ns = 0;

    bcf_unpack(rec, BCF_UN_FMT);
    bcf_fmt_t *fmt_gt = NULL;
    for (i=0; i<rec->n_fmt; i++)
        if ( rec->d.fmt[i].id==args.gt_id ) { fmt_gt = &rec->d.fmt[i]; break; }
    if ( !fmt_gt ) return rec;    // no GT tag

    hts_expand(int32_t,rec->n_allele,args.marr,args.arr);
    hts_expand(float,rec->n_allele,args.mfarr,args.farr);
    hts_expand(counts_t,rec->n_allele,args.mcounts,args.counts);
    memset(args.arr,0,sizeof(*args.arr)*rec->n_allele);
    memset(args.counts,0,sizeof(*args.counts)*rec->n_allele);

    #define BRANCH_INT(type_t,vector_end) { \
        for (i=0; i<rec->n_sample; i++) \
        { \
            type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
            int ial, als = 0; \
            for (ial=0; ial<fmt_gt->n; ial++) \
            { \
                if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
                if ( bcf_gt_is_missing(p[ial]) ) break; /* missing allele */ \
                int idx = bcf_gt_allele(p[ial]); \
                \
                if ( idx >= rec->n_allele ) \
                    error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args.in_hdr->samples[i],bcf_seqname(args.in_hdr,rec),rec->pos+1); \
                als |= (1<<idx);  /* this breaks with too many alleles */ \
            } \
            if ( ial==0 ) continue; /* missing alleles */ \
            ns++; \
            int is_hom  = als && !(als & (als-1)); /* only one bit is set */ \
            int is_hemi = ial==1; \
            for (ial=0; als; ial++) \
            { \
                if ( als&1 ) \
                { \
                    if ( !is_hom ) \
                        args.counts[ial].nhet++; \
                    else if ( !is_hemi ) \
                        args.counts[ial].nhom += 2; \
                    else \
                        args.counts[ial].nhemi++; \
                } \
                als >>= 1; \
            } \
        } \
    }
    switch (fmt_gt->type) {
        case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
        case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
        case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
        default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args.in_hdr,rec),rec->pos+1); break;
    }
    #undef BRANCH_INT
    if ( args.tags&SET_NS )
    {
        if ( bcf_update_info_int32(args.out_hdr,rec,"NS",&ns,1)!=0 )
            error("Error occurred while updating NS at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
    }
    if ( args.tags&SET_AN )
    {
        args.arr[0] = 0;
        for (i=0; i<rec->n_allele; i++)
            args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi;
        if ( bcf_update_info_int32(args.out_hdr,rec,"AN",args.arr,1)!=0 )
            error("Error occurred while updating AN at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
    }
    if ( args.tags&SET_AF )
    {
        int n = rec->n_allele-1;
        if ( n>0 )
        {
            args.arr[0] = 0;
            for (i=0; i<rec->n_allele; i++)
                args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi;
            for (i=1; i<rec->n_allele; i++)
                args.farr[i] = (args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi)*1.0/args.arr[0];
        }
        if ( args.arr[0] )
        {
            if ( bcf_update_info_float(args.out_hdr,rec,"AF",args.farr+1,n)!=0 )
                error("Error occurred while updating AF at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
        }
    }
    if ( args.tags&SET_AC )
    {
        int n = rec->n_allele-1;
        if ( n>0 )
        {
            memset(args.arr,0,sizeof(*args.arr)*rec->n_allele);
            for (i=1; i<rec->n_allele; i++)
                args.arr[i] = args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi;
        }
        if ( bcf_update_info_int32(args.out_hdr,rec,"AC",args.arr+1,n)!=0 )
            error("Error occurred while updating AC at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
    }
    if ( args.tags&SET_AC_Het )
    {
        int n = rec->n_allele-1;
        if ( n>0 )
        {
            memset(args.arr,0,sizeof(*args.arr)*rec->n_allele);
            for (i=1; i<rec->n_allele; i++)
                args.arr[i] += args.counts[i].nhet;
        }
        if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Het",args.arr+1,n)!=0 )
            error("Error occurred while updating AC_Het at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
    }
    if ( args.tags&SET_AC_Hom )
    {
        int n = rec->n_allele-1;
        if ( n>0 )
        {
            memset(args.arr,0,sizeof(*args.arr)*rec->n_allele);
            for (i=1; i<rec->n_allele; i++)
                args.arr[i] += args.counts[i].nhom;
        }
        if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Hom",args.arr+1,n)!=0 )
            error("Error occurred while updating AC_Hom at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
    }
    if ( args.tags&SET_AC_Hemi )
    {
        int n = rec->n_allele-1;
        if ( n>0 )
        {
            memset(args.arr,0,sizeof(*args.arr)*rec->n_allele);
            for (i=1; i<rec->n_allele; i++)
                args.arr[i] += args.counts[i].nhemi;
        }
        if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Hemi",args.arr+1,n)!=0 )
            error("Error occurred while updating AC_Hemi at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
    }
    return rec;
}
Пример #10
0
static void init_data(args_t *args)
{
    bcf1_t *line = NULL;

    // With phased concat, the chunks overlap and come in the right order.  To
    // avoid opening all files at once, store start positions to recognise need
    // for the next one. This way we can keep only two open chunks at once.
    if ( args->phased_concat )
    {
        args->start_pos = (int*) malloc(sizeof(int)*args->nfnames);
        line = bcf_init();
    }

    kstring_t str = {0,0,0};
    int i, prev_chrid = -1;
    for (i=0; i<args->nfnames; i++)
    {
        htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
        bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
        args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
        if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
            error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);

        int j;
        for (j=0; j<bcf_hdr_nsamples(hdr); j++)
            if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) )
                error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);

        if ( args->phased_concat )
        {
            int ret = bcf_read(fp, hdr, line);
            if ( ret!=0 ) args->start_pos[i] = -2;  // empty file
            else
            {
                int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line));
                args->start_pos[i] = chrid==prev_chrid ? line->pos : -1;
                prev_chrid = chrid;
            }
        }
        bcf_hdr_destroy(hdr);
        hts_close(fp);
    }
    free(str.s);
    if ( line ) bcf_destroy(line);

    args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int));

    if ( args->phased_concat )
    {
        bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">");
        bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
    }
    if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
    if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
    if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);

    bcf_hdr_write(args->out_fh, args->out_hdr);

    if ( args->allow_overlaps )
    {
        args->files = bcf_sr_init();
        args->files->require_index = 1;
        if ( args->regions_list )
        {
            if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
                error("Failed to read the regions: %s\n", args->regions_list);
        }
        if ( args->remove_dups )
        {
            if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
            else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
            else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
            else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
            else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
            else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
            else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
        }
        for (i=0; i<args->nfnames; i++)
            if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum));
    }
    else if ( args->phased_concat )
    {
        // Remove empty files from the list
        int nok = 0;
        while (1)
        {
            while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++;
            if ( nok==args->nfnames ) break;

            i = nok;
            while ( i<args->nfnames && args->start_pos[i]==-2 ) i++;
            if ( i==args->nfnames ) break;

            int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp;
            char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str;
        }
        for (i=nok; i<args->nfnames; i++) free(args->fnames[i]);
        args->nfnames = nok;

        for (i=1; i<args->nfnames; i++)
            if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] )
                error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]);

        args->prev_chr = -1;
        args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->nmism  = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
        args->phase_set  = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
        args->files = bcf_sr_init();
        args->files->require_index = 1;
        args->ifname = 0;
    }
}
Пример #11
0
int subset_vcf(args_t *args, bcf1_t *line)
{
    if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles
    if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles
    if (args->novel || args->known)
    {
        if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.'
        if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0;  // skip sites which are novel, ID == '.'
    }

    if (args->include || args->exclude)
    {
        int line_type = bcf_get_variant_types(line);
        if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types
        if ( args->exclude &&   line_type&args->exclude  ) return 0; // exclude given variant types
    }

    if ( args->filter )
    {
        int ret = filter_test(args->filter, line, NULL);
        if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; }
        else if ( ret ) return 0;
    }

    hts_expand(int, line->n_allele, args->mac, args->ac);
    int i, an = 0, non_ref_ac = 0;
    if (args->calc_ac) {
        bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate
        for (i=1; i<line->n_allele; i++)
            non_ref_ac += args->ac[i];
        for (i=0; i<line->n_allele; i++)
            an += args->ac[i];
    }

    if (args->n_samples)
    {
        int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
        bcf_subset(args->hdr, line, args->n_samples, args->imap);
        if (args->calc_ac) {
            bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
            an = 0;
            for (i=0; i<line->n_allele; i++) {
                args->ac[i] = ac_sub[i];
                an += ac_sub[i];
            }
            for (i=1; i<line->n_allele; i++)
                non_ref_ac_sub += ac_sub[i];
            if (args->private_vars) {
                if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites
                if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites
            }
            non_ref_ac = non_ref_ac_sub;
        }
        free(ac_sub);
    }

    bcf_fmt_t *gt_fmt;
    if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) )
    {
        int nhet = 0, nhom = 0, nmiss = 0;
        for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
        {
            int type = bcf_gt_type(gt_fmt,i,NULL,NULL);
            if ( type==GT_HET_RA || type==GT_HET_AA )
            {
                if ( args->gt_type==GT_NO_HET ) return 0;
                nhet = 1;
            }
            else if ( type==GT_UNKN )
            {
                if ( args->gt_type==GT_NO_MISSING ) return 0;
                nmiss = 1;
            }
            else
            {
                if ( args->gt_type==GT_NO_HOM ) return 0;
                nhom = 1;
            }
        }
        if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0;
        else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0;
        else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0;
    }

    int minor_ac = 0;
    int major_ac = 0;
    if ( args->calc_ac )
    {
        minor_ac = args->ac[0];
        major_ac = args->ac[0];
        for (i=1; i<line->n_allele; i++){
            if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; }
            if (args->ac[i] > major_ac) { major_ac = args->ac[i]; }
        }
    }

    if (args->min_ac)
    {
        if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
        else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
        else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC
        else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
        else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
    }
    if (args->max_ac)
    {
        if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
        else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
        else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC
        else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
        else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
    }
    if (args->min_af)
    {
        if (an == 0) return 0; // freq not defined, skip site
        if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
        else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF
        else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF
        else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
        else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
    }
    if (args->max_af)
    {
        if (an == 0) return 0; // freq not defined, skip site
        if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
        else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF
        else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF
        else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF
        else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF
    }
    if (args->uncalled) {
        if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
        if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
    }
    if (args->calc_ac && args->update_info) {
        bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
        bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
    }
    if (args->trim_alts)
    {
        int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
        if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
    }
    if (args->phased) {
        int phased = bcf_all_phased(args->hdr, line);
        if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased
        if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased
    }
    if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0);
    return 1;
}
Пример #12
0
static void vcfroh(args_t *args, bcf1_t *line)
{
    // Are we done?
    if ( !line )
    { 
        flush_viterbi(args);
        return; 
    }
    args->ntot++;

    // Skip unwanted lines
    if ( line->rid == args->skip_rid ) return;
    if ( line->n_allele==1 ) return;    // no ALT allele
    if ( line->n_allele!=2 ) return;    // only biallelic sites
    if ( args->snps_only && !bcf_is_snp(line) ) return;

    // Initialize genetic map
    int skip_rid = 0;
    if ( args->prev_rid<0 )
    {
        args->prev_rid = line->rid;
        args->prev_pos = line->pos;
        skip_rid = load_genmap(args, line);
        if ( !skip_rid && args->vi_training ) push_rid(args, line->rid);
    }

    // New chromosome?
    if ( args->prev_rid!=line->rid )
    {
        skip_rid = load_genmap(args, line);
        if ( args->vi_training )
        {
            if ( !skip_rid ) push_rid(args, line->rid);
        }
        else
        {
            flush_viterbi(args);
            args->nsites = 0;
        }
        args->prev_rid = line->rid;
        args->prev_pos = line->pos;
    }

    if ( skip_rid )
    {
        fprintf(pysamerr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line));
        args->skip_rid = line->rid;
        return;
    }
    if ( args->prev_pos > line->pos ) error("The file is not sorted?!\n");

    args->prev_rid = line->rid;
    args->prev_pos = line->pos;


    // Ready for the new site
    int m = args->msites;
    hts_expand(uint32_t,args->nsites+1,args->msites,args->sites);
    if ( args->msites!=m )
        args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2);

    // Set likelihoods and alternate allele frequencies
    double alt_freq, pdg[3];
    if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong

    args->nused++;

    // Calculate emission probabilities P(D|AZ) and P(D|HW)
    double *eprob = &args->eprob[2*args->nsites];
    eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
    eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;

    args->sites[args->nsites] = line->pos;
    args->nsites++;
}
Пример #13
0
int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
{
    args->nitmp = 0;

    // Set allele frequency
    int ret;
    if ( args->af_tag )
    {
        // Use an INFO tag provided by the user
        ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
        if ( ret==1 )
            *alt_freq = args->AFs[0];
        if ( ret==-2 )
            error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
    }
    else if ( args->af_fname ) 
    {
        // Read AF from a file
        ret = read_AF(args->files->targets, line, alt_freq);
    }
    else
    {
        // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
        ret = -1;
        if ( !args->estimate_AF )
        {
            int AC = -1, AN = 0;
            ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
            if ( ret==1 )
            {
                AN = args->itmp[0];
                ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
                if ( ret>0 )
                    AC = args->itmp[0];
            }
            if ( AN<=0 || AC<0 ) 
                ret = -1;
            else 
                *alt_freq = (double) AC/AN;
        }
        if ( ret==-1 )
            ret = estimate_AF(args, line, alt_freq);    // reads GTs into args->itmp
    }

    if ( ret<0 ) return ret;
    if ( *alt_freq==0.0 )
    {
        if ( args->dflt_AF==0 ) return -1;       // we skip sites with AF=0
        *alt_freq = args->dflt_AF;
    }

    // Set P(D|G)
    if ( args->fake_PLs )
    {
        if ( !args->nitmp )
        {
            args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
            if ( args->nitmp != 2*args->nsmpl ) return -1;     // not diploid?
            args->nitmp /= args->nsmpl;
        }

        int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
        if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;

        int a = bcf_gt_allele(gt[0]);
        int b = bcf_gt_allele(gt[1]);
        if ( a!=b )
        {
            pdg[0] = pdg[2] = args->unseen_PL;
            pdg[1] = 1 - 2*args->unseen_PL;
        }
        else if ( a==0 )
        {
            pdg[0] = 1 - 2*args->unseen_PL;
            pdg[1] = pdg[2] = args->unseen_PL;
        }
        else
        {
            pdg[0] = pdg[1] = args->unseen_PL;
            pdg[2] = 1 - 2*args->unseen_PL;
        }
    }
    else
    {
        args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
        if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1;     // not diploid?
        args->nitmp /= args->nsmpl;

        int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
        pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
        pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
        pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;

        double sum = pdg[0] + pdg[1] + pdg[2];
        if ( !sum ) return -1;
        pdg[0] /= sum;
        pdg[1] /= sum;
        pdg[2] /= sum;
    }

    return 0;
}
Пример #14
0
int beds_fill_buffer(struct beds_anno_file *file, bcf_hdr_t *hdr_out, bcf1_t *line)
{
    assert(file->idx);
    int tid = tbx_name2id(file->idx, bcf_seqname(hdr_out, line));
    // if cached this region already, just skip refill. this is different from vcfs_fill_buffer()
    if ( tid == file->last_id && file->last_start <= line->pos + 1 && file->last_end > line->pos)
	return -1;

    if ( tid == -1 ) {
        if ( file->no_such_chrom == 0 ) {
            warnings("no chromosome %s found in databases %s.", bcf_seqname(hdr_out, line), file->fname);
            file->no_such_chrom = 1;
        }
	return 1;
    } else {
        file->no_such_chrom = 0;
    }

    // empty cache
    file->cached = 0;
    int i;
    hts_itr_t *itr = tbx_itr_queryi(file->idx, tid, line->pos, line->pos + line->rlen);
    if ( itr == NULL )
	return 1;
    // if buffer refilled, init last start and end
    file->last_id = tid;
    file->last_start = -1;
    file->last_end = -1;    
    while (1) {
	if ( file->cached == file->max ) {
	    file->max += 8;
	    file->buffer = (struct beds_anno_tsv**)realloc(file->buffer, sizeof(struct beds_anno_tsv*)*file->max);
	    for (i = 8; i > 0; --i)
		file->buffer[file->max - i] = beds_anno_tsv_init();
	}

	if ( tbx_itr_next(file->fp, file->idx, itr, &file->buffer[file->cached]->string) < 0)
	    break;
	struct beds_anno_tsv *tsv = file->buffer[file->cached];
	convert_string_tsv(tsv);
        // Skip if variant located outside of region.
        
        if (line->pos < tsv->start || line->pos >= tsv->end)
            continue;
        if (tsv->end - tsv->start == 1 && line->pos != tsv->start)
            continue;
        file->cached++;
	if ( file->last_end == -1 ) {
	    file->last_end = tsv->end;
	    file->last_start = tsv->start;
	    continue;
	} 
	if ( file->last_end < tsv->end )
	    file->last_end = tsv->end;
	if ( file->last_start > tsv->start )
	    file->last_start = tsv->start;
        
    }
    // if buffer is filled return 0, else return 1
    return file->cached ? 0 : 1;    
}
Пример #15
0
static void apply_variant(args_t *args, bcf1_t *rec)
{
    if ( rec->n_allele==1 ) return;

    if ( rec->pos <= args->fa_frz_pos )
    {
        fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
        return;
    }
    if ( args->mask )
    {
        char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
        int start = rec->pos;
        int end   = rec->pos + rec->rlen - 1;
        if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
    }

    int i, ialt = 1;
    if ( args->isample >= 0 )
    {
        bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
        if ( !fmt ) return;
        if ( args->haplotype )
        {
            if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
            uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
            ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
            if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
            ialt = bcf_gt_allele(ialt);
        }
        else if ( args->output_iupac ) 
        {
            uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
            ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
            if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
            ialt = bcf_gt_allele(ialt);

            int jalt;
            if ( fmt->n>1 )
            {
                ptr = fmt->p + fmt->size*args->isample + 1;
                jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
                if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
                else jalt = bcf_gt_allele(jalt);
            }
            else jalt = ialt;
            if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
            if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
            {
                char ial = rec->d.allele[ialt][0];
                char jal = rec->d.allele[jalt][0];
                rec->d.allele[ialt][0] = gt2iupac(ial,jal);
            }
        }
        else
        {
            for (i=0; i<fmt->n; i++)
            {
                uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
                ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
                if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
                ialt = bcf_gt_allele(ialt);
                if ( ialt ) break;
            }
        }
        if ( !ialt ) return;  // ref allele
        if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
    }
    else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
    {
        char ial = rec->d.allele[0][0];
        char jal = rec->d.allele[1][0];
        rec->d.allele[1][0] = gt2iupac(ial,jal);
    }

    int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
    if ( idx<0 || idx>=args->fa_buf.l ) 
        error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);

    // sanity check the reference base
    int len_diff = 0, alen = 0;
    if ( rec->d.allele[ialt][0]=='<' )
    {
        if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
            error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1);
        assert( rec->d.allele[0][1]==0 );           // todo: for now expecting strlen(REF) = 1
        len_diff = 1-rec->rlen;
        rec->d.allele[ialt] = rec->d.allele[0];     // according to VCF spec, REF must precede the event
        alen = strlen(rec->d.allele[ialt]);
    }
    else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
    {
        // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
        char tmp = 0;
        if ( args->fa_buf.l - idx > rec->rlen ) 
        { 
            tmp = args->fa_buf.s[idx+rec->rlen];
            args->fa_buf.s[idx+rec->rlen] = 0;
        }
        error(
            "The fasta sequence does not match the REF allele at %s:%d:\n"
            "   .vcf: [%s]\n" 
            "   .vcf: [%s] <- (ALT)\n" 
            "   .fa:  [%s]%c%s\n",
            bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, 
            tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
            );
    }
    else
    {
        alen = strlen(rec->d.allele[ialt]);
        len_diff = alen - rec->rlen;
    }

    if ( args->fa_case )
        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
    else
        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);

    if ( len_diff <= 0 )
    {
        // deletion or same size event
        for (i=0; i<alen; i++)
            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
        if ( len_diff )
            memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
    }
    else
    {
        // insertion
        ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
        memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
        for (i=0; i<alen; i++)
            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
    }
    if (args->chain && len_diff != 0)
    {
        // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
        if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
        {
            // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
            push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
        }
        else
        {
            // otherwise, just the coordinates of the variant as given
            push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen);
        }
    }
    args->fa_buf.l += len_diff;
    args->fa_mod_off += len_diff;
    args->fa_frz_pos  = rec->pos + rec->rlen - 1;
}
Пример #16
0
static void phased_flush(args_t *args)
{
    if ( !args->nbuf ) return;

    bcf_hdr_t *ahdr = args->files->readers[0].header;
    bcf_hdr_t *bhdr = args->files->readers[1].header;

    int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr);
    static int gt_absent_warned = 0;

    for (i=0; i<args->nbuf; i+=2)
    {
        bcf1_t *arec = args->buf[i];
        bcf1_t *brec = args->buf[i+1];

        int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
        if ( nGTs < 0 ) 
        {
            if ( !gt_absent_warned )
            {
                fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1);
                gt_absent_warned = 1;
            }
            continue;
        }
        if ( nGTs != 2*nsmpl ) continue;    // not diploid
        nGTs = bcf_get_genotypes(bhdr, brec, &args->GTb, &args->mGTb);
        if ( nGTs < 0 )
        {
            if ( !gt_absent_warned )
            {
                fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1);
                gt_absent_warned = 1;
            }
            continue;
        }
        if ( nGTs != 2*nsmpl ) continue;    // not diploid

        for (j=0; j<nsmpl; j++)
        {
            int *gta = &args->GTa[j*2];
            int *gtb = &args->GTb[j*2];
            if ( gta[1]==bcf_int32_vector_end || gtb[1]==bcf_int32_vector_end ) continue;
            if ( bcf_gt_is_missing(gta[0]) || bcf_gt_is_missing(gta[1]) || bcf_gt_is_missing(gtb[0]) || bcf_gt_is_missing(gtb[1]) ) continue;
            if ( !bcf_gt_is_phased(gta[1]) || !bcf_gt_is_phased(gtb[1]) ) continue;
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gta[1]) || bcf_gt_allele(gtb[0])==bcf_gt_allele(gtb[1]) ) continue;
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[0]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[1]) )
            {
                if ( args->swap_phase[j] ) args->nmism[j]++; else args->nmatch[j]++;
            }
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[1]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[0]) )
            {
                if ( args->swap_phase[j] ) args->nmatch[j]++; else args->nmism[j]++;
            }
        }
    }
    for (i=0; i<args->nbuf/2; i+=2)
    {
        bcf1_t *arec = args->buf[i];
        bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
        if ( args->nswap )
            phase_update(args, args->out_hdr, arec);
        if ( !args->compact_PS || args->phase_set_changed )
        {
            bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
            args->phase_set_changed = 0;
        }
        bcf_write(args->out_fh, args->out_hdr, arec);

        if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d  [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1);
        args->prev_pos_check = arec->pos;
    }
    args->nswap = 0;
    for (j=0; j<nsmpl; j++)
    {
        if ( args->nmatch[j] >= args->nmism[j] )
            args->swap_phase[j] = 0;
        else
        {
            args->swap_phase[j] = 1;
            args->nswap++;
        }
        if ( args->nmatch[j] && args->nmism[j] )
        {
            // Entropy-inspired quality. The factor 0.7 shifts and scales to (0,1)
            double f = (double)args->nmatch[j]/(args->nmatch[j]+args->nmism[j]);
            args->phase_qual[j] = 99*(0.7 + f*log(f) + (1-f)*log(1-f))/0.7;
        }
        else
            args->phase_qual[j] = 99;
        args->nmatch[j] = 0;
        args->nmism[j]  = 0;
    }
    int PQ_printed = 0;
    for (; i<args->nbuf; i+=2)
    {
        bcf1_t *brec = args->buf[i+1];
        bcf_translate(args->out_hdr, args->files->readers[1].header, brec);
        if ( !PQ_printed )
        {
            bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl);
            PQ_printed = 1;
            for (j=0; j<nsmpl; j++)
                if ( args->phase_qual[j] < args->min_PQ ) 
                {
                    args->phase_set[j] = brec->pos+1;
                    args->phase_set_changed = 1;
                }
                else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing;
        }
        if ( args->nswap )
            phase_update(args, args->out_hdr, brec);
        if ( !args->compact_PS || args->phase_set_changed )
        {
            bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl);
            args->phase_set_changed = 0;
        }
        bcf_write(args->out_fh, args->out_hdr, brec);

        if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d  [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1);
        args->prev_pos_check = brec->pos;
    }
    args->nbuf = 0;
}
Пример #17
0
int _reader_next_line(bcf_srs_t *files)
{
    int i, min_pos = INT_MAX;

    // Loop until next suitable line is found or all readers have finished
    while ( 1 )
    {
        // Get all readers ready for the next region.
        if ( files->regions && _readers_next_region(files)<0 ) break;

        // Fill buffers
        const char *chr = NULL;
        for (i=0; i<files->nreaders; i++)
        {
            _reader_fill_buffer(files, &files->readers[i]);

            // Update the minimum coordinate
            if ( !files->readers[i].nbuffer ) continue;
            if ( min_pos > files->readers[i].buffer[1]->pos )
            {
                min_pos = files->readers[i].buffer[1]->pos;
                chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]);
            }
        }
        if ( min_pos==INT_MAX )
        {
            if ( !files->regions ) break;
            continue;
        }

        // Skip this position if not present in targets
        if ( files->targets )
        {
            int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos);
            if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) )
            {
                // Remove all lines with this position from the buffer
                for (i=0; i<files->nreaders; i++)
                    if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos )
                        _reader_shift_buffer(&files->readers[i]);
                min_pos = INT_MAX;
                continue;
            }
        }

        break;  // done: min_pos is set
    }

    // There can be records with duplicate positions. Set the active line intelligently so that
    // the alleles match.
    int nret = 0;   // number of readers sharing the position
    bcf1_t *first = NULL;   // record which will be used for allele matching
    for (i=0; i<files->nreaders; i++)
    {
        files->has_line[i] = 0;

        // Skip readers with no records at this position
        if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue;

        // Until now buffer[0] of all reader was empty and the lines started at buffer[1].
        // Now lines which are ready to be output will be moved to buffer[0].
        if ( _reader_match_alleles(files, &files->readers[i], first) < 0 ) continue;
        if ( !first ) first = files->readers[i].buffer[0];

        nret++;
        files->has_line[i] = 1;
    }
    return nret;
}
Пример #18
0
static void concat(args_t *args)
{
    int i;
    if ( args->phased_concat )  // phased concat
    {
        // keep only two open files at a time
        while ( args->ifname < args->nfnames )
        {
            int new_file = 0;
            while ( args->files->nreaders < 2 && args->ifname < args->nfnames )
            {
                if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
                new_file = 1;

                args->ifname++;
                if ( args->start_pos[args->ifname-1]==-1 ) break;   // new chromosome, start with only one file open
                if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome
            }

            // is there a line from the previous run? Seek the newly opened reader to that position
            int seek_pos = -1;
            int seek_chr = -1;
            if ( bcf_sr_has_line(args->files,0) )
            {
                bcf1_t *line = bcf_sr_get_line(args->files,0);
                bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos);
                seek_pos = line->pos;
                seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line));
            }
            else if ( new_file )
                bcf_sr_seek(args->files,NULL,0);  // set to start

            int nret;
            while ( (nret = bcf_sr_next_line(args->files)) )
            {
                if ( !bcf_sr_has_line(args->files,0) )  // no input from the first reader
                {
                    // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
                    if ( ! bcf_sr_region_done(args->files,0) ) continue;

                    phased_flush(args);
                    bcf_sr_remove_reader(args->files, 0);
                }

                // Get a line to learn about current position
                for (i=0; i<args->files->nreaders; i++)
                    if ( bcf_sr_has_line(args->files,i) ) break;
                bcf1_t *line = bcf_sr_get_line(args->files,i);

                // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
                if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue;
                seek_pos = seek_chr = -1;

                //  Check if the position overlaps with the next, yet unopened, reader
                int must_seek = 0;
                while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] )
                {
                    must_seek = 1;
                    if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
                    args->ifname++;
                }
                if ( must_seek )
                {
                    bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos);
                    seek_pos = line->pos;
                    seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line));
                    continue;
                }

                // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
                if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue;

                phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL);
            }

            if ( args->files->nreaders )
            {
                phased_flush(args);
                while ( args->files->nreaders )
                    bcf_sr_remove_reader(args->files, 0);
            }
        }
    }
    else if ( args->files )  // combining overlapping files, using synced reader
    {
        while ( bcf_sr_next_line(args->files) )
        {
            for (i=0; i<args->files->nreaders; i++)
            {
                bcf1_t *line = bcf_sr_get_line(args->files,i);
                if ( !line ) continue;
                bcf_translate(args->out_hdr, args->files->readers[i].header, line);
                bcf_write1(args->out_fh, args->out_hdr, line);
                if ( args->remove_dups ) break;
            }
        }
    }
    else    // concatenating
    {
        kstring_t tmp = {0,0,0};
        int prev_chr_id = -1, prev_pos;
        bcf1_t *line = bcf_init();
        for (i=0; i<args->nfnames; i++)
        {
            htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
            bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
            if ( !fp->is_bin && args->output_type&FT_VCF )
            {
                line->max_unpack = BCF_UN_STR;
                // if VCF is on both input and output, avoid VCF to BCF conversion
                while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
                {
                    char *str = fp->line.s;
                    while ( *str && *str!='\t' ) str++;
                    tmp.l = 0;
                    kputsn(fp->line.s,str-fp->line.s,&tmp);
                    int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s);
                    if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]);
                    if ( prev_chr_id!=chr_id )
                    {
                        prev_pos = -1;
                        if ( args->seen_seq[chr_id] )
                            error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s);
                    }
                    char *end;
                    int pos = strtol(str+1,&end,10) - 1;
                    if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s);
                    if ( prev_pos > pos )
                        error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s);
                    args->seen_seq[chr_id] = 1;
                    prev_chr_id = chr_id;

                    if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l);
                }
            }
            else
            {
                // BCF conversion is required
                line->max_unpack = 0;
                while ( bcf_read(fp, hdr, line)==0 )
                {
                    bcf_translate(args->out_hdr, hdr, line);

                    if ( prev_chr_id!=line->rid )
                    {
                        prev_pos = -1;
                        if ( args->seen_seq[line->rid] )
                            error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
                    }
                    if ( prev_pos > line->pos )
                        error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
                    args->seen_seq[line->rid] = 1;
                    prev_chr_id = line->rid;

                    if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n");
                }
            }
            bcf_hdr_destroy(hdr);
            hts_close(fp);
        }
        bcf_destroy(line);
        free(tmp.s);
    }
}
Пример #19
0
static int query_regions(args_t *args, char *fname, char **regs, int nregs)
{
    int i;
    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) error("Could not read %s\n", fname);
    enum htsExactFormat format = hts_get_format(fp)->format;

    regidx_t *reg_idx = NULL;
    if ( args->targets_fname )
    {
        reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
        if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
    }

    if ( format == bcf )
    {
        htsFile *out = hts_open("-","w");
        if ( !out ) error("Could not open stdout\n", fname);
        hts_idx_t *idx = bcf_index_load(fname);
        if ( !idx ) error("Could not load .csi index of %s\n", fname);
        bcf_hdr_t *hdr = bcf_hdr_read(fp);
        if ( !hdr ) error("Could not read the header: %s\n", fname);
        if ( args->print_header )
            bcf_hdr_write(out,hdr);
        if ( !args->header_only )
        {
            bcf1_t *rec = bcf_init();
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
                while ( bcf_itr_next(fp, itr, rec) >=0 )
                {
                    if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
                    bcf_write(out,hdr,rec);
                }
                tbx_itr_destroy(itr);
            }
            bcf_destroy(rec);
        }
        if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
        bcf_hdr_destroy(hdr);
        hts_idx_destroy(idx);
    }
    else if ( format==vcf || format==sam || format==unknown_format )
    {
        tbx_t *tbx = tbx_index_load(fname);
        if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
        kstring_t str = {0,0,0};
        if ( args->print_header )
        {
            while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
            {
                if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
                puts(str.s);
            }
        }
        if ( !args->header_only )
        {
            int nseq;
            const char **seq = NULL;
            if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
                if ( !itr ) continue;
                while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
                {
                    if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
                    puts(str.s);
                }
                tbx_itr_destroy(itr);
            }
            free(seq);
        }
        free(str.s);
        tbx_destroy(tbx);
    }
    else if ( format==bam )
        error("Please use \"samtools view\" for querying BAM files.\n");

    if ( reg_idx ) regidx_destroy(reg_idx);
    if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);

    for (i=0; i<nregs; i++) free(regs[i]);
    free(regs);
    return 0;
}
Пример #20
0
static void init_data(args_t *args)
{
    bcf_srs_t *files = bcf_sr_init();
    if ( args->regions_list )
    {
        if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 )
            error("Failed to read the regions: %s\n", args->regions_list);
    }
    if ( args->targets_list )
    {
        if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 )
            error("Failed to read the targets: %s\n", args->targets_list);
    }
    if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum));
    bcf_hdr_t *hdr = files->readers[0].header;
    if ( !args->sample )
    {
        if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n");
        args->sample = hdr->samples[0];
    }
    else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample);
    int ret = bcf_hdr_set_samples(hdr, args->sample, 0);
    if ( ret<0 ) error("Error setting the sample: %s\n", args->sample);

    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) )
        error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname);

    int i;
    args->xvals = (double*) calloc(args->nbins,sizeof(double));
    for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1);

    // collect BAF distributions for all chromosomes
    int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1;
    float *baf = NULL;
    while ( bcf_sr_next_line(files) )
    {
        ntotal++;

        bcf1_t *line = bcf_sr_get_line(files,0);
        if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue;
        if ( bcf_float_is_missing(baf[0]) ) continue;

        nprocessed++;

        if ( prev_chr==-1 || prev_chr!=line->rid )
        {
            // new chromosome
            idist = args->ndist++;
            args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist);
            memset(&args->dist[idist],0,sizeof(dist_t));
            args->dist[idist].chr   = strdup(bcf_seqname(hdr,line));
            args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double));
            args->dist[idist].xvals = args->xvals;
            args->dist[idist].nvals = args->nbins;
            prev_chr = line->rid;
        }
        int bin = baf[0]*(args->nbins-1);
        args->dist[idist].yvals[bin]++;   // the distribution
    }
    free(baf);
    bcf_sr_destroy(files);

    for (idist=0; idist<args->ndist; idist++)
    {
        #if 0
            int j;
            for (j=0; j<args->nbins; j++)
            {
                double x = args->dist[idist].xvals[j];
                args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3);
            }
        #endif
        init_dist(args, &args->dist[idist],args->verbose);
    }

    args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir);
    fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version());
    fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]);
    for (i=1; i<args->argc; i++)
        fprintf(args->dat_fp, " %s",args->argv[i]);
    fprintf(args->dat_fp,"\n#\n");
    fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n");
    fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n");
    fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n");

    char *fname = NULL;
    FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir);
//-------- matplotlib script --------------
    fprintf(fp,
        "#!/usr/bin/env python\n"
        "#\n"
        "import matplotlib as mpl\n"
        "mpl.use('Agg')\n"
        "import matplotlib.pyplot as plt\n"
        "import csv,sys,argparse\n"
        "from math import exp\n"
        "\n"
        "outdir = '%s'\n"
        "\n"
        "def read_dat(dat,fit,cn):\n"
        "   csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n"
        "   with open(outdir+'/dist.dat', 'rb') as f:\n"
        "      reader = csv.reader(f, 'tab')\n"
        "      for row in reader:\n"
        "          if row[0][0]=='#': continue\n"
        "          type = row[0]\n"
        "          chr  = row[1]\n"
        "          if type=='DIST':\n"
        "              if chr not in dat: dat[chr] = []\n"
        "              dat[chr].append(row)\n"
        "          elif type=='FIT':\n"
        "              if chr not in fit: fit[chr] = []\n"
        "              fit[chr].append(row)\n"
        "          elif type=='CN':\n"
        "              cn[chr] = row[2]\n"
        "\n"
        "def plot_dist(dat,fit,chr):\n"
        "   fig, ax = plt.subplots(1, 1, figsize=(7,5))\n"
        "   ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n"
        "   if chr in fit:\n"
        "       for i in range(len(fit[chr])):\n"
        "           pfit = fit[chr][i]\n"
        "           exec('def xfit(x): return '+pfit[5])\n"
        "           istart = int(pfit[3])\n"
        "           iend   = int(pfit[4])+1\n"
        "           vals   = dat[chr][istart:iend]\n"
        "           args   = {}\n"
        "           if i==0: args = {'label':'Target to Fit'}\n"
        "           ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n"
        "           if i==0: args = {'label':'Best Fit'}\n"
        "           ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n"
        "   ax.set_title('BAF distribution, chr'+chr)\n"
        "   ax.set_xlabel('BAF')\n"
        "   ax.set_ylabel('Frequency')\n"
        "   ax.legend(loc='best',prop={'size':7},frameon=False)\n"
        "   plt.savefig(outdir+'/dist.chr'+chr+'.png')\n"
        "   plt.close()\n"
        "\n"
        "def plot_copy_number(cn):\n"
        "   fig, ax = plt.subplots(1, 1, figsize=(7,5))\n"
        "   xlabels = sorted(cn.keys())\n"
        "   xvals = range(len(xlabels))\n"
        "   yvals = [float(cn[x]) for x in xlabels]\n"
        "   ax.plot(xvals,yvals,'o',color='red')\n"
        "   for i in range(len(xvals)):\n"
        "       if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n"
        "   ax.tick_params(axis='both', which='major', labelsize=9)\n"
        "   ax.set_xticks(xvals)\n"
        "   ax.set_xticklabels(xlabels,rotation=45)\n"
        "   ax.set_xlim(-1,len(xlabels))\n"
        "   ax.set_ylim(0,5.0)\n"
        "   ax.set_yticks([1.0,2.0,3.0,4.0])\n"
        "   ax.set_xlabel('Chromosome')\n"
        "   ax.set_ylabel('Copy Number')\n"
        "   plt.savefig(outdir+'/copy-number.png')\n"
        "   plt.close()\n"
        "\n"
        "class myParser(argparse.ArgumentParser):\n"
        "   def error(self, message):\n"
        "       self.print_help()\n"
        "       sys.stderr.write('error: %%s\\n' %% message)\n"
        "       sys.exit(2)\n"
        "\n"
        "def main():\n"
        "   parser = myParser()\n"
        "   parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n"
        "   parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n"
        "   parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n"
        "   args = parser.parse_args()\n"
        "   dat = {}; fit = {}; cn = {}\n"
        "   read_dat(dat,fit,cn)\n"
        "   if args.distrib!=None:\n"
        "       plot_dist(dat,fit,args.distrib)\n"
        "   if args.all:\n"
        "       for chr in dat: plot_dist(dat,fit,chr)\n"
        "       plot_copy_number(cn)\n"
        "   elif args.copy_number:\n"
        "       plot_copy_number(cn)\n"
        "   else:\n"
        "       for chr in dat: plot_dist(dat,fit,chr)\n"
        "\n"
        "if __name__ == '__main__':\n"
        "   main()\n",
        args->output_dir);
//---------------------------------------
    chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH);
    free(fname);
    fclose(fp);
}
Пример #21
0
static void warn_ploidy(bcf1_t *rec)
{
    static int warned = 0;
    if ( warned ) return;
    fprintf(stderr,"Incorrect ploidy at %s:%d, skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),rec->pos+1);
    warned = 1;
}