Пример #1
0
int process_region_precise(args_t *args, char *seq, regitr_t *itr)
{
    int k = 1;
    uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end;
    while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++;
    
    int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL);
    assert(ret);

    memset(args->counts,0,args->ncounts*sizeof(int));

    // Select 'nsites' sites spaced so that they evenly cover the whole region 
    // to get a representative sample. We index-jump as we should be checking
    // a few sites only.
    int i, rid = -1, pos, prev_pos = -1, ismpl;
    for (i=0; i<args->nsites; i++)
    {
        rid = -1;
        pos = ((i+1.0)/(args->nsites+1))*(end - start) + start;
        if ( i>0 && pos <= prev_pos ) continue;     // the vcf is too sparse
        if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k;   // sequence not present
        if ( !bcf_sr_next_line(args->sr) ) return k;        // no sites found
        bcf1_t *rec = bcf_sr_get_line(args->sr,0);
        if ( rid==-1 ) rid = rec->rid;
        if ( rid!=rec->rid || rec->pos > end ) break;
        prev_pos = rec->pos;

        int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts);
        ngts /= args->nsample;
        for (ismpl=0; ismpl<args->nsample; ismpl++)
        {
            int32_t *gts = args->gts + ngts*ismpl;
            int igt, ploidy = 0;
            for (igt=0; igt<ngts; igt++)
            {
                if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break;
                else ploidy++;
            }
            args->counts[ismpl*(args->max_ploidy+1) + ploidy]++;
            if ( args->verbose )
                fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy);
        }
    }

    for (ismpl=0; ismpl<args->nsample; ismpl++)
    {
        float sum = 0, *probs = args->sex2prob + ismpl*args->nsex;
        int *counts = args->counts + ismpl*(args->max_ploidy+1);
        for (i=0; i<args->max_ploidy+1; i++) sum += counts[i];
        if ( !sum ) continue;
        for (i=0; i<args->nsex; i++)
        {
            int ploidy = args->sex2ploidy[i];
            probs[i] *= counts[ploidy]/sum;
        }
    }

    return k;
}
Пример #2
0
static void set_ploidy(args_t *args, bcf1_t *rec)
{
    ploidy_query(args->ploidy,(char*)bcf_seqname(args->aux.hdr,rec),rec->pos,args->sex2ploidy,NULL,NULL);

    int i;
    for (i=0; i<args->nsex; i++)
        if ( args->sex2ploidy[i]!=args->sex2ploidy_prev[i] ) break;

    if ( i==args->nsex ) return;    // ploidy same as previously

    for (i=0; i<args->nsamples; i++)
    {
        if ( args->sample2sex[i]<0 )
            args->aux.ploidy[i] = -1*args->sample2sex[i];
        else
            args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
    }

    int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
}
Пример #3
0
int process_region_guess(args_t *args, char *seq, regitr_t *itr)
{
    int kitr = 1;
    uint32_t start = 0, end = INT_MAX;
    reg_stats_t *stats = NULL;

    // set the start and the end position
    if ( itr )
    {
        start = itr->reg[itr->i].start;
        end   = itr->reg[itr->i].end;

        // flush all records with the same coordinates
        while ( itr->i+kitr<itr->n && start==itr->reg[itr->i+kitr].start && end==itr->reg[itr->i+kitr].end ) kitr++;

        int min,max,ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, &min, &max);
        assert(ret);
        stats = expand_regs(args, seq,start,end);
    }
    else
    {
        // background region
        int spos, epos;
        const char *ptr = hts_parse_reg(args->background, &spos, &epos);
        if ( !ptr )
            error("Could not parse the region: %s\n", args->background);
        seq = (char*) malloc(ptr - args->background + 1);
        memcpy(seq,args->background,ptr-args->background);
        seq[ptr-args->background] = 0;
        start = spos;
        end   = epos;
    }

    if ( bcf_sr_seek(args->sr,seq,start)!=0 ) 
    {
        // sequence not present
        if ( !itr ) free(seq);
        return kitr;
    }

    int ismpl, rid = bcf_hdr_name2id(args->hdr,seq);
    if ( !itr ) free(seq);

    while ( bcf_sr_next_line(args->sr) )
    {
        bcf1_t *rec = bcf_sr_get_line(args->sr,0);
        if ( rec->rid!=rid || rec->pos > end ) break;

        if ( args->guess & GUESS_GT )   // use GTs to guess the ploidy
        {
            bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
            if ( !fmt ) continue;
            for (ismpl=0; ismpl<args->nsample; ismpl++)
            {
                count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl];
                int gt = bcf_gt_type(fmt, ismpl, NULL,NULL);
                if ( gt==GT_UNKN ) counts->nmiss++;
                else if ( gt==GT_HET_RA || gt==GT_HET_AA ) counts->nhet++;
                else counts->nhom++;
            }
        }
        else    // use PLs to guess the ploidy
        {
            int gl2pl = args->guess & GUESS_PL ? 1 : -1;
            int npl = bcf_get_format_int32(args->hdr,rec,args->guess&GUESS_PL?"PL":"GL",&args->pls,&args->npls);
            if ( npl<=0 ) continue;
            npl /= args->nsample;
            for (ismpl=0; ismpl<args->nsample; ismpl++)
            {
                int32_t *ptr = args->pls + ismpl*npl;
                int phom = INT_MAX, phet = INT_MAX, ial, jal, k = 0;
                for (ial=0; ial<rec->n_allele; ial++)
                {
                    for (jal=0; jal<ial; jal++)
                    {
                        if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end )  break;
                        ptr[k] *= gl2pl;
                        if ( phet > ptr[k] ) phet = ptr[k];
                        k++;
                    }
                    if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end )  break;
                    ptr[k] *= gl2pl;
                    if ( phom > ptr[k] ) phom = ptr[k];
                    k++;
                }
                count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl];
                if ( k == rec->n_allele ) counts->nhom++;   // haploid
                else if ( phet == phom || k != rec->n_allele*(rec->n_allele+1)/2 ) counts->nmiss++;
                else if ( phet < phom ) counts->nhet++;
                else counts->nhom++;
            }
        }
    }
    return kitr;
}