예제 #1
0
static void flush_viterbi(args_t *args)
{
    int i,j;

    if ( !args->nsites ) return; 

    if ( !args->vi_training )
    {
        // single viterbi pass, one chromsome
        hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites);
        hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites);
        double *fwd = hmm_get_fwd_bwd_prob(args->hmm);

        const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
        uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
        for (i=0; i<args->nsites; i++)
        {
            int state = vpath[i*2]==STATE_AZ ? 1 : 0;
            double *pval = fwd + i*2;
            printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
        }
        return;
    }

    // viterbi training, multiple chromosomes
    double t2az_prev, t2hw_prev;
    double deltaz, delthw;
    int niter = 0;
    do
    {
        double *tprob_arr = hmm_get_tprob(args->hmm);
        t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ;
        t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW;
        double tcounts[] = { 0,0,0,0 };
        for (i=0; i<args->nrids; i++)
        {
            // run viterbi for each chromosomes. eprob and sites contain
            // multiple chromosomes, rid_offs mark the boundaries
            int ioff = args->rid_offs[i];
            int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
            hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);

            // what transitions were observed: add to the total counts
            uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
            for (j=1; j<nsites; j++)
            {
                // count the number of transitions
                int prev_state = vpath[2*(j-1)];
                int curr_state = vpath[2*j];
                MAT(tcounts,2,curr_state,prev_state) += 1;
            }
        }

        // update the transition matrix tprob
        for (i=0; i<2; i++)
        {
            int n = 0;
            for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
            if ( !n) error("fixme: state %d not observed\n", i+1);
            for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n;
        }
        if ( args->genmap_fname || args->rec_rate > 0 )
            hmm_set_tprob(args->hmm, tcounts, 0);
        else
            hmm_set_tprob(args->hmm, tcounts, 10000);

        tprob_arr = hmm_get_tprob(args->hmm);
        deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
        delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
        niter++;

        fprintf(pysamerr,"%d: %f %f\n", niter,deltaz,delthw);
    }
    while ( deltaz > 0.0 || delthw > 0.0 );
    fprintf(pysamerr, "Viterbi training converged in %d iterations to", niter);
    double *tprob_arr = hmm_get_tprob(args->hmm);
    for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(pysamerr, " %f", MAT(tprob_arr,2,i,j));
    fprintf(pysamerr, "\n");
    
    // output the results
    for (i=0; i<args->nrids; i++)
    {
        int ioff = args->rid_offs[i];
        int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
        hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
        uint8_t *vpath = hmm_get_viterbi_path(args->hmm);

        const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
        for (j=0; j<nsites; j++)
        {
            printf("%s\t%d\t%d\t..\n", chr,args->sites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0);
        }
    }
}
예제 #2
0
void flush_viterbi(args_t *args)
{
    const char *s1, *s2, *s3 = NULL;
    if ( args->mode==C_UNRL )
    {
        s1 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->isample);
        s2 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->jsample);
    }
    else if ( args->mode==C_TRIO )
    {
        s1 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->imother);
        s3 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->ifather);
        s2 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->ichild);
    }

    if ( !args->fp )
    {
        kstring_t str = {0,0,0};
        kputs(args->prefix, &str);
        kputs(".dat", &str);
        args->fp = fopen(str.s,"w");
        if ( !args->fp ) error("%s: %s\n", str.s,strerror(errno));
        free(str.s);
        fprintf(args->fp,"# SG, shared segment\t[2]Chromosome\t[3]Start\t[4]End\t[5]%s:1\t[6]%s:2\n",s2,s2);
        fprintf(args->fp,"# SW, number of switches\t[3]Sample\t[4]Chromosome\t[5]nHets\t[5]nSwitches\t[6]switch rate\n");
    }

    hmm_run_viterbi(args->hmm,args->nsites,args->eprob,args->sites);
    uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
    int i, iprev = -1, prev_state = -1, nstates = hmm_get_nstates(args->hmm);
    int nswitch_mother = 0, nswitch_father = 0;
    for (i=0; i<args->nsites; i++)
    {
        int state = vpath[i*nstates];
        if ( state!=prev_state || i+1==args->nsites )
        {
            uint32_t start = iprev>=0 ? args->sites[iprev]+1 : 1, end = i>0 ? args->sites[i-1] : 1;
            const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
            if ( args->mode==C_UNRL )
            {
                switch (prev_state)
                {
                    case UNRL_0x0x:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t-\n", chr,start,end,s1); break;
                    case UNRL_0xx0:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t-\t%s:1\n", chr,start,end,s1); break;
                    case UNRL_x00x:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t-\n", chr,start,end,s1); break;
                    case UNRL_x0x0:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t-\t%s:2\n", chr,start,end,s1); break;
                    case UNRL_0101:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s1,s1); break;
                    case UNRL_0110:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s1,s1); break;
                }
            }
            else if ( args->mode==C_TRIO )
            {
                switch (prev_state)
                {
                    case TRIO_AC:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:1\n", chr,start,end,s1,s3); break;
                    case TRIO_AD:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s1,s3); break;
                    case TRIO_BC:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s1,s3); break;
                    case TRIO_BD:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:2\n", chr,start,end,s1,s3); break;
                    case TRIO_CA:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:1\n", chr,start,end,s3,s1); break;
                    case TRIO_DA:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s3,s1); break;
                    case TRIO_CB:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s3,s1); break;
                    case TRIO_DB:
                        fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:2\n", chr,start,end,s3,s1); break;
                }
                if ( hap_switch[state][prev_state] & SW_MOTHER ) nswitch_mother++;
                if ( hap_switch[state][prev_state] & SW_FATHER ) nswitch_father++;
            }
            iprev = i-1;
        }
        prev_state = state;
    }
    float mrate = args->nhet_mother>1 ? (float)nswitch_mother/(args->nhet_mother-1) : 0;
    float frate = args->nhet_father>1 ? (float)nswitch_father/(args->nhet_father-1) : 0;
    fprintf(args->fp,"SW\t%s\t%s\t%d\t%d\t%f\n", s1,bcf_hdr_id2name(args->hdr,args->prev_rid),args->nhet_mother,nswitch_mother,mrate);
    fprintf(args->fp,"SW\t%s\t%s\t%d\t%d\t%f\n", s3,bcf_hdr_id2name(args->hdr,args->prev_rid),args->nhet_father,nswitch_father,frate);
    args->nsites = 0;
    args->nhet_father = args->nhet_mother = 0;
}