int process_region_precise(args_t *args, char *seq, regitr_t *itr) { int k = 1; uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end; while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++; int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL); assert(ret); memset(args->counts,0,args->ncounts*sizeof(int)); // Select 'nsites' sites spaced so that they evenly cover the whole region // to get a representative sample. We index-jump as we should be checking // a few sites only. int i, rid = -1, pos, prev_pos = -1, ismpl; for (i=0; i<args->nsites; i++) { rid = -1; pos = ((i+1.0)/(args->nsites+1))*(end - start) + start; if ( i>0 && pos <= prev_pos ) continue; // the vcf is too sparse if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k; // sequence not present if ( !bcf_sr_next_line(args->sr) ) return k; // no sites found bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rid==-1 ) rid = rec->rid; if ( rid!=rec->rid || rec->pos > end ) break; prev_pos = rec->pos; int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts); ngts /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *gts = args->gts + ngts*ismpl; int igt, ploidy = 0; for (igt=0; igt<ngts; igt++) { if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break; else ploidy++; } args->counts[ismpl*(args->max_ploidy+1) + ploidy]++; if ( args->verbose ) fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy); } } for (ismpl=0; ismpl<args->nsample; ismpl++) { float sum = 0, *probs = args->sex2prob + ismpl*args->nsex; int *counts = args->counts + ismpl*(args->max_ploidy+1); for (i=0; i<args->max_ploidy+1; i++) sum += counts[i]; if ( !sum ) continue; for (i=0; i<args->nsex; i++) { int ploidy = args->sex2ploidy[i]; probs[i] *= counts[ploidy]/sum; } } return k; }
static void set_ploidy(args_t *args, bcf1_t *rec) { ploidy_query(args->ploidy,(char*)bcf_seqname(args->aux.hdr,rec),rec->pos,args->sex2ploidy,NULL,NULL); int i; for (i=0; i<args->nsex; i++) if ( args->sex2ploidy[i]!=args->sex2ploidy_prev[i] ) break; if ( i==args->nsex ) return; // ploidy same as previously for (i=0; i<args->nsamples; i++) { if ( args->sample2sex[i]<0 ) args->aux.ploidy[i] = -1*args->sample2sex[i]; else args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]]; } int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp; }
int process_region_guess(args_t *args, char *seq, regitr_t *itr) { int kitr = 1; uint32_t start = 0, end = INT_MAX; reg_stats_t *stats = NULL; // set the start and the end position if ( itr ) { start = itr->reg[itr->i].start; end = itr->reg[itr->i].end; // flush all records with the same coordinates while ( itr->i+kitr<itr->n && start==itr->reg[itr->i+kitr].start && end==itr->reg[itr->i+kitr].end ) kitr++; int min,max,ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, &min, &max); assert(ret); stats = expand_regs(args, seq,start,end); } else { // background region int spos, epos; const char *ptr = hts_parse_reg(args->background, &spos, &epos); if ( !ptr ) error("Could not parse the region: %s\n", args->background); seq = (char*) malloc(ptr - args->background + 1); memcpy(seq,args->background,ptr-args->background); seq[ptr-args->background] = 0; start = spos; end = epos; } if ( bcf_sr_seek(args->sr,seq,start)!=0 ) { // sequence not present if ( !itr ) free(seq); return kitr; } int ismpl, rid = bcf_hdr_name2id(args->hdr,seq); if ( !itr ) free(seq); while ( bcf_sr_next_line(args->sr) ) { bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rec->rid!=rid || rec->pos > end ) break; if ( args->guess & GUESS_GT ) // use GTs to guess the ploidy { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) continue; for (ismpl=0; ismpl<args->nsample; ismpl++) { count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; int gt = bcf_gt_type(fmt, ismpl, NULL,NULL); if ( gt==GT_UNKN ) counts->nmiss++; else if ( gt==GT_HET_RA || gt==GT_HET_AA ) counts->nhet++; else counts->nhom++; } } else // use PLs to guess the ploidy { int gl2pl = args->guess & GUESS_PL ? 1 : -1; int npl = bcf_get_format_int32(args->hdr,rec,args->guess&GUESS_PL?"PL":"GL",&args->pls,&args->npls); if ( npl<=0 ) continue; npl /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *ptr = args->pls + ismpl*npl; int phom = INT_MAX, phet = INT_MAX, ial, jal, k = 0; for (ial=0; ial<rec->n_allele; ial++) { for (jal=0; jal<ial; jal++) { if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phet > ptr[k] ) phet = ptr[k]; k++; } if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phom > ptr[k] ) phom = ptr[k]; k++; } count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; if ( k == rec->n_allele ) counts->nhom++; // haploid else if ( phet == phom || k != rec->n_allele*(rec->n_allele+1)/2 ) counts->nmiss++; else if ( phet < phom ) counts->nhet++; else counts->nhom++; } } } return kitr; }