static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line) { // PLs not present, use GTs instead. int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs int nsm_gt, i; if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); nsm_gt /= bcf_hdr_nsamples(hdr); int npl = line->n_allele*(line->n_allele+1)/2; hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr); for (i=0; i<bcf_hdr_nsamples(hdr); i++) { int *gt_ptr = args->tmp_arr + i*nsm_gt; int j, *pl_ptr = args->pl_arr + i*npl; if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype { for (j=0; j<npl; j++) pl_ptr[j] = -1; } else { int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); for (j=0; j<npl; j++) pl_ptr[j] = fake_PL; int idx = bcf_alleles2gt(a,b); pl_ptr[idx] = 0; } } return npl; }
int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq) { if ( !args->nitmp ) { args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? args->nitmp /= args->nsmpl; } int i, nalt = 0, nref = 0; for (i=0; i<args->nsmpl; i++) { int32_t *gt = &args->itmp[i*args->nitmp]; if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue; if ( bcf_gt_allele(gt[0]) ) nalt++; else nref++; if ( bcf_gt_allele(gt[1]) ) nalt++; else nref++; } if ( !nalt && !nref ) return -1; *alt_freq = (double)nalt / (nalt + nref); return 0; }
int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif) { int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr); if ( ngt<=0 ) return 1; // GT not present if ( ngt!=args->nsmpl*2 ) return 2; // not diploid ngt /= args->nsmpl; int i,j, idx = 0; for (i=1; i<args->nsmpl; i++) { int32_t *a = args->tmp_arr + i*ngt; if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; } int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]); for (j=0; j<i; j++) { int32_t *b = args->tmp_arr + j*ngt; if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; } int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]); ntot[idx]++; if ( agt!=bgt ) ndif[idx]++; idx++; } } return 0; }
static void set_observed_prob_unrelated(bcf1_t *rec) { float af = 0.5; // alternate allele frequency int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr); if ( ngt<0 ) return; if ( ngt!=4 ) return; // chrX int32_t a,b,c,d; a = args.gt_arr[2*args.isample]; b = args.gt_arr[2*args.isample+1]; c = args.gt_arr[2*args.jsample]; d = args.gt_arr[2*args.jsample+1]; if ( bcf_gt_is_missing(a) || bcf_gt_is_missing(b) ) return; if ( bcf_gt_is_missing(c) || bcf_gt_is_missing(d) ) return; if ( !bcf_gt_is_phased(a) && !bcf_gt_is_phased(b) ) return; // only the second allele should be set when phased if ( !bcf_gt_is_phased(c) && !bcf_gt_is_phased(d) ) return; a = bcf_gt_allele(a); b = bcf_gt_allele(b); c = bcf_gt_allele(c); d = bcf_gt_allele(d); int m = args.msites; args.nsites++; hts_expand(uint32_t,args.nsites,args.msites,args.sites); if ( m!=args.msites ) args.eprob = (double*) realloc(args.eprob, sizeof(double)*args.msites*args.nstates); args.sites[args.nsites-1] = rec->pos; double *prob = args.eprob + args.nstates*(args.nsites-1); prob[UNRL_xxxx] = prob_not_shared(af,a,c) * prob_not_shared(af,a,d) * prob_not_shared(af,b,c) * prob_not_shared(af,b,d); prob[UNRL_0x0x] = prob_shared(af,a,c) * prob_not_shared(af,b,d); prob[UNRL_0xx0] = prob_shared(af,a,d) * prob_not_shared(af,b,c); prob[UNRL_x00x] = prob_shared(af,b,c) * prob_not_shared(af,a,d); prob[UNRL_x0x0] = prob_shared(af,b,d) * prob_not_shared(af,a,c); prob[UNRL_0101] = prob_shared(af,a,c) * prob_shared(af,b,d); prob[UNRL_0110] = prob_shared(af,a,d) * prob_shared(af,b,c); #if 0 static int x = 0; if ( !x++) { printf("p(0==0) .. %f\n", prob_shared(af,0,0)); printf("p(0!=0) .. %f\n", prob_not_shared(af,0,0)); printf("p(0==1) .. %f\n", prob_shared(af,0,1)); printf("p(0!=1) .. %f\n", prob_not_shared(af,0,1)); } printf("%d|%d %d|%d x:%f 11:%f 12:%f 21:%f 22:%f 11,22:%f 12,21:%f %d\n", a,b,c,d, prob[UNRL_xxxx], prob[UNRL_0x0x], prob[UNRL_0xx0], prob[UNRL_x00x], prob[UNRL_x0x0], prob[UNRL_0101], prob[UNRL_0110], rec->pos+1); #endif }
int process_region_precise(args_t *args, char *seq, regitr_t *itr) { int k = 1; uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end; while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++; int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL); assert(ret); memset(args->counts,0,args->ncounts*sizeof(int)); // Select 'nsites' sites spaced so that they evenly cover the whole region // to get a representative sample. We index-jump as we should be checking // a few sites only. int i, rid = -1, pos, prev_pos = -1, ismpl; for (i=0; i<args->nsites; i++) { rid = -1; pos = ((i+1.0)/(args->nsites+1))*(end - start) + start; if ( i>0 && pos <= prev_pos ) continue; // the vcf is too sparse if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k; // sequence not present if ( !bcf_sr_next_line(args->sr) ) return k; // no sites found bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rid==-1 ) rid = rec->rid; if ( rid!=rec->rid || rec->pos > end ) break; prev_pos = rec->pos; int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts); ngts /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *gts = args->gts + ngts*ismpl; int igt, ploidy = 0; for (igt=0; igt<ngts; igt++) { if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break; else ploidy++; } args->counts[ismpl*(args->max_ploidy+1) + ploidy]++; if ( args->verbose ) fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy); } } for (ismpl=0; ismpl<args->nsample; ismpl++) { float sum = 0, *probs = args->sex2prob + ismpl*args->nsex; int *counts = args->counts + ismpl*(args->max_ploidy+1); for (i=0; i<args->max_ploidy+1; i++) sum += counts[i]; if ( !sum ) continue; for (i=0; i<args->nsex; i++) { int ploidy = args->sex2ploidy[i]; probs[i] *= counts[ploidy]/sum; } } return k; }
static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec) { int i, nGTs = bcf_get_genotypes(hdr, rec, &args->GTa, &args->mGTa); for (i=0; i<bcf_hdr_nsamples(hdr); i++) { if ( !args->swap_phase[i] ) continue; int *gt = &args->GTa[i*2]; if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue; SWAP(int, gt[0], gt[1]); gt[1] |= 1; } bcf_update_genotypes(hdr,rec,args->GTa,nGTs); }
static void set_observed_prob_trio(bcf1_t *rec) { int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr); if ( ngt<0 ) return; if ( ngt!=6 ) return; // chrX int32_t a,b,c,d,e,f; a = args.gt_arr[2*args.imother]; b = args.gt_arr[2*args.imother+1]; c = args.gt_arr[2*args.ifather]; d = args.gt_arr[2*args.ifather+1]; e = args.gt_arr[2*args.ichild]; f = args.gt_arr[2*args.ichild+1]; if ( bcf_gt_is_missing(a) || bcf_gt_is_missing(b) ) return; if ( bcf_gt_is_missing(c) || bcf_gt_is_missing(d) ) return; if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) return; if ( !bcf_gt_is_phased(a) && !bcf_gt_is_phased(b) ) return; // only the second allele should be set when phased if ( !bcf_gt_is_phased(c) && !bcf_gt_is_phased(d) ) return; if ( !bcf_gt_is_phased(e) && !bcf_gt_is_phased(f) ) return; a = bcf_gt_allele(a); b = bcf_gt_allele(b); c = bcf_gt_allele(c); d = bcf_gt_allele(d); e = bcf_gt_allele(e); f = bcf_gt_allele(f); int mother = (1<<a) | (1<<b); int father = (1<<c) | (1<<d); int child = (1<<e) | (1<<f); if ( !(mother&child) || !(father&child) ) return; // Mendelian-inconsistent site, skip if ( a!=b ) args.nhet_mother++; if ( c!=d ) args.nhet_father++; int m = args.msites; args.nsites++; hts_expand(uint32_t,args.nsites,args.msites,args.sites); if ( m!=args.msites ) args.eprob = (double*) realloc(args.eprob, sizeof(double)*args.msites*args.nstates); args.sites[args.nsites-1] = rec->pos; double *prob = args.eprob + args.nstates*(args.nsites-1); prob[TRIO_AC] = prob_shared(0,e,a) * prob_shared(0,f,c); prob[TRIO_AD] = prob_shared(0,e,a) * prob_shared(0,f,d); prob[TRIO_BC] = prob_shared(0,e,b) * prob_shared(0,f,c); prob[TRIO_BD] = prob_shared(0,e,b) * prob_shared(0,f,d); prob[TRIO_CA] = prob_shared(0,e,c) * prob_shared(0,f,a); prob[TRIO_DA] = prob_shared(0,e,d) * prob_shared(0,f,a); prob[TRIO_CB] = prob_shared(0,e,c) * prob_shared(0,f,b); prob[TRIO_DB] = prob_shared(0,e,d) * prob_shared(0,f,b); }
static void phased_flush(args_t *args) { if ( !args->nbuf ) return; bcf_hdr_t *ahdr = args->files->readers[0].header; bcf_hdr_t *bhdr = args->files->readers[1].header; int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr); static int gt_absent_warned = 0; for (i=0; i<args->nbuf; i+=2) { bcf1_t *arec = args->buf[i]; bcf1_t *brec = args->buf[i+1]; int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa); if ( nGTs < 0 ) { if ( !gt_absent_warned ) { fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); gt_absent_warned = 1; } continue; } if ( nGTs != 2*nsmpl ) continue; // not diploid nGTs = bcf_get_genotypes(bhdr, brec, &args->GTb, &args->mGTb); if ( nGTs < 0 ) { if ( !gt_absent_warned ) { fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); gt_absent_warned = 1; } continue; } if ( nGTs != 2*nsmpl ) continue; // not diploid for (j=0; j<nsmpl; j++) { int *gta = &args->GTa[j*2]; int *gtb = &args->GTb[j*2]; if ( gta[1]==bcf_int32_vector_end || gtb[1]==bcf_int32_vector_end ) continue; if ( bcf_gt_is_missing(gta[0]) || bcf_gt_is_missing(gta[1]) || bcf_gt_is_missing(gtb[0]) || bcf_gt_is_missing(gtb[1]) ) continue; if ( !bcf_gt_is_phased(gta[1]) || !bcf_gt_is_phased(gtb[1]) ) continue; if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gta[1]) || bcf_gt_allele(gtb[0])==bcf_gt_allele(gtb[1]) ) continue; if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[0]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[1]) ) { if ( args->swap_phase[j] ) args->nmism[j]++; else args->nmatch[j]++; } if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[1]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[0]) ) { if ( args->swap_phase[j] ) args->nmatch[j]++; else args->nmism[j]++; } } } for (i=0; i<args->nbuf/2; i+=2) { bcf1_t *arec = args->buf[i]; bcf_translate(args->out_hdr, args->files->readers[0].header, arec); if ( args->nswap ) phase_update(args, args->out_hdr, arec); if ( !args->compact_PS || args->phase_set_changed ) { bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } bcf_write(args->out_fh, args->out_hdr, arec); if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1); args->prev_pos_check = arec->pos; } args->nswap = 0; for (j=0; j<nsmpl; j++) { if ( args->nmatch[j] >= args->nmism[j] ) args->swap_phase[j] = 0; else { args->swap_phase[j] = 1; args->nswap++; } if ( args->nmatch[j] && args->nmism[j] ) { // Entropy-inspired quality. The factor 0.7 shifts and scales to (0,1) double f = (double)args->nmatch[j]/(args->nmatch[j]+args->nmism[j]); args->phase_qual[j] = 99*(0.7 + f*log(f) + (1-f)*log(1-f))/0.7; } else args->phase_qual[j] = 99; args->nmatch[j] = 0; args->nmism[j] = 0; } int PQ_printed = 0; for (; i<args->nbuf; i+=2) { bcf1_t *brec = args->buf[i+1]; bcf_translate(args->out_hdr, args->files->readers[1].header, brec); if ( !PQ_printed ) { bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; j<nsmpl; j++) if ( args->phase_qual[j] < args->min_PQ ) { args->phase_set[j] = brec->pos+1; args->phase_set_changed = 1; } else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing; } if ( args->nswap ) phase_update(args, args->out_hdr, brec); if ( !args->compact_PS || args->phase_set_changed ) { bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } bcf_write(args->out_fh, args->out_hdr, brec); if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1); args->prev_pos_check = brec->pos; } args->nbuf = 0; }
static void apply_variant(args_t *args, bcf1_t *rec) { if ( rec->n_allele==1 ) return; if ( rec->pos <= args->fa_frz_pos ) { fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); return; } if ( args->mask ) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = rec->pos; int end = rec->pos + rec->rlen - 1; if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; } int i, ialt = 1; if ( args->isample >= 0 ) { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) return; if ( args->haplotype ) { if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); } else if ( args->output_iupac ) { uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); int jalt; if ( fmt->n>1 ) { ptr = fmt->p + fmt->size*args->isample + 1; jalt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt; else jalt = bcf_gt_allele(jalt); } else jalt = ialt; if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? { char ial = rec->d.allele[ialt][0]; char jal = rec->d.allele[jalt][0]; rec->d.allele[ialt][0] = gt2iupac(ial,jal); } } else { for (i=0; i<fmt->n; i++) { uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); if ( ialt ) break; } } if ( !ialt ) return; // ref allele if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); } else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) { char ial = rec->d.allele[0][0]; char jal = rec->d.allele[1][0]; rec->d.allele[1][0] = gt2iupac(ial,jal); } int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; if ( idx<0 || idx>=args->fa_buf.l ) error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off); // sanity check the reference base int len_diff = 0, alen = 0; if ( rec->d.allele[ialt][0]=='<' ) { if ( strcasecmp(rec->d.allele[ialt], "<DEL>") ) error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1); assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 len_diff = 1-rec->rlen; rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event alen = strlen(rec->d.allele[ialt]); } else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) { // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off); char tmp = 0; if ( args->fa_buf.l - idx > rec->rlen ) { tmp = args->fa_buf.s[idx+rec->rlen]; args->fa_buf.s[idx+rec->rlen] = 0; } error( "The fasta sequence does not match the REF allele at %s:%d:\n" " .vcf: [%s]\n" " .vcf: [%s] <- (ALT)\n" " .fa: [%s]%c%s\n", bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); } else { alen = strlen(rec->d.allele[ialt]); len_diff = alen - rec->rlen; } if ( args->fa_case ) for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]); else for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]); if ( len_diff <= 0 ) { // deletion or same size event for (i=0; i<alen; i++) args->fa_buf.s[idx+i] = rec->d.allele[ialt][i]; if ( len_diff ) memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); } else { // insertion ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); for (i=0; i<alen; i++) args->fa_buf.s[idx+i] = rec->d.allele[ialt][i]; } if (args->chain && len_diff != 0) { // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant) if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0) { // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1); } else { // otherwise, just the coordinates of the variant as given push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen); } } args->fa_buf.l += len_diff; args->fa_mod_off += len_diff; args->fa_frz_pos = rec->pos + rec->rlen - 1; }
static void check_gt(args_t *args) { int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0; int fake_pls = args->no_PLs; // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname); if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); if ( !args->no_PLs ) fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); int tgt_isample = -1, query_isample = 0; if ( args->target_sample ) { tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample); if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample); } if ( args->all_sites ) { if ( tgt_isample==-1 ) { fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); tgt_isample = 0; } } if ( args->query_sample ) { query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample); if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample); } if ( args->all_sites ) fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]); // Main loop float prev_lk = 0; while ( (ret=bcf_sr_next_line(args->files)) ) { if ( ret!=2 ) continue; bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file bcf_unpack(sm_line, BCF_UN_FMT); bcf_unpack(gt_line, BCF_UN_FMT); // Init mapping from target genotype index to the sample's PL fields int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2; if ( n_gt2ipl > m_gt2ipl ) { m_gt2ipl = n_gt2ipl; gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl); } if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue; // Target genotypes int ngt, npl; if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); ngt /= bcf_hdr_nsamples(args->gt_hdr); if ( ngt!=2 ) continue; // checking only diploid genotypes // Sample PLs if ( !fake_pls ) { if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 ) error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); npl /= bcf_hdr_nsamples(args->sm_hdr); } else npl = fake_PLs(args, args->sm_hdr, sm_line); // Calculate likelihoods for all samples, assuming diploid genotypes // For faster access to genotype likelihoods (PLs) of the query sample int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl; double sum_pl = 0; // for converting PLs to probs for (max_ipl=0; max_ipl<npl; max_ipl++) { if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break; if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue; sum_pl += pow(10, -0.1*pl_ptr[max_ipl]); } if ( sum_pl==0 ) continue; // no PLs present if ( fake_pls && args->no_PLs==1 ) sum_pl = -1; // The main stats: concordance of the query sample with the target -g samples for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++) { int *gt_ptr = gt_arr + i*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue; int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( args->hom_only && a!=b ) continue; // heterozygous genotype int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); args->sites[i]++; } if ( args->all_sites ) { // Print LKs at all sites for debugging int *gt_ptr = gt_arr + tgt_isample*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( args->hom_only && a!=b ) continue; // heterozygous genotype fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk); prev_lk = args->lks[query_isample]; int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); for (igt=0; igt<npl; igt++) if ( pl_ptr[igt]==bcf_int32_vector_end ) break; else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, "."); else fprintf(fp, "\t%d", pl_ptr[igt]); fprintf(fp, "\n"); } } free(gt2ipl); free(gt_arr); free(args->pl_arr); free(args->tmp_arr); // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same // plot as discordance per site, the latter must be scaled to the same range int nsamples = bcf_hdr_nsamples(args->gt_hdr); double extreme_lk = 0, extreme_lk_per_site = 0; for (i=0; i<nsamples; i++) { if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i]; if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i]; } // Sorted output double **p = (double**) malloc(sizeof(double*)*nsamples); for (i=0; i<nsamples; i++) p[i] = &args->lks[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]); for (i=0; i<nsamples; i++) { int idx = p[i] - args->lks; double per_site = 0; if ( args->sites[idx] ) { if ( args->sites[idx] && extreme_lk_per_site ) { per_site = args->lks[idx]/args->sites[idx]; per_site *= extreme_lk / extreme_lk_per_site; } else per_site = 0; } fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i); } if ( args->plot ) { fclose(fp); plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); } }
void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) { int *map = (int*) calloc(line->n_allele, sizeof(int)); // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles for (i=1, j=1; i<line->n_allele; i++) { if ( rm_mask & 1<<i ) { // remove this allele line->d.allele[i] = NULL; nrm++; continue; } kputc(',', &str); kputs(line->d.allele[i], &str); map[i] = j; j++; } if ( !nrm ) { free(map); free(str.s); return; } int nR_ori = line->n_allele; int nR_new = line->n_allele-nrm; assert(nR_new > 0); // should not be able to remove reference allele int nA_ori = nR_ori-1; int nA_new = nR_new-1; int nG_ori = nR_ori*(nR_ori + 1)/2; int nG_new = nR_new*(nR_new + 1)/2; bcf_update_alleles_str(header, line, str.s); // remove from Number=G, Number=R and Number=A INFO fields. uint8_t *dat = NULL; int mdat = 0, ndat = 0, mdat_bytes = 0, nret; for (i=0; i<line->n_info; i++) { bcf_info_t *info = &line->d.info[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { str.l = 0; char *ss = (char*) dat, *se = (char*) dat; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; j<nexp; j++) { if ( !*se ) break; while ( *se && *se!=',' ) se++; if ( rm_mask & 1<<(j+inc) ) { if ( *se ) se++; ss = se; continue; } if ( str.l ) kputc(',',&str); kputsn(ss,se-ss,&str); if ( *se ) se++; ss = se; } assert( j==nexp ); } else // Number=G, assuming diploid genotype { int k = 0, n = 0; for (j=0; j<nR_ori; j++) { for (k=0; k<=j; k++) { if ( !*se ) break; while ( *se && *se!=',' ) se++; n++; if ( rm_mask & 1<<j || rm_mask & 1<<k ) { if ( *se ) se++; ss = se; continue; } if ( str.l ) kputc(',',&str); kputsn(ss,se-ss,&str); if ( *se ) se++; ss = se; } if ( !*se ) break; } assert( n=nG_ori ); } nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int inc = 0, ntop; if ( vlen==BCF_VL_A ) { assert( nret==nA_ori ); ntop = nA_ori; ndat = nA_new; inc = 1; } else { assert( nret==nR_ori ); ntop = nR_ori; ndat = nR_new; } int k = 0; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; j<ntop; j++) /* j:ori, k:new */ \ { \ if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \ if ( rm_mask & 1<<(j+inc) ) continue; \ if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \ k++; \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break; } #undef BRANCH } else // Number=G { assert( nret==nG_ori ); int k, l_ori = -1, l_new = 0; ndat = nG_new; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; j<nR_ori; j++) \ { \ for (k=0; k<=j; k++) \ { \ l_ori++; \ if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \ if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \ if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \ l_new++; \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break; } #undef BRANCH } nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } // Update GT fields, the allele indexes might have changed for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break; if ( i<line->n_allele ) { mdat = mdat_bytes / 4; // sizeof(int32_t) nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); mdat_bytes = mdat * 4; if ( nret>0 ) { nret /= line->n_sample; int32_t *ptr = (int32_t*) dat; for (i=0; i<line->n_sample; i++) { for (j=0; j<nret; j++) { if ( bcf_gt_is_missing(ptr[j]) ) continue; if ( ptr[j]==bcf_int32_vector_end ) break; int al = bcf_gt_allele(ptr[j]); assert( al<nR_ori && map[al]>=0 ); ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); } ptr += nret; } bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); } } // Remove from Number=G, Number=R and Number=A FORMAT fields. // Assuming haploid or diploid GTs for (i=0; i<line->n_fmt; i++) { bcf_fmt_t *fmt = &line->d.fmt[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { int size = nret/line->n_sample; // number of bytes per sample str.l = 0; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; j<line->n_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; for (k_src=0; k_src<nexp; k_src++) { if ( ptr>=se || !*ptr) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<(k_src+inc) ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } assert( k_src==nexp ); l = str.l - l; for (; l<size; l++) kputc(0, &str); } } else // Number=G, diploid or haploid { for (j=0; j<line->n_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; int nexp = 0; // diploid or haploid? while ( ptr<se ) { if ( !*ptr ) break; if ( *ptr==',' ) nexp++; ptr++; } if ( ptr!=ss ) nexp++; assert( nexp==nG_ori || nexp==nR_ori ); ptr = ss; if ( nexp==nG_ori ) // diploid { int ia, ib; for (ia=0; ia<nR_ori; ia++) { for (ib=0; ib<=ia; ib++) { if ( ptr>=se || !*ptr ) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } if ( ptr>=se || !*ptr ) break; } } else // haploid { for (k_src=0; k_src<nR_ori; k_src++) { if ( ptr>=se || !*ptr ) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<k_src ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } assert( k_src==nR_ori ); l = str.l - l; for (; l<size; l++) kputc(0, &str); } } } nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } int nori = nret / line->n_sample; if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G { int inc = 0, nnew; if ( vlen==BCF_VL_A ) { assert( nori==nA_ori ); // todo: will fail if all values are missing ndat = nA_new*line->n_sample; nnew = nA_new; inc = 1; } else { assert( nori==nR_ori ); // todo: will fail if all values are missing ndat = nR_new*line->n_sample; nnew = nR_new; } #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; j<line->n_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nnew; \ int size = sizeof(type_t); \ int k_src, k_dst = 0; \ for (k_src=0; k_src<nori; k_src++) \ { \ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \ if ( rm_mask & 1<<(k_src+inc) ) continue; \ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; } #undef BRANCH } else // Number=G, diploid or mixture of haploid+diploid { assert( nori==nG_ori ); ndat = nG_new*line->n_sample; #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; j<line->n_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ int size = sizeof(type_t); \ int ia, ib, k_dst = 0, k_src; \ int nset = 0; /* haploid or diploid? */ \ for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \ if ( nset==nR_ori ) /* haploid */ \ { \ for (k_src=0; k_src<nR_ori; k_src++) \ { \ if ( rm_mask & 1<<k_src ) continue; \ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ } \ else /* diploid */ \ { \ k_src = -1; \ for (ia=0; ia<nR_ori; ia++) \ { \ for (ib=0; ib<=ia; ib++) \ { \ k_src++; \ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \ if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ } \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; } #undef BRANCH } nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } free(dat); free(str.s); free(map); }
bcf1_t *process(bcf1_t *rec) { bcf1_t *dflt = args.mode&MODE_LIST_GOOD ? rec : NULL; args.nrec++; if ( rec->n_allele > 63 ) return dflt; // we use 64bit bitmask below int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr); if ( ngt<0 ) return dflt; if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt; ngt /= bcf_hdr_nsamples(args.hdr); int itr_set = regidx_overlap(args.rules, bcf_seqname(args.hdr,rec),rec->pos,rec->pos, args.itr_ori); int i, has_bad = 0, needs_update = 0; for (i=0; i<args.ntrios; i++) { int32_t a,b,c,d,e,f; trio_t *trio = &args.trios[i]; a = args.gt_arr[ngt*trio->imother]; b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end; c = args.gt_arr[ngt*trio->ifather]; d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end; e = args.gt_arr[ngt*trio->ichild]; f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end; // skip sites with missing data in child if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue; uint64_t mother = 0, father = 0,child1,child2; int is_ok = 0; if ( !itr_set ) { if ( f==bcf_int32_vector_end ) { warn_ploidy(rec); continue; } // All M,F,C genotypes are diploid. Missing data are considered consistent. child1 = 1<<bcf_gt_allele(e); child2 = 1<<bcf_gt_allele(f); mother = bcf_gt_is_missing(a) ? child1|child2 : 1<<bcf_gt_allele(a); mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(b); father = bcf_gt_is_missing(c) ? child1|child2 : 1<<bcf_gt_allele(c); father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(d); if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; } else { child1 = 1<<bcf_gt_allele(e); child2 = bcf_gt_is_missing(f) || f==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(f); mother |= bcf_gt_is_missing(a) ? 0 : 1<<bcf_gt_allele(a); mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(b); father |= bcf_gt_is_missing(c) ? 0 : 1<<bcf_gt_allele(c); father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(d); regitr_copy(args.itr, args.itr_ori); while ( !is_ok && regitr_overlap(args.itr) ) { rule_t *rule = ®itr_payload(args.itr,rule_t); if ( child1 && child2 ) { if ( !rule->mal || !rule->fal ) continue; // wrong rule (haploid), but this is a diploid GT if ( !mother ) mother = child1|child2; if ( !father ) father = child1|child2; if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; continue; } if ( rule->mal ) { if ( mother && !(child1&mother) ) continue; } if ( rule->fal ) { if ( father && !(child1&father) ) continue; } is_ok = 1; } } if ( is_ok ) { trio->nok++; } else { trio->nbad++; has_bad = 1; if ( args.mode&MODE_DELETE ) { args.gt_arr[ngt*trio->imother] = bcf_gt_missing; if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true args.gt_arr[ngt*trio->ifather] = bcf_gt_missing; if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing; args.gt_arr[ngt*trio->ichild] = bcf_gt_missing; if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1] = bcf_gt_missing; needs_update = 1; } } } if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) ) error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); if ( args.mode&MODE_DELETE ) return rec; if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec; if ( args.mode&MODE_LIST_BAD ) return has_bad ? rec : NULL; return NULL; }
int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) { args->nitmp = 0; // Set allele frequency int ret; if ( args->af_tag ) { // Use an INFO tag provided by the user ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs); if ( ret==1 ) *alt_freq = args->AFs[0]; if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); } else if ( args->af_fname ) { // Read AF from a file ret = read_AF(args->files->targets, line, alt_freq); } else { // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF ret = -1; if ( !args->estimate_AF ) { int AC = -1, AN = 0; ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp); if ( ret==1 ) { AN = args->itmp[0]; ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp); if ( ret>0 ) AC = args->itmp[0]; } if ( AN<=0 || AC<0 ) ret = -1; else *alt_freq = (double) AC/AN; } if ( ret==-1 ) ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp } if ( ret<0 ) return ret; if ( *alt_freq==0.0 ) { if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0 *alt_freq = args->dflt_AF; } // Set P(D|G) if ( args->fake_PLs ) { if ( !args->nitmp ) { args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? args->nitmp /= args->nsmpl; } int32_t *gt = &args->itmp[args->ismpl*args->nitmp]; if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1; int a = bcf_gt_allele(gt[0]); int b = bcf_gt_allele(gt[1]); if ( a!=b ) { pdg[0] = pdg[2] = args->unseen_PL; pdg[1] = 1 - 2*args->unseen_PL; } else if ( a==0 ) { pdg[0] = 1 - 2*args->unseen_PL; pdg[1] = pdg[2] = args->unseen_PL; } else { pdg[0] = pdg[1] = args->unseen_PL; pdg[2] = 1 - 2*args->unseen_PL; } } else { args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp); if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid? args->nitmp /= args->nsmpl; int32_t *pl = &args->itmp[args->ismpl*args->nitmp]; pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0; pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0; pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0; double sum = pdg[0] + pdg[1] + pdg[2]; if ( !sum ) return -1; pdg[0] /= sum; pdg[1] /= sum; pdg[2] /= sum; } return 0; }