static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) { if ( arec && arec->errcode ) error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname); if ( brec && brec->errcode ) error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname); int i, nsmpl = bcf_hdr_nsamples(args->out_hdr); int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec)); if ( args->prev_chr<0 || args->prev_chr!=chr_id ) { if ( args->prev_chr>=0 ) phased_flush(args); for (i=0; i<nsmpl; i++) args->phase_set[i] = arec->pos+1; args->phase_set_changed = 1; if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec)); args->seen_seq[chr_id] = 1; args->prev_chr = chr_id; args->prev_pos_check = -1; } if ( !brec ) { bcf_translate(args->out_hdr, args->files->readers[0].header, arec); if ( args->nswap ) phase_update(args, args->out_hdr, arec); if ( !args->compact_PS || args->phase_set_changed ) { bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } bcf_write(args->out_fh, args->out_hdr, arec); if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); args->prev_pos_check = arec->pos; return; } int m = args->mbuf; args->nbuf += 2; hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf); for (i=m; i<args->mbuf; i++) args->buf[i] = bcf_init1(); SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]); SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]); }
/* Called for each VCF record after all standard annotation things are finished. Return 0 on success, 1 to suppress the line from printing, -1 on critical errors. */ int process(bcf1_t *rec) { int i, ret; printf("%s\t%d\t%s\t%s", bcf_seqname(in_hdr,rec),rec->pos+1,rec->d.allele[0],rec->n_allele>1 ? rec->d.allele[1] : "."); if ( rec->n_allele==1 ) { for (i=0; i<rec->n_sample; i++) printf("\t0.0"); } else { for (i=0; i<nhandlers; i++) { ret = handlers[i](rec); if ( !ret ) break; // successfully printed } if ( i==nhandlers ) { // none of the annotations present for (i=0; i<rec->n_sample; i++) printf("\t-1.0"); } } printf("\n"); return 1; }
static int load_genmap(args_t *args, bcf1_t *line) { if ( !args->genmap_fname ) { args->ngenmap = 0; return 0; } kstring_t str = {0,0,0}; char *fname = strstr(args->genmap_fname,"{CHROM}"); if ( fname ) { kputsn(args->genmap_fname, fname - args->genmap_fname, &str); kputs(bcf_seqname(args->hdr,line), &str); kputs(fname+7,&str); fname = str.s; } else fname = args->genmap_fname; htsFile *fp = hts_open(fname, "rb"); if ( !fp ) { args->ngenmap = 0; return -1; } hts_getline(fp, KS_SEP_LINE, &str); if ( strcmp(str.s,"position COMBINED_rate(cM/Mb) Genetic_Map(cM)") ) error("Unexpected header, found:\n\t[%s], but expected:\n\t[position COMBINED_rate(cM/Mb) Genetic_Map(cM)]\n", fname, str.s); args->ngenmap = args->igenmap = 0; while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) { args->ngenmap++; hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap); genmap_t *gm = &args->genmap[args->ngenmap-1]; char *tmp, *end; gm->pos = strtol(str.s, &tmp, 10); if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s); // skip second column tmp++; while ( *tmp && !isspace(*tmp) ) tmp++; // read the genetic map in cM gm->rate = strtod(tmp+1, &end); if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s); } if ( !args->ngenmap ) error("Genetic map empty?\n"); int i; for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1 if ( hts_close(fp) ) error("Close failed\n"); free(str.s); return 0; }
static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr) { bcf1_t *rec = *rec_ptr; if ( args->vcf_rbuf.n >= args->vcf_rbuf.m ) error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); // Insert the new record in the buffer. The line would be overwritten in // the next bcf_sr_next_line call, therefore we need to swap it with an // unused one int i = rbuf_append(&args->vcf_rbuf); if ( !args->vcf_buf[i] ) args->vcf_buf[i] = bcf_init1(); bcf1_t *tmp = rec; *rec_ptr = args->vcf_buf[i]; args->vcf_buf[i] = tmp; }
int process(bcf1_t *rec) { if ( rec->n_allele<2 ) return 0; // not a variant int type = bcf_get_variant_types(rec); if ( !(type&VCF_INDEL) ) return 0; // not an indel int i, len = 0; for (i=1; i<rec->n_allele; i++) if ( len > rec->d.var[i].n ) len = rec->d.var[i].n; int pos_to = len!=0 ? rec->pos : rec->pos - len; // len is negative if ( bcf_sr_regions_overlap(exons, bcf_seqname(in_hdr,rec),rec->pos,pos_to) ) return 0; // no overlap hts_expand(int32_t,rec->n_allele-1,nfrm,frm); for (i=1; i<rec->n_allele; i++) { if ( rec->d.var[i].type!=VCF_INDEL ) { frm[i-1] = -1; continue; } int len = rec->d.var[i].n, tlen = 0; if ( len>0 ) { // insertion if ( exons->start <= rec->pos && exons->end > rec->pos ) tlen = abs(len); } else if ( exons->start <= rec->pos + abs(len) ) { // deletion tlen = abs(len); if ( rec->pos < exons->start ) // trim the beginning tlen -= exons->start - rec->pos + 1; if ( exons->end < rec->pos + abs(len) ) // trim the end tlen -= rec->pos + abs(len) - exons->end; } if ( tlen ) // there are some deleted/inserted bases in the exon { if ( tlen%3 ) frm[i-1] = 1; // out-of-frame else frm[i-1] = 0; // in-frame } else frm[i-1] = -1; // not applicable (is outside) } if ( bcf_update_info_int32(out_hdr,rec,"OOF",frm,rec->n_allele-1)<0 ) return -1; return 0; }
static void set_ploidy(args_t *args, bcf1_t *rec) { ploidy_query(args->ploidy,(char*)bcf_seqname(args->aux.hdr,rec),rec->pos,args->sex2ploidy,NULL,NULL); int i; for (i=0; i<args->nsex; i++) if ( args->sex2ploidy[i]!=args->sex2ploidy_prev[i] ) break; if ( i==args->nsex ) return; // ploidy same as previously for (i=0; i<args->nsamples; i++) { if ( args->sample2sex[i]<0 ) args->aux.ploidy[i] = -1*args->sample2sex[i]; else args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]]; } int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp; }
void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) { int *map = (int*) calloc(line->n_allele, sizeof(int)); // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles for (i=1, j=1; i<line->n_allele; i++) { if ( rm_mask & 1<<i ) { // remove this allele line->d.allele[i] = NULL; nrm++; continue; } kputc(',', &str); kputs(line->d.allele[i], &str); map[i] = j; j++; } if ( !nrm ) { free(map); free(str.s); return; } int nR_ori = line->n_allele; int nR_new = line->n_allele-nrm; assert(nR_new > 0); // should not be able to remove reference allele int nA_ori = nR_ori-1; int nA_new = nR_new-1; int nG_ori = nR_ori*(nR_ori + 1)/2; int nG_new = nR_new*(nR_new + 1)/2; bcf_update_alleles_str(header, line, str.s); // remove from Number=G, Number=R and Number=A INFO fields. uint8_t *dat = NULL; int mdat = 0, ndat = 0, mdat_bytes = 0, nret; for (i=0; i<line->n_info; i++) { bcf_info_t *info = &line->d.info[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { str.l = 0; char *ss = (char*) dat, *se = (char*) dat; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; j<nexp; j++) { if ( !*se ) break; while ( *se && *se!=',' ) se++; if ( rm_mask & 1<<(j+inc) ) { if ( *se ) se++; ss = se; continue; } if ( str.l ) kputc(',',&str); kputsn(ss,se-ss,&str); if ( *se ) se++; ss = se; } assert( j==nexp ); } else // Number=G, assuming diploid genotype { int k = 0, n = 0; for (j=0; j<nR_ori; j++) { for (k=0; k<=j; k++) { if ( !*se ) break; while ( *se && *se!=',' ) se++; n++; if ( rm_mask & 1<<j || rm_mask & 1<<k ) { if ( *se ) se++; ss = se; continue; } if ( str.l ) kputc(',',&str); kputsn(ss,se-ss,&str); if ( *se ) se++; ss = se; } if ( !*se ) break; } assert( n=nG_ori ); } nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int inc = 0, ntop; if ( vlen==BCF_VL_A ) { assert( nret==nA_ori ); ntop = nA_ori; ndat = nA_new; inc = 1; } else { assert( nret==nR_ori ); ntop = nR_ori; ndat = nR_new; } int k = 0; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; j<ntop; j++) /* j:ori, k:new */ \ { \ if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \ if ( rm_mask & 1<<(j+inc) ) continue; \ if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \ k++; \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break; } #undef BRANCH } else // Number=G { assert( nret==nG_ori ); int k, l_ori = -1, l_new = 0; ndat = nG_new; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; j<nR_ori; j++) \ { \ for (k=0; k<=j; k++) \ { \ l_ori++; \ if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \ if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \ if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \ l_new++; \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break; } #undef BRANCH } nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } // Update GT fields, the allele indexes might have changed for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break; if ( i<line->n_allele ) { mdat = mdat_bytes / 4; // sizeof(int32_t) nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); mdat_bytes = mdat * 4; if ( nret>0 ) { nret /= line->n_sample; int32_t *ptr = (int32_t*) dat; for (i=0; i<line->n_sample; i++) { for (j=0; j<nret; j++) { if ( ptr[j]==bcf_gt_missing ) continue; if ( ptr[j]==bcf_int32_vector_end ) break; int al = bcf_gt_allele(ptr[j]); assert( al<nR_ori && map[al]>=0 ); ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); } ptr += nret; } bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); } } // Remove from Number=G, Number=R and Number=A FORMAT fields. // Assuming haploid or diploid GTs for (i=0; i<line->n_fmt; i++) { bcf_fmt_t *fmt = &line->d.fmt[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { int size = nret/line->n_sample; // number of bytes per sample str.l = 0; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; j<line->n_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; for (k_src=0; k_src<nexp; k_src++) { if ( ptr>=se || !*ptr) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<(k_src+inc) ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } assert( k_src==nexp ); l = str.l - l; for (; l<size; l++) kputc(0, &str); } } else // Number=G, diploid or haploid { for (j=0; j<line->n_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; int nexp = 0; // diploid or haploid? while ( ptr<se ) { if ( !*ptr ) break; if ( *ptr==',' ) nexp++; ptr++; } if ( ptr!=ss ) nexp++; assert( nexp==nG_ori || nexp==nR_ori ); ptr = ss; if ( nexp==nG_ori ) // diploid { int ia, ib; for (ia=0; ia<nR_ori; ia++) { for (ib=0; ib<=ia; ib++) { if ( ptr>=se || !*ptr ) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } if ( ptr>=se || !*ptr ) break; } } else // haploid { for (k_src=0; k_src<nR_ori; k_src++) { if ( ptr>=se || !*ptr ) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<k_src ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } assert( k_src==nR_ori ); l = str.l - l; for (; l<size; l++) kputc(0, &str); } } } nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } int nori = nret / line->n_sample; if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G { int ntop, inc = 0; if ( vlen==BCF_VL_A ) { assert( nori==nA_ori ); // todo: will fail if all values are missing ntop = nA_ori; ndat = nA_new*line->n_sample; inc = 1; } else { assert( nori==nR_ori ); // todo: will fail if all values are missing ntop = nR_ori; ndat = nR_new*line->n_sample; } #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; j<line->n_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \ int size = sizeof(type_t); \ int k_src, k_dst = 0; \ for (k_src=0; k_src<ntop; k_src++) \ { \ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \ if ( rm_mask & 1<<(k_src+inc) ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; } #undef BRANCH } else // Number=G, diploid or mixture of haploid+diploid { assert( nori==nG_ori ); ndat = nG_new*line->n_sample; #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; j<line->n_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ int size = sizeof(type_t); \ int ia, ib, k_dst = 0, k_src; \ int nset = 0; /* haploid or diploid? */ \ for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \ if ( nset==nR_ori ) /* haploid */ \ { \ for (k_src=0; k_src<nR_ori; k_src++) \ { \ if ( rm_mask & 1<<k_src ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ } \ else /* diploid */ \ { \ k_src = -1; \ for (ia=0; ia<nR_ori; ia++) \ { \ for (ib=0; ib<=ia; ib++) \ { \ k_src++; \ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \ if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ } \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; } #undef BRANCH } nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } free(dat); free(str.s); free(map); }
bcf1_t *process(bcf1_t *rec) { bcf1_t *dflt = args.mode&MODE_LIST_GOOD ? rec : NULL; args.nrec++; if ( rec->n_allele > 63 ) return dflt; // we use 64bit bitmask below int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr); if ( ngt<0 ) return dflt; if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt; ngt /= bcf_hdr_nsamples(args.hdr); int itr_set = regidx_overlap(args.rules, bcf_seqname(args.hdr,rec),rec->pos,rec->pos, args.itr_ori); int i, has_bad = 0, needs_update = 0; for (i=0; i<args.ntrios; i++) { int32_t a,b,c,d,e,f; trio_t *trio = &args.trios[i]; a = args.gt_arr[ngt*trio->imother]; b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end; c = args.gt_arr[ngt*trio->ifather]; d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end; e = args.gt_arr[ngt*trio->ichild]; f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end; // skip sites with missing data in child if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue; uint64_t mother = 0, father = 0,child1,child2; int is_ok = 0; if ( !itr_set ) { if ( f==bcf_int32_vector_end ) { warn_ploidy(rec); continue; } // All M,F,C genotypes are diploid. Missing data are considered consistent. child1 = 1<<bcf_gt_allele(e); child2 = 1<<bcf_gt_allele(f); mother = bcf_gt_is_missing(a) ? child1|child2 : 1<<bcf_gt_allele(a); mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(b); father = bcf_gt_is_missing(c) ? child1|child2 : 1<<bcf_gt_allele(c); father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(d); if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; } else { child1 = 1<<bcf_gt_allele(e); child2 = bcf_gt_is_missing(f) || f==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(f); mother |= bcf_gt_is_missing(a) ? 0 : 1<<bcf_gt_allele(a); mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(b); father |= bcf_gt_is_missing(c) ? 0 : 1<<bcf_gt_allele(c); father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(d); regitr_copy(args.itr, args.itr_ori); while ( !is_ok && regitr_overlap(args.itr) ) { rule_t *rule = ®itr_payload(args.itr,rule_t); if ( child1 && child2 ) { if ( !rule->mal || !rule->fal ) continue; // wrong rule (haploid), but this is a diploid GT if ( !mother ) mother = child1|child2; if ( !father ) father = child1|child2; if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; continue; } if ( rule->mal ) { if ( mother && !(child1&mother) ) continue; } if ( rule->fal ) { if ( father && !(child1&father) ) continue; } is_ok = 1; } } if ( is_ok ) { trio->nok++; } else { trio->nbad++; has_bad = 1; if ( args.mode&MODE_DELETE ) { args.gt_arr[ngt*trio->imother] = bcf_gt_missing; if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true args.gt_arr[ngt*trio->ifather] = bcf_gt_missing; if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing; args.gt_arr[ngt*trio->ichild] = bcf_gt_missing; if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1] = bcf_gt_missing; needs_update = 1; } } } if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) ) error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); if ( args.mode&MODE_DELETE ) return rec; if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec; if ( args.mode&MODE_LIST_BAD ) return has_bad ? rec : NULL; return NULL; }
bcf1_t *process(bcf1_t *rec) { int i, ns = 0; bcf_unpack(rec, BCF_UN_FMT); bcf_fmt_t *fmt_gt = NULL; for (i=0; i<rec->n_fmt; i++) if ( rec->d.fmt[i].id==args.gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } if ( !fmt_gt ) return rec; // no GT tag hts_expand(int32_t,rec->n_allele,args.marr,args.arr); hts_expand(float,rec->n_allele,args.mfarr,args.farr); hts_expand(counts_t,rec->n_allele,args.mcounts,args.counts); memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); memset(args.counts,0,sizeof(*args.counts)*rec->n_allele); #define BRANCH_INT(type_t,vector_end) { \ for (i=0; i<rec->n_sample; i++) \ { \ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ int ial, als = 0; \ for (ial=0; ial<fmt_gt->n; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[ial]) ) break; /* missing allele */ \ int idx = bcf_gt_allele(p[ial]); \ \ if ( idx >= rec->n_allele ) \ error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args.in_hdr->samples[i],bcf_seqname(args.in_hdr,rec),rec->pos+1); \ als |= (1<<idx); /* this breaks with too many alleles */ \ } \ if ( ial==0 ) continue; /* missing alleles */ \ ns++; \ int is_hom = als && !(als & (als-1)); /* only one bit is set */ \ int is_hemi = ial==1; \ for (ial=0; als; ial++) \ { \ if ( als&1 ) \ { \ if ( !is_hom ) \ args.counts[ial].nhet++; \ else if ( !is_hemi ) \ args.counts[ial].nhom += 2; \ else \ args.counts[ial].nhemi++; \ } \ als >>= 1; \ } \ } \ } switch (fmt_gt->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args.in_hdr,rec),rec->pos+1); break; } #undef BRANCH_INT if ( args.tags&SET_NS ) { if ( bcf_update_info_int32(args.out_hdr,rec,"NS",&ns,1)!=0 ) error("Error occurred while updating NS at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AN ) { args.arr[0] = 0; for (i=0; i<rec->n_allele; i++) args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; if ( bcf_update_info_int32(args.out_hdr,rec,"AN",args.arr,1)!=0 ) error("Error occurred while updating AN at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AF ) { int n = rec->n_allele-1; if ( n>0 ) { args.arr[0] = 0; for (i=0; i<rec->n_allele; i++) args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; for (i=1; i<rec->n_allele; i++) args.farr[i] = (args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi)*1.0/args.arr[0]; } if ( args.arr[0] ) { if ( bcf_update_info_float(args.out_hdr,rec,"AF",args.farr+1,n)!=0 ) error("Error occurred while updating AF at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } } if ( args.tags&SET_AC ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] = args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC",args.arr+1,n)!=0 ) error("Error occurred while updating AC at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Het ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhet; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Het",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Het at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Hom ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhom; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Hom",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Hom at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Hemi ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhemi; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Hemi",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Hemi at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } return rec; }
static void init_data(args_t *args) { bcf1_t *line = NULL; // With phased concat, the chunks overlap and come in the right order. To // avoid opening all files at once, store start positions to recognise need // for the next one. This way we can keep only two open chunks at once. if ( args->phased_concat ) { args->start_pos = (int*) malloc(sizeof(int)*args->nfnames); line = bcf_init(); } kstring_t str = {0,0,0}; int i, prev_chrid = -1; for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); int j; for (j=0; j<bcf_hdr_nsamples(hdr); j++) if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) ) error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); if ( args->phased_concat ) { int ret = bcf_read(fp, hdr, line); if ( ret!=0 ) args->start_pos[i] = -2; // empty file else { int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line)); args->start_pos[i] = chrid==prev_chrid ? line->pos : -1; prev_chrid = chrid; } } bcf_hdr_destroy(hdr); hts_close(fp); } free(str.s); if ( line ) bcf_destroy(line); args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int)); if ( args->phased_concat ) { bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">"); bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">"); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); bcf_hdr_write(args->out_fh, args->out_hdr); if ( args->allow_overlaps ) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->remove_dups ) { if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The -D string \"%s\" not recognised.\n", args->remove_dups); } for (i=0; i<args->nfnames; i++) if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum)); } else if ( args->phased_concat ) { // Remove empty files from the list int nok = 0; while (1) { while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++; if ( nok==args->nfnames ) break; i = nok; while ( i<args->nfnames && args->start_pos[i]==-2 ) i++; if ( i==args->nfnames ) break; int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp; char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str; } for (i=nok; i<args->nfnames; i++) free(args->fnames[i]); args->nfnames = nok; for (i=1; i<args->nfnames; i++) if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] ) error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]); args->prev_chr = -1; args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->files = bcf_sr_init(); args->files->require_index = 1; args->ifname = 0; } }
int subset_vcf(args_t *args, bcf1_t *line) { if ( args->min_alleles && line->n_allele < args->min_alleles ) return 0; // min alleles if ( args->max_alleles && line->n_allele > args->max_alleles ) return 0; // max alleles if (args->novel || args->known) { if ( args->novel && (line->d.id[0]!='.' || line->d.id[1]!=0) ) return 0; // skip sites which are known, ID != '.' if ( args->known && line->d.id[0]=='.' && line->d.id[1]==0 ) return 0; // skip sites which are novel, ID == '.' } if (args->include || args->exclude) { int line_type = bcf_get_variant_types(line); if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types } if ( args->filter ) { int ret = filter_test(args->filter, line, NULL); if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return 0; } else if ( ret ) return 0; } hts_expand(int, line->n_allele, args->mac, args->ac); int i, an = 0, non_ref_ac = 0; if (args->calc_ac) { bcf_calc_ac(args->hdr, line, args->ac, BCF_UN_INFO|BCF_UN_FMT); // get original AC and AN values from INFO field if available, otherwise calculate for (i=1; i<line->n_allele; i++) non_ref_ac += args->ac[i]; for (i=0; i<line->n_allele; i++) an += args->ac[i]; } if (args->n_samples) { int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int)); bcf_subset(args->hdr, line, args->n_samples, args->imap); if (args->calc_ac) { bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN an = 0; for (i=0; i<line->n_allele; i++) { args->ac[i] = ac_sub[i]; an += ac_sub[i]; } for (i=1; i<line->n_allele; i++) non_ref_ac_sub += ac_sub[i]; if (args->private_vars) { if (args->private_vars == FLT_INCLUDE && !(non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub)) { free(ac_sub); return 0; } // select private sites if (args->private_vars == FLT_EXCLUDE && non_ref_ac_sub > 0 && non_ref_ac == non_ref_ac_sub) { free(ac_sub); return 0; } // exclude private sites } non_ref_ac = non_ref_ac_sub; } free(ac_sub); } bcf_fmt_t *gt_fmt; if ( args->gt_type && (gt_fmt=bcf_get_fmt(args->hdr,line,"GT")) ) { int nhet = 0, nhom = 0, nmiss = 0; for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) { int type = bcf_gt_type(gt_fmt,i,NULL,NULL); if ( type==GT_HET_RA || type==GT_HET_AA ) { if ( args->gt_type==GT_NO_HET ) return 0; nhet = 1; } else if ( type==GT_UNKN ) { if ( args->gt_type==GT_NO_MISSING ) return 0; nmiss = 1; } else { if ( args->gt_type==GT_NO_HOM ) return 0; nhom = 1; } } if ( args->gt_type==GT_NEED_HOM && !nhom ) return 0; else if ( args->gt_type==GT_NEED_HET && !nhet ) return 0; else if ( args->gt_type==GT_NEED_MISSING && !nmiss ) return 0; } int minor_ac = 0; int major_ac = 0; if ( args->calc_ac ) { minor_ac = args->ac[0]; major_ac = args->ac[0]; for (i=1; i<line->n_allele; i++){ if (args->ac[i] < minor_ac) { minor_ac = args->ac[i]; } if (args->ac[i] > major_ac) { major_ac = args->ac[i]; } } } if (args->min_ac) { if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC else if (args->min_ac_type == ALLELE_ALT1 && args->min_ac>args->ac[1]) return 0; // min 1st alternate AC else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC } if (args->max_ac) { if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC else if (args->max_ac_type == ALLELE_ALT1 && args->max_ac<args->ac[1]) return 0; // max 1st alternate AC else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC } if (args->min_af) { if (an == 0) return 0; // freq not defined, skip site if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF else if (args->min_af_type == ALLELE_MINOR && args->min_af>minor_ac/(double)an) return 0; // min minor AF else if (args->min_af_type == ALLELE_ALT1 && args->min_af>args->ac[1]/(double)an) return 0; // min 1st alternate AF else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF } if (args->max_af) { if (an == 0) return 0; // freq not defined, skip site if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF else if (args->max_af_type == ALLELE_MINOR && args->max_af<minor_ac/(double)an) return 0; // max minor AF else if (args->max_af_type == ALLELE_ALT1 && args->max_af<args->ac[1]/(double)an) return 0; // max 1st alternate AF else if (args->max_af_type == ALLELE_MAJOR && args->max_af < major_ac/(double)an) return 0; // max major AF else if (args->max_af_type == ALLELE_NONMAJOR && args->max_af < (an-major_ac)/(double)an) return 0; // max non-major AF } if (args->uncalled) { if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled } if (args->calc_ac && args->update_info) { bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1); bcf_update_info_int32(args->hdr, line, "AN", &an, 1); } if (args->trim_alts) { int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); } if (args->phased) { int phased = bcf_all_phased(args->hdr, line); if (args->phased == FLT_INCLUDE && !phased) { return 0; } // skip unphased if (args->phased == FLT_EXCLUDE && phased) { return 0; } // skip phased } if (args->sites_only) bcf_subset(args->hsub ? args->hsub : args->hdr, line, 0, 0); return 1; }
static void vcfroh(args_t *args, bcf1_t *line) { // Are we done? if ( !line ) { flush_viterbi(args); return; } args->ntot++; // Skip unwanted lines if ( line->rid == args->skip_rid ) return; if ( line->n_allele==1 ) return; // no ALT allele if ( line->n_allele!=2 ) return; // only biallelic sites if ( args->snps_only && !bcf_is_snp(line) ) return; // Initialize genetic map int skip_rid = 0; if ( args->prev_rid<0 ) { args->prev_rid = line->rid; args->prev_pos = line->pos; skip_rid = load_genmap(args, line); if ( !skip_rid && args->vi_training ) push_rid(args, line->rid); } // New chromosome? if ( args->prev_rid!=line->rid ) { skip_rid = load_genmap(args, line); if ( args->vi_training ) { if ( !skip_rid ) push_rid(args, line->rid); } else { flush_viterbi(args); args->nsites = 0; } args->prev_rid = line->rid; args->prev_pos = line->pos; } if ( skip_rid ) { fprintf(pysamerr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line)); args->skip_rid = line->rid; return; } if ( args->prev_pos > line->pos ) error("The file is not sorted?!\n"); args->prev_rid = line->rid; args->prev_pos = line->pos; // Ready for the new site int m = args->msites; hts_expand(uint32_t,args->nsites+1,args->msites,args->sites); if ( args->msites!=m ) args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2); // Set likelihoods and alternate allele frequencies double alt_freq, pdg[3]; if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong args->nused++; // Calculate emission probabilities P(D|AZ) and P(D|HW) double *eprob = &args->eprob[2*args->nsites]; eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq; eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq; args->sites[args->nsites] = line->pos; args->nsites++; }
int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) { args->nitmp = 0; // Set allele frequency int ret; if ( args->af_tag ) { // Use an INFO tag provided by the user ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs); if ( ret==1 ) *alt_freq = args->AFs[0]; if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); } else if ( args->af_fname ) { // Read AF from a file ret = read_AF(args->files->targets, line, alt_freq); } else { // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF ret = -1; if ( !args->estimate_AF ) { int AC = -1, AN = 0; ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp); if ( ret==1 ) { AN = args->itmp[0]; ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp); if ( ret>0 ) AC = args->itmp[0]; } if ( AN<=0 || AC<0 ) ret = -1; else *alt_freq = (double) AC/AN; } if ( ret==-1 ) ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp } if ( ret<0 ) return ret; if ( *alt_freq==0.0 ) { if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0 *alt_freq = args->dflt_AF; } // Set P(D|G) if ( args->fake_PLs ) { if ( !args->nitmp ) { args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? args->nitmp /= args->nsmpl; } int32_t *gt = &args->itmp[args->ismpl*args->nitmp]; if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1; int a = bcf_gt_allele(gt[0]); int b = bcf_gt_allele(gt[1]); if ( a!=b ) { pdg[0] = pdg[2] = args->unseen_PL; pdg[1] = 1 - 2*args->unseen_PL; } else if ( a==0 ) { pdg[0] = 1 - 2*args->unseen_PL; pdg[1] = pdg[2] = args->unseen_PL; } else { pdg[0] = pdg[1] = args->unseen_PL; pdg[2] = 1 - 2*args->unseen_PL; } } else { args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp); if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid? args->nitmp /= args->nsmpl; int32_t *pl = &args->itmp[args->ismpl*args->nitmp]; pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0; pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0; pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0; double sum = pdg[0] + pdg[1] + pdg[2]; if ( !sum ) return -1; pdg[0] /= sum; pdg[1] /= sum; pdg[2] /= sum; } return 0; }
int beds_fill_buffer(struct beds_anno_file *file, bcf_hdr_t *hdr_out, bcf1_t *line) { assert(file->idx); int tid = tbx_name2id(file->idx, bcf_seqname(hdr_out, line)); // if cached this region already, just skip refill. this is different from vcfs_fill_buffer() if ( tid == file->last_id && file->last_start <= line->pos + 1 && file->last_end > line->pos) return -1; if ( tid == -1 ) { if ( file->no_such_chrom == 0 ) { warnings("no chromosome %s found in databases %s.", bcf_seqname(hdr_out, line), file->fname); file->no_such_chrom = 1; } return 1; } else { file->no_such_chrom = 0; } // empty cache file->cached = 0; int i; hts_itr_t *itr = tbx_itr_queryi(file->idx, tid, line->pos, line->pos + line->rlen); if ( itr == NULL ) return 1; // if buffer refilled, init last start and end file->last_id = tid; file->last_start = -1; file->last_end = -1; while (1) { if ( file->cached == file->max ) { file->max += 8; file->buffer = (struct beds_anno_tsv**)realloc(file->buffer, sizeof(struct beds_anno_tsv*)*file->max); for (i = 8; i > 0; --i) file->buffer[file->max - i] = beds_anno_tsv_init(); } if ( tbx_itr_next(file->fp, file->idx, itr, &file->buffer[file->cached]->string) < 0) break; struct beds_anno_tsv *tsv = file->buffer[file->cached]; convert_string_tsv(tsv); // Skip if variant located outside of region. if (line->pos < tsv->start || line->pos >= tsv->end) continue; if (tsv->end - tsv->start == 1 && line->pos != tsv->start) continue; file->cached++; if ( file->last_end == -1 ) { file->last_end = tsv->end; file->last_start = tsv->start; continue; } if ( file->last_end < tsv->end ) file->last_end = tsv->end; if ( file->last_start > tsv->start ) file->last_start = tsv->start; } // if buffer is filled return 0, else return 1 return file->cached ? 0 : 1; }
static void apply_variant(args_t *args, bcf1_t *rec) { if ( rec->n_allele==1 ) return; if ( rec->pos <= args->fa_frz_pos ) { fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); return; } if ( args->mask ) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = rec->pos; int end = rec->pos + rec->rlen - 1; if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; } int i, ialt = 1; if ( args->isample >= 0 ) { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) return; if ( args->haplotype ) { if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); } else if ( args->output_iupac ) { uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); int jalt; if ( fmt->n>1 ) { ptr = fmt->p + fmt->size*args->isample + 1; jalt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt; else jalt = bcf_gt_allele(jalt); } else jalt = ialt; if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? { char ial = rec->d.allele[ialt][0]; char jal = rec->d.allele[jalt][0]; rec->d.allele[ialt][0] = gt2iupac(ial,jal); } } else { for (i=0; i<fmt->n; i++) { uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i; ialt = bcf_dec_int1(ptr, fmt->type, &ignore); if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); if ( ialt ) break; } } if ( !ialt ) return; // ref allele if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); } else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) { char ial = rec->d.allele[0][0]; char jal = rec->d.allele[1][0]; rec->d.allele[1][0] = gt2iupac(ial,jal); } int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; if ( idx<0 || idx>=args->fa_buf.l ) error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off); // sanity check the reference base int len_diff = 0, alen = 0; if ( rec->d.allele[ialt][0]=='<' ) { if ( strcasecmp(rec->d.allele[ialt], "<DEL>") ) error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1); assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 len_diff = 1-rec->rlen; rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event alen = strlen(rec->d.allele[ialt]); } else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) { // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off); char tmp = 0; if ( args->fa_buf.l - idx > rec->rlen ) { tmp = args->fa_buf.s[idx+rec->rlen]; args->fa_buf.s[idx+rec->rlen] = 0; } error( "The fasta sequence does not match the REF allele at %s:%d:\n" " .vcf: [%s]\n" " .vcf: [%s] <- (ALT)\n" " .fa: [%s]%c%s\n", bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); } else { alen = strlen(rec->d.allele[ialt]); len_diff = alen - rec->rlen; } if ( args->fa_case ) for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]); else for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]); if ( len_diff <= 0 ) { // deletion or same size event for (i=0; i<alen; i++) args->fa_buf.s[idx+i] = rec->d.allele[ialt][i]; if ( len_diff ) memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); } else { // insertion ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); for (i=0; i<alen; i++) args->fa_buf.s[idx+i] = rec->d.allele[ialt][i]; } if (args->chain && len_diff != 0) { // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant) if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0) { // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1); } else { // otherwise, just the coordinates of the variant as given push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen); } } args->fa_buf.l += len_diff; args->fa_mod_off += len_diff; args->fa_frz_pos = rec->pos + rec->rlen - 1; }
static void phased_flush(args_t *args) { if ( !args->nbuf ) return; bcf_hdr_t *ahdr = args->files->readers[0].header; bcf_hdr_t *bhdr = args->files->readers[1].header; int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr); static int gt_absent_warned = 0; for (i=0; i<args->nbuf; i+=2) { bcf1_t *arec = args->buf[i]; bcf1_t *brec = args->buf[i+1]; int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa); if ( nGTs < 0 ) { if ( !gt_absent_warned ) { fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); gt_absent_warned = 1; } continue; } if ( nGTs != 2*nsmpl ) continue; // not diploid nGTs = bcf_get_genotypes(bhdr, brec, &args->GTb, &args->mGTb); if ( nGTs < 0 ) { if ( !gt_absent_warned ) { fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); gt_absent_warned = 1; } continue; } if ( nGTs != 2*nsmpl ) continue; // not diploid for (j=0; j<nsmpl; j++) { int *gta = &args->GTa[j*2]; int *gtb = &args->GTb[j*2]; if ( gta[1]==bcf_int32_vector_end || gtb[1]==bcf_int32_vector_end ) continue; if ( bcf_gt_is_missing(gta[0]) || bcf_gt_is_missing(gta[1]) || bcf_gt_is_missing(gtb[0]) || bcf_gt_is_missing(gtb[1]) ) continue; if ( !bcf_gt_is_phased(gta[1]) || !bcf_gt_is_phased(gtb[1]) ) continue; if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gta[1]) || bcf_gt_allele(gtb[0])==bcf_gt_allele(gtb[1]) ) continue; if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[0]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[1]) ) { if ( args->swap_phase[j] ) args->nmism[j]++; else args->nmatch[j]++; } if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[1]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[0]) ) { if ( args->swap_phase[j] ) args->nmatch[j]++; else args->nmism[j]++; } } } for (i=0; i<args->nbuf/2; i+=2) { bcf1_t *arec = args->buf[i]; bcf_translate(args->out_hdr, args->files->readers[0].header, arec); if ( args->nswap ) phase_update(args, args->out_hdr, arec); if ( !args->compact_PS || args->phase_set_changed ) { bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } bcf_write(args->out_fh, args->out_hdr, arec); if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1); args->prev_pos_check = arec->pos; } args->nswap = 0; for (j=0; j<nsmpl; j++) { if ( args->nmatch[j] >= args->nmism[j] ) args->swap_phase[j] = 0; else { args->swap_phase[j] = 1; args->nswap++; } if ( args->nmatch[j] && args->nmism[j] ) { // Entropy-inspired quality. The factor 0.7 shifts and scales to (0,1) double f = (double)args->nmatch[j]/(args->nmatch[j]+args->nmism[j]); args->phase_qual[j] = 99*(0.7 + f*log(f) + (1-f)*log(1-f))/0.7; } else args->phase_qual[j] = 99; args->nmatch[j] = 0; args->nmism[j] = 0; } int PQ_printed = 0; for (; i<args->nbuf; i+=2) { bcf1_t *brec = args->buf[i+1]; bcf_translate(args->out_hdr, args->files->readers[1].header, brec); if ( !PQ_printed ) { bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; j<nsmpl; j++) if ( args->phase_qual[j] < args->min_PQ ) { args->phase_set[j] = brec->pos+1; args->phase_set_changed = 1; } else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing; } if ( args->nswap ) phase_update(args, args->out_hdr, brec); if ( !args->compact_PS || args->phase_set_changed ) { bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl); args->phase_set_changed = 0; } bcf_write(args->out_fh, args->out_hdr, brec); if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1); args->prev_pos_check = brec->pos; } args->nbuf = 0; }
int _reader_next_line(bcf_srs_t *files) { int i, min_pos = INT_MAX; // Loop until next suitable line is found or all readers have finished while ( 1 ) { // Get all readers ready for the next region. if ( files->regions && _readers_next_region(files)<0 ) break; // Fill buffers const char *chr = NULL; for (i=0; i<files->nreaders; i++) { _reader_fill_buffer(files, &files->readers[i]); // Update the minimum coordinate if ( !files->readers[i].nbuffer ) continue; if ( min_pos > files->readers[i].buffer[1]->pos ) { min_pos = files->readers[i].buffer[1]->pos; chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]); } } if ( min_pos==INT_MAX ) { if ( !files->regions ) break; continue; } // Skip this position if not present in targets if ( files->targets ) { int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos); if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) ) { // Remove all lines with this position from the buffer for (i=0; i<files->nreaders; i++) if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) _reader_shift_buffer(&files->readers[i]); min_pos = INT_MAX; continue; } } break; // done: min_pos is set } // There can be records with duplicate positions. Set the active line intelligently so that // the alleles match. int nret = 0; // number of readers sharing the position bcf1_t *first = NULL; // record which will be used for allele matching for (i=0; i<files->nreaders; i++) { files->has_line[i] = 0; // Skip readers with no records at this position if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue; // Until now buffer[0] of all reader was empty and the lines started at buffer[1]. // Now lines which are ready to be output will be moved to buffer[0]. if ( _reader_match_alleles(files, &files->readers[i], first) < 0 ) continue; if ( !first ) first = files->readers[i].buffer[0]; nret++; files->has_line[i] = 1; } return nret; }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) bcf_hdr_write(out,hdr); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; bcf_write(out,hdr,rec); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }
static void init_data(args_t *args) { bcf_srs_t *files = bcf_sr_init(); if ( args->regions_list ) { if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum)); bcf_hdr_t *hdr = files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = hdr->samples[0]; } else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample); int ret = bcf_hdr_set_samples(hdr, args->sample, 0); if ( ret<0 ) error("Error setting the sample: %s\n", args->sample); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) ) error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname); int i; args->xvals = (double*) calloc(args->nbins,sizeof(double)); for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1); // collect BAF distributions for all chromosomes int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1; float *baf = NULL; while ( bcf_sr_next_line(files) ) { ntotal++; bcf1_t *line = bcf_sr_get_line(files,0); if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue; if ( bcf_float_is_missing(baf[0]) ) continue; nprocessed++; if ( prev_chr==-1 || prev_chr!=line->rid ) { // new chromosome idist = args->ndist++; args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist); memset(&args->dist[idist],0,sizeof(dist_t)); args->dist[idist].chr = strdup(bcf_seqname(hdr,line)); args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double)); args->dist[idist].xvals = args->xvals; args->dist[idist].nvals = args->nbins; prev_chr = line->rid; } int bin = baf[0]*(args->nbins-1); args->dist[idist].yvals[bin]++; // the distribution } free(baf); bcf_sr_destroy(files); for (idist=0; idist<args->ndist; idist++) { #if 0 int j; for (j=0; j<args->nbins; j++) { double x = args->dist[idist].xvals[j]; args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3); } #endif init_dist(args, &args->dist[idist],args->verbose); } args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir); fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version()); fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]); for (i=1; i<args->argc; i++) fprintf(args->dat_fp, " %s",args->argv[i]); fprintf(args->dat_fp,"\n#\n"); fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n"); fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n"); fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n"); char *fname = NULL; FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir); //-------- matplotlib script -------------- fprintf(fp, "#!/usr/bin/env python\n" "#\n" "import matplotlib as mpl\n" "mpl.use('Agg')\n" "import matplotlib.pyplot as plt\n" "import csv,sys,argparse\n" "from math import exp\n" "\n" "outdir = '%s'\n" "\n" "def read_dat(dat,fit,cn):\n" " csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n" " with open(outdir+'/dist.dat', 'rb') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" " type = row[0]\n" " chr = row[1]\n" " if type=='DIST':\n" " if chr not in dat: dat[chr] = []\n" " dat[chr].append(row)\n" " elif type=='FIT':\n" " if chr not in fit: fit[chr] = []\n" " fit[chr].append(row)\n" " elif type=='CN':\n" " cn[chr] = row[2]\n" "\n" "def plot_dist(dat,fit,chr):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n" " if chr in fit:\n" " for i in range(len(fit[chr])):\n" " pfit = fit[chr][i]\n" " exec('def xfit(x): return '+pfit[5])\n" " istart = int(pfit[3])\n" " iend = int(pfit[4])+1\n" " vals = dat[chr][istart:iend]\n" " args = {}\n" " if i==0: args = {'label':'Target to Fit'}\n" " ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n" " if i==0: args = {'label':'Best Fit'}\n" " ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n" " ax.set_title('BAF distribution, chr'+chr)\n" " ax.set_xlabel('BAF')\n" " ax.set_ylabel('Frequency')\n" " ax.legend(loc='best',prop={'size':7},frameon=False)\n" " plt.savefig(outdir+'/dist.chr'+chr+'.png')\n" " plt.close()\n" "\n" "def plot_copy_number(cn):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " xlabels = sorted(cn.keys())\n" " xvals = range(len(xlabels))\n" " yvals = [float(cn[x]) for x in xlabels]\n" " ax.plot(xvals,yvals,'o',color='red')\n" " for i in range(len(xvals)):\n" " if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n" " ax.tick_params(axis='both', which='major', labelsize=9)\n" " ax.set_xticks(xvals)\n" " ax.set_xticklabels(xlabels,rotation=45)\n" " ax.set_xlim(-1,len(xlabels))\n" " ax.set_ylim(0,5.0)\n" " ax.set_yticks([1.0,2.0,3.0,4.0])\n" " ax.set_xlabel('Chromosome')\n" " ax.set_ylabel('Copy Number')\n" " plt.savefig(outdir+'/copy-number.png')\n" " plt.close()\n" "\n" "class myParser(argparse.ArgumentParser):\n" " def error(self, message):\n" " self.print_help()\n" " sys.stderr.write('error: %%s\\n' %% message)\n" " sys.exit(2)\n" "\n" "def main():\n" " parser = myParser()\n" " parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n" " parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n" " parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n" " args = parser.parse_args()\n" " dat = {}; fit = {}; cn = {}\n" " read_dat(dat,fit,cn)\n" " if args.distrib!=None:\n" " plot_dist(dat,fit,args.distrib)\n" " if args.all:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" " plot_copy_number(cn)\n" " elif args.copy_number:\n" " plot_copy_number(cn)\n" " else:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" "\n" "if __name__ == '__main__':\n" " main()\n", args->output_dir); //--------------------------------------- chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH); free(fname); fclose(fp); }
static void warn_ploidy(bcf1_t *rec) { static int warned = 0; if ( warned ) return; fprintf(stderr,"Incorrect ploidy at %s:%d, skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),rec->pos+1); warned = 1; }