int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal) { int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0; #define BRANCH_INT(type_t,missing,vector_end) { \ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ for (i=0; i<fmt_ptr->n; i++) \ { \ if ( p[i] == vector_end ) break; /* smaller ploidy */ \ if ( !p[i] || p[i] == missing ) continue; /* missing allele */ \ int tmp = p[i]>>1; \ if ( tmp>1 ) \ { \ if ( !ial ) { ial = tmp; has_alt = 1; } \ else if ( tmp!=ial ) \ { \ if ( tmp<ial ) \ { \ jal = ial; \ ial = tmp; \ } \ else \ { \ jal = tmp; \ } \ has_alt = 2; \ } \ } \ else has_ref = 1; \ nals++; \ } \ } switch (fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT if ( _ial ) *_ial = ial>0 ? ial-1 : ial; if ( _jal ) *_jal = jal>0 ? jal-1 : jal; if ( !nals ) return GT_UNKN; if ( nals==1 ) return has_ref ? GT_HAPL_R : GT_HAPL_A; if ( !has_ref ) return has_alt==1 ? GT_HOM_AA : GT_HET_AA; if ( !has_alt ) return GT_HOM_RR; return GT_HET_RA; }
// true if all samples are phased. // haploid genotypes are considered phased // ./. => not phased, .|. => phased int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line) { bcf_unpack(line, BCF_UN_FMT); bcf_fmt_t *fmt_ptr = bcf_get_fmt(header, line, "GT"); int all_phased = 1; if ( fmt_ptr ) { int i, isample; for (isample=0; isample<line->n_sample; isample++) { int sample_phased = 0; #define BRANCH_INT(type_t,vector_end) { \ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ for (i=0; i<fmt_ptr->n; i++) \ { \ if (fmt_ptr->n == 1 || (p[i] == vector_end && i == 1)) { sample_phased = 1; break; } /* haploid phased by definition */ \ if ( p[i] == vector_end ) { break; }; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \ if ((p[i])&1) { \ sample_phased = 1; \ break; \ } \ } \ } switch (fmt_ptr->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT if (!sample_phased) { all_phased = 0; break; } } } return all_phased; }
int calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) { int i; for (i=0; i<line->n_allele; i++) ac[i]=0; // Use INFO/AC,AN field only when asked if ( which&BCF_UN_INFO ) { bcf_unpack(line, BCF_UN_INFO); int an_id = bcf_id2int(header, BCF_DT_ID, "AN"); int ac_id = bcf_id2int(header, BCF_DT_ID, "AC"); if ( an_id>=0 && ac_id>=0 ) { int i, an=0, ac_len=0, ac_type=0; uint8_t *ac_ptr=NULL; for (i=0; i<line->n_info; i++) { bcf_info_t *z = &line->d.info[i]; if ( z->key == an_id ) an = z->v1.i; else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } } int nac = 0; #define BRANCH_INT(type_t) { \ type_t *p = (type_t *) ac_ptr; \ for (i=0; i<ac_len; i++) \ { \ ac[i+1] = p[i]; \ nac += p[i]; \ } \ } if ( ac_type==BCF_BT_INT8 ) { BRANCH_INT(uint8_t) } else if ( ac_type==BCF_BT_INT16 ) { BRANCH_INT(uint16_t) } else if ( ac_type==BCF_BT_INT32 ) { BRANCH_INT(uint32_t) } #undef BRANCH_INT ac[0] = an - nac; return 1; }
int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) { int i; for (i=0; i<line->n_allele; i++) ac[i]=0; // Use INFO/AC,AN field only when asked if ( which&BCF_UN_INFO ) { bcf_unpack(line, BCF_UN_INFO); int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN"); int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC"); int i, an=-1, ac_len=0, ac_type=0; uint8_t *ac_ptr=NULL; if ( an_id>=0 && ac_id>=0 ) { for (i=0; i<line->n_info; i++) { bcf_info_t *z = &line->d.info[i]; if ( z->key == an_id ) an = z->v1.i; else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } } } if ( an>=0 && ac_ptr ) { int nac = 0; #define BRANCH_INT(type_t) { \ type_t *p = (type_t *) ac_ptr; \ for (i=0; i<ac_len; i++) \ { \ ac[i+1] = p[i]; \ nac += p[i]; \ } \ } switch (ac_type) { case BCF_BT_INT8: BRANCH_INT(int8_t); break; case BCF_BT_INT16: BRANCH_INT(int16_t); break; case BCF_BT_INT32: BRANCH_INT(int32_t); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT assert( an>=nac ); // sanity check for missing values ac[0] = an - nac; return 1; } } // Split genotype fields only when asked if ( which&BCF_UN_FMT ) { int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT"); if ( gt_id<0 ) return 0; bcf_unpack(line, BCF_UN_FMT); bcf_fmt_t *fmt_gt = NULL; for (i=0; i<(int)line->n_fmt; i++) if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } if ( !fmt_gt ) return 0; #define BRANCH_INT(type_t,missing,vector_end) { \ for (i=0; i<line->n_sample; i++) \ { \ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ int ial; \ for (ial=0; ial<fmt_gt->n; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ ac[(p[ial]>>1)-1]++; \ } \ } \ } switch (fmt_gt->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; } return 0; }
bcf1_t *process(bcf1_t *rec) { int i, ns = 0; bcf_unpack(rec, BCF_UN_FMT); bcf_fmt_t *fmt_gt = NULL; for (i=0; i<rec->n_fmt; i++) if ( rec->d.fmt[i].id==args.gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } if ( !fmt_gt ) return rec; // no GT tag hts_expand(int32_t,rec->n_allele,args.marr,args.arr); hts_expand(float,rec->n_allele,args.mfarr,args.farr); hts_expand(counts_t,rec->n_allele,args.mcounts,args.counts); memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); memset(args.counts,0,sizeof(*args.counts)*rec->n_allele); #define BRANCH_INT(type_t,vector_end) { \ for (i=0; i<rec->n_sample; i++) \ { \ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ int ial, als = 0; \ for (ial=0; ial<fmt_gt->n; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[ial]) ) break; /* missing allele */ \ int idx = bcf_gt_allele(p[ial]); \ \ if ( idx >= rec->n_allele ) \ error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args.in_hdr->samples[i],bcf_seqname(args.in_hdr,rec),rec->pos+1); \ als |= (1<<idx); /* this breaks with too many alleles */ \ } \ if ( ial==0 ) continue; /* missing alleles */ \ ns++; \ int is_hom = als && !(als & (als-1)); /* only one bit is set */ \ int is_hemi = ial==1; \ for (ial=0; als; ial++) \ { \ if ( als&1 ) \ { \ if ( !is_hom ) \ args.counts[ial].nhet++; \ else if ( !is_hemi ) \ args.counts[ial].nhom += 2; \ else \ args.counts[ial].nhemi++; \ } \ als >>= 1; \ } \ } \ } switch (fmt_gt->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args.in_hdr,rec),rec->pos+1); break; } #undef BRANCH_INT if ( args.tags&SET_NS ) { if ( bcf_update_info_int32(args.out_hdr,rec,"NS",&ns,1)!=0 ) error("Error occurred while updating NS at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AN ) { args.arr[0] = 0; for (i=0; i<rec->n_allele; i++) args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; if ( bcf_update_info_int32(args.out_hdr,rec,"AN",args.arr,1)!=0 ) error("Error occurred while updating AN at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AF ) { int n = rec->n_allele-1; if ( n>0 ) { args.arr[0] = 0; for (i=0; i<rec->n_allele; i++) args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; for (i=1; i<rec->n_allele; i++) args.farr[i] = (args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi)*1.0/args.arr[0]; } if ( args.arr[0] ) { if ( bcf_update_info_float(args.out_hdr,rec,"AF",args.farr+1,n)!=0 ) error("Error occurred while updating AF at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } } if ( args.tags&SET_AC ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] = args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC",args.arr+1,n)!=0 ) error("Error occurred while updating AC at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Het ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhet; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Het",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Het at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Hom ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhom; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Hom",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Hom at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } if ( args.tags&SET_AC_Hemi ) { int n = rec->n_allele-1; if ( n>0 ) { memset(args.arr,0,sizeof(*args.arr)*rec->n_allele); for (i=1; i<rec->n_allele; i++) args.arr[i] += args.counts[i].nhemi; } if ( bcf_update_info_int32(args.out_hdr,rec,"AC_Hemi",args.arr+1,n)!=0 ) error("Error occurred while updating AC_Hemi at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1); } return rec; }