// add ROC decision point void BlockQuantify::addROCValue(std::string const & roc_identifier, roc::DecisionType dt, double level, uint64_t n, bcf1_t * v) { // add observation to a roc auto observe = [this, level, dt, n, v](std::string const & name, bool f) { roc::DecisionType final_dt = dt; if(f) { if(dt == roc::DecisionType::TP) { // filter-failed TPs become FNs final_dt = roc::DecisionType::FN; } else if(dt == roc::DecisionType::TP2) { // filter-failed TPs become FNs final_dt = roc::DecisionType::FN2; } else if(dt != roc::DecisionType::FN) { // filter-failed FPs / UNKs become Ns final_dt = roc::DecisionType::N; } } uint64_t flags = 0; switch(final_dt) { case roc::DecisionType::FN: case roc::DecisionType::TP: { const std::string bk_truth = bcfhelpers::getFormatString(_impl->hdr, v, "BK", 0, "."); const std::string bi_truth = bcfhelpers::getFormatString(_impl->hdr, v, "BI", 0, "."); const std::string blt_truth = bcfhelpers::getFormatString(_impl->hdr, v, "BLT", 0, "."); flags = roc::makeObservationFlags(bk_truth, bi_truth, blt_truth); break; } case roc::DecisionType::FP: case roc::DecisionType::UNK: case roc::DecisionType::TP2: case roc::DecisionType::FN2: { const std::string bk_query = bcfhelpers::getFormatString(_impl->hdr, v, "BK", 1, "."); const std::string bi_query = bcfhelpers::getFormatString(_impl->hdr, v, "BI", 1, "."); const std::string blt_query = bcfhelpers::getFormatString(_impl->hdr, v, "BLT", 1, "."); flags = roc::makeObservationFlags(bk_query, bi_query, blt_query); break; } default: break; } auto it = _impl->rocs.find(name); if(it == _impl->rocs.end()) { it = _impl->rocs.insert(std::make_pair(name, roc::Roc())).first; } // make sure FN and N always come first if( ( final_dt == roc::DecisionType::FN || final_dt == roc::DecisionType::FN2 || final_dt == roc::DecisionType::N ) && level == 0 ) { it->second.add(roc::Observation{std::numeric_limits<double>::min(), final_dt, n, flags}); } else { it->second.add(roc::Observation{level, final_dt, n, flags}); } }; bcf_unpack(v, BCF_UN_FLT); bool fail = false; // fails any of the non-blocked filters bool fail_any = false; // fails any filter for(int j = 0; j < v->d.n_flt; ++j) { std::string filter = "PASS"; int k = v->d.flt[j]; if(k >= 0) { filter = bcf_hdr_int2id(_impl->hdr, BCF_DT_ID, v->d.flt[j]); } if(filter != "PASS" && filter != "") { if(!_impl->filters_to_ignore.count("*") && !_impl->filters_to_ignore.count(filter)) { fail = true; observe("f:" + roc_identifier + ":" + filter, false); } else { observe("f:" + roc_identifier + ":SEL_IGN_" + filter, false); } fail_any = true; } } observe("a:" + roc_identifier + ":PASS", fail_any); // selectively-filtered ROCs if(!_impl->filters_to_ignore.empty()) { observe("a:" + roc_identifier + ":SEL", fail); } observe("a:" + roc_identifier + ":ALL", false); const std::string regions = bcfhelpers::getInfoString(_impl->hdr, v, "Regions", ""); if(!regions.empty() && regions != "CONF") { std::vector<std::string> rs; stringutil::split(regions, rs, ","); for(auto const & r : rs) { if(r == "CONF") { continue; } observe("s|" + r + ":" + roc_identifier + ":PASS", fail_any); if(!_impl->filters_to_ignore.empty()) { observe("s|" + r + ":" + roc_identifier + ":SEL", fail); } observe("s|" + r + ":" + roc_identifier + ":ALL", false); } } }
void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) { int *map = (int*) calloc(line->n_allele, sizeof(int)); // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles for (i=1, j=1; i<line->n_allele; i++) { if ( rm_mask & 1<<i ) { // remove this allele line->d.allele[i] = NULL; nrm++; continue; } kputc(',', &str); kputs(line->d.allele[i], &str); map[i] = j; j++; } if ( !nrm ) { free(map); free(str.s); return; } int nR_ori = line->n_allele; int nR_new = line->n_allele-nrm; assert(nR_new > 0); // should not be able to remove reference allele int nA_ori = nR_ori-1; int nA_new = nR_new-1; int nG_ori = nR_ori*(nR_ori + 1)/2; int nG_new = nR_new*(nR_new + 1)/2; bcf_update_alleles_str(header, line, str.s); // remove from Number=G, Number=R and Number=A INFO fields. uint8_t *dat = NULL; int mdat = 0, ndat = 0, mdat_bytes = 0, nret; for (i=0; i<line->n_info; i++) { bcf_info_t *info = &line->d.info[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { str.l = 0; char *ss = (char*) dat, *se = (char*) dat; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; j<nexp; j++) { if ( !*se ) break; while ( *se && *se!=',' ) se++; if ( rm_mask & 1<<(j+inc) ) { if ( *se ) se++; ss = se; continue; } if ( str.l ) kputc(',',&str); kputsn(ss,se-ss,&str); if ( *se ) se++; ss = se; } assert( j==nexp ); } else // Number=G, assuming diploid genotype { int k = 0, n = 0; for (j=0; j<nR_ori; j++) { for (k=0; k<=j; k++) { if ( !*se ) break; while ( *se && *se!=',' ) se++; n++; if ( rm_mask & 1<<j || rm_mask & 1<<k ) { if ( *se ) se++; ss = se; continue; } if ( str.l ) kputc(',',&str); kputsn(ss,se-ss,&str); if ( *se ) se++; ss = se; } if ( !*se ) break; } assert( n=nG_ori ); } nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int inc = 0, ntop; if ( vlen==BCF_VL_A ) { assert( nret==nA_ori ); ntop = nA_ori; ndat = nA_new; inc = 1; } else { assert( nret==nR_ori ); ntop = nR_ori; ndat = nR_new; } int k = 0; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; j<ntop; j++) /* j:ori, k:new */ \ { \ if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \ if ( rm_mask & 1<<(j+inc) ) continue; \ if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \ k++; \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break; } #undef BRANCH } else // Number=G { assert( nret==nG_ori ); int k, l_ori = -1, l_new = 0; ndat = nG_new; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; j<nR_ori; j++) \ { \ for (k=0; k<=j; k++) \ { \ l_ori++; \ if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \ if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \ if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \ l_new++; \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break; } #undef BRANCH } nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } // Update GT fields, the allele indexes might have changed for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break; if ( i<line->n_allele ) { mdat = mdat_bytes / 4; // sizeof(int32_t) nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); mdat_bytes = mdat * 4; if ( nret>0 ) { nret /= line->n_sample; int32_t *ptr = (int32_t*) dat; for (i=0; i<line->n_sample; i++) { for (j=0; j<nret; j++) { if ( ptr[j]==bcf_gt_missing ) continue; if ( ptr[j]==bcf_int32_vector_end ) break; int al = bcf_gt_allele(ptr[j]); assert( al<nR_ori && map[al]>=0 ); ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); } ptr += nret; } bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); } } // Remove from Number=G, Number=R and Number=A FORMAT fields. // Assuming haploid or diploid GTs for (i=0; i<line->n_fmt; i++) { bcf_fmt_t *fmt = &line->d.fmt[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { int size = nret/line->n_sample; // number of bytes per sample str.l = 0; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; j<line->n_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; for (k_src=0; k_src<nexp; k_src++) { if ( ptr>=se || !*ptr) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<(k_src+inc) ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } assert( k_src==nexp ); l = str.l - l; for (; l<size; l++) kputc(0, &str); } } else // Number=G, diploid or haploid { for (j=0; j<line->n_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; int nexp = 0; // diploid or haploid? while ( ptr<se ) { if ( !*ptr ) break; if ( *ptr==',' ) nexp++; ptr++; } if ( ptr!=ss ) nexp++; assert( nexp==nG_ori || nexp==nR_ori ); ptr = ss; if ( nexp==nG_ori ) // diploid { int ia, ib; for (ia=0; ia<nR_ori; ia++) { for (ib=0; ib<=ia; ib++) { if ( ptr>=se || !*ptr ) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } if ( ptr>=se || !*ptr ) break; } } else // haploid { for (k_src=0; k_src<nR_ori; k_src++) { if ( ptr>=se || !*ptr ) break; while ( ptr<se && *ptr && *ptr!=',' ) ptr++; if ( rm_mask & 1<<k_src ) { ss = ++ptr; continue; } if ( k_dst ) kputc(',',&str); kputsn(ss,ptr-ss,&str); ss = ++ptr; k_dst++; } assert( k_src==nR_ori ); l = str.l - l; for (; l<size; l++) kputc(0, &str); } } } nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } int nori = nret / line->n_sample; if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G { int ntop, inc = 0; if ( vlen==BCF_VL_A ) { assert( nori==nA_ori ); // todo: will fail if all values are missing ntop = nA_ori; ndat = nA_new*line->n_sample; inc = 1; } else { assert( nori==nR_ori ); // todo: will fail if all values are missing ntop = nR_ori; ndat = nR_new*line->n_sample; } #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; j<line->n_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \ int size = sizeof(type_t); \ int k_src, k_dst = 0; \ for (k_src=0; k_src<ntop; k_src++) \ { \ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \ if ( rm_mask & 1<<(k_src+inc) ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; } #undef BRANCH } else // Number=G, diploid or mixture of haploid+diploid { assert( nori==nG_ori ); ndat = nG_new*line->n_sample; #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; j<line->n_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ int size = sizeof(type_t); \ int ia, ib, k_dst = 0, k_src; \ int nset = 0; /* haploid or diploid? */ \ for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \ if ( nset==nR_ori ) /* haploid */ \ { \ for (k_src=0; k_src<nR_ori; k_src++) \ { \ if ( rm_mask & 1<<k_src ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ } \ else /* diploid */ \ { \ k_src = -1; \ for (ia=0; ia<nR_ori; ia++) \ { \ for (ib=0; ib<=ia; ib++) \ { \ k_src++; \ if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \ if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \ if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ k_dst++; \ } \ } \ } \ } \ } switch (type) { case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; } #undef BRANCH } nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } free(dat); free(str.s); free(map); }
int run(int argc, char **argv) { char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL; memset(&args,0,sizeof(args_t)); args.mode = 0; args.output_fname = "-"; static struct option loptions[] = { {"trio",1,0,'t'}, {"trio-file",1,0,'T'}, {"delete",0,0,'d'}, {"list",1,0,'l'}, {"count",0,0,'c'}, {"rules",1,0,'r'}, {"rules-file",1,0,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0) { switch (c) { case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'R': rules_fname = optarg; break; case 'r': rules_string = optarg; break; case 'd': args.mode |= MODE_DELETE; break; case 'c': args.mode |= MODE_COUNT; break; case 'l': if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; else error("The argument not recognised: --list %s\n", optarg); break; case 't': trio_samples = optarg; break; case 'T': trio_file = optarg; break; case 'h': case '?': default: error("%s",usage()); break; } } if ( rules_fname ) args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); else args.rules = init_rules(&args, rules_string); if ( !args.rules ) return -1; args.itr = regitr_init(args.rules); args.itr_ori = regitr_init(args.rules); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else error("%s",usage()); } else fname = argv[optind]; if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n"); if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; args.sr = bcf_sr_init(); if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); args.hdr = bcf_sr_get_header(args.sr, 0); args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); bcf_hdr_write(args.out_fh, args.hdr); int i, n = 0; char **list; if ( trio_samples ) { args.ntrios = 1; args.trios = (trio_t*) calloc(1,sizeof(trio_t)); list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); } if ( trio_file ) { list = hts_readlist(trio_file, 1, &n); args.ntrios = n; args.trios = (trio_t*) calloc(n,sizeof(trio_t)); for (i=0; i<n; i++) { char *ss = list[i], *se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s: %s\n",trio_file, ss); *se = 0; args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s\n",trio_file); *se = 0; args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; if ( *ss=='\0' ) error("Could not parse %s\n",trio_file); args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss); free(list[i]); } free(list); } while ( bcf_sr_next_line(args.sr) ) { bcf1_t *line = bcf_sr_get_line(args.sr,0); line = process(line); if ( line ) { if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); bcf_write1(args.out_fh, args.hdr, line); } } fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); for (i=0; i<args.ntrios; i++) { trio_t *trio = &args.trios[i]; fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ); } free(args.gt_arr); free(args.trios); regitr_destroy(args.itr); regitr_destroy(args.itr_ori); regidx_destroy(args.rules); bcf_sr_destroy(args.sr); if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); return 0; }
static void reheader_bcf(args_t *args, int is_compressed) { htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); kstring_t htxt = {0,0,0}; int hlen; htxt.s = bcf_hdr_fmt_text(hdr, 1, &hlen); htxt.l = hlen; int i, nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0; read_header_file(args->header_fname, &htxt); } if ( samples ) { set_samples(samples, nsamples, &htxt); for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } bcf_hdr_t *hdr_out = bcf_hdr_init("r"); bcf_hdr_parse(hdr_out, htxt.s); if ( args->header_fname ) hdr_out = strip_header(hdr, hdr_out); // write the header and the body htsFile *fp_out = hts_open("-",is_compressed ? "wb" : "wbu"); bcf_hdr_write(fp_out, hdr_out); bcf1_t *rec = bcf_init(); while ( bcf_read(fp, hdr, rec)==0 ) { // sanity checking, this slows things down. Make it optional? bcf_unpack(rec, BCF_UN_ALL); if ( rec->rid >= hdr_out->n[BCF_DT_CTG] || strcmp(bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid),bcf_hdr_int2id(hdr_out,BCF_DT_CTG,rec->rid)) ) error("The CHROM is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid)); for (i=0; i<rec->d.n_flt; i++) { int id = rec->d.flt[i]; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FLT,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken FILTER ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->d.n_flt ) error("The FILTER is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i])); for (i=0; i<rec->n_info; i++) { int id = rec->d.info[i].key; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !hdr_out->id[BCF_DT_ID][id].key ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_INFO,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken INFO ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->n_info ) error("The INFO tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.info[i].key)); for (i=0; i<rec->n_fmt; i++) { int id = rec->d.fmt[i].id; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !hdr_out->id[BCF_DT_ID][id].key ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FMT,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken FORMAT ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->n_fmt ) error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); bcf_write(fp_out,hdr_out,rec); } bcf_destroy(rec); free(htxt.s); hts_close(fp_out); hts_close(fp); bcf_hdr_destroy(hdr_out); bcf_hdr_destroy(hdr); }
static void init_data(args_t *args) { int i; args->hdr = args->files->readers[0].header; if (args->calc_ac && args->update_info) { bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">"); bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"); } bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); // setup sample data if (args->sample_names) { void *hdr_samples = khash_str2int_init(); for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)); void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL; int nsmpl; char **smpl = NULL; args->samples = NULL; args->n_samples = 0; smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl); if ( !smpl ) { error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names); } if ( exclude ) { for (i=0; i<nsmpl; i++) { if (!khash_str2int_has_key(hdr_samples,smpl[i])) { if (args->force_samples) { fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); } else { error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } } khash_str2int_inc(exclude, smpl[i]); } for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) { if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)) ) continue; args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*)); args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)); } khash_str2int_destroy(exclude); } else { for (i=0; i<nsmpl; i++) { if (!khash_str2int_has_key(hdr_samples,smpl[i])) { if (args->force_samples) { fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); continue; } else { error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } } args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*)); args->samples[args->n_samples++] = strdup(smpl[i]); } } for (i=0; i<nsmpl; i++) free(smpl[i]); free(smpl); khash_str2int_destroy(hdr_samples); if (args->n_samples == 0) { fprintf(stderr, "Warn: subsetting has removed all samples\n"); args->sites_only = 1; } } if (args->n_samples) args->imap = (int*)malloc(args->n_samples * sizeof(int)); // determine variant types to include/exclude if (args->include_types || args->exclude_types) { if (args->include_types && args->exclude_types) { fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n"); exit(1); } char **type_list = 0; int m = 0, n = 0; const char *q, *p; for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) { if (*p == ',' || *p == 0) { if (m == n) { m = m? m<<1 : 16; type_list = (char**)realloc(type_list, m * sizeof(char*)); } type_list[n] = (char*)calloc(p - q + 1, 1); strncpy(type_list[n++], q, p - q); q = p + 1; if (*p == 0) break; } } type_list = (char**)realloc(type_list, n * sizeof(char*)); if (args->include_types) { args->include = 0; for (i = 0; i < n; ++i) { if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP; else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL; else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP; else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } } if (args->exclude_types) { args->exclude = 0; for (i = 0; i < n; ++i) { if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP; else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL; else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP; else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } } for (i = 0; i < n; ++i) free(type_list[i]); free(type_list); } // setup output char modew[8]; strcpy(modew, "w"); if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel); if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); // headers: hdr=full header, hsub=subset header, hnull=sites only header if (args->sites_only){ args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0); bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL); } if (args->n_samples > 0) { args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap); if ( !args->hsub ) error("Error occurred while subsetting samples\n"); if ( args->n_samples != bcf_hdr_nsamples(args->hsub) ) { int i; for (i=0; i<args->n_samples; i++) if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]); } } if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); }
void flush_viterbi(args_t *args) { const char *s1, *s2, *s3 = NULL; if ( args->mode==C_UNRL ) { s1 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->isample); s2 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->jsample); } else if ( args->mode==C_TRIO ) { s1 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->imother); s3 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->ifather); s2 = bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,args->ichild); } if ( !args->fp ) { kstring_t str = {0,0,0}; kputs(args->prefix, &str); kputs(".dat", &str); args->fp = fopen(str.s,"w"); if ( !args->fp ) error("%s: %s\n", str.s,strerror(errno)); free(str.s); fprintf(args->fp,"# SG, shared segment\t[2]Chromosome\t[3]Start\t[4]End\t[5]%s:1\t[6]%s:2\n",s2,s2); fprintf(args->fp,"# SW, number of switches\t[3]Sample\t[4]Chromosome\t[5]nHets\t[5]nSwitches\t[6]switch rate\n"); } hmm_run_viterbi(args->hmm,args->nsites,args->eprob,args->sites); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); int i, iprev = -1, prev_state = -1, nstates = hmm_get_nstates(args->hmm); int nswitch_mother = 0, nswitch_father = 0; for (i=0; i<args->nsites; i++) { int state = vpath[i*nstates]; if ( state!=prev_state || i+1==args->nsites ) { uint32_t start = iprev>=0 ? args->sites[iprev]+1 : 1, end = i>0 ? args->sites[i-1] : 1; const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid); if ( args->mode==C_UNRL ) { switch (prev_state) { case UNRL_0x0x: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t-\n", chr,start,end,s1); break; case UNRL_0xx0: fprintf(args->fp,"SG\t%s\t%d\t%d\t-\t%s:1\n", chr,start,end,s1); break; case UNRL_x00x: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t-\n", chr,start,end,s1); break; case UNRL_x0x0: fprintf(args->fp,"SG\t%s\t%d\t%d\t-\t%s:2\n", chr,start,end,s1); break; case UNRL_0101: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s1,s1); break; case UNRL_0110: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s1,s1); break; } } else if ( args->mode==C_TRIO ) { switch (prev_state) { case TRIO_AC: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:1\n", chr,start,end,s1,s3); break; case TRIO_AD: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s1,s3); break; case TRIO_BC: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s1,s3); break; case TRIO_BD: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:2\n", chr,start,end,s1,s3); break; case TRIO_CA: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:1\n", chr,start,end,s3,s1); break; case TRIO_DA: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:1\n", chr,start,end,s3,s1); break; case TRIO_CB: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:1\t%s:2\n", chr,start,end,s3,s1); break; case TRIO_DB: fprintf(args->fp,"SG\t%s\t%d\t%d\t%s:2\t%s:2\n", chr,start,end,s3,s1); break; } if ( hap_switch[state][prev_state] & SW_MOTHER ) nswitch_mother++; if ( hap_switch[state][prev_state] & SW_FATHER ) nswitch_father++; } iprev = i-1; } prev_state = state; } float mrate = args->nhet_mother>1 ? (float)nswitch_mother/(args->nhet_mother-1) : 0; float frate = args->nhet_father>1 ? (float)nswitch_father/(args->nhet_father-1) : 0; fprintf(args->fp,"SW\t%s\t%s\t%d\t%d\t%f\n", s1,bcf_hdr_id2name(args->hdr,args->prev_rid),args->nhet_mother,nswitch_mother,mrate); fprintf(args->fp,"SW\t%s\t%s\t%d\t%d\t%f\n", s3,bcf_hdr_id2name(args->hdr,args->prev_rid),args->nhet_father,nswitch_father,frate); args->nsites = 0; args->nhet_father = args->nhet_mother = 0; }