void union_data::readGenotypesVCF(string fvcf,string region) { int n_includedG = 0; int n_excludedG_mult = 0; int n_excludedG_void = 0; int n_excludedG_user = 0; int n_includedS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files bcf_srs_t * sr = bcf_sr_init(); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!"); bcf_sr_set_regions(sr, region.c_str(), 0); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i0 = 0 ; i0 < n_samples ; i0 ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0]))); if (mappingS.back() >= 0) n_includedS++; } //Read genotype data int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL; float * ds_arr = NULL; bcf1_t * line; unsigned int linecount = 0; while(bcf_sr_next_line (sr)) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr); if (nds == n_samples || ngt == 2*n_samples) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); if (filter_genotype.check(sid)) { genotype_id.push_back(sid); genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid))); string genotype_ref = string(line->d.allele[0]); genotype_start.push_back(line->pos + 1); nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr); if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]); else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1); genotype_val.push_back(vector < float > (sample_count, 0.0)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i]; else { if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing; else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); } } } pair < string, int > temp (sid,n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } else n_excludedG_void ++; } else n_excludedG_mult ++; } //Finalize free(gt_arr); free(ds_arr); bcf_sr_destroy(sr); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded"); //if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]"); //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!"); }
int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) { args->nitmp = 0; // Set allele frequency int ret; if ( args->af_tag ) { // Use an INFO tag provided by the user ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs); if ( ret==1 ) *alt_freq = args->AFs[0]; if ( ret==-2 ) error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); } else if ( args->af_fname ) { // Read AF from a file ret = read_AF(args->files->targets, line, alt_freq); } else { // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF ret = -1; if ( !args->estimate_AF ) { int AC = -1, AN = 0; ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp); if ( ret==1 ) { AN = args->itmp[0]; ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp); if ( ret>0 ) AC = args->itmp[0]; } if ( AN<=0 || AC<0 ) ret = -1; else *alt_freq = (double) AC/AN; } if ( ret==-1 ) ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp } if ( ret<0 ) return ret; if ( *alt_freq==0.0 ) { if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0 *alt_freq = args->dflt_AF; } // Set P(D|G) if ( args->fake_PLs ) { if ( !args->nitmp ) { args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp); if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid? args->nitmp /= args->nsmpl; } int32_t *gt = &args->itmp[args->ismpl*args->nitmp]; if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1; int a = bcf_gt_allele(gt[0]); int b = bcf_gt_allele(gt[1]); if ( a!=b ) { pdg[0] = pdg[2] = args->unseen_PL; pdg[1] = 1 - 2*args->unseen_PL; } else if ( a==0 ) { pdg[0] = 1 - 2*args->unseen_PL; pdg[1] = pdg[2] = args->unseen_PL; } else { pdg[0] = pdg[1] = args->unseen_PL; pdg[2] = 1 - 2*args->unseen_PL; } } else { args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp); if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid? args->nitmp /= args->nsmpl; int32_t *pl = &args->itmp[args->ismpl*args->nitmp]; pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0; pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0; pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0; double sum = pdg[0] + pdg[1] + pdg[2]; if ( !sum ) return -1; pdg[0] /= sum; pdg[1] /= sum; pdg[2] /= sum; } return 0; }
/** * Evaluates the actions for this node. */ void Node::evaluate(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug) { if (debug) std::cerr << "evaluation " << type << "\n"; if (type&VT_LOGIC_OP) { if (type==VT_NOT) { if (debug) std::cerr << "\tVT_NOT " << left->value << " \n"; value = !(left->value); } else if (type==VT_AND) { if (debug) std::cerr << "\tVT_AND " << left->value << "&" << right->value << " \n"; value = (left->value && right->value); } else if (type==VT_OR) { value = (left->value || right->value); } } else if (type&VT_MATH_CMP) { if (type==VT_EQ) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { if (debug) std::cerr << "\tVT_EQ " << left->i << "&" << right->i << " \n"; value = (left->i==right->i); return; } else if ((right->type&VT_FLT)) { if (debug) std::cerr << "\tVT_EQ " << left->i << "&" << right->f << " \n"; value = (left->i==right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { if (debug) std::cerr << "\tVT_EQ " << left->f << "&" << right->i << " \n"; value = (left->f==right->i); return; } else if ((right->type&VT_FLT)) { if (debug) std::cerr << "\tVT_EQ " << left->f << "&" << right->f << " \n"; value = (left->f==right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { if (debug) std::cerr << "\tVT_EQ " << left->tag.s << "&" << right->tag.s << " \n"; value = strcmp(left->tag.s, right->tag.s)==0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported : == %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_NE) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i!=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i!=right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { value = (left->f!=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f!=right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)==0 ? false : true; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: !=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_LE) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i<=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i<=right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { type |= VT_INT; value = (left->f<=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f<=right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)<=0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: <=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_GE) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i>=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i>=right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { value = (left->f>=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f>=right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)>=0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: >=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_GT) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i>right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i>right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { value = (left->f>right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f>right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)>0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: >\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_LT) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i<right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i<right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { value = (left->f<right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f<right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)<0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: <\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } } else if (type&VT_BCF_OP) { if (type==VT_FILTER) { if (bcf_has_filter(h, v, tag.s)!=1) { value = false; } else { value = true; } } else if (type==VT_INFO) { int32_t *data = NULL; int32_t n=0; if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0) { type |= VT_INT; i = *data; f = (float)i; } else if (bcf_get_info_float(h, v, tag.s, &data, &n)>0) { type |= VT_FLT; f = (float)(*data); } else if (bcf_get_info_string(h, v, tag.s, &data, &n)>0) { type |= VT_STR; s.l=0; for (int32_t i=0; i<n; ++i) { kputc(data[i], &s); } } else if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0) { type |= VT_FLG; i = 1; f = 1; b = true; value = true; s.l=0; } else { i = 0; f = 0; b = false; value = false; s.l=0; } if (n) free(data); } else if (type==(VT_INFO|VT_INT)) { int32_t *data = NULL; int32_t n=0; if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0) { i = *((int*)data); } if (n) free(data); } else if (type==(VT_INFO|VT_FLT)) { int32_t *data = NULL; int32_t n=0; if (bcf_get_info_float(h, v, tag.s, &data, &n)>0) { f = *((float*)data); } if (n) free(data); } else if (type==(VT_INFO|VT_STR)) { int32_t *data = NULL; int32_t n=0; if (bcf_get_info_string(h, v, tag.s, &data, &n)>0) { s.l=0; for (int32_t i=0; i<n; ++i) { kputc(data[i], &s); } } if (n) free(data); } else if (type==(VT_INFO|VT_FLG)) { if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0) { i = 1; f = 1; b = true; value = true; //s.l=0; kputc('1', &s); } else { i = 0; f = 0; b = false; value = false; s.l=0; } if (debug) std::cerr << "\tVT_INFO|VT_FLG " << i << " " << f << " " << b << " " << value << " " << s.s << " \n"; } else if (type==VT_VARIANT_TYPE) { if (debug) std::cerr << "\tVTYPE " << variant->vtype2string(variant->type) << " \n"; i = variant->type; value = i; } else if (type==VT_VARIANT_DLEN) { if (debug) std::cerr << "\tDLEN " << variant->alleles[0].dlen << " \n"; i = variant->alleles[0].dlen; value = i; } else if (type==VT_VARIANT_LEN) { if (debug) std::cerr << "\tLEN " << abs(variant->alleles[0].dlen) << " \n"; i = abs(variant->alleles[0].dlen); value = i; } else if (type==VT_N_ALLELE) { if (debug) std::cerr << "\tN_ALLELE " << bcf_get_n_allele(v) << " \n"; i = bcf_get_n_allele(v); } } else if (type&VT_MATH_OP) { if ((type&8207)==VT_ADD) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { type |= VT_INT; i = (left->i+right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->i+right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { type |= VT_FLT; f = (left->f+right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->f+right->f); return; } } fprintf(stderr, "[%s:%d %s] evaluation not supported : +\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } else if ((type&8207)==VT_SUB) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { type |= VT_INT; i = (left->i-right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->i-right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { type |= VT_FLT; f = (left->f-right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->f-right->f); return; } } fprintf(stderr, "[%s:%d %s] evaluation not supported : -\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } else if ((type&8207)==VT_MUL) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { type |= VT_INT; i = (left->i*right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->i*right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { type |= VT_FLT; f = (left->f*right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->f*right->f); return; } } fprintf(stderr, "[%s:%d %s] evaluation not supported : *\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } else if ((type&8207)==VT_DIV) { if (left->type&VT_INT) { if (right->type&VT_INT) { type |= VT_FLT; f = ((float)left->i/right->i); return; } else if (right->type&VT_FLT) { type |= VT_FLT; f = (left->i/right->f); return; } } else if (left->type&VT_FLT) { if (right->type&VT_INT) { type |= VT_FLT; f = (left->f/right->i); return; } else if (right->type&VT_FLT) { type |= VT_FLT; f = (left->f/right->f); return; } } fprintf(stderr, "[%s:%d %s] evaluation not supported : /\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } else if (type==VT_BIT_AND) { if ((left->type&VT_INT) && (right->type&VT_INT)) { i = (left->i & right->i); value = i; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported for & : %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_BIT_OR) { if ((left->type&VT_INT) && (right->type&VT_INT)) { i = (left->i | right->i); value = i; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported for | : %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else { fprintf(stderr, "[%s:%d %s] math op not supported : %d\n", __FILE__, __LINE__, __FUNCTION__, (type&15)); exit(1); } } }
/** * Constructor. * @v - VCF record. */ GenotypingRecord::GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype) { clear(); this->h = h; this->v = v; rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); this->vtype = vtype; int32_t n_allele = bcf_get_n_allele(v); if (vtype==VT_SNP && n_allele==2) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v); end1 = beg1; } else if (vtype==VT_INDEL && bcf_get_n_allele(v)==2) { rid = bcf_get_rid(v); char** alleles = bcf_get_allele(v); dlen = strlen(alleles[1])-strlen(alleles[0]); len = abs(dlen); int32_t *flanks = NULL; int32_t n = 0; if (bcf_get_info_int32(h, v, "FLANKS", &flanks, &n)>0) { lend1 = flanks[0]; rbeg1 = flanks[1]; free(flanks); } else { lend1 = bcf_get_pos1(v) - 1; rbeg1 = bcf_get_end_pos1(v) + 1; } int32_t *fuzzy_flanks = NULL; n = 0; if (bcf_get_info_int32(h, v, "FZ_FLANKS", &fuzzy_flanks, &n)>0) { fuzzy_lend1 = fuzzy_flanks[0]; fuzzy_rbeg1 = fuzzy_flanks[1]; free(fuzzy_flanks); } else { fuzzy_lend1 = bcf_get_pos1(v) - 1; fuzzy_rbeg1 = bcf_get_end_pos1(v) + 1; } beg1 = std::min(lend1-2, fuzzy_lend1-2); end1 = std::max(rbeg1+2, fuzzy_rbeg1+2); //construct alleles //get reference sequence // char* ref_seq = NULL; // int32_t ref_len = 0; //// ref_seq = faidx_fetch_seq(fai, bcf_get_chrom(h,v), lend1+1-1, rbeg1-1-1, &ref_len); // // for (uint32_t i=0; i<n_allele; ++i) // { // // } // for () // { // } // if (dlen>0) { indel.append(&alleles[1][1]); } else { indel.append(&alleles[0][1]); } } else if (vtype==VT_VNTR) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v) - 1; end1 = bcf_get_end_pos1(v) + 1; char *motif = NULL; int32_t n = 0; if (bcf_get_info_string(h, v, "MOTIF", &motif, &n)>0) { this->motif.assign(motif); free(motif); } } }