/** * Gets a sorted string representation of the alleles of a variant. */ void bcf_alleles2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var) { bcf_unpack(v, BCF_UN_STR); var->l = 0; if (v->n_allele==2) { kputs(bcf_get_alt(v, 0), var); kputc(',', var); kputs(bcf_get_alt(v, 1), var); } else { char** allele = bcf_get_allele(v); char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*)); for (int32_t i=1; i<v->n_allele; ++i) { temp[i-1] = allele[i]; } std::qsort(temp, bcf_get_n_allele(v)-1, sizeof(char*), cmpstr); kputs(bcf_get_alt(v, 0), var); for (int32_t i=0; i<v->n_allele-1; ++i) { kputc(',', var); kputs(temp[i], var); } free(temp); } }
/** * Extract reference sequence region for motif discovery in a fuzzy fashion. */ void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); trim(pos1, ref, alt); if (debug) { std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n"; std::cerr << " : " << ref << ":" << alt << "\n"; } min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3); max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3); int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.exact_rbeg1 = min_beg1; if (seq_len) free(seq); }
/** * Checks if a vntr is a homopolymer. */ bool CandidateRegionExtractor::is_homopolymer(bcf_hdr_t* h, bcf1_t* v) { bool is_homopolymer = false; uint32_t ref_len = strlen(bcf_get_ref(v)); for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); } return is_homopolymer; }
/** * Gets a string representation of the variant. */ std::string Variant::get_variant_string() { kstring_t var = {0,0,0}; bcf_unpack(v, BCF_UN_STR); var.l = 0; kputs(bcf_get_chrom(h, v), &var); kputc(':', &var); kputw(bcf_get_pos1(v), &var); kputc(':', &var); for (size_t i=0; i<bcf_get_n_allele(v); ++i) { if (i) kputc('/', &var); kputs(bcf_get_alt(v, i), &var); } std::string str(var.s); if (var.m) free(var.s); return str; }
/** * Classifies variants. */ int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var) { bcf_unpack(v, BCF_UN_STR); const char* chrom = bcf_get_chrom(h, v); uint32_t pos1 = bcf_get_pos1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; var.ts = 0; var.tv = 0; var.ins = 0; var.del = 0; var.clear(); // this sets the type to VT_REF by default. bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t type = VT_REF; //check for tags if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { type = VT_VNTR; } } if (type==VT_VNTR) { type = VT_VNTR; var.type |= type; var.alleles.push_back(Allele(type)); } else { type = VT_SV; var.type |= type; std::string sv_type(allele[i]); var.alleles.push_back(Allele(type, sv_type)); } } else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { type |= VT_CLUMPED; } var.type |= type; var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); var.ts += ts; var.tv += tv; var.ins = dlen>0?1:0; var.del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (var.type==VT_VNTR) { bcf_unpack(v, BCF_UN_INFO); //populate motif, motif len etc. etc. // char* str = NULL; // int32_t n = 0; // int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n); // if (ret>0) // { // var.motif = std::string(str); // var.mlen = var.motif.size(); // } // ret = bcf_get_info_string(h, v, "RU", &str, &n); // if (ret>0) // { // var.ru = std::string(str); // var.mlen = var.ru.size(); // } // if (n) free(str); // // int32_t* no = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "RL", &no, &n); // if (ret>0) var.rlen = *no; // if (n) free(no); // // int32_t* fl = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "REF", &fl, &n); // if (ret>0) var.rcn = *fl; // if (n) free(fl); } //additionally define MNPs by length of all alleles if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { var.type |= VT_MNP; } } return var.type; }
/** * Checks if a variant is normalized. * Ignores if entry is not a variant. */ bool VariantManip::is_normalized(bcf1_t *v) { char** alleles = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); if (n_allele==1) return true; char first_base; char last_base; size_t rlen, alen, len; bool exists_len_one_allele = false; bool first_base_same = true; bool last_base_same = true; if (n_allele==2) { rlen = strlen(alleles[0]); alen = strlen(alleles[1]); if (rlen==1&&alen==1) { return true; } else { //check if variant is reference. if (rlen==alen) { if (strcmp(alleles[0], alleles[1])==0) { return true; } } //ref if (rlen==1) exists_len_one_allele = true; first_base = alleles[0][0]; last_base = alleles[0][rlen-1]; //alt if (alen==1) exists_len_one_allele = true; if (first_base!=alleles[1][0]) first_base_same = false; if (last_base!=alleles[1][alen-1]) last_base_same = false; if (last_base_same || (!exists_len_one_allele && first_base_same)) { return false; } return true; } } else { bool same = true; for (size_t i=0; i<n_allele; ++i) { if (i) { len = strlen(alleles[i]); if (len==1) exists_len_one_allele = true; if (first_base!=alleles[i][0]) first_base_same = false; if (last_base!=alleles[i][len-1]) last_base_same = false; same = same && strcmp(alleles[i],alleles[0])==0; } else { len = strlen(alleles[0]); if (len==1) exists_len_one_allele = true; first_base = alleles[0][0]; last_base = alleles[0][len-1]; } } //reference entry if (same) { return true; } if (last_base_same || (!exists_len_one_allele && first_base_same)) { return false; } return true; } }
/** * Extract reference sequence region for motif discovery. * * The input is a VCF record that contains an indel. * * If the the indel has multiple alleles, it will examine all * alleles. * * todo: is might be a good idea to combine this step with motif detection * since there seems to be a need to have an iterative process here * to ensure a good candidate motif is chosen. * */ void CandidateRegionExtractor::extract_regions_by_exact_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; if (debug) { bcf_print_liten(h, v); } //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); //this prevents introduction of flanks that do not harbour the repeat unit trim(pos1, ref, alt); int32_t end1 = pos1 + ref.size() - 1; right_align(chrom, end1, ref, alt); int32_t beg1 = end1 - ref.size() + 1; left_align(chrom, beg1, ref, alt); min_beg1 = beg1<min_beg1 ? beg1 : min_beg1; max_end1 = end1>max_end1 ? end1 : max_end1; int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.rid = bcf_get_rid(v); vntr.exact_rbeg1 = min_beg1; vntr.exact_rend1 = max_end1; if (seq_len) free(seq); }
/** * Evaluates the actions for this node. */ void Node::evaluate(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug) { if (debug) std::cerr << "evaluation " << type << "\n"; if (type&VT_LOGIC_OP) { if (type==VT_NOT) { if (debug) std::cerr << "\tVT_NOT " << left->value << " \n"; value = !(left->value); } else if (type==VT_AND) { if (debug) std::cerr << "\tVT_AND " << left->value << "&" << right->value << " \n"; value = (left->value && right->value); } else if (type==VT_OR) { value = (left->value || right->value); } } else if (type&VT_MATH_CMP) { if (type==VT_EQ) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { if (debug) std::cerr << "\tVT_EQ " << left->i << "&" << right->i << " \n"; value = (left->i==right->i); return; } else if ((right->type&VT_FLT)) { if (debug) std::cerr << "\tVT_EQ " << left->i << "&" << right->f << " \n"; value = (left->i==right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { if (debug) std::cerr << "\tVT_EQ " << left->f << "&" << right->i << " \n"; value = (left->f==right->i); return; } else if ((right->type&VT_FLT)) { if (debug) std::cerr << "\tVT_EQ " << left->f << "&" << right->f << " \n"; value = (left->f==right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { if (debug) std::cerr << "\tVT_EQ " << left->tag.s << "&" << right->tag.s << " \n"; value = strcmp(left->tag.s, right->tag.s)==0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported : == %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_NE) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i!=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i!=right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { value = (left->f!=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f!=right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)==0 ? false : true; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: !=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_LE) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i<=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i<=right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { type |= VT_INT; value = (left->f<=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f<=right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)<=0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: <=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_GE) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i>=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i>=right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { value = (left->f>=right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f>=right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)>=0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: >=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_GT) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i>right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i>right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { value = (left->f>right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f>right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)>0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: >\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_LT) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { value = (left->i<right->i); return; } else if ((right->type&VT_FLT)) { value = (left->i<right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { value = (left->f<right->i); return; } else if ((right->type&VT_FLT)) { value = (left->f<right->f); return; } } else if ((left->type&VT_STR) && (right->type&VT_STR)) { value = strcmp(left->tag.s, right->tag.s)<0 ? true : false; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: <\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } } else if (type&VT_BCF_OP) { if (type==VT_FILTER) { if (bcf_has_filter(h, v, tag.s)!=1) { value = false; } else { value = true; } } else if (type==VT_INFO) { int32_t *data = NULL; int32_t n=0; if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0) { type |= VT_INT; i = *data; f = (float)i; } else if (bcf_get_info_float(h, v, tag.s, &data, &n)>0) { type |= VT_FLT; f = (float)(*data); } else if (bcf_get_info_string(h, v, tag.s, &data, &n)>0) { type |= VT_STR; s.l=0; for (int32_t i=0; i<n; ++i) { kputc(data[i], &s); } } else if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0) { type |= VT_FLG; i = 1; f = 1; b = true; value = true; s.l=0; } else { i = 0; f = 0; b = false; value = false; s.l=0; } if (n) free(data); } else if (type==(VT_INFO|VT_INT)) { int32_t *data = NULL; int32_t n=0; if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0) { i = *((int*)data); } if (n) free(data); } else if (type==(VT_INFO|VT_FLT)) { int32_t *data = NULL; int32_t n=0; if (bcf_get_info_float(h, v, tag.s, &data, &n)>0) { f = *((float*)data); } if (n) free(data); } else if (type==(VT_INFO|VT_STR)) { int32_t *data = NULL; int32_t n=0; if (bcf_get_info_string(h, v, tag.s, &data, &n)>0) { s.l=0; for (int32_t i=0; i<n; ++i) { kputc(data[i], &s); } } if (n) free(data); } else if (type==(VT_INFO|VT_FLG)) { if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0) { i = 1; f = 1; b = true; value = true; //s.l=0; kputc('1', &s); } else { i = 0; f = 0; b = false; value = false; s.l=0; } if (debug) std::cerr << "\tVT_INFO|VT_FLG " << i << " " << f << " " << b << " " << value << " " << s.s << " \n"; } else if (type==VT_VARIANT_TYPE) { if (debug) std::cerr << "\tVTYPE " << variant->vtype2string(variant->type) << " \n"; i = variant->type; value = i; } else if (type==VT_VARIANT_DLEN) { if (debug) std::cerr << "\tDLEN " << variant->alleles[0].dlen << " \n"; i = variant->alleles[0].dlen; value = i; } else if (type==VT_VARIANT_LEN) { if (debug) std::cerr << "\tLEN " << abs(variant->alleles[0].dlen) << " \n"; i = abs(variant->alleles[0].dlen); value = i; } else if (type==VT_N_ALLELE) { if (debug) std::cerr << "\tN_ALLELE " << bcf_get_n_allele(v) << " \n"; i = bcf_get_n_allele(v); } } else if (type&VT_MATH_OP) { if ((type&8207)==VT_ADD) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { type |= VT_INT; i = (left->i+right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->i+right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { type |= VT_FLT; f = (left->f+right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->f+right->f); return; } } fprintf(stderr, "[%s:%d %s] evaluation not supported : +\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } else if ((type&8207)==VT_SUB) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { type |= VT_INT; i = (left->i-right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->i-right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { type |= VT_FLT; f = (left->f-right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->f-right->f); return; } } fprintf(stderr, "[%s:%d %s] evaluation not supported : -\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } else if ((type&8207)==VT_MUL) { if ((left->type&VT_INT)) { if ((right->type&VT_INT)) { type |= VT_INT; i = (left->i*right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->i*right->f); return; } } else if ((left->type&VT_FLT)) { if ((right->type&VT_INT)) { type |= VT_FLT; f = (left->f*right->i); return; } else if ((right->type&VT_FLT)) { type |= VT_FLT; f = (left->f*right->f); return; } } fprintf(stderr, "[%s:%d %s] evaluation not supported : *\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } else if ((type&8207)==VT_DIV) { if (left->type&VT_INT) { if (right->type&VT_INT) { type |= VT_FLT; f = ((float)left->i/right->i); return; } else if (right->type&VT_FLT) { type |= VT_FLT; f = (left->i/right->f); return; } } else if (left->type&VT_FLT) { if (right->type&VT_INT) { type |= VT_FLT; f = (left->f/right->i); return; } else if (right->type&VT_FLT) { type |= VT_FLT; f = (left->f/right->f); return; } } fprintf(stderr, "[%s:%d %s] evaluation not supported : /\n", __FILE__, __LINE__, __FUNCTION__); exit(1); } else if (type==VT_BIT_AND) { if ((left->type&VT_INT) && (right->type&VT_INT)) { i = (left->i & right->i); value = i; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported for & : %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else if (type==VT_BIT_OR) { if ((left->type&VT_INT) && (right->type&VT_INT)) { i = (left->i | right->i); value = i; return; } fprintf(stderr, "[%s:%d %s] evaluation not supported for | : %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type); exit(1); } else { fprintf(stderr, "[%s:%d %s] math op not supported : %d\n", __FILE__, __LINE__, __FUNCTION__, (type&15)); exit(1); } } }
/** * Classifies variants. */ int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v) { clear(); this->h = h; this->v = v; bcf_unpack(v, BCF_UN_STR); chrom.assign(bcf_get_chrom(h, v)); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); end1 = bcf_get_end1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); if (strchr(ref, 'N')) { contains_N = true; } //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t allele_type = VT_REF; //check for symbolic alternative alleles if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { allele_type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { allele_type = VT_VNTR; } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { allele_type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { type = VT_VNTR; for (size_t j=3; j<len-1; ++j) { if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.') { type = VT_SV; } } } } if (allele_type==VT_VNTR) { allele_type = VT_VNTR; type |= allele_type; alleles.push_back(Allele(allele_type)); } else { allele_type = VT_SV; type |= allele_type; std::string sv_type(allele[i]); alleles.push_back(Allele(allele_type, sv_type)); } } //checks for chromosomal breakpoints else if (strchr(allele[i],'[')||strchr(allele[i],']')) { allele_type = VT_SV; type |= allele_type; std::string sv_type("<BND>"); alleles.push_back(Allele(allele_type, sv_type)); } //non variant record else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } //explicit sequence of bases else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (strchr(alt, 'N')) { contains_N = true; } if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { allele_type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { allele_type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { allele_type |= VT_CLUMPED; } type |= allele_type; alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); ts += ts; tv += tv; ins = dlen>0?1:0; del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (type==VT_VNTR) { update_vntr_from_info_fields(h, v); } //additionally define MNPs by length of all alleles if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { type |= VT_MNP; } } return type; }
/** * Constructor. * @v - VCF record. */ GenotypingRecord::GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype) { clear(); this->h = h; this->v = v; rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); this->vtype = vtype; int32_t n_allele = bcf_get_n_allele(v); if (vtype==VT_SNP && n_allele==2) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v); end1 = beg1; } else if (vtype==VT_INDEL && bcf_get_n_allele(v)==2) { rid = bcf_get_rid(v); char** alleles = bcf_get_allele(v); dlen = strlen(alleles[1])-strlen(alleles[0]); len = abs(dlen); int32_t *flanks = NULL; int32_t n = 0; if (bcf_get_info_int32(h, v, "FLANKS", &flanks, &n)>0) { lend1 = flanks[0]; rbeg1 = flanks[1]; free(flanks); } else { lend1 = bcf_get_pos1(v) - 1; rbeg1 = bcf_get_end_pos1(v) + 1; } int32_t *fuzzy_flanks = NULL; n = 0; if (bcf_get_info_int32(h, v, "FZ_FLANKS", &fuzzy_flanks, &n)>0) { fuzzy_lend1 = fuzzy_flanks[0]; fuzzy_rbeg1 = fuzzy_flanks[1]; free(fuzzy_flanks); } else { fuzzy_lend1 = bcf_get_pos1(v) - 1; fuzzy_rbeg1 = bcf_get_end_pos1(v) + 1; } beg1 = std::min(lend1-2, fuzzy_lend1-2); end1 = std::max(rbeg1+2, fuzzy_rbeg1+2); //construct alleles //get reference sequence // char* ref_seq = NULL; // int32_t ref_len = 0; //// ref_seq = faidx_fetch_seq(fai, bcf_get_chrom(h,v), lend1+1-1, rbeg1-1-1, &ref_len); // // for (uint32_t i=0; i<n_allele; ++i) // { // // } // for () // { // } // if (dlen>0) { indel.append(&alleles[1][1]); } else { indel.append(&alleles[0][1]); } } else if (vtype==VT_VNTR) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v) - 1; end1 = bcf_get_end_pos1(v) + 1; char *motif = NULL; int32_t n = 0; if (bcf_get_info_string(h, v, "MOTIF", &motif, &n)>0) { this->motif.assign(motif); free(motif); } } }