/** * Gets a sorted string representation of the alleles of a variant. */ void bcf_alleles2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var) { bcf_unpack(v, BCF_UN_STR); var->l = 0; if (v->n_allele==2) { kputs(bcf_get_alt(v, 0), var); kputc(',', var); kputs(bcf_get_alt(v, 1), var); } else { char** allele = bcf_get_allele(v); char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*)); for (int32_t i=1; i<v->n_allele; ++i) { temp[i-1] = allele[i]; } std::qsort(temp, bcf_get_n_allele(v)-1, sizeof(char*), cmpstr); kputs(bcf_get_alt(v, 0), var); for (int32_t i=0; i<v->n_allele-1; ++i) { kputc(',', var); kputs(temp[i], var); } free(temp); } }
/** * Pick candidate motifs in different modes. * Invokes motif tree and the candidate motifs are stored in a * heap within the motif tree. */ void CandidateMotifPicker::generate_candidate_motifs(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { std::cerr << "********************************************\n"; std::cerr << "PICK CANDIDATE MOTIFS\n\n"; } if (variant.ins) { char** alleles = bcf_get_allele(v); if (debug) { const char* repeat_tract = variant.vntr.exact_repeat_tract.c_str(); std::cerr << "Longest Allele : " << alleles[0][0] << "[" << &alleles[1][1] << "]" << &repeat_tract[1] << "\n"; } //spike in inserted allele std::string spiked_seq(alleles[1]); std::string insertion = variant.vntr.exact_repeat_tract.substr(strlen(alleles[0]), variant.vntr.exact_repeat_tract.size()-strlen(alleles[0])); spiked_seq.append(insertion); mt->detect_candidate_motifs(spiked_seq); indel_sequence.assign(&alleles[1][1]); } else { mt->detect_candidate_motifs(variant.vntr.exact_repeat_tract); char** alleles = bcf_get_allele(v); indel_sequence.assign(&alleles[0][1]); } if (debug) { std::cerr << "Indel : " << indel_sequence << "\n"; } }
/** * Pick candidate region. * * @mode - REFERENCE use refence field * - ALLELE_EXACT by exact alignment * - ALLELE_FUZZY by fuzzy alignment */ void CandidateRegionExtractor::pick_candidate_region(bcf_hdr_t* h, bcf1_t* v, Variant& variant, uint32_t mode) { if (mode==REFERENCE) { VNTR& vntr = variant.vntr; vntr.exact_repeat_tract.assign(bcf_get_ref(v)); vntr.exact_rbeg1 = bcf_get_pos1(v); char** alleles = bcf_get_allele(v); vntr.exact_rend1 = strlen(alleles[0]); vntr.fuzzy_rbeg1 = vntr.exact_rbeg1; vntr.fuzzy_rend1 = vntr.exact_rend1; } else if (mode==EXACT_LEFT_RIGHT_ALIGNMENT) { extract_regions_by_exact_alignment(h, v, variant); } else if (mode==FUZZY_LEFT_RIGHT_ALIGNMENT) { extract_regions_by_fuzzy_alignment(h, v, variant); } }
/** * Classifies variants. */ int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var) { bcf_unpack(v, BCF_UN_STR); const char* chrom = bcf_get_chrom(h, v); uint32_t pos1 = bcf_get_pos1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; var.ts = 0; var.tv = 0; var.ins = 0; var.del = 0; var.clear(); // this sets the type to VT_REF by default. bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t type = VT_REF; //check for tags if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { type = VT_VNTR; } } if (type==VT_VNTR) { type = VT_VNTR; var.type |= type; var.alleles.push_back(Allele(type)); } else { type = VT_SV; var.type |= type; std::string sv_type(allele[i]); var.alleles.push_back(Allele(type, sv_type)); } } else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { type |= VT_CLUMPED; } var.type |= type; var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); var.ts += ts; var.tv += tv; var.ins = dlen>0?1:0; var.del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (var.type==VT_VNTR) { bcf_unpack(v, BCF_UN_INFO); //populate motif, motif len etc. etc. // char* str = NULL; // int32_t n = 0; // int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n); // if (ret>0) // { // var.motif = std::string(str); // var.mlen = var.motif.size(); // } // ret = bcf_get_info_string(h, v, "RU", &str, &n); // if (ret>0) // { // var.ru = std::string(str); // var.mlen = var.ru.size(); // } // if (n) free(str); // // int32_t* no = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "RL", &no, &n); // if (ret>0) var.rlen = *no; // if (n) free(no); // // int32_t* fl = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "REF", &fl, &n); // if (ret>0) var.rcn = *fl; // if (n) free(fl); } //additionally define MNPs by length of all alleles if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { var.type |= VT_MNP; } } return var.type; }
/** * Checks if a variant is normalized. * Ignores if entry is not a variant. */ bool VariantManip::is_normalized(bcf1_t *v) { char** alleles = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); if (n_allele==1) return true; char first_base; char last_base; size_t rlen, alen, len; bool exists_len_one_allele = false; bool first_base_same = true; bool last_base_same = true; if (n_allele==2) { rlen = strlen(alleles[0]); alen = strlen(alleles[1]); if (rlen==1&&alen==1) { return true; } else { //check if variant is reference. if (rlen==alen) { if (strcmp(alleles[0], alleles[1])==0) { return true; } } //ref if (rlen==1) exists_len_one_allele = true; first_base = alleles[0][0]; last_base = alleles[0][rlen-1]; //alt if (alen==1) exists_len_one_allele = true; if (first_base!=alleles[1][0]) first_base_same = false; if (last_base!=alleles[1][alen-1]) last_base_same = false; if (last_base_same || (!exists_len_one_allele && first_base_same)) { return false; } return true; } } else { bool same = true; for (size_t i=0; i<n_allele; ++i) { if (i) { len = strlen(alleles[i]); if (len==1) exists_len_one_allele = true; if (first_base!=alleles[i][0]) first_base_same = false; if (last_base!=alleles[i][len-1]) last_base_same = false; same = same && strcmp(alleles[i],alleles[0])==0; } else { len = strlen(alleles[0]); if (len==1) exists_len_one_allele = true; first_base = alleles[0][0]; last_base = alleles[0][len-1]; } } //reference entry if (same) { return true; } if (last_base_same || (!exists_len_one_allele && first_base_same)) { return false; } return true; } }
/** * Updates VNTR related information from INFO fields. */ void Variant::update_vntr_from_info_fields() { vntr.motif = bcf_get_rid(v); char** allele = bcf_get_allele(v); // vntr.exact_repeat_tract.assign(allele[0]); // std::string tags[16] = {"MOTIF", "RU", "BASIS", "MLEN", "BLEN", "REPEAT_TRACT", "COMP", "ENTROPY", "ENTROPY2", "KL_DIVERGENCE", "KL_DIVERGENCE2", "RL", "LL", "RU_COUNTS", "SCORE", "TRF_SCORE"}; vntr.motif = bcf_get_info_str(h, v, "MOTIF"); vntr.ru = bcf_get_info_str(h, v, "RU"); vntr.basis = bcf_get_info_str(h, v, "BASIS"); if (vntr.basis=="") vntr.basis = VNTR::get_basis(vntr.motif); vntr.mlen = vntr.motif.size(); vntr.blen = (int32_t) vntr.basis.size(); std::vector<int32_t> i_vec = bcf_get_info_int_vec(h, v, "REPEAT_TRACT", 2, 0); vntr.beg1 = i_vec[0]; vntr.end1 = i_vec[1]; i_vec = bcf_get_info_int_vec(h, v, "COMP", 4, 0); vntr.comp[0] = i_vec[0]; vntr.comp[1] = i_vec[1]; vntr.comp[2] = i_vec[2]; vntr.comp[3] = i_vec[3]; vntr.entropy = bcf_get_info_flt(h, v, "ENTROPY"); vntr.entropy2 = bcf_get_info_flt(h, v, "ENTROPY2"); vntr.kl_divergence = bcf_get_info_flt(h, v, "KL_DIVERGENCE"); vntr.kl_divergence2 = bcf_get_info_flt(h, v, "KL_DIVERGENCE2"); vntr.rl = bcf_get_info_int(h, v, "RL"); vntr.ll = bcf_get_info_int(h, v, "LL"); i_vec = bcf_get_info_int_vec(h, v, "RU_COUNTS", 2, 0); vntr.no_perfect_ru = i_vec[0]; vntr.no_ru = i_vec[1]; vntr.score = bcf_get_info_flt(h, v, "SCORE"); vntr.trf_score = bcf_get_info_int(h, v, "TRF_SCORE"); vntr.exact_motif = bcf_get_info_str(h, v, "EX_MOTIF"); vntr.exact_ru = bcf_get_info_str(h, v, "EX_RU"); vntr.exact_basis = bcf_get_info_str(h, v, "EX_BASIS"); vntr.exact_mlen = (int32_t) vntr.exact_motif.size(); vntr.exact_blen = (int32_t) vntr.exact_basis.size(); i_vec = bcf_get_info_int_vec(h, v, "EX_REPEAT_TRACT", 2, 0); vntr.exact_beg1 = i_vec[0]; vntr.exact_end1 = i_vec[1]; i_vec = bcf_get_info_int_vec(h, v, "EX_COMP", 4, 0); vntr.exact_comp[0] = i_vec[0]; vntr.exact_comp[1] = i_vec[1]; vntr.exact_comp[2] = i_vec[2]; vntr.exact_comp[3] = i_vec[3]; vntr.exact_entropy = bcf_get_info_flt(h, v, "EX_ENTROPY"); vntr.exact_entropy2 = bcf_get_info_flt(h, v, "EX_ENTROPY2"); vntr.exact_kl_divergence = bcf_get_info_flt(h, v, "EX_KL_DIVERGENCE"); vntr.exact_kl_divergence2 = bcf_get_info_flt(h, v, "EX_KL_DIVERGENCE2"); vntr.exact_rl = bcf_get_info_int(h, v, "EX_RL"); vntr.exact_ll = bcf_get_info_int(h, v, "EX_LL"); i_vec = bcf_get_info_int_vec(h, v, "EX_RU_COUNTS", 2, 0); vntr.exact_no_perfect_ru = i_vec[0]; vntr.exact_no_ru = i_vec[1]; vntr.exact_score = bcf_get_info_flt(h, v, "EX_SCORE"); vntr.exact_trf_score = bcf_get_info_int(h, v, "EX_TRF_SCORE"); vntr.fuzzy_motif = bcf_get_info_str(h, v, "FZ_MOTIF"); vntr.fuzzy_ru = bcf_get_info_str(h, v, "FZ_RU"); vntr.fuzzy_basis = bcf_get_info_str(h, v, "FZ_BASIS"); vntr.fuzzy_mlen = (int32_t) vntr.fuzzy_motif.size(); vntr.fuzzy_blen = (int32_t) vntr.fuzzy_basis.size(); i_vec = bcf_get_info_int_vec(h, v, "FZ_REPEAT_TRACT", 2, 0); vntr.fuzzy_beg1 = i_vec[0]; vntr.fuzzy_end1 = i_vec[1]; i_vec = bcf_get_info_int_vec(h, v, "FZ_COMP", 4, 0); vntr.fuzzy_comp[0] = i_vec[0]; vntr.fuzzy_comp[1] = i_vec[1]; vntr.fuzzy_comp[2] = i_vec[2]; vntr.fuzzy_comp[3] = i_vec[3]; vntr.fuzzy_entropy = bcf_get_info_flt(h, v, "FZ_ENTROPY"); vntr.fuzzy_entropy2 = bcf_get_info_flt(h, v, "FZ_ENTROPY2"); vntr.fuzzy_kl_divergence = bcf_get_info_flt(h, v, "FZ_KL_DIVERGENCE"); vntr.fuzzy_kl_divergence2 = bcf_get_info_flt(h, v, "FZ_KL_DIVERGENCE2"); vntr.fuzzy_rl = bcf_get_info_int(h, v, "FZ_RL"); vntr.fuzzy_ll = bcf_get_info_int(h, v, "FZ_LL"); i_vec = bcf_get_info_int_vec(h, v, "FZ_RU_COUNTS", 2, 0); vntr.fuzzy_no_perfect_ru = i_vec[0]; vntr.fuzzy_no_ru = i_vec[1]; vntr.fuzzy_score = bcf_get_info_flt(h, v, "FZ_SCORE"); vntr.fuzzy_trf_score = bcf_get_info_int(h, v, "FZ_TRF_SCORE"); }
/** * Classifies variants. */ int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v) { clear(); this->h = h; this->v = v; bcf_unpack(v, BCF_UN_STR); chrom.assign(bcf_get_chrom(h, v)); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); end1 = bcf_get_end1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); if (strchr(ref, 'N')) { contains_N = true; } //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t allele_type = VT_REF; //check for symbolic alternative alleles if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { allele_type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { allele_type = VT_VNTR; } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { allele_type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { type = VT_VNTR; for (size_t j=3; j<len-1; ++j) { if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.') { type = VT_SV; } } } } if (allele_type==VT_VNTR) { allele_type = VT_VNTR; type |= allele_type; alleles.push_back(Allele(allele_type)); } else { allele_type = VT_SV; type |= allele_type; std::string sv_type(allele[i]); alleles.push_back(Allele(allele_type, sv_type)); } } //checks for chromosomal breakpoints else if (strchr(allele[i],'[')||strchr(allele[i],']')) { allele_type = VT_SV; type |= allele_type; std::string sv_type("<BND>"); alleles.push_back(Allele(allele_type, sv_type)); } //non variant record else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } //explicit sequence of bases else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (strchr(alt, 'N')) { contains_N = true; } if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { allele_type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { allele_type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { allele_type |= VT_CLUMPED; } type |= allele_type; alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); ts += ts; tv += tv; ins = dlen>0?1:0; del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (type==VT_VNTR) { update_vntr_from_info_fields(h, v); } //additionally define MNPs by length of all alleles if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { type |= VT_MNP; } } return type; }
/** * Constructor. * @v - VCF record. */ GenotypingRecord::GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype) { clear(); this->h = h; this->v = v; rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); this->vtype = vtype; int32_t n_allele = bcf_get_n_allele(v); if (vtype==VT_SNP && n_allele==2) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v); end1 = beg1; } else if (vtype==VT_INDEL && bcf_get_n_allele(v)==2) { rid = bcf_get_rid(v); char** alleles = bcf_get_allele(v); dlen = strlen(alleles[1])-strlen(alleles[0]); len = abs(dlen); int32_t *flanks = NULL; int32_t n = 0; if (bcf_get_info_int32(h, v, "FLANKS", &flanks, &n)>0) { lend1 = flanks[0]; rbeg1 = flanks[1]; free(flanks); } else { lend1 = bcf_get_pos1(v) - 1; rbeg1 = bcf_get_end_pos1(v) + 1; } int32_t *fuzzy_flanks = NULL; n = 0; if (bcf_get_info_int32(h, v, "FZ_FLANKS", &fuzzy_flanks, &n)>0) { fuzzy_lend1 = fuzzy_flanks[0]; fuzzy_rbeg1 = fuzzy_flanks[1]; free(fuzzy_flanks); } else { fuzzy_lend1 = bcf_get_pos1(v) - 1; fuzzy_rbeg1 = bcf_get_end_pos1(v) + 1; } beg1 = std::min(lend1-2, fuzzy_lend1-2); end1 = std::max(rbeg1+2, fuzzy_rbeg1+2); //construct alleles //get reference sequence // char* ref_seq = NULL; // int32_t ref_len = 0; //// ref_seq = faidx_fetch_seq(fai, bcf_get_chrom(h,v), lend1+1-1, rbeg1-1-1, &ref_len); // // for (uint32_t i=0; i<n_allele; ++i) // { // // } // for () // { // } // if (dlen>0) { indel.append(&alleles[1][1]); } else { indel.append(&alleles[0][1]); } } else if (vtype==VT_VNTR) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v) - 1; end1 = bcf_get_end_pos1(v) + 1; char *motif = NULL; int32_t n = 0; if (bcf_get_info_string(h, v, "MOTIF", &motif, &n)>0) { this->motif.assign(motif); free(motif); } } }