/** * Gets a sorted string representation of a variant. */ void bcf_variant2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var) { bcf_print_liten(h,v); bcf_unpack(v, BCF_UN_STR); var->l = 0; kputs(bcf_get_chrom(h, v), var); kputc(':', var); kputw(bcf_get_pos1(v), var); kputc(':', var); if (v->n_allele==2) { kputs(bcf_get_alt(v, 0), var); kputc(',', var); kputs(bcf_get_alt(v, 1), var); } else { char** allele = bcf_get_allele(v); char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*)); for (int32_t i=1; i<v->n_allele; ++i) { temp[i] = allele[i]; } std::qsort(temp, bcf_get_n_allele(v), sizeof(char*), cmpstr); kputs(bcf_get_alt(v, 0), var); for (int32_t i=0; i<v->n_allele-1; ++i) { kputc(',', var); kputs(temp[i], var); } free(temp); } }
/** * Extract reference sequence region for motif discovery in a fuzzy fashion. */ void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); trim(pos1, ref, alt); if (debug) { std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n"; std::cerr << " : " << ref << ":" << alt << "\n"; } min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3); max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3); int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.exact_rbeg1 = min_beg1; if (seq_len) free(seq); }
/** * Gets a string representation of a variant. */ void bcf_variant2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var) { bcf_unpack(v, BCF_UN_STR); var->l = 0; kputs(bcf_get_chrom(h, v), var); kputc(':', var); kputw(bcf_get_pos1(v), var); kputc(':', var); for (int32_t i=0; i<v->n_allele; ++i) { if (i) kputc(',', var); kputs(bcf_get_alt(v, i), var); } }
/** * Gets a string representation of the variant. */ std::string Variant::get_variant_string() { kstring_t var = {0,0,0}; bcf_unpack(v, BCF_UN_STR); var.l = 0; kputs(bcf_get_chrom(h, v), &var); kputc(':', &var); kputw(bcf_get_pos1(v), &var); kputc(':', &var); for (size_t i=0; i<bcf_get_n_allele(v); ++i) { if (i) kputc('/', &var); kputs(bcf_get_alt(v, i), &var); } std::string str(var.s); if (var.m) free(var.s); return str; }
/** * Classifies variants. */ int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var) { bcf_unpack(v, BCF_UN_STR); const char* chrom = bcf_get_chrom(h, v); uint32_t pos1 = bcf_get_pos1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; var.ts = 0; var.tv = 0; var.ins = 0; var.del = 0; var.clear(); // this sets the type to VT_REF by default. bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t type = VT_REF; //check for tags if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { type = VT_VNTR; } } if (type==VT_VNTR) { type = VT_VNTR; var.type |= type; var.alleles.push_back(Allele(type)); } else { type = VT_SV; var.type |= type; std::string sv_type(allele[i]); var.alleles.push_back(Allele(type, sv_type)); } } else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { type |= VT_CLUMPED; } var.type |= type; var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); var.ts += ts; var.tv += tv; var.ins = dlen>0?1:0; var.del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (var.type==VT_VNTR) { bcf_unpack(v, BCF_UN_INFO); //populate motif, motif len etc. etc. // char* str = NULL; // int32_t n = 0; // int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n); // if (ret>0) // { // var.motif = std::string(str); // var.mlen = var.motif.size(); // } // ret = bcf_get_info_string(h, v, "RU", &str, &n); // if (ret>0) // { // var.ru = std::string(str); // var.mlen = var.ru.size(); // } // if (n) free(str); // // int32_t* no = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "RL", &no, &n); // if (ret>0) var.rlen = *no; // if (n) free(no); // // int32_t* fl = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "REF", &fl, &n); // if (ret>0) var.rcn = *fl; // if (n) free(fl); } //additionally define MNPs by length of all alleles if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { var.type |= VT_MNP; } } return var.type; }
/** * Detects near by STRs. */ bool VariantManip::detect_str(bcf_hdr_t *h, bcf1_t *v, Variant& variant) { return detect_str(bcf_get_chrom(h, v), bcf_get_pos1(v), variant); }
/** * Extract reference sequence region for motif discovery. * * The input is a VCF record that contains an indel. * * If the the indel has multiple alleles, it will examine all * alleles. * * todo: is might be a good idea to combine this step with motif detection * since there seems to be a need to have an iterative process here * to ensure a good candidate motif is chosen. * */ void CandidateRegionExtractor::extract_regions_by_exact_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; if (debug) { bcf_print_liten(h, v); } //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); //this prevents introduction of flanks that do not harbour the repeat unit trim(pos1, ref, alt); int32_t end1 = pos1 + ref.size() - 1; right_align(chrom, end1, ref, alt); int32_t beg1 = end1 - ref.size() + 1; left_align(chrom, beg1, ref, alt); min_beg1 = beg1<min_beg1 ? beg1 : min_beg1; max_end1 = end1>max_end1 ? end1 : max_end1; int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.rid = bcf_get_rid(v); vntr.exact_rbeg1 = min_beg1; vntr.exact_rend1 = max_end1; if (seq_len) free(seq); }
/** * Gets sequence name of a record. */ const char* BCFSyncedStreamReader::get_seqname(int32_t i, bcf1_t *v) { return bcf_get_chrom(hdrs[i], v); }
/** * Constructor. */ Variant::Variant(bcf_hdr_t* h, bcf1_t* v) { this->h = h; this->v = v; type = classify(h, v); chrom = bcf_get_chrom(h, v); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); no_overlapping_snps = 0; no_overlapping_indels = 0; no_overlapping_vntrs = 0; is_new_multiallelic = false; //attempts to update relevant information on variants if (type==VT_SNP) { beg1 = bcf_get_pos1(v); end1 = bcf_get_pos1(v); } else if (type==VT_INDEL) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); //annotate ends if (!end1) end1 = bcf_get_end1(v); } //complex variants else if (type & (VT_SNP|VT_MNP|VT_INDEL|VT_CLUMPED)) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); } else if (type==VT_VNTR) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); update_vntr_from_info_fields(h, v); vs.push_back(v); vntr_vs.push_back(v); } else if (type==VT_SV) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); } else { std::cerr << "unexpected type in variant construction\n"; print(); exit(1); } }
/** * Classifies variants. */ int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v) { clear(); this->h = h; this->v = v; bcf_unpack(v, BCF_UN_STR); chrom.assign(bcf_get_chrom(h, v)); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); end1 = bcf_get_end1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); if (strchr(ref, 'N')) { contains_N = true; } //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t allele_type = VT_REF; //check for symbolic alternative alleles if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { allele_type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { allele_type = VT_VNTR; } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { allele_type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { type = VT_VNTR; for (size_t j=3; j<len-1; ++j) { if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.') { type = VT_SV; } } } } if (allele_type==VT_VNTR) { allele_type = VT_VNTR; type |= allele_type; alleles.push_back(Allele(allele_type)); } else { allele_type = VT_SV; type |= allele_type; std::string sv_type(allele[i]); alleles.push_back(Allele(allele_type, sv_type)); } } //checks for chromosomal breakpoints else if (strchr(allele[i],'[')||strchr(allele[i],']')) { allele_type = VT_SV; type |= allele_type; std::string sv_type("<BND>"); alleles.push_back(Allele(allele_type, sv_type)); } //non variant record else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } //explicit sequence of bases else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (strchr(alt, 'N')) { contains_N = true; } if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { allele_type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { allele_type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { allele_type |= VT_CLUMPED; } type |= allele_type; alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); ts += ts; tv += tv; ins = dlen>0?1:0; del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (type==VT_VNTR) { update_vntr_from_info_fields(h, v); } //additionally define MNPs by length of all alleles if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { type |= VT_MNP; } } return type; }
/** * Gets sequence name of a record. */ const char* BCFOrderedReader::get_seqname(bcf1_t *v) { return bcf_get_chrom(hdr, v); };