/** * Extract reference sequence region for motif discovery. * * The input is a VCF record that contains an indel. * * If the the indel has multiple alleles, it will examine all * alleles. * * todo: is might be a good idea to combine this step with motif detection * since there seems to be a need to have an iterative process here * to ensure a good candidate motif is chosen. * */ void CandidateRegionExtractor::extract_regions_by_exact_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; if (debug) { bcf_print_liten(h, v); } //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); //this prevents introduction of flanks that do not harbour the repeat unit trim(pos1, ref, alt); int32_t end1 = pos1 + ref.size() - 1; right_align(chrom, end1, ref, alt); int32_t beg1 = end1 - ref.size() + 1; left_align(chrom, beg1, ref, alt); min_beg1 = beg1<min_beg1 ? beg1 : min_beg1; max_end1 = end1>max_end1 ? end1 : max_end1; int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.rid = bcf_get_rid(v); vntr.exact_rbeg1 = min_beg1; vntr.exact_rend1 = max_end1; if (seq_len) free(seq); }
/** * Constructor. */ Variant::Variant(bcf_hdr_t* h, bcf1_t* v) { this->h = h; this->v = v; type = classify(h, v); chrom = bcf_get_chrom(h, v); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); no_overlapping_snps = 0; no_overlapping_indels = 0; no_overlapping_vntrs = 0; is_new_multiallelic = false; //attempts to update relevant information on variants if (type==VT_SNP) { beg1 = bcf_get_pos1(v); end1 = bcf_get_pos1(v); } else if (type==VT_INDEL) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); //annotate ends if (!end1) end1 = bcf_get_end1(v); } //complex variants else if (type & (VT_SNP|VT_MNP|VT_INDEL|VT_CLUMPED)) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); } else if (type==VT_VNTR) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); update_vntr_from_info_fields(h, v); vs.push_back(v); vntr_vs.push_back(v); } else if (type==VT_SV) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); } else { std::cerr << "unexpected type in variant construction\n"; print(); exit(1); } }
/** * Updates VNTR related information from INFO fields. */ void Variant::update_vntr_from_info_fields() { vntr.motif = bcf_get_rid(v); char** allele = bcf_get_allele(v); // vntr.exact_repeat_tract.assign(allele[0]); // std::string tags[16] = {"MOTIF", "RU", "BASIS", "MLEN", "BLEN", "REPEAT_TRACT", "COMP", "ENTROPY", "ENTROPY2", "KL_DIVERGENCE", "KL_DIVERGENCE2", "RL", "LL", "RU_COUNTS", "SCORE", "TRF_SCORE"}; vntr.motif = bcf_get_info_str(h, v, "MOTIF"); vntr.ru = bcf_get_info_str(h, v, "RU"); vntr.basis = bcf_get_info_str(h, v, "BASIS"); if (vntr.basis=="") vntr.basis = VNTR::get_basis(vntr.motif); vntr.mlen = vntr.motif.size(); vntr.blen = (int32_t) vntr.basis.size(); std::vector<int32_t> i_vec = bcf_get_info_int_vec(h, v, "REPEAT_TRACT", 2, 0); vntr.beg1 = i_vec[0]; vntr.end1 = i_vec[1]; i_vec = bcf_get_info_int_vec(h, v, "COMP", 4, 0); vntr.comp[0] = i_vec[0]; vntr.comp[1] = i_vec[1]; vntr.comp[2] = i_vec[2]; vntr.comp[3] = i_vec[3]; vntr.entropy = bcf_get_info_flt(h, v, "ENTROPY"); vntr.entropy2 = bcf_get_info_flt(h, v, "ENTROPY2"); vntr.kl_divergence = bcf_get_info_flt(h, v, "KL_DIVERGENCE"); vntr.kl_divergence2 = bcf_get_info_flt(h, v, "KL_DIVERGENCE2"); vntr.rl = bcf_get_info_int(h, v, "RL"); vntr.ll = bcf_get_info_int(h, v, "LL"); i_vec = bcf_get_info_int_vec(h, v, "RU_COUNTS", 2, 0); vntr.no_perfect_ru = i_vec[0]; vntr.no_ru = i_vec[1]; vntr.score = bcf_get_info_flt(h, v, "SCORE"); vntr.trf_score = bcf_get_info_int(h, v, "TRF_SCORE"); vntr.exact_motif = bcf_get_info_str(h, v, "EX_MOTIF"); vntr.exact_ru = bcf_get_info_str(h, v, "EX_RU"); vntr.exact_basis = bcf_get_info_str(h, v, "EX_BASIS"); vntr.exact_mlen = (int32_t) vntr.exact_motif.size(); vntr.exact_blen = (int32_t) vntr.exact_basis.size(); i_vec = bcf_get_info_int_vec(h, v, "EX_REPEAT_TRACT", 2, 0); vntr.exact_beg1 = i_vec[0]; vntr.exact_end1 = i_vec[1]; i_vec = bcf_get_info_int_vec(h, v, "EX_COMP", 4, 0); vntr.exact_comp[0] = i_vec[0]; vntr.exact_comp[1] = i_vec[1]; vntr.exact_comp[2] = i_vec[2]; vntr.exact_comp[3] = i_vec[3]; vntr.exact_entropy = bcf_get_info_flt(h, v, "EX_ENTROPY"); vntr.exact_entropy2 = bcf_get_info_flt(h, v, "EX_ENTROPY2"); vntr.exact_kl_divergence = bcf_get_info_flt(h, v, "EX_KL_DIVERGENCE"); vntr.exact_kl_divergence2 = bcf_get_info_flt(h, v, "EX_KL_DIVERGENCE2"); vntr.exact_rl = bcf_get_info_int(h, v, "EX_RL"); vntr.exact_ll = bcf_get_info_int(h, v, "EX_LL"); i_vec = bcf_get_info_int_vec(h, v, "EX_RU_COUNTS", 2, 0); vntr.exact_no_perfect_ru = i_vec[0]; vntr.exact_no_ru = i_vec[1]; vntr.exact_score = bcf_get_info_flt(h, v, "EX_SCORE"); vntr.exact_trf_score = bcf_get_info_int(h, v, "EX_TRF_SCORE"); vntr.fuzzy_motif = bcf_get_info_str(h, v, "FZ_MOTIF"); vntr.fuzzy_ru = bcf_get_info_str(h, v, "FZ_RU"); vntr.fuzzy_basis = bcf_get_info_str(h, v, "FZ_BASIS"); vntr.fuzzy_mlen = (int32_t) vntr.fuzzy_motif.size(); vntr.fuzzy_blen = (int32_t) vntr.fuzzy_basis.size(); i_vec = bcf_get_info_int_vec(h, v, "FZ_REPEAT_TRACT", 2, 0); vntr.fuzzy_beg1 = i_vec[0]; vntr.fuzzy_end1 = i_vec[1]; i_vec = bcf_get_info_int_vec(h, v, "FZ_COMP", 4, 0); vntr.fuzzy_comp[0] = i_vec[0]; vntr.fuzzy_comp[1] = i_vec[1]; vntr.fuzzy_comp[2] = i_vec[2]; vntr.fuzzy_comp[3] = i_vec[3]; vntr.fuzzy_entropy = bcf_get_info_flt(h, v, "FZ_ENTROPY"); vntr.fuzzy_entropy2 = bcf_get_info_flt(h, v, "FZ_ENTROPY2"); vntr.fuzzy_kl_divergence = bcf_get_info_flt(h, v, "FZ_KL_DIVERGENCE"); vntr.fuzzy_kl_divergence2 = bcf_get_info_flt(h, v, "FZ_KL_DIVERGENCE2"); vntr.fuzzy_rl = bcf_get_info_int(h, v, "FZ_RL"); vntr.fuzzy_ll = bcf_get_info_int(h, v, "FZ_LL"); i_vec = bcf_get_info_int_vec(h, v, "FZ_RU_COUNTS", 2, 0); vntr.fuzzy_no_perfect_ru = i_vec[0]; vntr.fuzzy_no_ru = i_vec[1]; vntr.fuzzy_score = bcf_get_info_flt(h, v, "FZ_SCORE"); vntr.fuzzy_trf_score = bcf_get_info_int(h, v, "FZ_TRF_SCORE"); }
/** * Classifies variants. */ int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v) { clear(); this->h = h; this->v = v; bcf_unpack(v, BCF_UN_STR); chrom.assign(bcf_get_chrom(h, v)); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); end1 = bcf_get_end1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); if (strchr(ref, 'N')) { contains_N = true; } //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t allele_type = VT_REF; //check for symbolic alternative alleles if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { allele_type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { allele_type = VT_VNTR; } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { allele_type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { type = VT_VNTR; for (size_t j=3; j<len-1; ++j) { if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.') { type = VT_SV; } } } } if (allele_type==VT_VNTR) { allele_type = VT_VNTR; type |= allele_type; alleles.push_back(Allele(allele_type)); } else { allele_type = VT_SV; type |= allele_type; std::string sv_type(allele[i]); alleles.push_back(Allele(allele_type, sv_type)); } } //checks for chromosomal breakpoints else if (strchr(allele[i],'[')||strchr(allele[i],']')) { allele_type = VT_SV; type |= allele_type; std::string sv_type("<BND>"); alleles.push_back(Allele(allele_type, sv_type)); } //non variant record else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } //explicit sequence of bases else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (strchr(alt, 'N')) { contains_N = true; } if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { allele_type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { allele_type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { allele_type |= VT_CLUMPED; } type |= allele_type; alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); ts += ts; tv += tv; ins = dlen>0?1:0; del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (type==VT_VNTR) { update_vntr_from_info_fields(h, v); } //additionally define MNPs by length of all alleles if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { type |= VT_MNP; } } return type; }
/** * Annotates VNTR characteristics. * @mode - */ void VNTRAnnotator::annotate(bcf_hdr_t* h, bcf1_t* v, Variant& variant, std::string mode) { VNTR& vntr = variant.vntr; //update chromosome and position variant.rid = bcf_get_rid(v); variant.pos1 = bcf_get_pos1(v); //this is for reannotating an VNTR record //this is more for the purpose of evaluation to //check if vt's algorithm is concordant with //VNTRs from other sources. if (variant.type==VT_VNTR) { if (debug) std::cerr << "ANNOTATING VNTR/STR \n"; //1. pick candidate region cre->pick_candidate_region(h, v, variant, REFERENCE); //2. detect candidate motifs from a reference seqeuence cmp->generate_candidate_motifs(h, v, variant); cmp->next_motif(h, v, variant); } //main purpose - annotation of Indels. else if (variant.type&VT_INDEL) { //the basic steps in annotating a TR // //1. extract a region that has a chance of containing the repeat units //2. choose a set of candidate motifs and pick motif //3. detect repeat region and evaluate //4. iterate 2 and 3 //EXACT MODE if (mode=="e") { if (debug) std::cerr << "============================================\n"; if (debug) std::cerr << "ANNOTATING INDEL EXACTLY\n"; //1. pick candidate region using exact left and right alignment cre->pick_candidate_region(h, v, variant, EXACT_LEFT_RIGHT_ALIGNMENT); //2. evaluate reference length fd->detect_flanks(h, v, variant, CLIP_ENDS); if (debug) std::cerr << "============================================\n"; return; } //FUZZY DETECTION else if (mode=="f") { if (debug) std::cerr << "============================================\n"; if (debug) std::cerr << "ANNOTATING INDEL FUZZILY\n"; //1. selects candidate region by fuzzy left and right alignment cre->pick_candidate_region(h, v, variant, EXACT_LEFT_RIGHT_ALIGNMENT); //2. detect candidate motifs from a reference sequence cmp->generate_candidate_motifs(h, v, variant); if (!cmp->next_motif(h, v, variant)) { std::cerr << "oops, no candidate motif for next step\n"; } //3. evaluate reference length fd->detect_flanks(h, v, variant, FRAHMM); //introduce reiteration based on concordance and exact concordance. if (debug) std::cerr << "============================================\n"; return; } } }
/** * Gets records for the most recent position and fills up the buffer from file i. * returns true if buffer is filled or it is not necessary to fill buffer. * returns false if no more records are found to fill buffer */ void BCFSyncedReader::fill_buffer(int32_t i) { if (buffer[i].size()>=2) return; if (random_access) { int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); if (ftypes[i].format==bcf) { bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (itrs[i] && bcf_itr_next(files[i], itrs[i], v)>=0) { populated = true; bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } else if (ftypes[i].format==vcf) { while (itrs[i] && tbx_itr_next(files[i], tbxs[i], itrs[i], &s)>=0) { bcf1_t *v = get_bcf1_from_pool(); vcf_parse(&s, hdrs[i], v); bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } } } } else { int32_t rid = buffer[i].size()==0 ? -1 : bcf_get_rid(buffer[i].front()); int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (bcf_read(files[i], hdrs[i], v)>=0) { populated = true; bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (rid==-1) { rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); } if (bcf_get_rid(v)!=rid || bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } }
/** * Inserts a record into pq. */ void BCFSyncedReader::insert_into_pq(int32_t i, bcf1_t *v) { pq.push(new bcfptr(i, bcf_get_rid(v), bcf_get_pos1(v), hdrs[i], v, sync_by_pos)); }
/** * Constructor. * @v - VCF record. */ GenotypingRecord::GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype) { clear(); this->h = h; this->v = v; rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); this->vtype = vtype; int32_t n_allele = bcf_get_n_allele(v); if (vtype==VT_SNP && n_allele==2) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v); end1 = beg1; } else if (vtype==VT_INDEL && bcf_get_n_allele(v)==2) { rid = bcf_get_rid(v); char** alleles = bcf_get_allele(v); dlen = strlen(alleles[1])-strlen(alleles[0]); len = abs(dlen); int32_t *flanks = NULL; int32_t n = 0; if (bcf_get_info_int32(h, v, "FLANKS", &flanks, &n)>0) { lend1 = flanks[0]; rbeg1 = flanks[1]; free(flanks); } else { lend1 = bcf_get_pos1(v) - 1; rbeg1 = bcf_get_end_pos1(v) + 1; } int32_t *fuzzy_flanks = NULL; n = 0; if (bcf_get_info_int32(h, v, "FZ_FLANKS", &fuzzy_flanks, &n)>0) { fuzzy_lend1 = fuzzy_flanks[0]; fuzzy_rbeg1 = fuzzy_flanks[1]; free(fuzzy_flanks); } else { fuzzy_lend1 = bcf_get_pos1(v) - 1; fuzzy_rbeg1 = bcf_get_end_pos1(v) + 1; } beg1 = std::min(lend1-2, fuzzy_lend1-2); end1 = std::max(rbeg1+2, fuzzy_rbeg1+2); //construct alleles //get reference sequence // char* ref_seq = NULL; // int32_t ref_len = 0; //// ref_seq = faidx_fetch_seq(fai, bcf_get_chrom(h,v), lend1+1-1, rbeg1-1-1, &ref_len); // // for (uint32_t i=0; i<n_allele; ++i) // { // // } // for () // { // } // if (dlen>0) { indel.append(&alleles[1][1]); } else { indel.append(&alleles[0][1]); } } else if (vtype==VT_VNTR) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v) - 1; end1 = bcf_get_end_pos1(v) + 1; char *motif = NULL; int32_t n = 0; if (bcf_get_info_string(h, v, "MOTIF", &motif, &n)>0) { this->motif.assign(motif); free(motif); } } }