/** * Extract reference sequence region for motif discovery in a fuzzy fashion. */ void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); trim(pos1, ref, alt); if (debug) { std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n"; std::cerr << " : " << ref << ":" << alt << "\n"; } min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3); max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3); int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.exact_rbeg1 = min_beg1; if (seq_len) free(seq); }
/** * Gets records for the most recent position and fills up the buffer from file i. * returns true if buffer is filled or it is not necessary to fill buffer. * returns false if no more records are found to fill buffer */ void BCFSyncedStreamReader::fill_buffer(int32_t i) { //not necessary to fill buffer if (buffer[i].size()>=2) return; int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); if (ftypes[i]==FT_BCF_GZ) { bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (itrs[i] && bcf_itr_next(vcfs[i], itrs[i], v) >= 0) { populated = true; bcf_unpack(v, BCF_UN_STR); buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } else if (ftypes[i]==FT_VCF_GZ) { while (itrs[i] && tbx_itr_next(vcfs[i], tbxs[i], itrs[i], &s) >= 0) { bcf1_t *v = get_bcf1_from_pool(); vcf_parse(&s, hdrs[i], v); bcf_unpack(v, BCF_UN_STR); buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } } } }
/** * Returns true if chrom:start1-end1 overlaps with a region in the file. */ bool OrderedBCFOverlapMatcher::overlaps_with(std::string& chrom, int32_t start1, int32_t end1) { bool overlaps = false; //moves to new chromosome if (current_interval.seq!=chrom) { buffer.clear(); current_interval.set(chrom); odr->jump_to_interval(current_interval); std::cerr << "Jumped to chromosome " << chrom << "\n"; while (odr->read(v)) { if (bcf_get_end_pos1(v)<start1) continue; overlaps = overlaps || (bcf_get_pos1(v)<=end1); buffer.push_back(v); if (bcf_get_pos1(v)>end1) break; v = bcf_init(); } } else { //scythe preceding bed records std::list<bcf1_t*>::iterator i = buffer.begin(); while (i!=buffer.end()) { if (bcf_get_end_pos1(*i)<start1) { bcf_destroy(*i); i = buffer.erase(i); continue; } overlaps = (bcf_get_pos1(*i)<=end1); break; } if (!overlaps) { if (buffer.empty()) { while (odr->read(v)) { if (bcf_get_end_pos1(*i)<start1) continue; overlaps = overlaps || (bcf_get_pos1(*i)<=end1); buffer.push_back(v); if (bcf_get_pos1(v)>end1) break; v = bcf_init(); } } } } return overlaps; };
/** * Gets a sorted string representation of a variant. */ void bcf_variant2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var) { bcf_print_liten(h,v); bcf_unpack(v, BCF_UN_STR); var->l = 0; kputs(bcf_get_chrom(h, v), var); kputc(':', var); kputw(bcf_get_pos1(v), var); kputc(':', var); if (v->n_allele==2) { kputs(bcf_get_alt(v, 0), var); kputc(',', var); kputs(bcf_get_alt(v, 1), var); } else { char** allele = bcf_get_allele(v); char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*)); for (int32_t i=1; i<v->n_allele; ++i) { temp[i] = allele[i]; } std::qsort(temp, bcf_get_n_allele(v), sizeof(char*), cmpstr); kputs(bcf_get_alt(v, 0), var); for (int32_t i=0; i<v->n_allele-1; ++i) { kputc(',', var); kputs(temp[i], var); } free(temp); } }
/** * Checks if a vntr is a homopolymer. */ bool CandidateRegionExtractor::is_homopolymer(bcf_hdr_t* h, bcf1_t* v) { bool is_homopolymer = false; uint32_t ref_len = strlen(bcf_get_ref(v)); for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); } return is_homopolymer; }
/** * Gets a string representation of a variant. */ void bcf_variant2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var) { bcf_unpack(v, BCF_UN_STR); var->l = 0; kputs(bcf_get_chrom(h, v), var); kputc(':', var); kputw(bcf_get_pos1(v), var); kputc(':', var); for (int32_t i=0; i<v->n_allele; ++i) { if (i) kputc(',', var); kputs(bcf_get_alt(v, i), var); } }
/** * Gets a string representation of the variant. */ std::string Variant::get_variant_string() { kstring_t var = {0,0,0}; bcf_unpack(v, BCF_UN_STR); var.l = 0; kputs(bcf_get_chrom(h, v), &var); kputc(':', &var); kputw(bcf_get_pos1(v), &var); kputc(':', &var); for (size_t i=0; i<bcf_get_n_allele(v); ++i) { if (i) kputc('/', &var); kputs(bcf_get_alt(v, i), &var); } std::string str(var.s); if (var.m) free(var.s); return str; }
/** * Pick candidate region. * * @mode - REFERENCE use refence field * - ALLELE_EXACT by exact alignment * - ALLELE_FUZZY by fuzzy alignment */ void CandidateRegionExtractor::pick_candidate_region(bcf_hdr_t* h, bcf1_t* v, Variant& variant, uint32_t mode) { if (mode==REFERENCE) { VNTR& vntr = variant.vntr; vntr.exact_repeat_tract.assign(bcf_get_ref(v)); vntr.exact_rbeg1 = bcf_get_pos1(v); char** alleles = bcf_get_allele(v); vntr.exact_rend1 = strlen(alleles[0]); vntr.fuzzy_rbeg1 = vntr.exact_rbeg1; vntr.fuzzy_rend1 = vntr.exact_rend1; } else if (mode==EXACT_LEFT_RIGHT_ALIGNMENT) { extract_regions_by_exact_alignment(h, v, variant); } else if (mode==FUZZY_LEFT_RIGHT_ALIGNMENT) { extract_regions_by_fuzzy_alignment(h, v, variant); } }
/** * Extract reference sequence region for motif discovery. * * The input is a VCF record that contains an indel. * * If the the indel has multiple alleles, it will examine all * alleles. * * todo: is might be a good idea to combine this step with motif detection * since there seems to be a need to have an iterative process here * to ensure a good candidate motif is chosen. * */ void CandidateRegionExtractor::extract_regions_by_exact_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant) { if (debug) { if (debug) std::cerr << "********************************************\n"; std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n"; } VNTR& vntr = variant.vntr; const char* chrom = bcf_get_chrom(h, v); int32_t min_beg1 = bcf_get_pos1(v); int32_t max_end1 = min_beg1; if (debug) { bcf_print_liten(h, v); } //merge candidate search region for (size_t i=1; i<bcf_get_n_allele(v); ++i) { std::string ref(bcf_get_alt(v, 0)); std::string alt(bcf_get_alt(v, i)); int32_t pos1 = bcf_get_pos1(v); //this prevents introduction of flanks that do not harbour the repeat unit trim(pos1, ref, alt); int32_t end1 = pos1 + ref.size() - 1; right_align(chrom, end1, ref, alt); int32_t beg1 = end1 - ref.size() + 1; left_align(chrom, beg1, ref, alt); min_beg1 = beg1<min_beg1 ? beg1 : min_beg1; max_end1 = end1>max_end1 ? end1 : max_end1; int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n"; std::cerr << " " << seq << "\n"; } if (seq_len) free(seq); } int32_t seq_len; char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len); if (debug) { std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n"; std::cerr << " " << seq << "\n"; } vntr.exact_repeat_tract = seq; vntr.rid = bcf_get_rid(v); vntr.exact_rbeg1 = min_beg1; vntr.exact_rend1 = max_end1; if (seq_len) free(seq); }
/** * Gets records for the most recent position and fills up the buffer from file i. * returns true if buffer is filled or it is not necessary to fill buffer. * returns false if no more records are found to fill buffer */ void BCFSyncedReader::fill_buffer(int32_t i) { if (buffer[i].size()>=2) return; if (random_access) { int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); if (ftypes[i].format==bcf) { bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (itrs[i] && bcf_itr_next(files[i], itrs[i], v)>=0) { populated = true; bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } else if (ftypes[i].format==vcf) { while (itrs[i] && tbx_itr_next(files[i], tbxs[i], itrs[i], &s)>=0) { bcf1_t *v = get_bcf1_from_pool(); vcf_parse(&s, hdrs[i], v); bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (pos1==0) { pos1 = bcf_get_pos1(v); } if (bcf_get_pos1(v)!=pos1) { break; } } } } else { int32_t rid = buffer[i].size()==0 ? -1 : bcf_get_rid(buffer[i].front()); int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front()); bcf1_t *v = get_bcf1_from_pool(); bool populated = false; while (bcf_read(files[i], hdrs[i], v)>=0) { populated = true; bcf_unpack(v, BCF_UN_STR); //check to ensure order if (!buffer[i].empty()) { if (!bcf_is_in_order(buffer[i].back(), v)) { fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str()); exit(1); } } buffer[i].push_back(v); insert_into_pq(i, v); if (rid==-1) { rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); } if (bcf_get_rid(v)!=rid || bcf_get_pos1(v)!=pos1) { break; } v = get_bcf1_from_pool(); populated = false; } if (!populated) store_bcf1_into_pool(v); } }
/** * Constructor. */ Variant::Variant(bcf_hdr_t* h, bcf1_t* v) { this->h = h; this->v = v; type = classify(h, v); chrom = bcf_get_chrom(h, v); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); no_overlapping_snps = 0; no_overlapping_indels = 0; no_overlapping_vntrs = 0; is_new_multiallelic = false; //attempts to update relevant information on variants if (type==VT_SNP) { beg1 = bcf_get_pos1(v); end1 = bcf_get_pos1(v); } else if (type==VT_INDEL) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); //annotate ends if (!end1) end1 = bcf_get_end1(v); } //complex variants else if (type & (VT_SNP|VT_MNP|VT_INDEL|VT_CLUMPED)) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); } else if (type==VT_VNTR) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); update_vntr_from_info_fields(h, v); vs.push_back(v); vntr_vs.push_back(v); } else if (type==VT_SV) { beg1 = bcf_get_pos1(v); end1 = bcf_get_info_int(h, v, "END", 0); if (!end1) end1 = bcf_get_end1(v); } else { std::cerr << "unexpected type in variant construction\n"; print(); exit(1); } }
/** * Classifies variants. */ int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v) { clear(); this->h = h; this->v = v; bcf_unpack(v, BCF_UN_STR); chrom.assign(bcf_get_chrom(h, v)); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); end1 = bcf_get_end1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); if (strchr(ref, 'N')) { contains_N = true; } //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t allele_type = VT_REF; //check for symbolic alternative alleles if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { allele_type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { allele_type = VT_VNTR; } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { allele_type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { type = VT_VNTR; for (size_t j=3; j<len-1; ++j) { if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.') { type = VT_SV; } } } } if (allele_type==VT_VNTR) { allele_type = VT_VNTR; type |= allele_type; alleles.push_back(Allele(allele_type)); } else { allele_type = VT_SV; type |= allele_type; std::string sv_type(allele[i]); alleles.push_back(Allele(allele_type, sv_type)); } } //checks for chromosomal breakpoints else if (strchr(allele[i],'[')||strchr(allele[i],']')) { allele_type = VT_SV; type |= allele_type; std::string sv_type("<BND>"); alleles.push_back(Allele(allele_type, sv_type)); } //non variant record else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } //explicit sequence of bases else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (strchr(alt, 'N')) { contains_N = true; } if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { allele_type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { allele_type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { allele_type |= VT_CLUMPED; } type |= allele_type; alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); ts += ts; tv += tv; ins = dlen>0?1:0; del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (type==VT_VNTR) { update_vntr_from_info_fields(h, v); } //additionally define MNPs by length of all alleles if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { type |= VT_MNP; } } return type; }
/** * Annotates VNTR characteristics. * @mode - */ void VNTRAnnotator::annotate(bcf_hdr_t* h, bcf1_t* v, Variant& variant, std::string mode) { VNTR& vntr = variant.vntr; //update chromosome and position variant.rid = bcf_get_rid(v); variant.pos1 = bcf_get_pos1(v); //this is for reannotating an VNTR record //this is more for the purpose of evaluation to //check if vt's algorithm is concordant with //VNTRs from other sources. if (variant.type==VT_VNTR) { if (debug) std::cerr << "ANNOTATING VNTR/STR \n"; //1. pick candidate region cre->pick_candidate_region(h, v, variant, REFERENCE); //2. detect candidate motifs from a reference seqeuence cmp->generate_candidate_motifs(h, v, variant); cmp->next_motif(h, v, variant); } //main purpose - annotation of Indels. else if (variant.type&VT_INDEL) { //the basic steps in annotating a TR // //1. extract a region that has a chance of containing the repeat units //2. choose a set of candidate motifs and pick motif //3. detect repeat region and evaluate //4. iterate 2 and 3 //EXACT MODE if (mode=="e") { if (debug) std::cerr << "============================================\n"; if (debug) std::cerr << "ANNOTATING INDEL EXACTLY\n"; //1. pick candidate region using exact left and right alignment cre->pick_candidate_region(h, v, variant, EXACT_LEFT_RIGHT_ALIGNMENT); //2. evaluate reference length fd->detect_flanks(h, v, variant, CLIP_ENDS); if (debug) std::cerr << "============================================\n"; return; } //FUZZY DETECTION else if (mode=="f") { if (debug) std::cerr << "============================================\n"; if (debug) std::cerr << "ANNOTATING INDEL FUZZILY\n"; //1. selects candidate region by fuzzy left and right alignment cre->pick_candidate_region(h, v, variant, EXACT_LEFT_RIGHT_ALIGNMENT); //2. detect candidate motifs from a reference sequence cmp->generate_candidate_motifs(h, v, variant); if (!cmp->next_motif(h, v, variant)) { std::cerr << "oops, no candidate motif for next step\n"; } //3. evaluate reference length fd->detect_flanks(h, v, variant, FRAHMM); //introduce reiteration based on concordance and exact concordance. if (debug) std::cerr << "============================================\n"; return; } } }
/** * Detects near by STRs. */ bool VariantManip::detect_str(bcf_hdr_t *h, bcf1_t *v, Variant& variant) { return detect_str(bcf_get_chrom(h, v), bcf_get_pos1(v), variant); }
/** * Inserts a record into pq. */ void BCFSyncedStreamReader::insert_into_pq(int32_t i, bcf1_t *v) { pq.push(new bcfptr(i, bcf_get_pos1(v), hdrs[i], v, sync_by_pos)); }
/** * Classifies variants. */ int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var) { bcf_unpack(v, BCF_UN_STR); const char* chrom = bcf_get_chrom(h, v); uint32_t pos1 = bcf_get_pos1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; var.ts = 0; var.tv = 0; var.ins = 0; var.del = 0; var.clear(); // this sets the type to VT_REF by default. bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t type = VT_REF; //check for tags if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { type = VT_VNTR; } } if (type==VT_VNTR) { type = VT_VNTR; var.type |= type; var.alleles.push_back(Allele(type)); } else { type = VT_SV; var.type |= type; std::string sv_type(allele[i]); var.alleles.push_back(Allele(type, sv_type)); } } else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { type |= VT_CLUMPED; } var.type |= type; var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); var.ts += ts; var.tv += tv; var.ins = dlen>0?1:0; var.del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (var.type==VT_VNTR) { bcf_unpack(v, BCF_UN_INFO); //populate motif, motif len etc. etc. // char* str = NULL; // int32_t n = 0; // int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n); // if (ret>0) // { // var.motif = std::string(str); // var.mlen = var.motif.size(); // } // ret = bcf_get_info_string(h, v, "RU", &str, &n); // if (ret>0) // { // var.ru = std::string(str); // var.mlen = var.ru.size(); // } // if (n) free(str); // // int32_t* no = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "RL", &no, &n); // if (ret>0) var.rlen = *no; // if (n) free(no); // // int32_t* fl = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "REF", &fl, &n); // if (ret>0) var.rcn = *fl; // if (n) free(fl); } //additionally define MNPs by length of all alleles if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { var.type |= VT_MNP; } } return var.type; }
/** * Constructor. * @v - VCF record. */ GenotypingRecord::GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype) { clear(); this->h = h; this->v = v; rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); this->vtype = vtype; int32_t n_allele = bcf_get_n_allele(v); if (vtype==VT_SNP && n_allele==2) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v); end1 = beg1; } else if (vtype==VT_INDEL && bcf_get_n_allele(v)==2) { rid = bcf_get_rid(v); char** alleles = bcf_get_allele(v); dlen = strlen(alleles[1])-strlen(alleles[0]); len = abs(dlen); int32_t *flanks = NULL; int32_t n = 0; if (bcf_get_info_int32(h, v, "FLANKS", &flanks, &n)>0) { lend1 = flanks[0]; rbeg1 = flanks[1]; free(flanks); } else { lend1 = bcf_get_pos1(v) - 1; rbeg1 = bcf_get_end_pos1(v) + 1; } int32_t *fuzzy_flanks = NULL; n = 0; if (bcf_get_info_int32(h, v, "FZ_FLANKS", &fuzzy_flanks, &n)>0) { fuzzy_lend1 = fuzzy_flanks[0]; fuzzy_rbeg1 = fuzzy_flanks[1]; free(fuzzy_flanks); } else { fuzzy_lend1 = bcf_get_pos1(v) - 1; fuzzy_rbeg1 = bcf_get_end_pos1(v) + 1; } beg1 = std::min(lend1-2, fuzzy_lend1-2); end1 = std::max(rbeg1+2, fuzzy_rbeg1+2); //construct alleles //get reference sequence // char* ref_seq = NULL; // int32_t ref_len = 0; //// ref_seq = faidx_fetch_seq(fai, bcf_get_chrom(h,v), lend1+1-1, rbeg1-1-1, &ref_len); // // for (uint32_t i=0; i<n_allele; ++i) // { // // } // for () // { // } // if (dlen>0) { indel.append(&alleles[1][1]); } else { indel.append(&alleles[0][1]); } } else if (vtype==VT_VNTR) { rid = bcf_get_rid(v); beg1 = bcf_get_pos1(v) - 1; end1 = bcf_get_end_pos1(v) + 1; char *motif = NULL; int32_t n = 0; if (bcf_get_info_string(h, v, "MOTIF", &motif, &n)>0) { this->motif.assign(motif); free(motif); } } }