예제 #1
0
/**
 * Extract reference sequence region for motif discovery in a fuzzy fashion.
 */
void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(bcf_hdr_t* h, bcf1_t* v,  Variant& variant)
{
    if (debug)
    {
        if (debug) std::cerr << "********************************************\n";
        std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n";
    }

    VNTR& vntr = variant.vntr;
    const char* chrom = bcf_get_chrom(h, v);

    int32_t min_beg1 = bcf_get_pos1(v);
    int32_t max_end1 = min_beg1;

    //merge candidate search region
    for (size_t i=1; i<bcf_get_n_allele(v); ++i)
    {
        std::string ref(bcf_get_alt(v, 0));
        std::string alt(bcf_get_alt(v, i));
        int32_t pos1 = bcf_get_pos1(v);

        trim(pos1, ref, alt);

        if (debug)
        {
            std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n";
            std::cerr << "               : " << ref << ":" << alt << "\n";
        }

        min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3);
        max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3);

        int32_t seq_len;
        char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);
        if (debug)
        {
            std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
            std::cerr << "             " << seq << "\n";
        }

        if (seq_len) free(seq);
    }

    int32_t seq_len;
    char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

    if (debug)
    {
        std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
        std::cerr << "                   " << seq << "\n";
    }

    vntr.exact_repeat_tract = seq;
    vntr.exact_rbeg1 = min_beg1;

    if (seq_len) free(seq);
}
예제 #2
0
/**
 * Gets records for the most recent position and fills up the buffer from file i.
 * returns true if buffer is filled or it is not necessary to fill buffer.
 * returns false if no more records are found to fill buffer
 */
void BCFSyncedStreamReader::fill_buffer(int32_t i)
{
    //not necessary to fill buffer
    if (buffer[i].size()>=2)
        return;

    int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front());

    if (ftypes[i]==FT_BCF_GZ)
    {
        bcf1_t *v = get_bcf1_from_pool();
        bool populated = false;
        while (itrs[i] && bcf_itr_next(vcfs[i], itrs[i], v) >= 0)
        {
            populated = true;
            bcf_unpack(v, BCF_UN_STR);
            buffer[i].push_back(v);
            insert_into_pq(i, v);

            if (pos1==0)
            {
                pos1 = bcf_get_pos1(v);
            }

            if (bcf_get_pos1(v)!=pos1)
            {
                break;
            }
            v = get_bcf1_from_pool();
            populated = false;
        }
        if (!populated)
            store_bcf1_into_pool(v);
    }
    else if (ftypes[i]==FT_VCF_GZ)
    {
        while (itrs[i] && tbx_itr_next(vcfs[i], tbxs[i], itrs[i], &s) >= 0)
        {
            bcf1_t *v = get_bcf1_from_pool();
            vcf_parse(&s, hdrs[i], v);

            bcf_unpack(v, BCF_UN_STR);
            buffer[i].push_back(v);
            insert_into_pq(i, v);

            if (pos1==0)
            {
                pos1 = bcf_get_pos1(v);
            }

            if (bcf_get_pos1(v)!=pos1)
            {
                break;
            }
        }
    }
}
/**
 * Returns true if chrom:start1-end1 overlaps with a region in the file.
 */
bool OrderedBCFOverlapMatcher::overlaps_with(std::string& chrom, int32_t start1, int32_t end1)
{
    bool overlaps = false;

    //moves to new chromosome
    if (current_interval.seq!=chrom)
    {
        buffer.clear();
        current_interval.set(chrom);
        odr->jump_to_interval(current_interval);
        std::cerr << "Jumped to chromosome " << chrom << "\n";
        while (odr->read(v))
        {
            if (bcf_get_end_pos1(v)<start1) continue;
            overlaps = overlaps || (bcf_get_pos1(v)<=end1);
            buffer.push_back(v);
            if (bcf_get_pos1(v)>end1) break;
            
            v = bcf_init();
        }
    }
    else
    {
        //scythe preceding bed records
        std::list<bcf1_t*>::iterator i = buffer.begin();
        while (i!=buffer.end())
        {
            if (bcf_get_end_pos1(*i)<start1)
            {
                bcf_destroy(*i);
                i = buffer.erase(i);
                continue;
            }

            overlaps = (bcf_get_pos1(*i)<=end1);
            break;
        }
        
        if (!overlaps)        
        {
            if (buffer.empty())
            {    
                while (odr->read(v))
                {
                    if (bcf_get_end_pos1(*i)<start1) continue;
                    overlaps = overlaps || (bcf_get_pos1(*i)<=end1);
                    buffer.push_back(v);
                    if (bcf_get_pos1(v)>end1) break;
                    
                    v = bcf_init();
                }
            }
        }
    }

    return overlaps;
};
예제 #4
0
파일: hts_utils.cpp 프로젝트: pkrusche/vt
/**
 * Gets a sorted string representation of a variant.
 */
void bcf_variant2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
{
    bcf_print_liten(h,v);
    
    bcf_unpack(v, BCF_UN_STR);
    var->l = 0;
    kputs(bcf_get_chrom(h, v), var);
    kputc(':', var);
    kputw(bcf_get_pos1(v), var);
    kputc(':', var);
    
    if (v->n_allele==2)
    {
        kputs(bcf_get_alt(v, 0), var);
        kputc(',', var);
        kputs(bcf_get_alt(v, 1), var);
    }
    else
    {
        char** allele = bcf_get_allele(v);
        char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*));
        for (int32_t i=1; i<v->n_allele; ++i)
        {
            temp[i] = allele[i];
        }
        std::qsort(temp, bcf_get_n_allele(v), sizeof(char*), cmpstr);
        kputs(bcf_get_alt(v, 0), var);
        for (int32_t i=0; i<v->n_allele-1; ++i)
        {
            kputc(',', var);
            kputs(temp[i], var);
        }
        free(temp);
    }
}
예제 #5
0
/**
 * Checks if a vntr is a homopolymer.
 */
bool CandidateRegionExtractor::is_homopolymer(bcf_hdr_t* h, bcf1_t* v)
{
    bool is_homopolymer = false;
    uint32_t ref_len = strlen(bcf_get_ref(v));
    for (size_t i=1; i<bcf_get_n_allele(v); ++i)
    {
        std::string ref(bcf_get_alt(v, 0));
        std::string alt(bcf_get_alt(v, i));
        int32_t pos1 = bcf_get_pos1(v);
    }

    return is_homopolymer;
}
예제 #6
0
파일: hts_utils.cpp 프로젝트: pkrusche/vt
/**
 * Gets a string representation of a variant.
 */
void bcf_variant2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
{
    bcf_unpack(v, BCF_UN_STR);
    var->l = 0;
    kputs(bcf_get_chrom(h, v), var);
    kputc(':', var);
    kputw(bcf_get_pos1(v), var);
    kputc(':', var);
    for (int32_t i=0; i<v->n_allele; ++i)
    {
        if (i) kputc(',', var);
        kputs(bcf_get_alt(v, i), var);
    }
}
예제 #7
0
파일: variant.cpp 프로젝트: atks/vt
/**
 * Gets a string representation of the variant.
 */
std::string Variant::get_variant_string()
{
    kstring_t var = {0,0,0};
    bcf_unpack(v, BCF_UN_STR);
    var.l = 0;
    kputs(bcf_get_chrom(h, v), &var);
    kputc(':', &var);
    kputw(bcf_get_pos1(v), &var);
    kputc(':', &var);
    for (size_t i=0; i<bcf_get_n_allele(v); ++i)
    {
        if (i) kputc('/', &var);
        kputs(bcf_get_alt(v, i), &var);
    }

    std::string str(var.s);

    if (var.m) free(var.s);

    return str;
}
예제 #8
0
/**
 * Pick candidate region.
 *
 * @mode - REFERENCE     use refence field
 *       - ALLELE_EXACT  by exact alignment
 *       - ALLELE_FUZZY  by fuzzy alignment
 */
void CandidateRegionExtractor::pick_candidate_region(bcf_hdr_t* h, bcf1_t* v, Variant& variant, uint32_t mode)
{
    if (mode==REFERENCE)
    {
        VNTR& vntr = variant.vntr;
        vntr.exact_repeat_tract.assign(bcf_get_ref(v));
        vntr.exact_rbeg1 = bcf_get_pos1(v);
        char** alleles = bcf_get_allele(v);
        vntr.exact_rend1 = strlen(alleles[0]);
        vntr.fuzzy_rbeg1 = vntr.exact_rbeg1;
        vntr.fuzzy_rend1 = vntr.exact_rend1;
        
    }
    else if (mode==EXACT_LEFT_RIGHT_ALIGNMENT)
    {
        extract_regions_by_exact_alignment(h, v, variant);
    }
    else if (mode==FUZZY_LEFT_RIGHT_ALIGNMENT)
    {
        extract_regions_by_fuzzy_alignment(h, v, variant);
    }
}
예제 #9
0
/**
 * Extract reference sequence region for motif discovery.
 *
 * The input is a VCF record that contains an indel.
 * 
 * If the the indel has multiple alleles, it will examine all
 * alleles.
 *
 * todo: is might be a good idea to combine this step with motif detection
 *       since there seems to be a need to have an iterative process here
 *       to ensure a good candidate motif is chosen. *  
 */
void CandidateRegionExtractor::extract_regions_by_exact_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant)
{
    if (debug)
    {
        if (debug) std::cerr << "********************************************\n";
        std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n";
    }

    VNTR& vntr = variant.vntr;
    const char* chrom = bcf_get_chrom(h, v);

    int32_t min_beg1 = bcf_get_pos1(v);
    int32_t max_end1 = min_beg1;

    if (debug)
    {
       bcf_print_liten(h, v);
    }

    //merge candidate search region
    for (size_t i=1; i<bcf_get_n_allele(v); ++i)
    {
        std::string ref(bcf_get_alt(v, 0));
        std::string alt(bcf_get_alt(v, i));
        int32_t pos1 = bcf_get_pos1(v);

        //this prevents introduction of flanks that do not harbour the repeat unit
        trim(pos1, ref, alt);

        int32_t end1 = pos1 + ref.size() - 1;
        right_align(chrom, end1, ref, alt);

        int32_t beg1 = end1 - ref.size() + 1;
        left_align(chrom, beg1, ref, alt);

        min_beg1 = beg1<min_beg1 ? beg1 : min_beg1;
        max_end1 = end1>max_end1 ? end1 : max_end1;

        int32_t seq_len;
        char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

        if (debug)
        {
            std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n";
            std::cerr << "             " << seq << "\n";
        }

        if (seq_len) free(seq);
    }

    int32_t seq_len;
    char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

    if (debug)
    {
        std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
        std::cerr << "                   " << seq << "\n";
    }

    vntr.exact_repeat_tract = seq;
    vntr.rid = bcf_get_rid(v);
    vntr.exact_rbeg1 = min_beg1;
    vntr.exact_rend1 = max_end1;
    
    if (seq_len) free(seq);
}
예제 #10
0
/**
 * Gets records for the most recent position and fills up the buffer from file i.
 * returns true if buffer is filled or it is not necessary to fill buffer.
 * returns false if no more records are found to fill buffer
 */
void BCFSyncedReader::fill_buffer(int32_t i)
{
    if (buffer[i].size()>=2)
        return;

    if (random_access)
    {
        int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front());

        if (ftypes[i].format==bcf)
        {
            bcf1_t *v = get_bcf1_from_pool();
            bool populated = false;

            while (itrs[i] && bcf_itr_next(files[i], itrs[i], v)>=0)
            {
                populated = true;
                bcf_unpack(v, BCF_UN_STR);
                
                //check to ensure order
                if (!buffer[i].empty())
                {
                    if (!bcf_is_in_order(buffer[i].back(), v))
                    {
                        fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
                        exit(1);
                    }
                }
                
                buffer[i].push_back(v);
                insert_into_pq(i, v);

                if (pos1==0)
                {
                    pos1 = bcf_get_pos1(v);
                }

                if (bcf_get_pos1(v)!=pos1)
                {
                    break;
                }

                v = get_bcf1_from_pool();
                populated = false;
            }

            if (!populated)
                store_bcf1_into_pool(v);
        }
        else if (ftypes[i].format==vcf)
        {
            while (itrs[i] && tbx_itr_next(files[i], tbxs[i], itrs[i], &s)>=0)
            {
                bcf1_t *v = get_bcf1_from_pool();
                vcf_parse(&s, hdrs[i], v);

                bcf_unpack(v, BCF_UN_STR);
                
                //check to ensure order
                if (!buffer[i].empty())
                {
                    if (!bcf_is_in_order(buffer[i].back(), v))
                    {
                        fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
                        exit(1);
                    }
                }
                
                buffer[i].push_back(v);
                insert_into_pq(i, v);

                if (pos1==0)
                {
                    pos1 = bcf_get_pos1(v);
                }

                if (bcf_get_pos1(v)!=pos1)
                {
                    break;
                }
            }
        }
    }
    else
    {
        int32_t rid = buffer[i].size()==0 ? -1 : bcf_get_rid(buffer[i].front());
        int32_t pos1 = buffer[i].size()==0 ? 0 : bcf_get_pos1(buffer[i].front());

        bcf1_t *v = get_bcf1_from_pool();
        bool populated = false;

        while (bcf_read(files[i], hdrs[i], v)>=0)
        {
            populated = true;
            bcf_unpack(v, BCF_UN_STR);
            
            //check to ensure order
            if (!buffer[i].empty())
            {
                if (!bcf_is_in_order(buffer[i].back(), v))
                {
                    fprintf(stderr, "[E:%s:%d %s] VCF file not in order: %s\n", __FILE__, __LINE__, __FUNCTION__, file_names[i].c_str());
                    exit(1);
                }
            }
            
            buffer[i].push_back(v);
            insert_into_pq(i, v);

            if (rid==-1)
            {
                rid = bcf_get_rid(v);
                pos1 = bcf_get_pos1(v);
            }

            if (bcf_get_rid(v)!=rid || bcf_get_pos1(v)!=pos1)
            {
                break;
            }

            v = get_bcf1_from_pool();
            populated = false;
        }

        if (!populated)
            store_bcf1_into_pool(v);
    }
}
예제 #11
0
파일: variant.cpp 프로젝트: atks/vt
/**
 * Constructor.
 */
Variant::Variant(bcf_hdr_t* h, bcf1_t* v)
{
    this->h = h;
    this->v = v;

    type = classify(h, v);

    chrom = bcf_get_chrom(h, v);
    rid = bcf_get_rid(v);
    pos1 = bcf_get_pos1(v);

    no_overlapping_snps = 0;
    no_overlapping_indels = 0;
    no_overlapping_vntrs = 0;

    is_new_multiallelic =  false;

    //attempts to update relevant information on variants
    if (type==VT_SNP)
    {
        beg1 = bcf_get_pos1(v);
        end1 = bcf_get_pos1(v);
    }
    else if (type==VT_INDEL)
    {
        beg1 = bcf_get_pos1(v);
        end1 = bcf_get_info_int(h, v, "END", 0);

        //annotate ends
        if (!end1) end1 = bcf_get_end1(v);
    }
    //complex variants
    else if (type & (VT_SNP|VT_MNP|VT_INDEL|VT_CLUMPED))
    {
        beg1 = bcf_get_pos1(v);
        end1 = bcf_get_info_int(h, v, "END", 0);
        if (!end1) end1 = bcf_get_end1(v);
    }
    else if (type==VT_VNTR)
    {
        beg1 = bcf_get_pos1(v);
        end1 = bcf_get_info_int(h, v, "END", 0);
        if (!end1) end1 = bcf_get_end1(v);

        update_vntr_from_info_fields(h, v);

        vs.push_back(v);
        vntr_vs.push_back(v);
    }
    else if (type==VT_SV)
    {
        beg1 = bcf_get_pos1(v);
        end1 = bcf_get_info_int(h, v, "END", 0);
        if (!end1) end1 = bcf_get_end1(v);
    }
    else
    {
        std::cerr << "unexpected type in variant construction\n";
        print();
        exit(1);
    }
}
예제 #12
0
파일: variant.cpp 프로젝트: atks/vt
/**
 * Classifies variants.
 */
int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v)
{
    clear();

    this->h = h;
    this->v = v;

    bcf_unpack(v, BCF_UN_STR);
    chrom.assign(bcf_get_chrom(h, v));
    rid = bcf_get_rid(v);
    pos1 = bcf_get_pos1(v);
    end1 = bcf_get_end1(v);
    char** allele = bcf_get_allele(v);
    int32_t n_allele = bcf_get_n_allele(v);
    int32_t pos0 = pos1-1;

    bool homogeneous_length = true;
    char* ref = allele[0];
    int32_t rlen = strlen(ref);

    if (strchr(ref, 'N'))
    {
        contains_N = true;
    }

    //if only ref allele, skip this entire for loop
    for (size_t i=1; i<n_allele; ++i)
    {
        int32_t allele_type = VT_REF;

        //check for symbolic alternative alleles
        if (strchr(allele[i],'<'))
        {
            size_t len = strlen(allele[i]);
            if (len>=5)
            {
                //VN/d+
                if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            allele_type = VT_VNTR;
                        }
                    }
                }
                //VNTR
                else if (len==6 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' &&
                         allele[i][5]=='>' )
                {
                     allele_type = VT_VNTR;
                }
                //STR
                else if (len==5 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' &&
                         allele[i][4]=='>' )
                {
                     allele_type = VT_VNTR;
                }
                //ST/d+
                else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' )
                {
                    type = VT_VNTR;

                    for (size_t j=3; j<len-1; ++j)
                    {
                        if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.')
                        {
                            type = VT_SV;
                        }
                    }
                }
            }

            if (allele_type==VT_VNTR)
            {
                allele_type = VT_VNTR;
                type |= allele_type;
                alleles.push_back(Allele(allele_type));
            }
            else
            {
                allele_type = VT_SV;
                type |= allele_type;
                std::string sv_type(allele[i]);
                alleles.push_back(Allele(allele_type, sv_type));
            }
        }
        //checks for chromosomal breakpoints
        else if (strchr(allele[i],'[')||strchr(allele[i],']'))
        {
            allele_type = VT_SV;
            type |= allele_type;
            std::string sv_type("<BND>");
            alleles.push_back(Allele(allele_type, sv_type));
        }
        //non variant record
        else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0)
        {
            type = VT_REF;
        }
        //explicit sequence of bases
        else
        {
            kstring_t REF = {0,0,0};
            kstring_t ALT = {0,0,0};

            ref = allele[0];
            char* alt = allele[i];
            int32_t alen = strlen(alt);

            if (strchr(alt, 'N'))
            {
                contains_N = true;
            }

            if (rlen!=alen)
            {
                homogeneous_length = false;
            }

            //trimming
            //this is required in particular for the
            //characterization of multiallelics and
            //in general, any unnormalized variant
            int32_t rl = rlen;
            int32_t al = alen;
            //trim right
            while (rl!=1 && al!=1)
            {
                if (ref[rl-1]==alt[al-1])
                {
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            //trim left
            while (rl !=1 && al!=1)
            {
                if (ref[0]==alt[0])
                {
                    ++ref;
                    ++alt;
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            kputsn(ref, rl, &REF);
            kputsn(alt, al, &ALT);

            ref = REF.s;
            alt = ALT.s;

            int32_t mlen = std::min(rl, al);
            int32_t dlen = al-rl;
            int32_t diff = 0;
            int32_t ts = 0;
            int32_t tv = 0;

            if (mlen==1 && dlen)
            {
                char ls, le, ss;

                if (rl>al)
                {
                     ls = ref[0];
                     le = ref[rl-1];
                     ss = alt[0];
                }
                else
                {
                     ls = alt[0];
                     le = alt[al-1];
                     ss = ref[0];
                }

                if (ls!=ss && le!=ss)
                {
                    ++diff;

                    if ((ls=='G' && ss=='A') ||
                        (ls=='A' && ss=='G') ||
                        (ls=='C' && ss=='T') ||
                        (ls=='T' && ss=='C'))
                    {
                        ++ts;
                    }
                    else
                    {
                        ++tv;
                    }
                }
            }
            else
            {
                for (int32_t j=0; j<mlen; ++j)
                {
                    if (ref[j]!=alt[j])
                    {
                        ++diff;

                        if ((ref[j]=='G' && alt[j]=='A') ||
                            (ref[j]=='A' && alt[j]=='G') ||
                            (ref[j]=='C' && alt[j]=='T') ||
                            (ref[j]=='T' && alt[j]=='C'))
                        {
                            ++ts;
                        }
                        else
                        {
                            ++tv;
                        }
                    }
                }
            }

            //substitution variants
            if (mlen==diff)
            {
                allele_type |= mlen==1 ? VT_SNP : VT_MNP;
            }

            //indel variants
            if (dlen)
            {
                allele_type |= VT_INDEL;
            }

            //clumped SNPs and MNPs
            if (diff && diff < mlen) //internal gaps
            {
                allele_type |= VT_CLUMPED;
            }

            type |= allele_type;
            alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv));
            ts += ts;
            tv += tv;
            ins = dlen>0?1:0;
            del = dlen<0?1:0;

            if (REF.m) free(REF.s);
            if (ALT.m) free(ALT.s);
        }
    }

    if (type==VT_VNTR)
    {
        update_vntr_from_info_fields(h, v);
    }

    //additionally define MNPs by length of all alleles
    if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF)
    {
        if (homogeneous_length && rlen>1 && n_allele>1)
        {
            type |= VT_MNP;
        }
    }

    return type;
}
예제 #13
0
/**
 * Annotates VNTR characteristics.
 * @mode -
 */
void VNTRAnnotator::annotate(bcf_hdr_t* h, bcf1_t* v, Variant& variant, std::string mode)
{
    VNTR& vntr = variant.vntr;

    //update chromosome and position
    variant.rid = bcf_get_rid(v);
    variant.pos1 = bcf_get_pos1(v);

    //this is for reannotating an VNTR record
    //this is more for the purpose of evaluation to
    //check if vt's algorithm is concordant with
    //VNTRs from other sources.
    if (variant.type==VT_VNTR)
    {
        if (debug) std::cerr << "ANNOTATING VNTR/STR \n";

        //1. pick candidate region
        cre->pick_candidate_region(h, v, variant, REFERENCE);
        
        //2. detect candidate motifs from a reference seqeuence
        cmp->generate_candidate_motifs(h, v, variant);
        cmp->next_motif(h, v, variant);
    }
    //main purpose - annotation of Indels.
    else if (variant.type&VT_INDEL)
    {
        //the basic steps in annotating a TR
        //
        //1. extract a region that has a chance of containing the repeat units
        //2. choose a set of candidate motifs and pick motif
        //3. detect repeat region and evaluate
        //4. iterate 2 and 3
        
        //EXACT MODE
        if (mode=="e")
        {
            if (debug) std::cerr << "============================================\n";
            if (debug) std::cerr << "ANNOTATING INDEL EXACTLY\n";

            //1. pick candidate region using exact left and right alignment
            cre->pick_candidate_region(h, v, variant, EXACT_LEFT_RIGHT_ALIGNMENT);

            //2. evaluate reference length
            fd->detect_flanks(h, v, variant, CLIP_ENDS);

            if (debug) std::cerr << "============================================\n";
            return;
        }
        //FUZZY DETECTION
        else if (mode=="f")
        {
            if (debug) std::cerr << "============================================\n";
            if (debug) std::cerr << "ANNOTATING INDEL FUZZILY\n";

            //1. selects candidate region by fuzzy left and right alignment
            cre->pick_candidate_region(h, v, variant, EXACT_LEFT_RIGHT_ALIGNMENT);
            
            //2. detect candidate motifs from a reference sequence
            cmp->generate_candidate_motifs(h, v, variant);
            
            if (!cmp->next_motif(h, v, variant))
            {
                std::cerr << "oops, no candidate motif for next step\n";
                    
            }
            //3. evaluate reference length
            fd->detect_flanks(h, v, variant, FRAHMM);

            //introduce reiteration based on concordance and exact concordance.

            if (debug) std::cerr << "============================================\n";
            return;
        }
    }
}
예제 #14
0
/**
 * Detects near by STRs.
 */
bool VariantManip::detect_str(bcf_hdr_t *h, bcf1_t *v, Variant& variant)
{
    return detect_str(bcf_get_chrom(h, v), bcf_get_pos1(v), variant);
}
예제 #15
0
/**
 * Inserts a record into pq.
 */
void BCFSyncedStreamReader::insert_into_pq(int32_t i, bcf1_t *v)
{
    pq.push(new bcfptr(i, bcf_get_pos1(v), hdrs[i], v, sync_by_pos));
}
예제 #16
0
/**
 * Classifies variants.
 */
int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var)
{
    bcf_unpack(v, BCF_UN_STR);
    const char* chrom = bcf_get_chrom(h, v);
    uint32_t pos1 = bcf_get_pos1(v);
    char** allele = bcf_get_allele(v);
    int32_t n_allele = bcf_get_n_allele(v);

    int32_t pos0 = pos1-1;
    var.ts = 0;
    var.tv = 0;
    var.ins = 0;
    var.del = 0;

    var.clear(); // this sets the type to VT_REF by default.

    bool homogeneous_length = true;

    char* ref = allele[0];
    int32_t rlen = strlen(ref);

    //if only ref allele, skip this entire for loop
    for (size_t i=1; i<n_allele; ++i)
    {
        int32_t type = VT_REF;

        //check for tags
        if (strchr(allele[i],'<'))
        {
            size_t len = strlen(allele[i]);
            if (len>=5)
            {
                //VN/d+
                if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            type = VT_VNTR;
                        }
                    }
                }
                //VNTR
                else if (len==6 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' &&
                         allele[i][5]=='>' )
                {
                     type = VT_VNTR;
                }
                //ST/d+
                else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            type = VT_VNTR;
                        }
                    }
                }
                //STR
                else if (len==5 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' &&
                         allele[i][4]=='>' )
                {
                     type = VT_VNTR;
                }
            }
                        
            if (type==VT_VNTR)
            {
                type = VT_VNTR;
                var.type |= type;
                var.alleles.push_back(Allele(type));
            }
            else
            {
                type = VT_SV;
                var.type |= type;
                std::string sv_type(allele[i]);
                var.alleles.push_back(Allele(type, sv_type));
            }
        }
        else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0)
        {
            type = VT_REF;
        }
        else
        {
            kstring_t REF = {0,0,0};
            kstring_t ALT = {0,0,0};

            ref = allele[0];
            char* alt = allele[i];
            int32_t alen = strlen(alt);

            if (rlen!=alen)
            {
                homogeneous_length = false;
            }

            //trimming
            //this is required in particular for the
            //characterization of multiallelics and
            //in general, any unnormalized variant
            int32_t rl = rlen;
            int32_t al = alen;
            //trim right
            while (rl!=1 && al!=1)
            {
                if (ref[rl-1]==alt[al-1])
                {
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            //trim left
            while (rl !=1 && al!=1)
            {
                if (ref[0]==alt[0])
                {
                    ++ref;
                    ++alt;
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            kputsn(ref, rl, &REF);
            kputsn(alt, al, &ALT);

            ref = REF.s;
            alt = ALT.s;

            int32_t mlen = std::min(rl, al);
            int32_t dlen = al-rl;
            int32_t diff = 0;
            int32_t ts = 0;
            int32_t tv = 0;

            if (mlen==1 && dlen)
            {
                char ls, le, ss;

                if (rl>al)
                {
                     ls = ref[0];
                     le = ref[rl-1];
                     ss = alt[0];
                }
                else
                {
                     ls = alt[0];
                     le = alt[al-1];
                     ss = ref[0];
                }

                if (ls!=ss && le!=ss)
                {
                    ++diff;

                    if ((ls=='G' && ss=='A') ||
                        (ls=='A' && ss=='G') ||
                        (ls=='C' && ss=='T') ||
                        (ls=='T' && ss=='C'))
                    {
                        ++ts;
                    }
                    else
                    {
                        ++tv;
                    }
                }
            }
            else
            {
                for (int32_t j=0; j<mlen; ++j)
                {
                    if (ref[j]!=alt[j])
                    {
                        ++diff;

                        if ((ref[j]=='G' && alt[j]=='A') ||
                            (ref[j]=='A' && alt[j]=='G') ||
                            (ref[j]=='C' && alt[j]=='T') ||
                            (ref[j]=='T' && alt[j]=='C'))
                        {
                            ++ts;
                        }
                        else
                        {
                            ++tv;
                        }
                    }
                }
            }

            //substitution variants
            if (mlen==diff)
            {
                type |= mlen==1 ? VT_SNP : VT_MNP;
            }

            //indel variants
            if (dlen)
            {
                type |= VT_INDEL;
            }

            //clumped SNPs and MNPs
            if (diff && diff < mlen) //internal gaps
            {
                type |= VT_CLUMPED;
            }

            var.type |= type;
            var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv));
            var.ts += ts;
            var.tv += tv;
            var.ins = dlen>0?1:0;
            var.del = dlen<0?1:0;

            if (REF.m) free(REF.s);
            if (ALT.m) free(ALT.s);
        }
    }

    if (var.type==VT_VNTR)
    {
        bcf_unpack(v, BCF_UN_INFO);
        
        //populate motif, motif len etc. etc.
//        char* str = NULL;
//        int32_t n = 0;
//        int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n);
//        if (ret>0) 
//        {
//            var.motif = std::string(str);
//            var.mlen = var.motif.size();
//        }
//        ret = bcf_get_info_string(h, v, "RU", &str, &n);
//        if (ret>0) 
//        {
//            var.ru = std::string(str);
//            var.mlen = var.ru.size();
//        }
//        if (n) free(str);
//        
//        int32_t* no = NULL;
//        n = 0;    
//        ret = bcf_get_info_int32(h, v, "RL", &no, &n);
//        if (ret>0) var.rlen = *no;
//        if (n) free(no);
//            
//        int32_t* fl = NULL;
//        n = 0;                                    
//        ret = bcf_get_info_int32(h, v, "REF", &fl, &n);
//        if (ret>0) var.rcn = *fl;
//        if (n) free(fl);                        
    }
    
    //additionally define MNPs by length of all alleles
    if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF)
    {
        if (homogeneous_length && rlen>1 && n_allele>1)
        {
            var.type |= VT_MNP;
        }
    }

    return var.type;
}
예제 #17
0
/**
 * Constructor.
 * @v - VCF record.
 */
GenotypingRecord::GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype)
{
    clear();
    this->h = h;
    this->v = v;
    rid = bcf_get_rid(v);
    pos1 = bcf_get_pos1(v);
    this->vtype = vtype;
    int32_t n_allele = bcf_get_n_allele(v);
    
    if (vtype==VT_SNP && n_allele==2)
    {
        rid = bcf_get_rid(v);
        beg1 = bcf_get_pos1(v);
        end1 = beg1;
    }
    else if (vtype==VT_INDEL && bcf_get_n_allele(v)==2)
    {
        rid = bcf_get_rid(v);
        char** alleles = bcf_get_allele(v);
        dlen = strlen(alleles[1])-strlen(alleles[0]);
        len = abs(dlen);

        int32_t *flanks = NULL;
        int32_t n = 0;
        if (bcf_get_info_int32(h, v, "FLANKS", &flanks, &n)>0)
        {
            lend1 = flanks[0];
            rbeg1 = flanks[1];
            free(flanks);
        }
        else
        {
            lend1 = bcf_get_pos1(v) - 1;
            rbeg1 = bcf_get_end_pos1(v) + 1;
        }

        int32_t *fuzzy_flanks = NULL;
        n = 0;
        if (bcf_get_info_int32(h, v, "FZ_FLANKS", &fuzzy_flanks, &n)>0)
        {
            fuzzy_lend1 = fuzzy_flanks[0];
            fuzzy_rbeg1 = fuzzy_flanks[1];
            free(fuzzy_flanks);
        }
        else
        {
            fuzzy_lend1 = bcf_get_pos1(v) - 1;
            fuzzy_rbeg1 = bcf_get_end_pos1(v) + 1;
        }

        beg1 = std::min(lend1-2, fuzzy_lend1-2);
        end1 = std::max(rbeg1+2, fuzzy_rbeg1+2);
    
        //construct alleles
        
        //get reference sequence
//        char* ref_seq = NULL;
//        int32_t ref_len = 0;
////        ref_seq = faidx_fetch_seq(fai, bcf_get_chrom(h,v), lend1+1-1, rbeg1-1-1, &ref_len);
//        
//        for (uint32_t i=0; i<n_allele; ++i)
//        {
//            
//        }
        
//        for ()
//        {
//        }
//    
        if (dlen>0)
        {
            indel.append(&alleles[1][1]);
        }
        else
        {
            indel.append(&alleles[0][1]);
        }
    }
    else if (vtype==VT_VNTR)
    {
        rid = bcf_get_rid(v);
        beg1 = bcf_get_pos1(v) - 1;
        end1 = bcf_get_end_pos1(v) + 1;
        
        char *motif = NULL;
        int32_t n = 0;
        
        if (bcf_get_info_string(h, v, "MOTIF", &motif, &n)>0)
        {
           this->motif.assign(motif);
           free(motif);
        }
    }
}