Example #1
0
/**
 * Gets a sorted string representation of the alleles of a variant.
 */
void bcf_alleles2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var)
{
    bcf_unpack(v, BCF_UN_STR);
    var->l = 0;
    
    if (v->n_allele==2)
    {
        kputs(bcf_get_alt(v, 0), var);
        kputc(',', var);
        kputs(bcf_get_alt(v, 1), var);
    }
    else
    {
        char** allele = bcf_get_allele(v);
        char** temp = (char**) malloc((bcf_get_n_allele(v)-1)*sizeof(char*));
        for (int32_t i=1; i<v->n_allele; ++i)
        {
            temp[i-1] = allele[i];
        }
        
        std::qsort(temp, bcf_get_n_allele(v)-1, sizeof(char*), cmpstr);
        
        kputs(bcf_get_alt(v, 0), var);
        for (int32_t i=0; i<v->n_allele-1; ++i)
        {
            kputc(',', var);
            kputs(temp[i], var);
        }
        
        free(temp);
    }
}
/**
 * Extract reference sequence region for motif discovery in a fuzzy fashion.
 */
void CandidateRegionExtractor::extract_regions_by_fuzzy_alignment(bcf_hdr_t* h, bcf1_t* v,  Variant& variant)
{
    if (debug)
    {
        if (debug) std::cerr << "********************************************\n";
        std::cerr << "EXTRACTIING REGION BY FUZZY ALIGNMENT\n\n";
    }

    VNTR& vntr = variant.vntr;
    const char* chrom = bcf_get_chrom(h, v);

    int32_t min_beg1 = bcf_get_pos1(v);
    int32_t max_end1 = min_beg1;

    //merge candidate search region
    for (size_t i=1; i<bcf_get_n_allele(v); ++i)
    {
        std::string ref(bcf_get_alt(v, 0));
        std::string alt(bcf_get_alt(v, i));
        int32_t pos1 = bcf_get_pos1(v);

        trim(pos1, ref, alt);

        if (debug)
        {
            std::cerr << "indel fragment : " << (ref.size()<alt.size()? alt : ref) << "\n";
            std::cerr << "               : " << ref << ":" << alt << "\n";
        }

        min_beg1 = fuzzy_left_align(chrom, pos1, ref, alt, 3);
        max_end1 = fuzzy_right_align(chrom, pos1 + ref.size() - 1, ref, alt, 3);

        int32_t seq_len;
        char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);
        if (debug)
        {
            std::cerr << "FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
            std::cerr << "             " << seq << "\n";
        }

        if (seq_len) free(seq);
    }

    int32_t seq_len;
    char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

    if (debug)
    {
        std::cerr << "FINAL FUZZY REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
        std::cerr << "                   " << seq << "\n";
    }

    vntr.exact_repeat_tract = seq;
    vntr.exact_rbeg1 = min_beg1;

    if (seq_len) free(seq);
}
/**
 * Checks if a vntr is a homopolymer.
 */
bool CandidateRegionExtractor::is_homopolymer(bcf_hdr_t* h, bcf1_t* v)
{
    bool is_homopolymer = false;
    uint32_t ref_len = strlen(bcf_get_ref(v));
    for (size_t i=1; i<bcf_get_n_allele(v); ++i)
    {
        std::string ref(bcf_get_alt(v, 0));
        std::string alt(bcf_get_alt(v, i));
        int32_t pos1 = bcf_get_pos1(v);
    }

    return is_homopolymer;
}
Example #4
0
File: variant.cpp Project: atks/vt
/**
 * Gets a string representation of the variant.
 */
std::string Variant::get_variant_string()
{
    kstring_t var = {0,0,0};
    bcf_unpack(v, BCF_UN_STR);
    var.l = 0;
    kputs(bcf_get_chrom(h, v), &var);
    kputc(':', &var);
    kputw(bcf_get_pos1(v), &var);
    kputc(':', &var);
    for (size_t i=0; i<bcf_get_n_allele(v); ++i)
    {
        if (i) kputc('/', &var);
        kputs(bcf_get_alt(v, i), &var);
    }

    std::string str(var.s);

    if (var.m) free(var.s);

    return str;
}
Example #5
0
/**
 * Classifies variants.
 */
int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var)
{
    bcf_unpack(v, BCF_UN_STR);
    const char* chrom = bcf_get_chrom(h, v);
    uint32_t pos1 = bcf_get_pos1(v);
    char** allele = bcf_get_allele(v);
    int32_t n_allele = bcf_get_n_allele(v);

    int32_t pos0 = pos1-1;
    var.ts = 0;
    var.tv = 0;
    var.ins = 0;
    var.del = 0;

    var.clear(); // this sets the type to VT_REF by default.

    bool homogeneous_length = true;

    char* ref = allele[0];
    int32_t rlen = strlen(ref);

    //if only ref allele, skip this entire for loop
    for (size_t i=1; i<n_allele; ++i)
    {
        int32_t type = VT_REF;

        //check for tags
        if (strchr(allele[i],'<'))
        {
            size_t len = strlen(allele[i]);
            if (len>=5)
            {
                //VN/d+
                if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            type = VT_VNTR;
                        }
                    }
                }
                //VNTR
                else if (len==6 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' &&
                         allele[i][5]=='>' )
                {
                     type = VT_VNTR;
                }
                //ST/d+
                else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            type = VT_VNTR;
                        }
                    }
                }
                //STR
                else if (len==5 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' &&
                         allele[i][4]=='>' )
                {
                     type = VT_VNTR;
                }
            }
                        
            if (type==VT_VNTR)
            {
                type = VT_VNTR;
                var.type |= type;
                var.alleles.push_back(Allele(type));
            }
            else
            {
                type = VT_SV;
                var.type |= type;
                std::string sv_type(allele[i]);
                var.alleles.push_back(Allele(type, sv_type));
            }
        }
        else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0)
        {
            type = VT_REF;
        }
        else
        {
            kstring_t REF = {0,0,0};
            kstring_t ALT = {0,0,0};

            ref = allele[0];
            char* alt = allele[i];
            int32_t alen = strlen(alt);

            if (rlen!=alen)
            {
                homogeneous_length = false;
            }

            //trimming
            //this is required in particular for the
            //characterization of multiallelics and
            //in general, any unnormalized variant
            int32_t rl = rlen;
            int32_t al = alen;
            //trim right
            while (rl!=1 && al!=1)
            {
                if (ref[rl-1]==alt[al-1])
                {
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            //trim left
            while (rl !=1 && al!=1)
            {
                if (ref[0]==alt[0])
                {
                    ++ref;
                    ++alt;
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            kputsn(ref, rl, &REF);
            kputsn(alt, al, &ALT);

            ref = REF.s;
            alt = ALT.s;

            int32_t mlen = std::min(rl, al);
            int32_t dlen = al-rl;
            int32_t diff = 0;
            int32_t ts = 0;
            int32_t tv = 0;

            if (mlen==1 && dlen)
            {
                char ls, le, ss;

                if (rl>al)
                {
                     ls = ref[0];
                     le = ref[rl-1];
                     ss = alt[0];
                }
                else
                {
                     ls = alt[0];
                     le = alt[al-1];
                     ss = ref[0];
                }

                if (ls!=ss && le!=ss)
                {
                    ++diff;

                    if ((ls=='G' && ss=='A') ||
                        (ls=='A' && ss=='G') ||
                        (ls=='C' && ss=='T') ||
                        (ls=='T' && ss=='C'))
                    {
                        ++ts;
                    }
                    else
                    {
                        ++tv;
                    }
                }
            }
            else
            {
                for (int32_t j=0; j<mlen; ++j)
                {
                    if (ref[j]!=alt[j])
                    {
                        ++diff;

                        if ((ref[j]=='G' && alt[j]=='A') ||
                            (ref[j]=='A' && alt[j]=='G') ||
                            (ref[j]=='C' && alt[j]=='T') ||
                            (ref[j]=='T' && alt[j]=='C'))
                        {
                            ++ts;
                        }
                        else
                        {
                            ++tv;
                        }
                    }
                }
            }

            //substitution variants
            if (mlen==diff)
            {
                type |= mlen==1 ? VT_SNP : VT_MNP;
            }

            //indel variants
            if (dlen)
            {
                type |= VT_INDEL;
            }

            //clumped SNPs and MNPs
            if (diff && diff < mlen) //internal gaps
            {
                type |= VT_CLUMPED;
            }

            var.type |= type;
            var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv));
            var.ts += ts;
            var.tv += tv;
            var.ins = dlen>0?1:0;
            var.del = dlen<0?1:0;

            if (REF.m) free(REF.s);
            if (ALT.m) free(ALT.s);
        }
    }

    if (var.type==VT_VNTR)
    {
        bcf_unpack(v, BCF_UN_INFO);
        
        //populate motif, motif len etc. etc.
//        char* str = NULL;
//        int32_t n = 0;
//        int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n);
//        if (ret>0) 
//        {
//            var.motif = std::string(str);
//            var.mlen = var.motif.size();
//        }
//        ret = bcf_get_info_string(h, v, "RU", &str, &n);
//        if (ret>0) 
//        {
//            var.ru = std::string(str);
//            var.mlen = var.ru.size();
//        }
//        if (n) free(str);
//        
//        int32_t* no = NULL;
//        n = 0;    
//        ret = bcf_get_info_int32(h, v, "RL", &no, &n);
//        if (ret>0) var.rlen = *no;
//        if (n) free(no);
//            
//        int32_t* fl = NULL;
//        n = 0;                                    
//        ret = bcf_get_info_int32(h, v, "REF", &fl, &n);
//        if (ret>0) var.rcn = *fl;
//        if (n) free(fl);                        
    }
    
    //additionally define MNPs by length of all alleles
    if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF)
    {
        if (homogeneous_length && rlen>1 && n_allele>1)
        {
            var.type |= VT_MNP;
        }
    }

    return var.type;
}
Example #6
0
/**
 * Checks if a variant is normalized.
 *  Ignores if entry is not a variant.
 */
bool VariantManip::is_normalized(bcf1_t *v)
{
    char** alleles = bcf_get_allele(v);
    int32_t n_allele = bcf_get_n_allele(v);
        
    if (n_allele==1) return true;
        
    char first_base;
    char last_base;
    size_t rlen, alen, len;
    bool exists_len_one_allele = false;
    bool first_base_same = true;
    bool last_base_same = true;

    if (n_allele==2)
    {
        rlen = strlen(alleles[0]);
        alen = strlen(alleles[1]);

        if (rlen==1&&alen==1)
        {
            return true;
        }
        else
        {
            //check if variant is reference.
            if (rlen==alen)
            {
                if (strcmp(alleles[0], alleles[1])==0)
                {
                    return true;
                }    
            }    
            
            //ref
            if (rlen==1) exists_len_one_allele = true;
            first_base = alleles[0][0];
            last_base = alleles[0][rlen-1];

            //alt
            if (alen==1) exists_len_one_allele = true;
            if (first_base!=alleles[1][0]) first_base_same = false;
            if (last_base!=alleles[1][alen-1]) last_base_same = false;

            if (last_base_same || (!exists_len_one_allele && first_base_same))
            {
                return false;
            }

            return true;
        }
    }
    else
    {
        bool same = true;
        for (size_t i=0; i<n_allele; ++i)
        {
            if (i)
            {
                len = strlen(alleles[i]);
                if (len==1) exists_len_one_allele = true;
                if (first_base!=alleles[i][0]) first_base_same = false;
                if (last_base!=alleles[i][len-1]) last_base_same = false;
            
                same = same && strcmp(alleles[i],alleles[0])==0;
            }
            else
            {
                len = strlen(alleles[0]);
                if (len==1) exists_len_one_allele = true;
                first_base = alleles[0][0];
                last_base = alleles[0][len-1];
            }
        }

        //reference entry
        if (same)
        {
            return true;
        }    

        if (last_base_same || (!exists_len_one_allele && first_base_same))
        {
            return false;
        }

        return true;
    }
}
/**
 * Extract reference sequence region for motif discovery.
 *
 * The input is a VCF record that contains an indel.
 * 
 * If the the indel has multiple alleles, it will examine all
 * alleles.
 *
 * todo: is might be a good idea to combine this step with motif detection
 *       since there seems to be a need to have an iterative process here
 *       to ensure a good candidate motif is chosen. *  
 */
void CandidateRegionExtractor::extract_regions_by_exact_alignment(bcf_hdr_t* h, bcf1_t* v, Variant& variant)
{
    if (debug)
    {
        if (debug) std::cerr << "********************************************\n";
        std::cerr << "EXTRACTIING REGION BY EXACT LEFT AND RIGHT ALIGNMENT\n\n";
    }

    VNTR& vntr = variant.vntr;
    const char* chrom = bcf_get_chrom(h, v);

    int32_t min_beg1 = bcf_get_pos1(v);
    int32_t max_end1 = min_beg1;

    if (debug)
    {
       bcf_print_liten(h, v);
    }

    //merge candidate search region
    for (size_t i=1; i<bcf_get_n_allele(v); ++i)
    {
        std::string ref(bcf_get_alt(v, 0));
        std::string alt(bcf_get_alt(v, i));
        int32_t pos1 = bcf_get_pos1(v);

        //this prevents introduction of flanks that do not harbour the repeat unit
        trim(pos1, ref, alt);

        int32_t end1 = pos1 + ref.size() - 1;
        right_align(chrom, end1, ref, alt);

        int32_t beg1 = end1 - ref.size() + 1;
        left_align(chrom, beg1, ref, alt);

        min_beg1 = beg1<min_beg1 ? beg1 : min_beg1;
        max_end1 = end1>max_end1 ? end1 : max_end1;

        int32_t seq_len;
        char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

        if (debug)
        {
            std::cerr << "EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") from " << pos1 << ":" << ref << ":" << alt << "\n";
            std::cerr << "             " << seq << "\n";
        }

        if (seq_len) free(seq);
    }

    int32_t seq_len;
    char* seq = faidx_fetch_seq(fai, chrom, min_beg1-1, max_end1-1, &seq_len);

    if (debug)
    {
        std::cerr << "FINAL EXACT REGION " << min_beg1 << "-" << max_end1 << " (" << max_end1-min_beg1+1 <<") " << "\n";
        std::cerr << "                   " << seq << "\n";
    }

    vntr.exact_repeat_tract = seq;
    vntr.rid = bcf_get_rid(v);
    vntr.exact_rbeg1 = min_beg1;
    vntr.exact_rend1 = max_end1;
    
    if (seq_len) free(seq);
}
Example #8
0
/**
 * Evaluates the actions for this node.
 */
void Node::evaluate(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug)
{
    if (debug)
        std::cerr << "evaluation  "  << type << "\n";

    if (type&VT_LOGIC_OP)
    {
        if (type==VT_NOT)
        {
            if (debug)
                std::cerr << "\tVT_NOT "   <<  left->value << " \n";
            value = !(left->value);
        }
        else if (type==VT_AND)
        {
            if (debug)
                std::cerr << "\tVT_AND "   <<  left->value << "&" << right->value    <<  " \n";
            value = (left->value && right->value);
        }
        else if (type==VT_OR)
        {
            value = (left->value || right->value);
        }
    }
    else if (type&VT_MATH_CMP)   
    {
        if (type==VT_EQ)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    if (debug)
                        std::cerr << "\tVT_EQ "   <<  left->i << "&" << right->i    <<  " \n";
                    value = (left->i==right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    if (debug)
                        std::cerr << "\tVT_EQ "   <<  left->i << "&" << right->f    <<  " \n";
                    value = (left->i==right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    if (debug)
                        std::cerr << "\tVT_EQ "   <<  left->f << "&" << right->i    <<  " \n";
                    value = (left->f==right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    if (debug)
                        std::cerr << "\tVT_EQ "   <<  left->f << "&" << right->f    <<  " \n";
                    value = (left->f==right->f);
                    return;
                }
            }
            else if ((left->type&VT_STR) && (right->type&VT_STR))
            {
                if (debug)
                        std::cerr << "\tVT_EQ "   <<  left->tag.s << "&" << right->tag.s    <<  " \n";
                value = strcmp(left->tag.s, right->tag.s)==0 ? true : false;
                return;
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported : == %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
            exit(1);
        }    
        else if (type==VT_NE)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->i!=right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->i!=right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->f!=right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->f!=right->f);
                    return;
                }
            }
            else if ((left->type&VT_STR) && (right->type&VT_STR))
            {
                value = strcmp(left->tag.s, right->tag.s)==0 ? false : true;
                return;
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: !=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
            exit(1);
        }
        else if (type==VT_LE)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->i<=right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->i<=right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    type |= VT_INT;
                    value = (left->f<=right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->f<=right->f);
                    return;
                }
            }
            else if ((left->type&VT_STR) && (right->type&VT_STR))
            {
                value = strcmp(left->tag.s, right->tag.s)<=0 ? true : false;
                return;
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: <=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
            exit(1);
        }
        else if (type==VT_GE)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->i>=right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->i>=right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->f>=right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->f>=right->f);
                    return;
                }
            }
            else if ((left->type&VT_STR) && (right->type&VT_STR))
            {
                value = strcmp(left->tag.s, right->tag.s)>=0 ? true : false;
                return;
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: >=\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
            exit(1);
        }
        else if (type==VT_GT)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->i>right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->i>right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->f>right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->f>right->f);
                    return;
                }
            }
            else if ((left->type&VT_STR) && (right->type&VT_STR))
            {
                value = strcmp(left->tag.s, right->tag.s)>0 ? true : false;
                return;
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: >\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
            exit(1);
        }
        else if (type==VT_LT)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->i<right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->i<right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    value = (left->f<right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    value = (left->f<right->f);
                    return;
                }
            }
            else if ((left->type&VT_STR) && (right->type&VT_STR))
            {
                value = strcmp(left->tag.s, right->tag.s)<0 ? true : false;
                return;
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported: %d %d: <\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
            exit(1);
        }
    }
    else if (type&VT_BCF_OP)   
    {
        if (type==VT_FILTER)
        {
            if (bcf_has_filter(h, v, tag.s)!=1)
            {
                value = false;
            }
            else
            {
                value = true;
            }
        }
        else if (type==VT_INFO)
        {
            int32_t *data = NULL;
            int32_t n=0;
    
            if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0)
            {
                type |= VT_INT;
                i = *data;
                f = (float)i;
            }
            else if (bcf_get_info_float(h, v, tag.s, &data, &n)>0)
            {
                type |= VT_FLT;
                f = (float)(*data);
            }
            else if (bcf_get_info_string(h, v, tag.s, &data, &n)>0)
            {
                type |= VT_STR;
                s.l=0;
                for (int32_t i=0; i<n; ++i)
                {
                    kputc(data[i], &s);
                }
            }
            else if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0)
            {
                type |= VT_FLG;
                i = 1;
                f = 1;
                b = true;
                value = true;
                s.l=0; 
            }
            else
            {
                i = 0;
                f = 0;
                b = false;
                value = false;
                s.l=0;
            }
    
            if (n) free(data);
        }
        else if (type==(VT_INFO|VT_INT))
        {
            int32_t *data = NULL;
            int32_t n=0;
    
            if (bcf_get_info_int32(h, v, tag.s, &data, &n)>0)
            {
                i = *((int*)data);
            }
    
            if (n) free(data);
        }
        else if (type==(VT_INFO|VT_FLT))
        {
            int32_t *data = NULL;
            int32_t n=0;
    
            if (bcf_get_info_float(h, v, tag.s, &data, &n)>0)
            {
                f = *((float*)data);
            }
    
            if (n) free(data);
        }
        else if (type==(VT_INFO|VT_STR))
        {
            int32_t *data = NULL;
            int32_t n=0;
    
            if (bcf_get_info_string(h, v, tag.s, &data, &n)>0)
            {
                s.l=0;
                for (int32_t i=0; i<n; ++i)
                {
                    kputc(data[i], &s);
                }
            }
    
            if (n) free(data);
        }
        else if (type==(VT_INFO|VT_FLG))
        {
            if (bcf_get_info_flag(h, v, tag.s, 0, 0)>0)
            {
                i = 1;
                f = 1;
                b = true;
                value = true;
                //s.l=0; kputc('1', &s);
            }
            else
            {
                i = 0;
                f = 0;
                b = false;
                value = false;
                s.l=0;
            }
            
            if (debug)
                std::cerr << "\tVT_INFO|VT_FLG "   << i << " " << f << " " << b << " " << value << " " << s.s <<  " \n";
        }
        else if (type==VT_VARIANT_TYPE)
        {
            if (debug)
                std::cerr << "\tVTYPE "   <<  variant->vtype2string(variant->type) <<  " \n";
            i = variant->type;
            value = i;
        }
        else if (type==VT_VARIANT_DLEN)
        {
            if (debug)
                std::cerr << "\tDLEN "   <<  variant->alleles[0].dlen <<  " \n";
            i = variant->alleles[0].dlen;
            value = i;
        }
        else if (type==VT_VARIANT_LEN)
        {
            if (debug)
                std::cerr << "\tLEN "   <<  abs(variant->alleles[0].dlen) <<  " \n";
            i = abs(variant->alleles[0].dlen);
            value = i;
        }
        else if (type==VT_N_ALLELE)
        {
            if (debug)
                std::cerr << "\tN_ALLELE "   <<  bcf_get_n_allele(v) <<  " \n";
            i = bcf_get_n_allele(v);
        }
    }
    else if (type&VT_MATH_OP)
    {   
        if ((type&8207)==VT_ADD)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    type |= VT_INT;
                    i = (left->i+right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    type |= VT_FLT;
                    f = (left->i+right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    type |= VT_FLT;
                    f = (left->f+right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    type |= VT_FLT;
                    f = (left->f+right->f);
                    return;
                }
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported : +\n", __FILE__, __LINE__, __FUNCTION__);
            exit(1);
        }
        else if ((type&8207)==VT_SUB)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    type |= VT_INT;
                    i = (left->i-right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    type |= VT_FLT;
                    f = (left->i-right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    type |= VT_FLT;
                    f = (left->f-right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    type |= VT_FLT;
                    f = (left->f-right->f);
                    return;
                }
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported : -\n", __FILE__, __LINE__, __FUNCTION__);
            exit(1);
        }
        else if ((type&8207)==VT_MUL)
        {
            if ((left->type&VT_INT))
            {
                if ((right->type&VT_INT))
                {
                    type |= VT_INT;
                    i = (left->i*right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    type |= VT_FLT;
                    f = (left->i*right->f);
                    return;
                }
            }
            else if ((left->type&VT_FLT))
            {
                if ((right->type&VT_INT))
                {
                    type |= VT_FLT;
                    f = (left->f*right->i);
                    return;
                }
                else if ((right->type&VT_FLT))
                {
                    type |= VT_FLT;
                    f = (left->f*right->f);
                    return;
                }
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported : *\n", __FILE__, __LINE__, __FUNCTION__);
            exit(1);
        }
        else if ((type&8207)==VT_DIV)
        {
            if (left->type&VT_INT)
            {
                if (right->type&VT_INT)
                {
                    type |= VT_FLT;
                    f = ((float)left->i/right->i);
                    return;
                }
                else if (right->type&VT_FLT)
                {
                    type |= VT_FLT;
                    f = (left->i/right->f);
                    return;
                }
            }
            else if (left->type&VT_FLT)
            {
                if (right->type&VT_INT)
                {
                    type |= VT_FLT;
                    f = (left->f/right->i);
                    return;
                }
                else if (right->type&VT_FLT)
                {
                    type |= VT_FLT;
                    f = (left->f/right->f);
                    return;
                }
            }
    
            fprintf(stderr, "[%s:%d %s] evaluation not supported : /\n", __FILE__, __LINE__, __FUNCTION__);
            exit(1);
        }
        else if (type==VT_BIT_AND)
        {
            if ((left->type&VT_INT) && (right->type&VT_INT))
            {
                i = (left->i & right->i);
                value = i;
                return;
            }
            
            fprintf(stderr, "[%s:%d %s] evaluation not supported for & :  %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
            exit(1);
        }
        else if (type==VT_BIT_OR)
        {
            if ((left->type&VT_INT) && (right->type&VT_INT))
            {
                i = (left->i | right->i);
                value = i;
                return;
            }
            
            fprintf(stderr, "[%s:%d %s] evaluation not supported for | : %d %d\n", __FILE__, __LINE__, __FUNCTION__, left->type, right->type);
            exit(1);
        }    
        else
        {
            fprintf(stderr, "[%s:%d %s] math op not supported : %d\n", __FILE__, __LINE__, __FUNCTION__, (type&15));
            exit(1);
        }
    }
}
Example #9
0
File: variant.cpp Project: atks/vt
/**
 * Classifies variants.
 */
int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v)
{
    clear();

    this->h = h;
    this->v = v;

    bcf_unpack(v, BCF_UN_STR);
    chrom.assign(bcf_get_chrom(h, v));
    rid = bcf_get_rid(v);
    pos1 = bcf_get_pos1(v);
    end1 = bcf_get_end1(v);
    char** allele = bcf_get_allele(v);
    int32_t n_allele = bcf_get_n_allele(v);
    int32_t pos0 = pos1-1;

    bool homogeneous_length = true;
    char* ref = allele[0];
    int32_t rlen = strlen(ref);

    if (strchr(ref, 'N'))
    {
        contains_N = true;
    }

    //if only ref allele, skip this entire for loop
    for (size_t i=1; i<n_allele; ++i)
    {
        int32_t allele_type = VT_REF;

        //check for symbolic alternative alleles
        if (strchr(allele[i],'<'))
        {
            size_t len = strlen(allele[i]);
            if (len>=5)
            {
                //VN/d+
                if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            allele_type = VT_VNTR;
                        }
                    }
                }
                //VNTR
                else if (len==6 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' &&
                         allele[i][5]=='>' )
                {
                     allele_type = VT_VNTR;
                }
                //STR
                else if (len==5 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' &&
                         allele[i][4]=='>' )
                {
                     allele_type = VT_VNTR;
                }
                //ST/d+
                else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' )
                {
                    type = VT_VNTR;

                    for (size_t j=3; j<len-1; ++j)
                    {
                        if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.')
                        {
                            type = VT_SV;
                        }
                    }
                }
            }

            if (allele_type==VT_VNTR)
            {
                allele_type = VT_VNTR;
                type |= allele_type;
                alleles.push_back(Allele(allele_type));
            }
            else
            {
                allele_type = VT_SV;
                type |= allele_type;
                std::string sv_type(allele[i]);
                alleles.push_back(Allele(allele_type, sv_type));
            }
        }
        //checks for chromosomal breakpoints
        else if (strchr(allele[i],'[')||strchr(allele[i],']'))
        {
            allele_type = VT_SV;
            type |= allele_type;
            std::string sv_type("<BND>");
            alleles.push_back(Allele(allele_type, sv_type));
        }
        //non variant record
        else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0)
        {
            type = VT_REF;
        }
        //explicit sequence of bases
        else
        {
            kstring_t REF = {0,0,0};
            kstring_t ALT = {0,0,0};

            ref = allele[0];
            char* alt = allele[i];
            int32_t alen = strlen(alt);

            if (strchr(alt, 'N'))
            {
                contains_N = true;
            }

            if (rlen!=alen)
            {
                homogeneous_length = false;
            }

            //trimming
            //this is required in particular for the
            //characterization of multiallelics and
            //in general, any unnormalized variant
            int32_t rl = rlen;
            int32_t al = alen;
            //trim right
            while (rl!=1 && al!=1)
            {
                if (ref[rl-1]==alt[al-1])
                {
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            //trim left
            while (rl !=1 && al!=1)
            {
                if (ref[0]==alt[0])
                {
                    ++ref;
                    ++alt;
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            kputsn(ref, rl, &REF);
            kputsn(alt, al, &ALT);

            ref = REF.s;
            alt = ALT.s;

            int32_t mlen = std::min(rl, al);
            int32_t dlen = al-rl;
            int32_t diff = 0;
            int32_t ts = 0;
            int32_t tv = 0;

            if (mlen==1 && dlen)
            {
                char ls, le, ss;

                if (rl>al)
                {
                     ls = ref[0];
                     le = ref[rl-1];
                     ss = alt[0];
                }
                else
                {
                     ls = alt[0];
                     le = alt[al-1];
                     ss = ref[0];
                }

                if (ls!=ss && le!=ss)
                {
                    ++diff;

                    if ((ls=='G' && ss=='A') ||
                        (ls=='A' && ss=='G') ||
                        (ls=='C' && ss=='T') ||
                        (ls=='T' && ss=='C'))
                    {
                        ++ts;
                    }
                    else
                    {
                        ++tv;
                    }
                }
            }
            else
            {
                for (int32_t j=0; j<mlen; ++j)
                {
                    if (ref[j]!=alt[j])
                    {
                        ++diff;

                        if ((ref[j]=='G' && alt[j]=='A') ||
                            (ref[j]=='A' && alt[j]=='G') ||
                            (ref[j]=='C' && alt[j]=='T') ||
                            (ref[j]=='T' && alt[j]=='C'))
                        {
                            ++ts;
                        }
                        else
                        {
                            ++tv;
                        }
                    }
                }
            }

            //substitution variants
            if (mlen==diff)
            {
                allele_type |= mlen==1 ? VT_SNP : VT_MNP;
            }

            //indel variants
            if (dlen)
            {
                allele_type |= VT_INDEL;
            }

            //clumped SNPs and MNPs
            if (diff && diff < mlen) //internal gaps
            {
                allele_type |= VT_CLUMPED;
            }

            type |= allele_type;
            alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv));
            ts += ts;
            tv += tv;
            ins = dlen>0?1:0;
            del = dlen<0?1:0;

            if (REF.m) free(REF.s);
            if (ALT.m) free(ALT.s);
        }
    }

    if (type==VT_VNTR)
    {
        update_vntr_from_info_fields(h, v);
    }

    //additionally define MNPs by length of all alleles
    if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF)
    {
        if (homogeneous_length && rlen>1 && n_allele>1)
        {
            type |= VT_MNP;
        }
    }

    return type;
}
Example #10
0
/**
 * Constructor.
 * @v - VCF record.
 */
GenotypingRecord::GenotypingRecord(bcf_hdr_t *h, bcf1_t *v, int32_t vtype)
{
    clear();
    this->h = h;
    this->v = v;
    rid = bcf_get_rid(v);
    pos1 = bcf_get_pos1(v);
    this->vtype = vtype;
    int32_t n_allele = bcf_get_n_allele(v);
    
    if (vtype==VT_SNP && n_allele==2)
    {
        rid = bcf_get_rid(v);
        beg1 = bcf_get_pos1(v);
        end1 = beg1;
    }
    else if (vtype==VT_INDEL && bcf_get_n_allele(v)==2)
    {
        rid = bcf_get_rid(v);
        char** alleles = bcf_get_allele(v);
        dlen = strlen(alleles[1])-strlen(alleles[0]);
        len = abs(dlen);

        int32_t *flanks = NULL;
        int32_t n = 0;
        if (bcf_get_info_int32(h, v, "FLANKS", &flanks, &n)>0)
        {
            lend1 = flanks[0];
            rbeg1 = flanks[1];
            free(flanks);
        }
        else
        {
            lend1 = bcf_get_pos1(v) - 1;
            rbeg1 = bcf_get_end_pos1(v) + 1;
        }

        int32_t *fuzzy_flanks = NULL;
        n = 0;
        if (bcf_get_info_int32(h, v, "FZ_FLANKS", &fuzzy_flanks, &n)>0)
        {
            fuzzy_lend1 = fuzzy_flanks[0];
            fuzzy_rbeg1 = fuzzy_flanks[1];
            free(fuzzy_flanks);
        }
        else
        {
            fuzzy_lend1 = bcf_get_pos1(v) - 1;
            fuzzy_rbeg1 = bcf_get_end_pos1(v) + 1;
        }

        beg1 = std::min(lend1-2, fuzzy_lend1-2);
        end1 = std::max(rbeg1+2, fuzzy_rbeg1+2);
    
        //construct alleles
        
        //get reference sequence
//        char* ref_seq = NULL;
//        int32_t ref_len = 0;
////        ref_seq = faidx_fetch_seq(fai, bcf_get_chrom(h,v), lend1+1-1, rbeg1-1-1, &ref_len);
//        
//        for (uint32_t i=0; i<n_allele; ++i)
//        {
//            
//        }
        
//        for ()
//        {
//        }
//    
        if (dlen>0)
        {
            indel.append(&alleles[1][1]);
        }
        else
        {
            indel.append(&alleles[0][1]);
        }
    }
    else if (vtype==VT_VNTR)
    {
        rid = bcf_get_rid(v);
        beg1 = bcf_get_pos1(v) - 1;
        end1 = bcf_get_end_pos1(v) + 1;
        
        char *motif = NULL;
        int32_t n = 0;
        
        if (bcf_get_info_string(h, v, "MOTIF", &motif, &n)>0)
        {
           this->motif.assign(motif);
           free(motif);
        }
    }
}