Example #1
0
/**
 * Classifies variants.
 */
int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var)
{
    bcf_unpack(v, BCF_UN_STR);
    const char* chrom = bcf_get_chrom(h, v);
    uint32_t pos1 = bcf_get_pos1(v);
    char** allele = bcf_get_allele(v);
    int32_t n_allele = bcf_get_n_allele(v);

    int32_t pos0 = pos1-1;
    var.ts = 0;
    var.tv = 0;
    var.ins = 0;
    var.del = 0;

    var.clear(); // this sets the type to VT_REF by default.

    bool homogeneous_length = true;

    char* ref = allele[0];
    int32_t rlen = strlen(ref);

    //if only ref allele, skip this entire for loop
    for (size_t i=1; i<n_allele; ++i)
    {
        int32_t type = VT_REF;

        //check for tags
        if (strchr(allele[i],'<'))
        {
            size_t len = strlen(allele[i]);
            if (len>=5)
            {
                //VN/d+
                if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            type = VT_VNTR;
                        }
                    }
                }
                //VNTR
                else if (len==6 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' &&
                         allele[i][5]=='>' )
                {
                     type = VT_VNTR;
                }
                //ST/d+
                else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            type = VT_VNTR;
                        }
                    }
                }
                //STR
                else if (len==5 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' &&
                         allele[i][4]=='>' )
                {
                     type = VT_VNTR;
                }
            }
                        
            if (type==VT_VNTR)
            {
                type = VT_VNTR;
                var.type |= type;
                var.alleles.push_back(Allele(type));
            }
            else
            {
                type = VT_SV;
                var.type |= type;
                std::string sv_type(allele[i]);
                var.alleles.push_back(Allele(type, sv_type));
            }
        }
        else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0)
        {
            type = VT_REF;
        }
        else
        {
            kstring_t REF = {0,0,0};
            kstring_t ALT = {0,0,0};

            ref = allele[0];
            char* alt = allele[i];
            int32_t alen = strlen(alt);

            if (rlen!=alen)
            {
                homogeneous_length = false;
            }

            //trimming
            //this is required in particular for the
            //characterization of multiallelics and
            //in general, any unnormalized variant
            int32_t rl = rlen;
            int32_t al = alen;
            //trim right
            while (rl!=1 && al!=1)
            {
                if (ref[rl-1]==alt[al-1])
                {
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            //trim left
            while (rl !=1 && al!=1)
            {
                if (ref[0]==alt[0])
                {
                    ++ref;
                    ++alt;
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            kputsn(ref, rl, &REF);
            kputsn(alt, al, &ALT);

            ref = REF.s;
            alt = ALT.s;

            int32_t mlen = std::min(rl, al);
            int32_t dlen = al-rl;
            int32_t diff = 0;
            int32_t ts = 0;
            int32_t tv = 0;

            if (mlen==1 && dlen)
            {
                char ls, le, ss;

                if (rl>al)
                {
                     ls = ref[0];
                     le = ref[rl-1];
                     ss = alt[0];
                }
                else
                {
                     ls = alt[0];
                     le = alt[al-1];
                     ss = ref[0];
                }

                if (ls!=ss && le!=ss)
                {
                    ++diff;

                    if ((ls=='G' && ss=='A') ||
                        (ls=='A' && ss=='G') ||
                        (ls=='C' && ss=='T') ||
                        (ls=='T' && ss=='C'))
                    {
                        ++ts;
                    }
                    else
                    {
                        ++tv;
                    }
                }
            }
            else
            {
                for (int32_t j=0; j<mlen; ++j)
                {
                    if (ref[j]!=alt[j])
                    {
                        ++diff;

                        if ((ref[j]=='G' && alt[j]=='A') ||
                            (ref[j]=='A' && alt[j]=='G') ||
                            (ref[j]=='C' && alt[j]=='T') ||
                            (ref[j]=='T' && alt[j]=='C'))
                        {
                            ++ts;
                        }
                        else
                        {
                            ++tv;
                        }
                    }
                }
            }

            //substitution variants
            if (mlen==diff)
            {
                type |= mlen==1 ? VT_SNP : VT_MNP;
            }

            //indel variants
            if (dlen)
            {
                type |= VT_INDEL;
            }

            //clumped SNPs and MNPs
            if (diff && diff < mlen) //internal gaps
            {
                type |= VT_CLUMPED;
            }

            var.type |= type;
            var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv));
            var.ts += ts;
            var.tv += tv;
            var.ins = dlen>0?1:0;
            var.del = dlen<0?1:0;

            if (REF.m) free(REF.s);
            if (ALT.m) free(ALT.s);
        }
    }

    if (var.type==VT_VNTR)
    {
        bcf_unpack(v, BCF_UN_INFO);
        
        //populate motif, motif len etc. etc.
//        char* str = NULL;
//        int32_t n = 0;
//        int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n);
//        if (ret>0) 
//        {
//            var.motif = std::string(str);
//            var.mlen = var.motif.size();
//        }
//        ret = bcf_get_info_string(h, v, "RU", &str, &n);
//        if (ret>0) 
//        {
//            var.ru = std::string(str);
//            var.mlen = var.ru.size();
//        }
//        if (n) free(str);
//        
//        int32_t* no = NULL;
//        n = 0;    
//        ret = bcf_get_info_int32(h, v, "RL", &no, &n);
//        if (ret>0) var.rlen = *no;
//        if (n) free(no);
//            
//        int32_t* fl = NULL;
//        n = 0;                                    
//        ret = bcf_get_info_int32(h, v, "REF", &fl, &n);
//        if (ret>0) var.rcn = *fl;
//        if (n) free(fl);                        
    }
    
    //additionally define MNPs by length of all alleles
    if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF)
    {
        if (homogeneous_length && rlen>1 && n_allele>1)
        {
            var.type |= VT_MNP;
        }
    }

    return var.type;
}
Example #2
0
Allele genotypeAllele(AlleleType type, string alt, unsigned int len, string cigar, unsigned int reflen, long int pos, long int rrbound) {
    return Allele(type, alt, len, reflen, cigar, pos, rrbound);
}
Example #3
0
File: variant.cpp Project: atks/vt
/**
 * Classifies variants.
 */
int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v)
{
    clear();

    this->h = h;
    this->v = v;

    bcf_unpack(v, BCF_UN_STR);
    chrom.assign(bcf_get_chrom(h, v));
    rid = bcf_get_rid(v);
    pos1 = bcf_get_pos1(v);
    end1 = bcf_get_end1(v);
    char** allele = bcf_get_allele(v);
    int32_t n_allele = bcf_get_n_allele(v);
    int32_t pos0 = pos1-1;

    bool homogeneous_length = true;
    char* ref = allele[0];
    int32_t rlen = strlen(ref);

    if (strchr(ref, 'N'))
    {
        contains_N = true;
    }

    //if only ref allele, skip this entire for loop
    for (size_t i=1; i<n_allele; ++i)
    {
        int32_t allele_type = VT_REF;

        //check for symbolic alternative alleles
        if (strchr(allele[i],'<'))
        {
            size_t len = strlen(allele[i]);
            if (len>=5)
            {
                //VN/d+
                if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' )
                {
                    for (size_t j=3; j<len-1; ++j)
                    {
                        if (allele[i][j]<'0' || allele[i][j]>'9')
                        {
                            allele_type = VT_VNTR;
                        }
                    }
                }
                //VNTR
                else if (len==6 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' &&
                         allele[i][5]=='>' )
                {
                     allele_type = VT_VNTR;
                }
                //STR
                else if (len==5 &&
                         allele[i][0]=='<' &&
                         allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' &&
                         allele[i][4]=='>' )
                {
                     allele_type = VT_VNTR;
                }
                //ST/d+
                else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' )
                {
                    type = VT_VNTR;

                    for (size_t j=3; j<len-1; ++j)
                    {
                        if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.')
                        {
                            type = VT_SV;
                        }
                    }
                }
            }

            if (allele_type==VT_VNTR)
            {
                allele_type = VT_VNTR;
                type |= allele_type;
                alleles.push_back(Allele(allele_type));
            }
            else
            {
                allele_type = VT_SV;
                type |= allele_type;
                std::string sv_type(allele[i]);
                alleles.push_back(Allele(allele_type, sv_type));
            }
        }
        //checks for chromosomal breakpoints
        else if (strchr(allele[i],'[')||strchr(allele[i],']'))
        {
            allele_type = VT_SV;
            type |= allele_type;
            std::string sv_type("<BND>");
            alleles.push_back(Allele(allele_type, sv_type));
        }
        //non variant record
        else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0)
        {
            type = VT_REF;
        }
        //explicit sequence of bases
        else
        {
            kstring_t REF = {0,0,0};
            kstring_t ALT = {0,0,0};

            ref = allele[0];
            char* alt = allele[i];
            int32_t alen = strlen(alt);

            if (strchr(alt, 'N'))
            {
                contains_N = true;
            }

            if (rlen!=alen)
            {
                homogeneous_length = false;
            }

            //trimming
            //this is required in particular for the
            //characterization of multiallelics and
            //in general, any unnormalized variant
            int32_t rl = rlen;
            int32_t al = alen;
            //trim right
            while (rl!=1 && al!=1)
            {
                if (ref[rl-1]==alt[al-1])
                {
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            //trim left
            while (rl !=1 && al!=1)
            {
                if (ref[0]==alt[0])
                {
                    ++ref;
                    ++alt;
                    --rl;
                    --al;
                }
                else
                {
                    break;
                }
            }

            kputsn(ref, rl, &REF);
            kputsn(alt, al, &ALT);

            ref = REF.s;
            alt = ALT.s;

            int32_t mlen = std::min(rl, al);
            int32_t dlen = al-rl;
            int32_t diff = 0;
            int32_t ts = 0;
            int32_t tv = 0;

            if (mlen==1 && dlen)
            {
                char ls, le, ss;

                if (rl>al)
                {
                     ls = ref[0];
                     le = ref[rl-1];
                     ss = alt[0];
                }
                else
                {
                     ls = alt[0];
                     le = alt[al-1];
                     ss = ref[0];
                }

                if (ls!=ss && le!=ss)
                {
                    ++diff;

                    if ((ls=='G' && ss=='A') ||
                        (ls=='A' && ss=='G') ||
                        (ls=='C' && ss=='T') ||
                        (ls=='T' && ss=='C'))
                    {
                        ++ts;
                    }
                    else
                    {
                        ++tv;
                    }
                }
            }
            else
            {
                for (int32_t j=0; j<mlen; ++j)
                {
                    if (ref[j]!=alt[j])
                    {
                        ++diff;

                        if ((ref[j]=='G' && alt[j]=='A') ||
                            (ref[j]=='A' && alt[j]=='G') ||
                            (ref[j]=='C' && alt[j]=='T') ||
                            (ref[j]=='T' && alt[j]=='C'))
                        {
                            ++ts;
                        }
                        else
                        {
                            ++tv;
                        }
                    }
                }
            }

            //substitution variants
            if (mlen==diff)
            {
                allele_type |= mlen==1 ? VT_SNP : VT_MNP;
            }

            //indel variants
            if (dlen)
            {
                allele_type |= VT_INDEL;
            }

            //clumped SNPs and MNPs
            if (diff && diff < mlen) //internal gaps
            {
                allele_type |= VT_CLUMPED;
            }

            type |= allele_type;
            alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv));
            ts += ts;
            tv += tv;
            ins = dlen>0?1:0;
            del = dlen<0?1:0;

            if (REF.m) free(REF.s);
            if (ALT.m) free(ALT.s);
        }
    }

    if (type==VT_VNTR)
    {
        update_vntr_from_info_fields(h, v);
    }

    //additionally define MNPs by length of all alleles
    if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF)
    {
        if (homogeneous_length && rlen>1 && n_allele>1)
        {
            type |= VT_MNP;
        }
    }

    return type;
}
Example #4
0
Allele genotypeAllele(Allele &a) {
    return Allele(a.type, a.alternateSequence, a.length, a.referenceLength, a.cigar, a.position, a.repeatRightBoundary);
}
Example #5
0
int main (int argc, char** argv) {

    double snp_mutation_rate = 0.001;
    double indel_mutation_rate = 0.0001;
    double het_rate = 0.5;
    double afs_alpha = 1;
    double indel_alpha = 3;
    double microsatellite_afs_alpha = 1;
    double microsatellite_len_alpha = 1.7;
    double microsatellite_mutation_rate = 0.0001;
    double mnp_ratio = 0.01;
    double tstv_ratio = 2.5;
    double deamination_ratio = 1.8;
    int microsatellite_min_length = 1;
    int indel_max = 1000;
    int ploidy = 1;
    int population_size = 1;
    int sample_id_max_digits = 1;
    int seed = time(NULL);
    string fastaFileName;
    string file_prefix = "";
    string sample_prefix = "";
    bool dry_run = false;
    int repeat_size_max = 20;
    bool uniform_indel_distribution = false;

    double p, lambda, shape, mu, sigma;

    string command_line = argv[0];
    for (int i = 1; i < argc; ++i) {
        command_line += " ";
        command_line += argv[i];
    }

    int c;

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                //{"brief",   no_argument,       &verbose_flag, 0},
                {"help", no_argument, 0, 'h'},
                {"snp-rate",  required_argument, 0, 's'},
                {"mnp-ratio", required_argument, 0, 'M'},
                {"indel-rate",  required_argument, 0, 'i'},
                {"indel-alpha", required_argument, 0, 'z'},
                {"indel-max", required_argument, 0, 'X'},
                {"repeat-size-max", required_argument, 0, 'q'},
                {"microsat-rate",  required_argument, 0, 'm'},
                {"microsat-afs-alpha", required_argument, 0, 't'},
                {"microsat-len-alpha", required_argument, 0, 'j'},
                {"microsat-min-len", required_argument, 0, 'l'},
                {"afs-alpha",  required_argument, 0, 'a'},
                {"ploidy", required_argument, 0, 'p'},
                {"population-size", required_argument, 0, 'n'},
                {"file-prefix", required_argument, 0, 'P'},
                {"sample-prefix", required_argument, 0, 'S'},
                {"random-seed", required_argument, 0, 'g'},
                {"dry-run", no_argument, 0, 'd'},
                {"uniform-indels", no_argument, 0, 'U'},
                {"ts-tv-ratio", required_argument, 0, 'T'},
                {"deamination-ratio", required_argument, 0, 'D'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hdUa:z:s:i:q:p:n:M:X:t:m:P:S:g:l:j:T:", long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 'd':
            dry_run = true;
            break;

        case 'U':
            uniform_indel_distribution = true;
            break;

        case 'q':
            if (!convert(optarg, repeat_size_max)) {
                cerr << "could not read -q, --repeat-size-max" << endl;
                exit(1);
            }
            break;

        case 's':
            if (!convert(optarg, snp_mutation_rate)) {
                cerr << "could not read -s, --snp-rate" << endl;
                exit(1);
            }
            break;

        case 'i':
            if (!convert(optarg, indel_mutation_rate)) {
                cerr << "could not read -i, --indel-rate" << endl;
                exit(1);
            }
            break;

        case 'a':
            if (!convert(optarg, afs_alpha)) {
                cerr << "could not read -a, --afs-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'z':
            if (!convert(optarg, indel_alpha)) {
                cerr << "could not read -z, --indel-alpha" << endl;
                exit(1);
            }
            break;

        case 'X':
            if (!convert(optarg, indel_max)) {
                cerr << "could not read -M, --indel-max" << endl;
                exit(1);
            }
            break;
 
        case 'M':
            if (!convert(optarg, mnp_ratio)) {
                cerr << "could not read -m, --mnp-ratio" << endl;
                exit(1);
            }
            break;
 
        case 'm':
            if (!convert(optarg, microsatellite_mutation_rate)) {
                cerr << "could not read -m, --microsat-rate" << endl;
                exit(1);
            }
            break;

        case 'T':
            if (!convert(optarg, tstv_ratio)) {
                cerr << "could not read -T, --ts-tv-ratio" << endl;
                exit(1);
            }
            break;
 
        case 't':
            if (!convert(optarg, microsatellite_afs_alpha)) {
                cerr << "could not read -m, --microsatellite-afs-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'j':
            if (!convert(optarg, microsatellite_len_alpha)) {
                cerr << "could not read -m, --microsatellite-len-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'l':
            if (!convert(optarg, microsatellite_min_length)) {
                cerr << "could not read -l, --microsat-min-len" << endl;
                exit(1);
            }
            break;
 
        case 'p':
            if (!convert(optarg, ploidy)) {
                cerr << "could not read -p, --ploidy" << endl;
                exit(1);
            }
            break;

        case 'P':
            file_prefix = optarg;
            break;

        case 'S':
            sample_prefix = optarg;
            break;
 
        case 'n':
            if (!convert(optarg, population_size)) {
                cerr << "could not read -n, --population-size" << endl;
                exit(1);
            }
            sample_id_max_digits = strlen(optarg);
            break;

        case 'g':
            if (!convert(optarg, seed)) {
                cerr << "could not read -g, --random-seed" << endl;
                exit(1);
            }
            break;

        case 'h':
            printSummary();
            exit(0);
            break;
 
        case '?':
            /* getopt_long already printed an error message. */
            printSummary();
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    /* Print any remaining command line arguments (not options). */
    if (optind < argc) {
        //cerr << "fasta file: " << argv[optind] << endl;
        fastaFileName = argv[optind];
    } else {
        cerr << "please specify a fasta file" << endl;
        printSummary();
        exit(1);
    }

    init_genrand(seed); // seed mt with current time

    //mt19937 eng(seed);

    int bpPerHaplotypeMean = 1000;
    double bpPerHaplotypeSigma = 200;
    normal_distribution<double> normal(mu, sigma);
     
    //lambda = 7.0;
    //poisson_distribution<int> poisson(lambda);
    //poisson(eng);

    string seqname;
    string sequence;  // holds sequence so we can process it

    FastaReference fr;
    fr.open(fastaFileName);

    string bases = "ATGC";

    vcf::VariantCallFile vcfFile;

    // write the VCF header
    stringstream headerss;
    headerss 
        << "##fileformat=VCFv4.1" << endl
        << "##fileDate=" << dateStr() << endl
        << "##source=mutatrix population genome simulator" << endl
        << "##seed=" << seed << endl
        << "##reference=" << fastaFileName << endl
        << "##phasing=true" << endl
        << "##commandline=" << command_line << endl
        << "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate allele count\">" << endl
        << "##INFO=<ID=TYPE,Number=A,Type=String,Description=\"Type of each allele (snp, ins, del, mnp, complex)\">" << endl
        << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples at the site\">" << endl
        << "##INFO=<ID=NA,Number=1,Type=Integer,Description=\"Number of alternate alleles\">" << endl
        << "##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"Length of each alternate allele\">" << endl
        << "##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description=\"Generated at a sequence repeat loci\">" << endl
        << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl
        << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";

    vector<string> samples;
    for (int i = 0; i < population_size; ++i) {
        stringstream sampless;
        sampless << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; // one-based sample names
        samples.push_back(sampless.str());
        headerss << "\t" << sampless.str();
    }

    // and set up our VCF output file
    string header = headerss.str();
    vcfFile.openForOutput(header);
    cout << vcfFile.header << endl;

    int copies = ploidy * population_size;

    map<string, vector<SampleFastaFile*> > sequencesByRefseq;

    if (!dry_run) {
        for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) {

            FastaIndexEntry& indexEntry = s->second;
            seqname = indexEntry.name;

            vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname];
            for (int i = 0; i < population_size; ++i) {
                stringstream sname;
                sname << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1;
                string samplename = sname.str();
                for (int j = 0; j < ploidy; ++j) {
                    stringstream cname;
                    cname << j;
                    string chromname = cname.str();
                    string fullname = samplename + ":" + seqname + ":" + chromname;
                    string filename = file_prefix + fullname + ".fa";
                    //sequences.push_back(SampleFastaFile(filename, seqname));
                    sequences.push_back(new SampleFastaFile(filename, seqname));
                }
            }
        }
    }



    for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) {

        FastaIndexEntry& indexEntry = s->second;
        seqname = indexEntry.name;
        sequence = fr.getSequence(s->first);

        vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname];
        //sequences.resize(copies);
        
        long int pos = 0;
        long int microsatellite_end_pos = 0;
        while (pos < sequence.size()) {

            //cout << pos + 1 << " microsat end pos " << microsatellite_end_pos << endl;

            string ref = sequence.substr(pos, 1); // by default, ref is just the current base

            // skip non-DNA sequence information
            if (!(ref == "A" || ref == "T" || ref == "C" || ref == "G")) {
                pos += ref.size();
                for (vector<SampleFastaFile*>::iterator s = sequences.begin(); s != sequences.end(); ++s) {
                    (*s)->write(ref);
                }
                continue;
            }

            vector<Allele> alleles;

            // establish if we are in a repeat
            // and what motif is being repeated, how many times

            int len = 1;

            // get reference repeats
            // if we have a repeat, adjust the mutation rate
            // using length and direction-dependent
            // formula from "Likelihood-Based Estimation of Microsatellite Mutation Rates"
            // http://www.genetics.org/cgi/content/full/164/2/781#T1

            if (pos > microsatellite_end_pos) {

                map<string, int> repeats = repeatCounts(pos + 1, (const string&) sequence, repeat_size_max);

                string seq;
                int repeat_count = 0;
                // get the "biggest" repeat, the most likely ms allele at this site
                for (map<string, int>::iterator r = repeats.begin(); r != repeats.end(); ++r) {
                    if (repeat_count < r->second) {
                        repeat_count = r->second;
                        seq = r->first;
                    }
                }
                //cout << pos + 1 << " " << sequence.substr(pos + 1, seq.size() * repeat_count) << " ?= " << seq * repeat_count << endl;

                // guard ensures that we are in a pure repeat situoation, tandem-tandem repeats are not handled presently
                if (repeats.size() > 0 && sequence.substr(pos + 1, seq.size() * repeat_count) == seq * repeat_count) {

                    int microsatellite_length = repeat_count * seq.size();

                    // record end of microsatellite so we don't generate more mutations until we pass it
                    microsatellite_end_pos = pos + microsatellite_length - 1;

                    if (microsatellite_length > microsatellite_min_length
                        //&& genrand_real1() / copies 
                        //    < microsatellite_mutation_rate * repeat_count) {
                        && genrand_real1() > pow(1 - (microsatellite_mutation_rate * repeat_count), log(copies) * 2)) {

                        // establish the relative rate of ins and del events
                        /*
                          long double repeatMutationDelProbability = microsatelliteDelProb(repeat_count);
                          long double repeatMutationInsProbability = microsatelliteInsProb(repeat_count);
                          long double indel_balance = 1;
                          if (repeatMutationInsProbability > repeatMutationDelProbability) {
                          indel_balance = repeatMutationInsProbability / repeatMutationDelProbability;
                          } else {
                          indel_balance = 1 - (repeatMutationInsProbability / repeatMutationDelProbability);
                          }
                        */
                        double indel_balance = 0.5;

                        // how many alleles at the site?

                        //int numalleles = min((int) floor(zetarandom(microsatellite_afs_alpha)), (int) ((double) repeat_count * indel_balance));
                        int numalleles = random_allele_frequency(repeat_count, microsatellite_afs_alpha);
                        //cout << "repeat_count: " << repeat_count << " numalleles: " << numalleles << endl;

                        map<int, bool> allele_lengths;
                        // lengths of the alleles
                        while (allele_lengths.size() < numalleles) {
                            int allele_length;
                            // TODO adjust length so that shorter events are more likely...
                            if (genrand_real1() > indel_balance) {
                                allele_length = -1 * min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count);
                            } else {
                                allele_length = min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count);
                            }
                            //cout << allele_length << endl;
                            map<int, bool>::iterator f = allele_lengths.find(allele_length);
                            if (f == allele_lengths.end()) {
                                allele_lengths[allele_length] = true;
                            }
                        }

                        // generate alleles
                        for (map<int, bool>::iterator f = allele_lengths.begin();
                             f != allele_lengths.end(); ++f) {

                            int allele_length = f->first;
                            int c = abs(f->first);
                            string alt = seq;

                            for (int i = 1; i < c; ++i)
                                alt += seq;

                            if (allele_length > 0) {
                                alleles.push_back(Allele(ref, ref + alt, "MICROSAT"));
                            } else {
                                alleles.push_back(Allele(ref + alt, ref, "MICROSAT"));
                            }
                            //cout << pos + 1 << " "  << microsatellite_length << " " << alleles.back() << endl;
                        }
                        //cout << "alleles.size() == " << alleles.size() << endl;
                    }
                }
            }

            // snp case
            if (genrand_real1() > pow(1 - snp_mutation_rate, log(max(copies, 2)) * 2)) {

                // make an alternate allele
                /*
                  string alt = ref;
                  while (alt == ref) {
                  alt = string(1, bases.at(genrand_int32() % 4));
                  }
                */
                string alt = ref;
                if (genrand_real1() > 1 / (1 + tstv_ratio)) {
                    if (ref == "A") {
                        alt = "G";
                    } else if (ref == "G") {
                        alt = "A";
                    } else if (ref == "C") {
                        alt = "T";
                    } else if (ref == "T") {
                        alt = "C";
                    }
                } else {
                    while (alt == ref || isTransition(ref, alt)) {
                        alt = string(1, bases.at(genrand_int32() % 4));
                    }
                }

                if (genrand_real1() < mnp_ratio) {
                    int i = 1;
                    do {
                        ref += sequence.substr(pos + i, 1);
                        alt += sequence.substr(pos + i, 1);
                        ++i;
                        while (alt.at(alt.size() - 1) == ref.at(ref.size() - 1)) {
                            alt.at(alt.size() - 1) = bases.at(genrand_int32() % 4);
                        }
                    } while (genrand_real1() < mnp_ratio);
                    len = alt.size();
                }
                alleles.push_back(Allele(ref, alt));
            }

            // indel case
            if (genrand_real1() > pow(1 - indel_mutation_rate, log(max(copies, 2)) * 2)) {
                // how many bp?
                if (uniform_indel_distribution) {
                    len = (int) floor(genrand_real1() * indel_max);
                } else {
                    len = (int) floor(zetarandom(indel_alpha));
                }
                // guard against out-of-sequence indels
                if (pos + len < sequence.size() && len <= indel_max) {
                    if (genrand_int32() % 2 == 0) {
                        // deletion
                        alleles.push_back(Allele(sequence.substr(pos, 1 + len), sequence.substr(pos, 1)));
                    } else {
                        string alt = ref;
                        // insertion?
                        // insert some random de novo bases
                        while (alt.length() < len + 1) {
                            alt += string(1, bases.at(genrand_int32() % 4));
                        }
                        alleles.push_back(Allele(ref, alt));
                    }
                } else {
                    // fall through
                }
            }

            // no mutation generated
            if (alleles.empty()) {
                for (int i = 0; i < copies; ++i) {
                    if (!dry_run) {
                        sequences.at(i)->write(ref);
                    }
                }
                pos += ref.size();
            } else {

                // TODO randomly distribute all the alleles throughout the population
                // generate allele frequencies for each
                // fun times...

                string genotype;

                vector<bool> alts;
                random_shuffle(alleles.begin(), alleles.end());

                vector<Allele*> population_alleles;
                list<Allele> present_alleles; // filtered for AFS > 0 in the sample
                
                // AFS simulation
                int remaining_copies = copies;
                while (remaining_copies > 0 && !alleles.empty()) {
                    Allele allele = alleles.back();
                    alleles.pop_back();
                    int allele_freq = random_allele_frequency(remaining_copies, afs_alpha);
                    if (allele_freq > 0) {
                        present_alleles.push_back(allele);
                        Allele* allelePtr = &present_alleles.back();
                        for (int i = 0; i < allele_freq; ++i) {
                            population_alleles.push_back(allelePtr);
                        }
                        remaining_copies -= allele_freq;
                    }
                }

                if (present_alleles.empty()) {
                    for (int i = 0; i < copies; ++i) {
                        if (!dry_run) {
                            sequences.at(i)->write(ref);
                        }
                    }
                    pos += ref.size();
                    continue;
                }

                reverse(present_alleles.begin(), present_alleles.end());

                // establish the correct reference sequence and alternate allele set
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    Allele& allele = *a;
                    //cout << allele << endl;
                    if (allele.ref.size() > ref.size()) {
                        ref = allele.ref;
                    }
                }

                // reference alleles take up the rest
                Allele reference_allele = Allele(ref, ref);
                for (int i = 0; i < remaining_copies; ++i) {
                    population_alleles.push_back(&reference_allele);
                }

                vector<string> altstrs;
                // now the reference allele is the largest possible, adjust the alt allele strings to reflect this
                // if we have indels, add the base before, set the position back one
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    Allele& allele = *a;
                    string alleleStr = ref;
                    if (allele.ref.size() == allele.alt.size()) {
                        alleleStr.replace(0, allele.alt.size(), allele.alt);
                    } else {
                        alleleStr.replace(0, allele.ref.size(), allele.alt);
                    }
                    allele.ref = ref;
                    allele.alt = alleleStr;
                    altstrs.push_back(alleleStr);
                }

                assert(population_alleles.size() == copies);

                // shuffle the alleles around the population
                random_shuffle(population_alleles.begin(), population_alleles.end());

                vcf::Variant var(vcfFile);
                var.sequenceName = seqname;
                var.position = pos + 1;
                var.quality = 99;
                var.id = ".";
                var.filter = ".";
                var.info["NS"].push_back(convert(population_size));
                var.info["NA"].push_back(convert(present_alleles.size()));
                var.format.push_back("GT");
                var.ref = ref;
                var.alt = altstrs;

                // debugging, uncomment to see sequence context
                //cout << sequence.substr(pos - 10, 10) << "*" << ref << "*" << sequence.substr(pos + 1, 9) << endl;

                map<string, int> alleleIndexes;
                alleleIndexes[convert(reference_allele)] = 0; // XXX should we handle this differently, by adding the reference allele to present_alleles?
                int i = 1;
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a, ++i) {
                    Allele& allele = *a;
                    //cout << allele << " " << i << endl;
                    alleleIndexes[convert(allele)] = i;
                    //cout << allele << " " << i << endl;
                }

                //for (map<string, int>::iterator a = alleleIndexes.begin(); a != alleleIndexes.end(); ++a) {
                //    cout << a->first << " = " << a->second << endl;
                //}

                int j = 0;
                for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s, ++j) {
                    string& sample = *s;
                    vector<string> genotype;
                    // XXX hack, maybe this should get stored in another map for easier access?
                    for (int i = 0; i < ploidy; ++i) {
                        int l = (j * ploidy) + i;
                        //cout << l << " " << population_alleles.at(l) << " " << alleleIndexes[convert(population_alleles.at(l))] << endl;
                        genotype.push_back(convert(alleleIndexes[convert(*population_alleles.at(l))]));
                    }
                    var.samples[sample]["GT"].push_back(join(genotype, "|"));
                    //cout << var.samples[sample]["GT"].front() << endl;
                }

                // XXX THIS IS BROKEN BECAUSE YOUR REFERENCE ALLELE CHANGES
                // LENGTH WITH DELETIONS.
                //
                // IT'S POSSIBLE TO GET COMPLEX ALLELES AT THE INTERSECTIONS
                // BETWEEN ONE ALLELIC VARIANT AND ANOTHER.  THIS IS BROKEN!
                //
                // TO FIX--- BUILD HAPLOTYPES, THEN DISTRIBUTE THEM WITHIN THE POPULATION
                //
                // now write out our sequence data (FASTA files)
                for (int j = 0; j < population_size; ++j) {
                    for (int i = 0; i < ploidy; ++i) {
                        int l = (j * ploidy) + i;
                        Allele* allele = population_alleles.at(l);
                        if (!dry_run) {
                            sequences.at(l)->write(allele->alt);
                        }
                    }
                }

                // tabulate allele frequency, and write some details to the VCF
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {

                    Allele& allele = *a;
                    Allele* allelePtr = &*a;

                    vector<string> genotypes;
                    genotypes.resize(population_size);

                    int allele_freq = 0;

                    // obtain allele frequencies and output FASTA sequence data
                    // for each simulated sample
                    for (int j = 0; j < population_size; ++j) {
                        for (int i = 0; i < ploidy; ++i) {
                            int l = (j * ploidy) + i;
                            if (population_alleles.at(l) == allelePtr) {
                                ++allele_freq;
                            }
                        }
                    }

                    // set up the allele-specific INFO fields in the VCF record
                    var.info["AC"].push_back(convert(allele_freq));

                    int delta = allele.alt.size() - allele.ref.size();
                    if (delta == 0) {
                        if (allele.ref.size() == 1) {
                            var.info["TYPE"].push_back("snp");
                            var.info["LEN"].push_back(convert(allele.ref.size()));
                        } else {
                            var.info["TYPE"].push_back("mnp");;
                            var.info["LEN"].push_back(convert(allele.ref.size()));
                        }
                    } else if (delta > 0) {
                        var.info["TYPE"].push_back("ins");;
                        var.info["LEN"].push_back(convert(abs(delta)));
                    } else {
                        var.info["TYPE"].push_back("del");;
                        var.info["LEN"].push_back(convert(abs(delta)));
                    }
                    if (!allele.type.empty()) {
                        var.infoFlags[allele.type] = true;
                    }

                }

                // write the VCF record to stdout
                cout << var << endl;

                int largest_ref = 1; // enforce one pos
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    if (a->ref.size() > largest_ref) {
                        largest_ref = a->ref.size();
                    }
                }

                pos += largest_ref; // step by the size of the last event
            }
        }
    }

    // close, clean up files
    for (map<string, vector<SampleFastaFile*> >::iterator s = sequencesByRefseq.begin(); s != sequencesByRefseq.end(); ++s) {
        vector<SampleFastaFile*>& files = s->second;
        for (vector<SampleFastaFile*>::iterator f = files.begin(); f != files.end(); ++f) {
            delete *f;
        }
        files.clear();
    }

    return 0;

}