bool is_transition(SkView* view) {
    SkEvent isTransition(gIsTransitionQuery);
    return view->doQuery(&isTransition);
}
Пример #2
0
int main (int argc, char** argv) {

    double snp_mutation_rate = 0.001;
    double indel_mutation_rate = 0.0001;
    double het_rate = 0.5;
    double afs_alpha = 1;
    double indel_alpha = 3;
    double microsatellite_afs_alpha = 1;
    double microsatellite_len_alpha = 1.7;
    double microsatellite_mutation_rate = 0.0001;
    double mnp_ratio = 0.01;
    double tstv_ratio = 2.5;
    double deamination_ratio = 1.8;
    int microsatellite_min_length = 1;
    int indel_max = 1000;
    int ploidy = 1;
    int population_size = 1;
    int sample_id_max_digits = 1;
    int seed = time(NULL);
    string fastaFileName;
    string file_prefix = "";
    string sample_prefix = "";
    bool dry_run = false;
    int repeat_size_max = 20;
    bool uniform_indel_distribution = false;

    double p, lambda, shape, mu, sigma;

    string command_line = argv[0];
    for (int i = 1; i < argc; ++i) {
        command_line += " ";
        command_line += argv[i];
    }

    int c;

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                //{"brief",   no_argument,       &verbose_flag, 0},
                {"help", no_argument, 0, 'h'},
                {"snp-rate",  required_argument, 0, 's'},
                {"mnp-ratio", required_argument, 0, 'M'},
                {"indel-rate",  required_argument, 0, 'i'},
                {"indel-alpha", required_argument, 0, 'z'},
                {"indel-max", required_argument, 0, 'X'},
                {"repeat-size-max", required_argument, 0, 'q'},
                {"microsat-rate",  required_argument, 0, 'm'},
                {"microsat-afs-alpha", required_argument, 0, 't'},
                {"microsat-len-alpha", required_argument, 0, 'j'},
                {"microsat-min-len", required_argument, 0, 'l'},
                {"afs-alpha",  required_argument, 0, 'a'},
                {"ploidy", required_argument, 0, 'p'},
                {"population-size", required_argument, 0, 'n'},
                {"file-prefix", required_argument, 0, 'P'},
                {"sample-prefix", required_argument, 0, 'S'},
                {"random-seed", required_argument, 0, 'g'},
                {"dry-run", no_argument, 0, 'd'},
                {"uniform-indels", no_argument, 0, 'U'},
                {"ts-tv-ratio", required_argument, 0, 'T'},
                {"deamination-ratio", required_argument, 0, 'D'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hdUa:z:s:i:q:p:n:M:X:t:m:P:S:g:l:j:T:", long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 'd':
            dry_run = true;
            break;

        case 'U':
            uniform_indel_distribution = true;
            break;

        case 'q':
            if (!convert(optarg, repeat_size_max)) {
                cerr << "could not read -q, --repeat-size-max" << endl;
                exit(1);
            }
            break;

        case 's':
            if (!convert(optarg, snp_mutation_rate)) {
                cerr << "could not read -s, --snp-rate" << endl;
                exit(1);
            }
            break;

        case 'i':
            if (!convert(optarg, indel_mutation_rate)) {
                cerr << "could not read -i, --indel-rate" << endl;
                exit(1);
            }
            break;

        case 'a':
            if (!convert(optarg, afs_alpha)) {
                cerr << "could not read -a, --afs-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'z':
            if (!convert(optarg, indel_alpha)) {
                cerr << "could not read -z, --indel-alpha" << endl;
                exit(1);
            }
            break;

        case 'X':
            if (!convert(optarg, indel_max)) {
                cerr << "could not read -M, --indel-max" << endl;
                exit(1);
            }
            break;
 
        case 'M':
            if (!convert(optarg, mnp_ratio)) {
                cerr << "could not read -m, --mnp-ratio" << endl;
                exit(1);
            }
            break;
 
        case 'm':
            if (!convert(optarg, microsatellite_mutation_rate)) {
                cerr << "could not read -m, --microsat-rate" << endl;
                exit(1);
            }
            break;

        case 'T':
            if (!convert(optarg, tstv_ratio)) {
                cerr << "could not read -T, --ts-tv-ratio" << endl;
                exit(1);
            }
            break;
 
        case 't':
            if (!convert(optarg, microsatellite_afs_alpha)) {
                cerr << "could not read -m, --microsatellite-afs-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'j':
            if (!convert(optarg, microsatellite_len_alpha)) {
                cerr << "could not read -m, --microsatellite-len-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'l':
            if (!convert(optarg, microsatellite_min_length)) {
                cerr << "could not read -l, --microsat-min-len" << endl;
                exit(1);
            }
            break;
 
        case 'p':
            if (!convert(optarg, ploidy)) {
                cerr << "could not read -p, --ploidy" << endl;
                exit(1);
            }
            break;

        case 'P':
            file_prefix = optarg;
            break;

        case 'S':
            sample_prefix = optarg;
            break;
 
        case 'n':
            if (!convert(optarg, population_size)) {
                cerr << "could not read -n, --population-size" << endl;
                exit(1);
            }
            sample_id_max_digits = strlen(optarg);
            break;

        case 'g':
            if (!convert(optarg, seed)) {
                cerr << "could not read -g, --random-seed" << endl;
                exit(1);
            }
            break;

        case 'h':
            printSummary();
            exit(0);
            break;
 
        case '?':
            /* getopt_long already printed an error message. */
            printSummary();
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    /* Print any remaining command line arguments (not options). */
    if (optind < argc) {
        //cerr << "fasta file: " << argv[optind] << endl;
        fastaFileName = argv[optind];
    } else {
        cerr << "please specify a fasta file" << endl;
        printSummary();
        exit(1);
    }

    init_genrand(seed); // seed mt with current time

    //mt19937 eng(seed);

    int bpPerHaplotypeMean = 1000;
    double bpPerHaplotypeSigma = 200;
    normal_distribution<double> normal(mu, sigma);
     
    //lambda = 7.0;
    //poisson_distribution<int> poisson(lambda);
    //poisson(eng);

    string seqname;
    string sequence;  // holds sequence so we can process it

    FastaReference fr;
    fr.open(fastaFileName);

    string bases = "ATGC";

    vcf::VariantCallFile vcfFile;

    // write the VCF header
    stringstream headerss;
    headerss 
        << "##fileformat=VCFv4.1" << endl
        << "##fileDate=" << dateStr() << endl
        << "##source=mutatrix population genome simulator" << endl
        << "##seed=" << seed << endl
        << "##reference=" << fastaFileName << endl
        << "##phasing=true" << endl
        << "##commandline=" << command_line << endl
        << "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate allele count\">" << endl
        << "##INFO=<ID=TYPE,Number=A,Type=String,Description=\"Type of each allele (snp, ins, del, mnp, complex)\">" << endl
        << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples at the site\">" << endl
        << "##INFO=<ID=NA,Number=1,Type=Integer,Description=\"Number of alternate alleles\">" << endl
        << "##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"Length of each alternate allele\">" << endl
        << "##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description=\"Generated at a sequence repeat loci\">" << endl
        << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl
        << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";

    vector<string> samples;
    for (int i = 0; i < population_size; ++i) {
        stringstream sampless;
        sampless << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; // one-based sample names
        samples.push_back(sampless.str());
        headerss << "\t" << sampless.str();
    }

    // and set up our VCF output file
    string header = headerss.str();
    vcfFile.openForOutput(header);
    cout << vcfFile.header << endl;

    int copies = ploidy * population_size;

    map<string, vector<SampleFastaFile*> > sequencesByRefseq;

    if (!dry_run) {
        for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) {

            FastaIndexEntry& indexEntry = s->second;
            seqname = indexEntry.name;

            vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname];
            for (int i = 0; i < population_size; ++i) {
                stringstream sname;
                sname << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1;
                string samplename = sname.str();
                for (int j = 0; j < ploidy; ++j) {
                    stringstream cname;
                    cname << j;
                    string chromname = cname.str();
                    string fullname = samplename + ":" + seqname + ":" + chromname;
                    string filename = file_prefix + fullname + ".fa";
                    //sequences.push_back(SampleFastaFile(filename, seqname));
                    sequences.push_back(new SampleFastaFile(filename, seqname));
                }
            }
        }
    }



    for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) {

        FastaIndexEntry& indexEntry = s->second;
        seqname = indexEntry.name;
        sequence = fr.getSequence(s->first);

        vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname];
        //sequences.resize(copies);
        
        long int pos = 0;
        long int microsatellite_end_pos = 0;
        while (pos < sequence.size()) {

            //cout << pos + 1 << " microsat end pos " << microsatellite_end_pos << endl;

            string ref = sequence.substr(pos, 1); // by default, ref is just the current base

            // skip non-DNA sequence information
            if (!(ref == "A" || ref == "T" || ref == "C" || ref == "G")) {
                pos += ref.size();
                for (vector<SampleFastaFile*>::iterator s = sequences.begin(); s != sequences.end(); ++s) {
                    (*s)->write(ref);
                }
                continue;
            }

            vector<Allele> alleles;

            // establish if we are in a repeat
            // and what motif is being repeated, how many times

            int len = 1;

            // get reference repeats
            // if we have a repeat, adjust the mutation rate
            // using length and direction-dependent
            // formula from "Likelihood-Based Estimation of Microsatellite Mutation Rates"
            // http://www.genetics.org/cgi/content/full/164/2/781#T1

            if (pos > microsatellite_end_pos) {

                map<string, int> repeats = repeatCounts(pos + 1, (const string&) sequence, repeat_size_max);

                string seq;
                int repeat_count = 0;
                // get the "biggest" repeat, the most likely ms allele at this site
                for (map<string, int>::iterator r = repeats.begin(); r != repeats.end(); ++r) {
                    if (repeat_count < r->second) {
                        repeat_count = r->second;
                        seq = r->first;
                    }
                }
                //cout << pos + 1 << " " << sequence.substr(pos + 1, seq.size() * repeat_count) << " ?= " << seq * repeat_count << endl;

                // guard ensures that we are in a pure repeat situoation, tandem-tandem repeats are not handled presently
                if (repeats.size() > 0 && sequence.substr(pos + 1, seq.size() * repeat_count) == seq * repeat_count) {

                    int microsatellite_length = repeat_count * seq.size();

                    // record end of microsatellite so we don't generate more mutations until we pass it
                    microsatellite_end_pos = pos + microsatellite_length - 1;

                    if (microsatellite_length > microsatellite_min_length
                        //&& genrand_real1() / copies 
                        //    < microsatellite_mutation_rate * repeat_count) {
                        && genrand_real1() > pow(1 - (microsatellite_mutation_rate * repeat_count), log(copies) * 2)) {

                        // establish the relative rate of ins and del events
                        /*
                          long double repeatMutationDelProbability = microsatelliteDelProb(repeat_count);
                          long double repeatMutationInsProbability = microsatelliteInsProb(repeat_count);
                          long double indel_balance = 1;
                          if (repeatMutationInsProbability > repeatMutationDelProbability) {
                          indel_balance = repeatMutationInsProbability / repeatMutationDelProbability;
                          } else {
                          indel_balance = 1 - (repeatMutationInsProbability / repeatMutationDelProbability);
                          }
                        */
                        double indel_balance = 0.5;

                        // how many alleles at the site?

                        //int numalleles = min((int) floor(zetarandom(microsatellite_afs_alpha)), (int) ((double) repeat_count * indel_balance));
                        int numalleles = random_allele_frequency(repeat_count, microsatellite_afs_alpha);
                        //cout << "repeat_count: " << repeat_count << " numalleles: " << numalleles << endl;

                        map<int, bool> allele_lengths;
                        // lengths of the alleles
                        while (allele_lengths.size() < numalleles) {
                            int allele_length;
                            // TODO adjust length so that shorter events are more likely...
                            if (genrand_real1() > indel_balance) {
                                allele_length = -1 * min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count);
                            } else {
                                allele_length = min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count);
                            }
                            //cout << allele_length << endl;
                            map<int, bool>::iterator f = allele_lengths.find(allele_length);
                            if (f == allele_lengths.end()) {
                                allele_lengths[allele_length] = true;
                            }
                        }

                        // generate alleles
                        for (map<int, bool>::iterator f = allele_lengths.begin();
                             f != allele_lengths.end(); ++f) {

                            int allele_length = f->first;
                            int c = abs(f->first);
                            string alt = seq;

                            for (int i = 1; i < c; ++i)
                                alt += seq;

                            if (allele_length > 0) {
                                alleles.push_back(Allele(ref, ref + alt, "MICROSAT"));
                            } else {
                                alleles.push_back(Allele(ref + alt, ref, "MICROSAT"));
                            }
                            //cout << pos + 1 << " "  << microsatellite_length << " " << alleles.back() << endl;
                        }
                        //cout << "alleles.size() == " << alleles.size() << endl;
                    }
                }
            }

            // snp case
            if (genrand_real1() > pow(1 - snp_mutation_rate, log(max(copies, 2)) * 2)) {

                // make an alternate allele
                /*
                  string alt = ref;
                  while (alt == ref) {
                  alt = string(1, bases.at(genrand_int32() % 4));
                  }
                */
                string alt = ref;
                if (genrand_real1() > 1 / (1 + tstv_ratio)) {
                    if (ref == "A") {
                        alt = "G";
                    } else if (ref == "G") {
                        alt = "A";
                    } else if (ref == "C") {
                        alt = "T";
                    } else if (ref == "T") {
                        alt = "C";
                    }
                } else {
                    while (alt == ref || isTransition(ref, alt)) {
                        alt = string(1, bases.at(genrand_int32() % 4));
                    }
                }

                if (genrand_real1() < mnp_ratio) {
                    int i = 1;
                    do {
                        ref += sequence.substr(pos + i, 1);
                        alt += sequence.substr(pos + i, 1);
                        ++i;
                        while (alt.at(alt.size() - 1) == ref.at(ref.size() - 1)) {
                            alt.at(alt.size() - 1) = bases.at(genrand_int32() % 4);
                        }
                    } while (genrand_real1() < mnp_ratio);
                    len = alt.size();
                }
                alleles.push_back(Allele(ref, alt));
            }

            // indel case
            if (genrand_real1() > pow(1 - indel_mutation_rate, log(max(copies, 2)) * 2)) {
                // how many bp?
                if (uniform_indel_distribution) {
                    len = (int) floor(genrand_real1() * indel_max);
                } else {
                    len = (int) floor(zetarandom(indel_alpha));
                }
                // guard against out-of-sequence indels
                if (pos + len < sequence.size() && len <= indel_max) {
                    if (genrand_int32() % 2 == 0) {
                        // deletion
                        alleles.push_back(Allele(sequence.substr(pos, 1 + len), sequence.substr(pos, 1)));
                    } else {
                        string alt = ref;
                        // insertion?
                        // insert some random de novo bases
                        while (alt.length() < len + 1) {
                            alt += string(1, bases.at(genrand_int32() % 4));
                        }
                        alleles.push_back(Allele(ref, alt));
                    }
                } else {
                    // fall through
                }
            }

            // no mutation generated
            if (alleles.empty()) {
                for (int i = 0; i < copies; ++i) {
                    if (!dry_run) {
                        sequences.at(i)->write(ref);
                    }
                }
                pos += ref.size();
            } else {

                // TODO randomly distribute all the alleles throughout the population
                // generate allele frequencies for each
                // fun times...

                string genotype;

                vector<bool> alts;
                random_shuffle(alleles.begin(), alleles.end());

                vector<Allele*> population_alleles;
                list<Allele> present_alleles; // filtered for AFS > 0 in the sample
                
                // AFS simulation
                int remaining_copies = copies;
                while (remaining_copies > 0 && !alleles.empty()) {
                    Allele allele = alleles.back();
                    alleles.pop_back();
                    int allele_freq = random_allele_frequency(remaining_copies, afs_alpha);
                    if (allele_freq > 0) {
                        present_alleles.push_back(allele);
                        Allele* allelePtr = &present_alleles.back();
                        for (int i = 0; i < allele_freq; ++i) {
                            population_alleles.push_back(allelePtr);
                        }
                        remaining_copies -= allele_freq;
                    }
                }

                if (present_alleles.empty()) {
                    for (int i = 0; i < copies; ++i) {
                        if (!dry_run) {
                            sequences.at(i)->write(ref);
                        }
                    }
                    pos += ref.size();
                    continue;
                }

                reverse(present_alleles.begin(), present_alleles.end());

                // establish the correct reference sequence and alternate allele set
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    Allele& allele = *a;
                    //cout << allele << endl;
                    if (allele.ref.size() > ref.size()) {
                        ref = allele.ref;
                    }
                }

                // reference alleles take up the rest
                Allele reference_allele = Allele(ref, ref);
                for (int i = 0; i < remaining_copies; ++i) {
                    population_alleles.push_back(&reference_allele);
                }

                vector<string> altstrs;
                // now the reference allele is the largest possible, adjust the alt allele strings to reflect this
                // if we have indels, add the base before, set the position back one
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    Allele& allele = *a;
                    string alleleStr = ref;
                    if (allele.ref.size() == allele.alt.size()) {
                        alleleStr.replace(0, allele.alt.size(), allele.alt);
                    } else {
                        alleleStr.replace(0, allele.ref.size(), allele.alt);
                    }
                    allele.ref = ref;
                    allele.alt = alleleStr;
                    altstrs.push_back(alleleStr);
                }

                assert(population_alleles.size() == copies);

                // shuffle the alleles around the population
                random_shuffle(population_alleles.begin(), population_alleles.end());

                vcf::Variant var(vcfFile);
                var.sequenceName = seqname;
                var.position = pos + 1;
                var.quality = 99;
                var.id = ".";
                var.filter = ".";
                var.info["NS"].push_back(convert(population_size));
                var.info["NA"].push_back(convert(present_alleles.size()));
                var.format.push_back("GT");
                var.ref = ref;
                var.alt = altstrs;

                // debugging, uncomment to see sequence context
                //cout << sequence.substr(pos - 10, 10) << "*" << ref << "*" << sequence.substr(pos + 1, 9) << endl;

                map<string, int> alleleIndexes;
                alleleIndexes[convert(reference_allele)] = 0; // XXX should we handle this differently, by adding the reference allele to present_alleles?
                int i = 1;
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a, ++i) {
                    Allele& allele = *a;
                    //cout << allele << " " << i << endl;
                    alleleIndexes[convert(allele)] = i;
                    //cout << allele << " " << i << endl;
                }

                //for (map<string, int>::iterator a = alleleIndexes.begin(); a != alleleIndexes.end(); ++a) {
                //    cout << a->first << " = " << a->second << endl;
                //}

                int j = 0;
                for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s, ++j) {
                    string& sample = *s;
                    vector<string> genotype;
                    // XXX hack, maybe this should get stored in another map for easier access?
                    for (int i = 0; i < ploidy; ++i) {
                        int l = (j * ploidy) + i;
                        //cout << l << " " << population_alleles.at(l) << " " << alleleIndexes[convert(population_alleles.at(l))] << endl;
                        genotype.push_back(convert(alleleIndexes[convert(*population_alleles.at(l))]));
                    }
                    var.samples[sample]["GT"].push_back(join(genotype, "|"));
                    //cout << var.samples[sample]["GT"].front() << endl;
                }

                // XXX THIS IS BROKEN BECAUSE YOUR REFERENCE ALLELE CHANGES
                // LENGTH WITH DELETIONS.
                //
                // IT'S POSSIBLE TO GET COMPLEX ALLELES AT THE INTERSECTIONS
                // BETWEEN ONE ALLELIC VARIANT AND ANOTHER.  THIS IS BROKEN!
                //
                // TO FIX--- BUILD HAPLOTYPES, THEN DISTRIBUTE THEM WITHIN THE POPULATION
                //
                // now write out our sequence data (FASTA files)
                for (int j = 0; j < population_size; ++j) {
                    for (int i = 0; i < ploidy; ++i) {
                        int l = (j * ploidy) + i;
                        Allele* allele = population_alleles.at(l);
                        if (!dry_run) {
                            sequences.at(l)->write(allele->alt);
                        }
                    }
                }

                // tabulate allele frequency, and write some details to the VCF
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {

                    Allele& allele = *a;
                    Allele* allelePtr = &*a;

                    vector<string> genotypes;
                    genotypes.resize(population_size);

                    int allele_freq = 0;

                    // obtain allele frequencies and output FASTA sequence data
                    // for each simulated sample
                    for (int j = 0; j < population_size; ++j) {
                        for (int i = 0; i < ploidy; ++i) {
                            int l = (j * ploidy) + i;
                            if (population_alleles.at(l) == allelePtr) {
                                ++allele_freq;
                            }
                        }
                    }

                    // set up the allele-specific INFO fields in the VCF record
                    var.info["AC"].push_back(convert(allele_freq));

                    int delta = allele.alt.size() - allele.ref.size();
                    if (delta == 0) {
                        if (allele.ref.size() == 1) {
                            var.info["TYPE"].push_back("snp");
                            var.info["LEN"].push_back(convert(allele.ref.size()));
                        } else {
                            var.info["TYPE"].push_back("mnp");;
                            var.info["LEN"].push_back(convert(allele.ref.size()));
                        }
                    } else if (delta > 0) {
                        var.info["TYPE"].push_back("ins");;
                        var.info["LEN"].push_back(convert(abs(delta)));
                    } else {
                        var.info["TYPE"].push_back("del");;
                        var.info["LEN"].push_back(convert(abs(delta)));
                    }
                    if (!allele.type.empty()) {
                        var.infoFlags[allele.type] = true;
                    }

                }

                // write the VCF record to stdout
                cout << var << endl;

                int largest_ref = 1; // enforce one pos
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    if (a->ref.size() > largest_ref) {
                        largest_ref = a->ref.size();
                    }
                }

                pos += largest_ref; // step by the size of the last event
            }
        }
    }

    // close, clean up files
    for (map<string, vector<SampleFastaFile*> >::iterator s = sequencesByRefseq.begin(); s != sequencesByRefseq.end(); ++s) {
        vector<SampleFastaFile*>& files = s->second;
        for (vector<SampleFastaFile*>::iterator f = files.begin(); f != files.end(); ++f) {
            delete *f;
        }
        files.clear();
    }

    return 0;

}