/** * Classifies variants. */ int32_t VariantManip::classify_variant(bcf_hdr_t *h, bcf1_t *v, Variant& var) { bcf_unpack(v, BCF_UN_STR); const char* chrom = bcf_get_chrom(h, v); uint32_t pos1 = bcf_get_pos1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; var.ts = 0; var.tv = 0; var.ins = 0; var.del = 0; var.clear(); // this sets the type to VT_REF by default. bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t type = VT_REF; //check for tags if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { type = VT_VNTR; } } } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { type = VT_VNTR; } } if (type==VT_VNTR) { type = VT_VNTR; var.type |= type; var.alleles.push_back(Allele(type)); } else { type = VT_SV; var.type |= type; std::string sv_type(allele[i]); var.alleles.push_back(Allele(type, sv_type)); } } else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { type |= VT_CLUMPED; } var.type |= type; var.alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); var.ts += ts; var.tv += tv; var.ins = dlen>0?1:0; var.del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (var.type==VT_VNTR) { bcf_unpack(v, BCF_UN_INFO); //populate motif, motif len etc. etc. // char* str = NULL; // int32_t n = 0; // int32_t ret = bcf_get_info_string(h, v, "MOTIF", &str, &n); // if (ret>0) // { // var.motif = std::string(str); // var.mlen = var.motif.size(); // } // ret = bcf_get_info_string(h, v, "RU", &str, &n); // if (ret>0) // { // var.ru = std::string(str); // var.mlen = var.ru.size(); // } // if (n) free(str); // // int32_t* no = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "RL", &no, &n); // if (ret>0) var.rlen = *no; // if (n) free(no); // // int32_t* fl = NULL; // n = 0; // ret = bcf_get_info_int32(h, v, "REF", &fl, &n); // if (ret>0) var.rcn = *fl; // if (n) free(fl); } //additionally define MNPs by length of all alleles if (!(var.type&(VT_VNTR|VT_SV)) && var.type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { var.type |= VT_MNP; } } return var.type; }
Allele genotypeAllele(AlleleType type, string alt, unsigned int len, string cigar, unsigned int reflen, long int pos, long int rrbound) { return Allele(type, alt, len, reflen, cigar, pos, rrbound); }
/** * Classifies variants. */ int32_t Variant::classify(bcf_hdr_t *h, bcf1_t *v) { clear(); this->h = h; this->v = v; bcf_unpack(v, BCF_UN_STR); chrom.assign(bcf_get_chrom(h, v)); rid = bcf_get_rid(v); pos1 = bcf_get_pos1(v); end1 = bcf_get_end1(v); char** allele = bcf_get_allele(v); int32_t n_allele = bcf_get_n_allele(v); int32_t pos0 = pos1-1; bool homogeneous_length = true; char* ref = allele[0]; int32_t rlen = strlen(ref); if (strchr(ref, 'N')) { contains_N = true; } //if only ref allele, skip this entire for loop for (size_t i=1; i<n_allele; ++i) { int32_t allele_type = VT_REF; //check for symbolic alternative alleles if (strchr(allele[i],'<')) { size_t len = strlen(allele[i]); if (len>=5) { //VN/d+ if (allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][len-1]=='>' ) { for (size_t j=3; j<len-1; ++j) { if (allele[i][j]<'0' || allele[i][j]>'9') { allele_type = VT_VNTR; } } } //VNTR else if (len==6 && allele[i][0]=='<' && allele[i][1]=='V' && allele[i][2]=='N' && allele[i][3]=='T' && allele[i][4]=='R' && allele[i][5]=='>' ) { allele_type = VT_VNTR; } //STR else if (len==5 && allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][3]=='R' && allele[i][4]=='>' ) { allele_type = VT_VNTR; } //ST/d+ else if (allele[i][0]=='<' && allele[i][1]=='S' && allele[i][2]=='T' && allele[i][len-1]=='>' ) { type = VT_VNTR; for (size_t j=3; j<len-1; ++j) { if ((allele[i][j]<'0' || allele[i][j]>'9') && allele[i][j]!='.') { type = VT_SV; } } } } if (allele_type==VT_VNTR) { allele_type = VT_VNTR; type |= allele_type; alleles.push_back(Allele(allele_type)); } else { allele_type = VT_SV; type |= allele_type; std::string sv_type(allele[i]); alleles.push_back(Allele(allele_type, sv_type)); } } //checks for chromosomal breakpoints else if (strchr(allele[i],'[')||strchr(allele[i],']')) { allele_type = VT_SV; type |= allele_type; std::string sv_type("<BND>"); alleles.push_back(Allele(allele_type, sv_type)); } //non variant record else if (allele[i][0]=='.' || strcmp(allele[i],allele[0])==0) { type = VT_REF; } //explicit sequence of bases else { kstring_t REF = {0,0,0}; kstring_t ALT = {0,0,0}; ref = allele[0]; char* alt = allele[i]; int32_t alen = strlen(alt); if (strchr(alt, 'N')) { contains_N = true; } if (rlen!=alen) { homogeneous_length = false; } //trimming //this is required in particular for the //characterization of multiallelics and //in general, any unnormalized variant int32_t rl = rlen; int32_t al = alen; //trim right while (rl!=1 && al!=1) { if (ref[rl-1]==alt[al-1]) { --rl; --al; } else { break; } } //trim left while (rl !=1 && al!=1) { if (ref[0]==alt[0]) { ++ref; ++alt; --rl; --al; } else { break; } } kputsn(ref, rl, &REF); kputsn(alt, al, &ALT); ref = REF.s; alt = ALT.s; int32_t mlen = std::min(rl, al); int32_t dlen = al-rl; int32_t diff = 0; int32_t ts = 0; int32_t tv = 0; if (mlen==1 && dlen) { char ls, le, ss; if (rl>al) { ls = ref[0]; le = ref[rl-1]; ss = alt[0]; } else { ls = alt[0]; le = alt[al-1]; ss = ref[0]; } if (ls!=ss && le!=ss) { ++diff; if ((ls=='G' && ss=='A') || (ls=='A' && ss=='G') || (ls=='C' && ss=='T') || (ls=='T' && ss=='C')) { ++ts; } else { ++tv; } } } else { for (int32_t j=0; j<mlen; ++j) { if (ref[j]!=alt[j]) { ++diff; if ((ref[j]=='G' && alt[j]=='A') || (ref[j]=='A' && alt[j]=='G') || (ref[j]=='C' && alt[j]=='T') || (ref[j]=='T' && alt[j]=='C')) { ++ts; } else { ++tv; } } } } //substitution variants if (mlen==diff) { allele_type |= mlen==1 ? VT_SNP : VT_MNP; } //indel variants if (dlen) { allele_type |= VT_INDEL; } //clumped SNPs and MNPs if (diff && diff < mlen) //internal gaps { allele_type |= VT_CLUMPED; } type |= allele_type; alleles.push_back(Allele(type, diff, alen, dlen, mlen, ts, tv)); ts += ts; tv += tv; ins = dlen>0?1:0; del = dlen<0?1:0; if (REF.m) free(REF.s); if (ALT.m) free(ALT.s); } } if (type==VT_VNTR) { update_vntr_from_info_fields(h, v); } //additionally define MNPs by length of all alleles if (!(type&(VT_VNTR|VT_SV)) && type!=VT_REF) { if (homogeneous_length && rlen>1 && n_allele>1) { type |= VT_MNP; } } return type; }
Allele genotypeAllele(Allele &a) { return Allele(a.type, a.alternateSequence, a.length, a.referenceLength, a.cigar, a.position, a.repeatRightBoundary); }
int main (int argc, char** argv) { double snp_mutation_rate = 0.001; double indel_mutation_rate = 0.0001; double het_rate = 0.5; double afs_alpha = 1; double indel_alpha = 3; double microsatellite_afs_alpha = 1; double microsatellite_len_alpha = 1.7; double microsatellite_mutation_rate = 0.0001; double mnp_ratio = 0.01; double tstv_ratio = 2.5; double deamination_ratio = 1.8; int microsatellite_min_length = 1; int indel_max = 1000; int ploidy = 1; int population_size = 1; int sample_id_max_digits = 1; int seed = time(NULL); string fastaFileName; string file_prefix = ""; string sample_prefix = ""; bool dry_run = false; int repeat_size_max = 20; bool uniform_indel_distribution = false; double p, lambda, shape, mu, sigma; string command_line = argv[0]; for (int i = 1; i < argc; ++i) { command_line += " "; command_line += argv[i]; } int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, //{"brief", no_argument, &verbose_flag, 0}, {"help", no_argument, 0, 'h'}, {"snp-rate", required_argument, 0, 's'}, {"mnp-ratio", required_argument, 0, 'M'}, {"indel-rate", required_argument, 0, 'i'}, {"indel-alpha", required_argument, 0, 'z'}, {"indel-max", required_argument, 0, 'X'}, {"repeat-size-max", required_argument, 0, 'q'}, {"microsat-rate", required_argument, 0, 'm'}, {"microsat-afs-alpha", required_argument, 0, 't'}, {"microsat-len-alpha", required_argument, 0, 'j'}, {"microsat-min-len", required_argument, 0, 'l'}, {"afs-alpha", required_argument, 0, 'a'}, {"ploidy", required_argument, 0, 'p'}, {"population-size", required_argument, 0, 'n'}, {"file-prefix", required_argument, 0, 'P'}, {"sample-prefix", required_argument, 0, 'S'}, {"random-seed", required_argument, 0, 'g'}, {"dry-run", no_argument, 0, 'd'}, {"uniform-indels", no_argument, 0, 'U'}, {"ts-tv-ratio", required_argument, 0, 'T'}, {"deamination-ratio", required_argument, 0, 'D'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hdUa:z:s:i:q:p:n:M:X:t:m:P:S:g:l:j:T:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'd': dry_run = true; break; case 'U': uniform_indel_distribution = true; break; case 'q': if (!convert(optarg, repeat_size_max)) { cerr << "could not read -q, --repeat-size-max" << endl; exit(1); } break; case 's': if (!convert(optarg, snp_mutation_rate)) { cerr << "could not read -s, --snp-rate" << endl; exit(1); } break; case 'i': if (!convert(optarg, indel_mutation_rate)) { cerr << "could not read -i, --indel-rate" << endl; exit(1); } break; case 'a': if (!convert(optarg, afs_alpha)) { cerr << "could not read -a, --afs-alpha" << endl; exit(1); } break; case 'z': if (!convert(optarg, indel_alpha)) { cerr << "could not read -z, --indel-alpha" << endl; exit(1); } break; case 'X': if (!convert(optarg, indel_max)) { cerr << "could not read -M, --indel-max" << endl; exit(1); } break; case 'M': if (!convert(optarg, mnp_ratio)) { cerr << "could not read -m, --mnp-ratio" << endl; exit(1); } break; case 'm': if (!convert(optarg, microsatellite_mutation_rate)) { cerr << "could not read -m, --microsat-rate" << endl; exit(1); } break; case 'T': if (!convert(optarg, tstv_ratio)) { cerr << "could not read -T, --ts-tv-ratio" << endl; exit(1); } break; case 't': if (!convert(optarg, microsatellite_afs_alpha)) { cerr << "could not read -m, --microsatellite-afs-alpha" << endl; exit(1); } break; case 'j': if (!convert(optarg, microsatellite_len_alpha)) { cerr << "could not read -m, --microsatellite-len-alpha" << endl; exit(1); } break; case 'l': if (!convert(optarg, microsatellite_min_length)) { cerr << "could not read -l, --microsat-min-len" << endl; exit(1); } break; case 'p': if (!convert(optarg, ploidy)) { cerr << "could not read -p, --ploidy" << endl; exit(1); } break; case 'P': file_prefix = optarg; break; case 'S': sample_prefix = optarg; break; case 'n': if (!convert(optarg, population_size)) { cerr << "could not read -n, --population-size" << endl; exit(1); } sample_id_max_digits = strlen(optarg); break; case 'g': if (!convert(optarg, seed)) { cerr << "could not read -g, --random-seed" << endl; exit(1); } break; case 'h': printSummary(); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(); exit(1); break; default: abort (); } } /* Print any remaining command line arguments (not options). */ if (optind < argc) { //cerr << "fasta file: " << argv[optind] << endl; fastaFileName = argv[optind]; } else { cerr << "please specify a fasta file" << endl; printSummary(); exit(1); } init_genrand(seed); // seed mt with current time //mt19937 eng(seed); int bpPerHaplotypeMean = 1000; double bpPerHaplotypeSigma = 200; normal_distribution<double> normal(mu, sigma); //lambda = 7.0; //poisson_distribution<int> poisson(lambda); //poisson(eng); string seqname; string sequence; // holds sequence so we can process it FastaReference fr; fr.open(fastaFileName); string bases = "ATGC"; vcf::VariantCallFile vcfFile; // write the VCF header stringstream headerss; headerss << "##fileformat=VCFv4.1" << endl << "##fileDate=" << dateStr() << endl << "##source=mutatrix population genome simulator" << endl << "##seed=" << seed << endl << "##reference=" << fastaFileName << endl << "##phasing=true" << endl << "##commandline=" << command_line << endl << "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate allele count\">" << endl << "##INFO=<ID=TYPE,Number=A,Type=String,Description=\"Type of each allele (snp, ins, del, mnp, complex)\">" << endl << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples at the site\">" << endl << "##INFO=<ID=NA,Number=1,Type=Integer,Description=\"Number of alternate alleles\">" << endl << "##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"Length of each alternate allele\">" << endl << "##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description=\"Generated at a sequence repeat loci\">" << endl << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; vector<string> samples; for (int i = 0; i < population_size; ++i) { stringstream sampless; sampless << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; // one-based sample names samples.push_back(sampless.str()); headerss << "\t" << sampless.str(); } // and set up our VCF output file string header = headerss.str(); vcfFile.openForOutput(header); cout << vcfFile.header << endl; int copies = ploidy * population_size; map<string, vector<SampleFastaFile*> > sequencesByRefseq; if (!dry_run) { for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) { FastaIndexEntry& indexEntry = s->second; seqname = indexEntry.name; vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname]; for (int i = 0; i < population_size; ++i) { stringstream sname; sname << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; string samplename = sname.str(); for (int j = 0; j < ploidy; ++j) { stringstream cname; cname << j; string chromname = cname.str(); string fullname = samplename + ":" + seqname + ":" + chromname; string filename = file_prefix + fullname + ".fa"; //sequences.push_back(SampleFastaFile(filename, seqname)); sequences.push_back(new SampleFastaFile(filename, seqname)); } } } } for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) { FastaIndexEntry& indexEntry = s->second; seqname = indexEntry.name; sequence = fr.getSequence(s->first); vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname]; //sequences.resize(copies); long int pos = 0; long int microsatellite_end_pos = 0; while (pos < sequence.size()) { //cout << pos + 1 << " microsat end pos " << microsatellite_end_pos << endl; string ref = sequence.substr(pos, 1); // by default, ref is just the current base // skip non-DNA sequence information if (!(ref == "A" || ref == "T" || ref == "C" || ref == "G")) { pos += ref.size(); for (vector<SampleFastaFile*>::iterator s = sequences.begin(); s != sequences.end(); ++s) { (*s)->write(ref); } continue; } vector<Allele> alleles; // establish if we are in a repeat // and what motif is being repeated, how many times int len = 1; // get reference repeats // if we have a repeat, adjust the mutation rate // using length and direction-dependent // formula from "Likelihood-Based Estimation of Microsatellite Mutation Rates" // http://www.genetics.org/cgi/content/full/164/2/781#T1 if (pos > microsatellite_end_pos) { map<string, int> repeats = repeatCounts(pos + 1, (const string&) sequence, repeat_size_max); string seq; int repeat_count = 0; // get the "biggest" repeat, the most likely ms allele at this site for (map<string, int>::iterator r = repeats.begin(); r != repeats.end(); ++r) { if (repeat_count < r->second) { repeat_count = r->second; seq = r->first; } } //cout << pos + 1 << " " << sequence.substr(pos + 1, seq.size() * repeat_count) << " ?= " << seq * repeat_count << endl; // guard ensures that we are in a pure repeat situoation, tandem-tandem repeats are not handled presently if (repeats.size() > 0 && sequence.substr(pos + 1, seq.size() * repeat_count) == seq * repeat_count) { int microsatellite_length = repeat_count * seq.size(); // record end of microsatellite so we don't generate more mutations until we pass it microsatellite_end_pos = pos + microsatellite_length - 1; if (microsatellite_length > microsatellite_min_length //&& genrand_real1() / copies // < microsatellite_mutation_rate * repeat_count) { && genrand_real1() > pow(1 - (microsatellite_mutation_rate * repeat_count), log(copies) * 2)) { // establish the relative rate of ins and del events /* long double repeatMutationDelProbability = microsatelliteDelProb(repeat_count); long double repeatMutationInsProbability = microsatelliteInsProb(repeat_count); long double indel_balance = 1; if (repeatMutationInsProbability > repeatMutationDelProbability) { indel_balance = repeatMutationInsProbability / repeatMutationDelProbability; } else { indel_balance = 1 - (repeatMutationInsProbability / repeatMutationDelProbability); } */ double indel_balance = 0.5; // how many alleles at the site? //int numalleles = min((int) floor(zetarandom(microsatellite_afs_alpha)), (int) ((double) repeat_count * indel_balance)); int numalleles = random_allele_frequency(repeat_count, microsatellite_afs_alpha); //cout << "repeat_count: " << repeat_count << " numalleles: " << numalleles << endl; map<int, bool> allele_lengths; // lengths of the alleles while (allele_lengths.size() < numalleles) { int allele_length; // TODO adjust length so that shorter events are more likely... if (genrand_real1() > indel_balance) { allele_length = -1 * min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count); } else { allele_length = min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count); } //cout << allele_length << endl; map<int, bool>::iterator f = allele_lengths.find(allele_length); if (f == allele_lengths.end()) { allele_lengths[allele_length] = true; } } // generate alleles for (map<int, bool>::iterator f = allele_lengths.begin(); f != allele_lengths.end(); ++f) { int allele_length = f->first; int c = abs(f->first); string alt = seq; for (int i = 1; i < c; ++i) alt += seq; if (allele_length > 0) { alleles.push_back(Allele(ref, ref + alt, "MICROSAT")); } else { alleles.push_back(Allele(ref + alt, ref, "MICROSAT")); } //cout << pos + 1 << " " << microsatellite_length << " " << alleles.back() << endl; } //cout << "alleles.size() == " << alleles.size() << endl; } } } // snp case if (genrand_real1() > pow(1 - snp_mutation_rate, log(max(copies, 2)) * 2)) { // make an alternate allele /* string alt = ref; while (alt == ref) { alt = string(1, bases.at(genrand_int32() % 4)); } */ string alt = ref; if (genrand_real1() > 1 / (1 + tstv_ratio)) { if (ref == "A") { alt = "G"; } else if (ref == "G") { alt = "A"; } else if (ref == "C") { alt = "T"; } else if (ref == "T") { alt = "C"; } } else { while (alt == ref || isTransition(ref, alt)) { alt = string(1, bases.at(genrand_int32() % 4)); } } if (genrand_real1() < mnp_ratio) { int i = 1; do { ref += sequence.substr(pos + i, 1); alt += sequence.substr(pos + i, 1); ++i; while (alt.at(alt.size() - 1) == ref.at(ref.size() - 1)) { alt.at(alt.size() - 1) = bases.at(genrand_int32() % 4); } } while (genrand_real1() < mnp_ratio); len = alt.size(); } alleles.push_back(Allele(ref, alt)); } // indel case if (genrand_real1() > pow(1 - indel_mutation_rate, log(max(copies, 2)) * 2)) { // how many bp? if (uniform_indel_distribution) { len = (int) floor(genrand_real1() * indel_max); } else { len = (int) floor(zetarandom(indel_alpha)); } // guard against out-of-sequence indels if (pos + len < sequence.size() && len <= indel_max) { if (genrand_int32() % 2 == 0) { // deletion alleles.push_back(Allele(sequence.substr(pos, 1 + len), sequence.substr(pos, 1))); } else { string alt = ref; // insertion? // insert some random de novo bases while (alt.length() < len + 1) { alt += string(1, bases.at(genrand_int32() % 4)); } alleles.push_back(Allele(ref, alt)); } } else { // fall through } } // no mutation generated if (alleles.empty()) { for (int i = 0; i < copies; ++i) { if (!dry_run) { sequences.at(i)->write(ref); } } pos += ref.size(); } else { // TODO randomly distribute all the alleles throughout the population // generate allele frequencies for each // fun times... string genotype; vector<bool> alts; random_shuffle(alleles.begin(), alleles.end()); vector<Allele*> population_alleles; list<Allele> present_alleles; // filtered for AFS > 0 in the sample // AFS simulation int remaining_copies = copies; while (remaining_copies > 0 && !alleles.empty()) { Allele allele = alleles.back(); alleles.pop_back(); int allele_freq = random_allele_frequency(remaining_copies, afs_alpha); if (allele_freq > 0) { present_alleles.push_back(allele); Allele* allelePtr = &present_alleles.back(); for (int i = 0; i < allele_freq; ++i) { population_alleles.push_back(allelePtr); } remaining_copies -= allele_freq; } } if (present_alleles.empty()) { for (int i = 0; i < copies; ++i) { if (!dry_run) { sequences.at(i)->write(ref); } } pos += ref.size(); continue; } reverse(present_alleles.begin(), present_alleles.end()); // establish the correct reference sequence and alternate allele set for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; //cout << allele << endl; if (allele.ref.size() > ref.size()) { ref = allele.ref; } } // reference alleles take up the rest Allele reference_allele = Allele(ref, ref); for (int i = 0; i < remaining_copies; ++i) { population_alleles.push_back(&reference_allele); } vector<string> altstrs; // now the reference allele is the largest possible, adjust the alt allele strings to reflect this // if we have indels, add the base before, set the position back one for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; string alleleStr = ref; if (allele.ref.size() == allele.alt.size()) { alleleStr.replace(0, allele.alt.size(), allele.alt); } else { alleleStr.replace(0, allele.ref.size(), allele.alt); } allele.ref = ref; allele.alt = alleleStr; altstrs.push_back(alleleStr); } assert(population_alleles.size() == copies); // shuffle the alleles around the population random_shuffle(population_alleles.begin(), population_alleles.end()); vcf::Variant var(vcfFile); var.sequenceName = seqname; var.position = pos + 1; var.quality = 99; var.id = "."; var.filter = "."; var.info["NS"].push_back(convert(population_size)); var.info["NA"].push_back(convert(present_alleles.size())); var.format.push_back("GT"); var.ref = ref; var.alt = altstrs; // debugging, uncomment to see sequence context //cout << sequence.substr(pos - 10, 10) << "*" << ref << "*" << sequence.substr(pos + 1, 9) << endl; map<string, int> alleleIndexes; alleleIndexes[convert(reference_allele)] = 0; // XXX should we handle this differently, by adding the reference allele to present_alleles? int i = 1; for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a, ++i) { Allele& allele = *a; //cout << allele << " " << i << endl; alleleIndexes[convert(allele)] = i; //cout << allele << " " << i << endl; } //for (map<string, int>::iterator a = alleleIndexes.begin(); a != alleleIndexes.end(); ++a) { // cout << a->first << " = " << a->second << endl; //} int j = 0; for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s, ++j) { string& sample = *s; vector<string> genotype; // XXX hack, maybe this should get stored in another map for easier access? for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; //cout << l << " " << population_alleles.at(l) << " " << alleleIndexes[convert(population_alleles.at(l))] << endl; genotype.push_back(convert(alleleIndexes[convert(*population_alleles.at(l))])); } var.samples[sample]["GT"].push_back(join(genotype, "|")); //cout << var.samples[sample]["GT"].front() << endl; } // XXX THIS IS BROKEN BECAUSE YOUR REFERENCE ALLELE CHANGES // LENGTH WITH DELETIONS. // // IT'S POSSIBLE TO GET COMPLEX ALLELES AT THE INTERSECTIONS // BETWEEN ONE ALLELIC VARIANT AND ANOTHER. THIS IS BROKEN! // // TO FIX--- BUILD HAPLOTYPES, THEN DISTRIBUTE THEM WITHIN THE POPULATION // // now write out our sequence data (FASTA files) for (int j = 0; j < population_size; ++j) { for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; Allele* allele = population_alleles.at(l); if (!dry_run) { sequences.at(l)->write(allele->alt); } } } // tabulate allele frequency, and write some details to the VCF for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; Allele* allelePtr = &*a; vector<string> genotypes; genotypes.resize(population_size); int allele_freq = 0; // obtain allele frequencies and output FASTA sequence data // for each simulated sample for (int j = 0; j < population_size; ++j) { for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; if (population_alleles.at(l) == allelePtr) { ++allele_freq; } } } // set up the allele-specific INFO fields in the VCF record var.info["AC"].push_back(convert(allele_freq)); int delta = allele.alt.size() - allele.ref.size(); if (delta == 0) { if (allele.ref.size() == 1) { var.info["TYPE"].push_back("snp"); var.info["LEN"].push_back(convert(allele.ref.size())); } else { var.info["TYPE"].push_back("mnp");; var.info["LEN"].push_back(convert(allele.ref.size())); } } else if (delta > 0) { var.info["TYPE"].push_back("ins");; var.info["LEN"].push_back(convert(abs(delta))); } else { var.info["TYPE"].push_back("del");; var.info["LEN"].push_back(convert(abs(delta))); } if (!allele.type.empty()) { var.infoFlags[allele.type] = true; } } // write the VCF record to stdout cout << var << endl; int largest_ref = 1; // enforce one pos for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { if (a->ref.size() > largest_ref) { largest_ref = a->ref.size(); } } pos += largest_ref; // step by the size of the last event } } } // close, clean up files for (map<string, vector<SampleFastaFile*> >::iterator s = sequencesByRefseq.begin(); s != sequencesByRefseq.end(); ++s) { vector<SampleFastaFile*>& files = s->second; for (vector<SampleFastaFile*>::iterator f = files.begin(); f != files.end(); ++f) { delete *f; } files.clear(); } return 0; }