// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) { if((s1.length() == 0) || (s2.length() == 0)) { cout << "ERROR: Found a read with a zero length." << endl; exit(1); } unsigned int referenceLen = s1.length() + 1; unsigned int queryLen = s2.length() + 1; unsigned int sequenceSumLength = s1.length() + s2.length(); // reinitialize our matrices if((referenceLen * queryLen) > mCurrentMatrixSize) { // calculate the new matrix size mCurrentMatrixSize = referenceLen * queryLen; // delete the old arrays if(mPointers) delete [] mPointers; if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps; if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps; try { // initialize the arrays mPointers = new char[mCurrentMatrixSize]; mSizesOfVerticalGaps = new short[mCurrentMatrixSize]; mSizesOfHorizontalGaps = new short[mCurrentMatrixSize]; } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // initialize the traceback matrix to STOP memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen); for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0; // initialize the gap matrices to 1 uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1); uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1); // initialize our repeat counts if they are needed vector<map<string, int> > referenceRepeats; vector<map<string, int> > queryRepeats; if (mUseRepeatGapExtensionPenalty) { for (unsigned int i = 0; i < queryLen; ++i) queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max)); for (unsigned int i = 0; i < referenceLen; ++i) referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max)); // keep only the biggest repeat vector<map<string, int> >::iterator q = queryRepeats.begin(); for (; q != queryRepeats.end(); ++q) { map<string, int>::iterator biggest = q->begin(); map<string, int>::iterator z = q->begin(); for (; z != q->end(); ++z) if (z->first.size() > biggest->first.size()) biggest = z; z = q->begin(); while (z != q->end()) { if (z != biggest) q->erase(z++); else ++z; } } q = referenceRepeats.begin(); for (; q != referenceRepeats.end(); ++q) { map<string, int>::iterator biggest = q->begin(); map<string, int>::iterator z = q->begin(); for (; z != q->end(); ++z) if (z->first.size() > biggest->first.size()) biggest = z; z = q->begin(); while (z != q->end()) { if (z != biggest) q->erase(z++); else ++z; } } // remove repeat information from ends of queries // this results in the addition of spurious flanking deletions in repeats map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2); if (!qrend.empty()) { int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second; for (int i = 0; i < queryEndRepeatBases; ++i) queryRepeats.at(queryRepeats.size() - 2 - i).clear(); } map<string, int>& qrbegin = queryRepeats.front(); if (!qrbegin.empty()) { int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second; for (int i = 0; i < queryBeginRepeatBases; ++i) queryRepeats.at(i).clear(); } } int entropyWindowSize = 8; vector<float> referenceEntropies; vector<float> queryEntropies; if (mUseEntropyGapOpenPenalty) { for (unsigned int i = 0; i < queryLen; ++i) queryEntropies.push_back( shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))], entropyWindowSize)); for (unsigned int i = 0; i < referenceLen; ++i) referenceEntropies.push_back( shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))], entropyWindowSize)); } // normalize entropies /* float qsum = 0; float qnorm = 0; float qmax = 0; for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) { qsum += *q; if (*q > qmax) qmax = *q; } qnorm = qsum / queryEntropies.size(); for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) *q = *q / qsum + qmax; float rsum = 0; float rnorm = 0; float rmax = 0; for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) { rsum += *r; if (*r > rmax) rmax = *r; } rnorm = rsum / referenceEntropies.size(); for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) *r = *r / rsum + rmax; */ // // construct // // reinitialize our query-dependent arrays if(s2.length() > mCurrentQuerySize) { // calculate the new query array size mCurrentQuerySize = s2.length(); // delete the old arrays if(mQueryGapScores) delete [] mQueryGapScores; if(mBestScores) delete [] mBestScores; // initialize the arrays try { mQueryGapScores = new float[mCurrentQuerySize + 1]; mBestScores = new float[mCurrentQuerySize + 1]; } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // reinitialize our reference+query-dependent arrays if(sequenceSumLength > mCurrentAQSumSize) { // calculate the new reference array size mCurrentAQSumSize = sequenceSumLength; // delete the old arrays if(mReversedAnchor) delete [] mReversedAnchor; if(mReversedQuery) delete [] mReversedQuery; // initialize the arrays try { mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1 mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2 } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // initialize the gap score and score vectors uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY); memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen); float similarityScore, totalSimilarityScore, bestScoreDiagonal; float queryGapExtendScore, queryGapOpenScore; float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore; unsigned int BestColumn = 0; unsigned int BestRow = 0; BestScore = FLOAT_NEGATIVE_INFINITY; for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) { currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY; bestScoreDiagonal = mBestScores[0]; for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) { // calculate our similarity score similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A']; // fill the matrices totalSimilarityScore = bestScoreDiagonal + similarityScore; //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl; queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; queryGapOpenScore = mBestScores[j] - mGapOpenPenalty; // compute the h**o-polymer gap score if enabled if(mUseHomoPolymerGapOpenPenalty) if((j > 1) && (s2[j - 1] == s2[j - 2])) queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty; // compute the entropy gap score if enabled if (mUseEntropyGapOpenPenalty) { queryGapOpenScore = mBestScores[j] - mGapOpenPenalty * max(queryEntropies.at(j), referenceEntropies.at(i)) * mEntropyGapOpenPenalty; } int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1; if (mUseRepeatGapExtensionPenalty) { map<string, int>& repeats = queryRepeats[j]; // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in? if (!repeats.empty()) { const pair<string, int>& repeat = *repeats.begin(); int repeatsize = repeat.first.size(); if (gaplen != repeatsize && gaplen % repeatsize != 0) { gaplen = gaplen / repeatsize + repeatsize; } if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) { string gapseq = string(&s1[i], gaplen); if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) { queryGapExtendScore = mQueryGapScores[j] + mRepeatGapExtensionPenalty / (float) gaplen; // mMaxRepeatGapExtensionPenalty) } else { queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; } } } else { queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; } } if(queryGapExtendScore > queryGapOpenScore) { mQueryGapScores[j] = queryGapExtendScore; mSizesOfVerticalGaps[l] = gaplen; } else mQueryGapScores[j] = queryGapOpenScore; referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty; // compute the h**o-polymer gap score if enabled if(mUseHomoPolymerGapOpenPenalty) if((i > 1) && (s1[i - 1] == s1[i - 2])) referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty; // compute the entropy gap score if enabled if (mUseEntropyGapOpenPenalty) { referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty * max(queryEntropies.at(j), referenceEntropies.at(i)) * mEntropyGapOpenPenalty; } gaplen = mSizesOfHorizontalGaps[l - 1] + 1; if (mUseRepeatGapExtensionPenalty) { map<string, int>& repeats = referenceRepeats[i]; // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in? if (!repeats.empty()) { const pair<string, int>& repeat = *repeats.begin(); int repeatsize = repeat.first.size(); if (gaplen != repeatsize && gaplen % repeatsize != 0) { gaplen = gaplen / repeatsize + repeatsize; } if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) { string gapseq = string(&s2[j], gaplen); if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) { referenceGapExtendScore = currentAnchorGapScore + mRepeatGapExtensionPenalty / (float) gaplen; //mMaxRepeatGapExtensionPenalty) } else { referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; } } } else { referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; } } if(referenceGapExtendScore > referenceGapOpenScore) { currentAnchorGapScore = referenceGapExtendScore; mSizesOfHorizontalGaps[l] = gaplen; } else currentAnchorGapScore = referenceGapOpenScore; bestScoreDiagonal = mBestScores[j]; mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore); // determine the traceback direction // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) if(mBestScores[j] == 0) mPointers[l] = Directions_STOP; else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL; else if(mBestScores[j] == mQueryGapScores[j]) mPointers[l] = Directions_UP; else mPointers[l] = Directions_LEFT; // set the traceback start at the current cell i, j and score if(mBestScores[j] > BestScore) { BestRow = i; BestColumn = j; BestScore = mBestScores[j]; } } } // // traceback // // aligned sequences int gappedAnchorLen = 0; // length of sequence #1 after alignment int gappedQueryLen = 0; // length of sequence #2 after alignment int numMismatches = 0; // the mismatched nucleotide count char c1, c2; int ci = BestRow; int cj = BestColumn; int ck = ci * queryLen; // traceback flag bool keepProcessing = true; while(keepProcessing) { //cerr << ci << " " << cj << " " << ck << " ... " << gappedAnchorLen << " " << gappedQueryLen << endl; // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) switch(mPointers[ck + cj]) { case Directions_DIAGONAL: c1 = s1[--ci]; c2 = s2[--cj]; ck -= queryLen; mReversedAnchor[gappedAnchorLen++] = c1; mReversedQuery[gappedQueryLen++] = c2; // increment our mismatch counter if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++; break; case Directions_STOP: keepProcessing = false; break; case Directions_UP: for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) { if (ci <= 0) { keepProcessing = false; break; } mReversedAnchor[gappedAnchorLen++] = s1[--ci]; mReversedQuery[gappedQueryLen++] = GAP; ck -= queryLen; numMismatches++; } break; case Directions_LEFT: for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) { if (cj <= 0) { keepProcessing = false; break; } mReversedAnchor[gappedAnchorLen++] = GAP; mReversedQuery[gappedQueryLen++] = s2[--cj]; numMismatches++; } break; } } // define the reference and query sequences mReversedAnchor[gappedAnchorLen] = 0; mReversedQuery[gappedQueryLen] = 0; // catch sequences with different lengths if(gappedAnchorLen != gappedQueryLen) { cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl; exit(1); } // reverse the strings and assign them to our alignment structure reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen); reverse(mReversedQuery, mReversedQuery + gappedQueryLen); //alignment.Reference = mReversedAnchor; //alignment.Query = mReversedQuery; // set the reference endpoints //alignment.ReferenceBegin = ci; //alignment.ReferenceEnd = BestRow - 1; referenceAl = ci; // set the query endpoints /* if(alignment.IsReverseComplement) { alignment.QueryBegin = s2Length - BestColumn; alignment.QueryEnd = s2Length - cj - 1; // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1; } else { alignment.QueryBegin = cj; alignment.QueryEnd = BestColumn - 1; // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1; } */ // set the query length and number of mismatches //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1; //alignment.NumMismatches = numMismatches; unsigned int alLength = strlen(mReversedAnchor); unsigned int m = 0, d = 0, i = 0; bool dashRegion = false; ostringstream oCigar (ostringstream::out); int insertedBases = 0; if ( cj != 0 ) { if ( cj > 0 ) { oCigar << cj << 'S'; } else { // how do we get negative cj's? referenceAl -= cj; alLength += cj; } } for ( unsigned int j = 0; j < alLength; j++ ) { // m if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) { if ( dashRegion ) { if ( d != 0 ) oCigar << d << 'D'; else { oCigar << i << 'I'; insertedBases += i; } } dashRegion = false; m++; d = 0; i = 0; } else { if ( !dashRegion && m ) oCigar << m << 'M'; dashRegion = true; m = 0; if ( mReversedAnchor[j] == GAP ) { if ( d != 0 ) oCigar << d << 'D'; i++; d = 0; } else { if ( i != 0) { oCigar << i << 'I'; insertedBases += i; } d++; i = 0; } } } if ( m != 0 ) oCigar << m << 'M'; else if ( d != 0 ) oCigar << d << 'D'; else if ( i != 0 ) oCigar << i << 'I'; if ( BestColumn != s2.length() ) oCigar << s2.length() - BestColumn << 'S'; cigarAl = oCigar.str(); // fix the gap order CorrectHomopolymerGapOrder(alLength, numMismatches); if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) { int offset = 0; string oldCigar; try { oldCigar = cigarAl; stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset); } catch (...) { cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl; cigarAl = oldCigar; // undo the failed left-realignment attempt offset = 0; } referenceAl += offset; } }
int main (int argc, char** argv) { double snp_mutation_rate = 0.001; double indel_mutation_rate = 0.0001; double het_rate = 0.5; double afs_alpha = 1; double indel_alpha = 3; double microsatellite_afs_alpha = 1; double microsatellite_len_alpha = 1.7; double microsatellite_mutation_rate = 0.0001; double mnp_ratio = 0.01; double tstv_ratio = 2.5; double deamination_ratio = 1.8; int microsatellite_min_length = 1; int indel_max = 1000; int ploidy = 1; int population_size = 1; int sample_id_max_digits = 1; int seed = time(NULL); string fastaFileName; string file_prefix = ""; string sample_prefix = ""; bool dry_run = false; int repeat_size_max = 20; bool uniform_indel_distribution = false; double p, lambda, shape, mu, sigma; string command_line = argv[0]; for (int i = 1; i < argc; ++i) { command_line += " "; command_line += argv[i]; } int c; while (true) { static struct option long_options[] = { /* These options set a flag. */ //{"verbose", no_argument, &verbose_flag, 1}, //{"brief", no_argument, &verbose_flag, 0}, {"help", no_argument, 0, 'h'}, {"snp-rate", required_argument, 0, 's'}, {"mnp-ratio", required_argument, 0, 'M'}, {"indel-rate", required_argument, 0, 'i'}, {"indel-alpha", required_argument, 0, 'z'}, {"indel-max", required_argument, 0, 'X'}, {"repeat-size-max", required_argument, 0, 'q'}, {"microsat-rate", required_argument, 0, 'm'}, {"microsat-afs-alpha", required_argument, 0, 't'}, {"microsat-len-alpha", required_argument, 0, 'j'}, {"microsat-min-len", required_argument, 0, 'l'}, {"afs-alpha", required_argument, 0, 'a'}, {"ploidy", required_argument, 0, 'p'}, {"population-size", required_argument, 0, 'n'}, {"file-prefix", required_argument, 0, 'P'}, {"sample-prefix", required_argument, 0, 'S'}, {"random-seed", required_argument, 0, 'g'}, {"dry-run", no_argument, 0, 'd'}, {"uniform-indels", no_argument, 0, 'U'}, {"ts-tv-ratio", required_argument, 0, 'T'}, {"deamination-ratio", required_argument, 0, 'D'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; c = getopt_long (argc, argv, "hdUa:z:s:i:q:p:n:M:X:t:m:P:S:g:l:j:T:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 0: /* If this option set a flag, do nothing else now. */ if (long_options[option_index].flag != 0) break; printf ("option %s", long_options[option_index].name); if (optarg) printf (" with arg %s", optarg); printf ("\n"); break; case 'd': dry_run = true; break; case 'U': uniform_indel_distribution = true; break; case 'q': if (!convert(optarg, repeat_size_max)) { cerr << "could not read -q, --repeat-size-max" << endl; exit(1); } break; case 's': if (!convert(optarg, snp_mutation_rate)) { cerr << "could not read -s, --snp-rate" << endl; exit(1); } break; case 'i': if (!convert(optarg, indel_mutation_rate)) { cerr << "could not read -i, --indel-rate" << endl; exit(1); } break; case 'a': if (!convert(optarg, afs_alpha)) { cerr << "could not read -a, --afs-alpha" << endl; exit(1); } break; case 'z': if (!convert(optarg, indel_alpha)) { cerr << "could not read -z, --indel-alpha" << endl; exit(1); } break; case 'X': if (!convert(optarg, indel_max)) { cerr << "could not read -M, --indel-max" << endl; exit(1); } break; case 'M': if (!convert(optarg, mnp_ratio)) { cerr << "could not read -m, --mnp-ratio" << endl; exit(1); } break; case 'm': if (!convert(optarg, microsatellite_mutation_rate)) { cerr << "could not read -m, --microsat-rate" << endl; exit(1); } break; case 'T': if (!convert(optarg, tstv_ratio)) { cerr << "could not read -T, --ts-tv-ratio" << endl; exit(1); } break; case 't': if (!convert(optarg, microsatellite_afs_alpha)) { cerr << "could not read -m, --microsatellite-afs-alpha" << endl; exit(1); } break; case 'j': if (!convert(optarg, microsatellite_len_alpha)) { cerr << "could not read -m, --microsatellite-len-alpha" << endl; exit(1); } break; case 'l': if (!convert(optarg, microsatellite_min_length)) { cerr << "could not read -l, --microsat-min-len" << endl; exit(1); } break; case 'p': if (!convert(optarg, ploidy)) { cerr << "could not read -p, --ploidy" << endl; exit(1); } break; case 'P': file_prefix = optarg; break; case 'S': sample_prefix = optarg; break; case 'n': if (!convert(optarg, population_size)) { cerr << "could not read -n, --population-size" << endl; exit(1); } sample_id_max_digits = strlen(optarg); break; case 'g': if (!convert(optarg, seed)) { cerr << "could not read -g, --random-seed" << endl; exit(1); } break; case 'h': printSummary(); exit(0); break; case '?': /* getopt_long already printed an error message. */ printSummary(); exit(1); break; default: abort (); } } /* Print any remaining command line arguments (not options). */ if (optind < argc) { //cerr << "fasta file: " << argv[optind] << endl; fastaFileName = argv[optind]; } else { cerr << "please specify a fasta file" << endl; printSummary(); exit(1); } init_genrand(seed); // seed mt with current time //mt19937 eng(seed); int bpPerHaplotypeMean = 1000; double bpPerHaplotypeSigma = 200; normal_distribution<double> normal(mu, sigma); //lambda = 7.0; //poisson_distribution<int> poisson(lambda); //poisson(eng); string seqname; string sequence; // holds sequence so we can process it FastaReference fr; fr.open(fastaFileName); string bases = "ATGC"; vcf::VariantCallFile vcfFile; // write the VCF header stringstream headerss; headerss << "##fileformat=VCFv4.1" << endl << "##fileDate=" << dateStr() << endl << "##source=mutatrix population genome simulator" << endl << "##seed=" << seed << endl << "##reference=" << fastaFileName << endl << "##phasing=true" << endl << "##commandline=" << command_line << endl << "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate allele count\">" << endl << "##INFO=<ID=TYPE,Number=A,Type=String,Description=\"Type of each allele (snp, ins, del, mnp, complex)\">" << endl << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples at the site\">" << endl << "##INFO=<ID=NA,Number=1,Type=Integer,Description=\"Number of alternate alleles\">" << endl << "##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"Length of each alternate allele\">" << endl << "##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description=\"Generated at a sequence repeat loci\">" << endl << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; vector<string> samples; for (int i = 0; i < population_size; ++i) { stringstream sampless; sampless << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; // one-based sample names samples.push_back(sampless.str()); headerss << "\t" << sampless.str(); } // and set up our VCF output file string header = headerss.str(); vcfFile.openForOutput(header); cout << vcfFile.header << endl; int copies = ploidy * population_size; map<string, vector<SampleFastaFile*> > sequencesByRefseq; if (!dry_run) { for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) { FastaIndexEntry& indexEntry = s->second; seqname = indexEntry.name; vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname]; for (int i = 0; i < population_size; ++i) { stringstream sname; sname << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; string samplename = sname.str(); for (int j = 0; j < ploidy; ++j) { stringstream cname; cname << j; string chromname = cname.str(); string fullname = samplename + ":" + seqname + ":" + chromname; string filename = file_prefix + fullname + ".fa"; //sequences.push_back(SampleFastaFile(filename, seqname)); sequences.push_back(new SampleFastaFile(filename, seqname)); } } } } for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) { FastaIndexEntry& indexEntry = s->second; seqname = indexEntry.name; sequence = fr.getSequence(s->first); vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname]; //sequences.resize(copies); long int pos = 0; long int microsatellite_end_pos = 0; while (pos < sequence.size()) { //cout << pos + 1 << " microsat end pos " << microsatellite_end_pos << endl; string ref = sequence.substr(pos, 1); // by default, ref is just the current base // skip non-DNA sequence information if (!(ref == "A" || ref == "T" || ref == "C" || ref == "G")) { pos += ref.size(); for (vector<SampleFastaFile*>::iterator s = sequences.begin(); s != sequences.end(); ++s) { (*s)->write(ref); } continue; } vector<Allele> alleles; // establish if we are in a repeat // and what motif is being repeated, how many times int len = 1; // get reference repeats // if we have a repeat, adjust the mutation rate // using length and direction-dependent // formula from "Likelihood-Based Estimation of Microsatellite Mutation Rates" // http://www.genetics.org/cgi/content/full/164/2/781#T1 if (pos > microsatellite_end_pos) { map<string, int> repeats = repeatCounts(pos + 1, (const string&) sequence, repeat_size_max); string seq; int repeat_count = 0; // get the "biggest" repeat, the most likely ms allele at this site for (map<string, int>::iterator r = repeats.begin(); r != repeats.end(); ++r) { if (repeat_count < r->second) { repeat_count = r->second; seq = r->first; } } //cout << pos + 1 << " " << sequence.substr(pos + 1, seq.size() * repeat_count) << " ?= " << seq * repeat_count << endl; // guard ensures that we are in a pure repeat situoation, tandem-tandem repeats are not handled presently if (repeats.size() > 0 && sequence.substr(pos + 1, seq.size() * repeat_count) == seq * repeat_count) { int microsatellite_length = repeat_count * seq.size(); // record end of microsatellite so we don't generate more mutations until we pass it microsatellite_end_pos = pos + microsatellite_length - 1; if (microsatellite_length > microsatellite_min_length //&& genrand_real1() / copies // < microsatellite_mutation_rate * repeat_count) { && genrand_real1() > pow(1 - (microsatellite_mutation_rate * repeat_count), log(copies) * 2)) { // establish the relative rate of ins and del events /* long double repeatMutationDelProbability = microsatelliteDelProb(repeat_count); long double repeatMutationInsProbability = microsatelliteInsProb(repeat_count); long double indel_balance = 1; if (repeatMutationInsProbability > repeatMutationDelProbability) { indel_balance = repeatMutationInsProbability / repeatMutationDelProbability; } else { indel_balance = 1 - (repeatMutationInsProbability / repeatMutationDelProbability); } */ double indel_balance = 0.5; // how many alleles at the site? //int numalleles = min((int) floor(zetarandom(microsatellite_afs_alpha)), (int) ((double) repeat_count * indel_balance)); int numalleles = random_allele_frequency(repeat_count, microsatellite_afs_alpha); //cout << "repeat_count: " << repeat_count << " numalleles: " << numalleles << endl; map<int, bool> allele_lengths; // lengths of the alleles while (allele_lengths.size() < numalleles) { int allele_length; // TODO adjust length so that shorter events are more likely... if (genrand_real1() > indel_balance) { allele_length = -1 * min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count); } else { allele_length = min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count); } //cout << allele_length << endl; map<int, bool>::iterator f = allele_lengths.find(allele_length); if (f == allele_lengths.end()) { allele_lengths[allele_length] = true; } } // generate alleles for (map<int, bool>::iterator f = allele_lengths.begin(); f != allele_lengths.end(); ++f) { int allele_length = f->first; int c = abs(f->first); string alt = seq; for (int i = 1; i < c; ++i) alt += seq; if (allele_length > 0) { alleles.push_back(Allele(ref, ref + alt, "MICROSAT")); } else { alleles.push_back(Allele(ref + alt, ref, "MICROSAT")); } //cout << pos + 1 << " " << microsatellite_length << " " << alleles.back() << endl; } //cout << "alleles.size() == " << alleles.size() << endl; } } } // snp case if (genrand_real1() > pow(1 - snp_mutation_rate, log(max(copies, 2)) * 2)) { // make an alternate allele /* string alt = ref; while (alt == ref) { alt = string(1, bases.at(genrand_int32() % 4)); } */ string alt = ref; if (genrand_real1() > 1 / (1 + tstv_ratio)) { if (ref == "A") { alt = "G"; } else if (ref == "G") { alt = "A"; } else if (ref == "C") { alt = "T"; } else if (ref == "T") { alt = "C"; } } else { while (alt == ref || isTransition(ref, alt)) { alt = string(1, bases.at(genrand_int32() % 4)); } } if (genrand_real1() < mnp_ratio) { int i = 1; do { ref += sequence.substr(pos + i, 1); alt += sequence.substr(pos + i, 1); ++i; while (alt.at(alt.size() - 1) == ref.at(ref.size() - 1)) { alt.at(alt.size() - 1) = bases.at(genrand_int32() % 4); } } while (genrand_real1() < mnp_ratio); len = alt.size(); } alleles.push_back(Allele(ref, alt)); } // indel case if (genrand_real1() > pow(1 - indel_mutation_rate, log(max(copies, 2)) * 2)) { // how many bp? if (uniform_indel_distribution) { len = (int) floor(genrand_real1() * indel_max); } else { len = (int) floor(zetarandom(indel_alpha)); } // guard against out-of-sequence indels if (pos + len < sequence.size() && len <= indel_max) { if (genrand_int32() % 2 == 0) { // deletion alleles.push_back(Allele(sequence.substr(pos, 1 + len), sequence.substr(pos, 1))); } else { string alt = ref; // insertion? // insert some random de novo bases while (alt.length() < len + 1) { alt += string(1, bases.at(genrand_int32() % 4)); } alleles.push_back(Allele(ref, alt)); } } else { // fall through } } // no mutation generated if (alleles.empty()) { for (int i = 0; i < copies; ++i) { if (!dry_run) { sequences.at(i)->write(ref); } } pos += ref.size(); } else { // TODO randomly distribute all the alleles throughout the population // generate allele frequencies for each // fun times... string genotype; vector<bool> alts; random_shuffle(alleles.begin(), alleles.end()); vector<Allele*> population_alleles; list<Allele> present_alleles; // filtered for AFS > 0 in the sample // AFS simulation int remaining_copies = copies; while (remaining_copies > 0 && !alleles.empty()) { Allele allele = alleles.back(); alleles.pop_back(); int allele_freq = random_allele_frequency(remaining_copies, afs_alpha); if (allele_freq > 0) { present_alleles.push_back(allele); Allele* allelePtr = &present_alleles.back(); for (int i = 0; i < allele_freq; ++i) { population_alleles.push_back(allelePtr); } remaining_copies -= allele_freq; } } if (present_alleles.empty()) { for (int i = 0; i < copies; ++i) { if (!dry_run) { sequences.at(i)->write(ref); } } pos += ref.size(); continue; } reverse(present_alleles.begin(), present_alleles.end()); // establish the correct reference sequence and alternate allele set for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; //cout << allele << endl; if (allele.ref.size() > ref.size()) { ref = allele.ref; } } // reference alleles take up the rest Allele reference_allele = Allele(ref, ref); for (int i = 0; i < remaining_copies; ++i) { population_alleles.push_back(&reference_allele); } vector<string> altstrs; // now the reference allele is the largest possible, adjust the alt allele strings to reflect this // if we have indels, add the base before, set the position back one for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; string alleleStr = ref; if (allele.ref.size() == allele.alt.size()) { alleleStr.replace(0, allele.alt.size(), allele.alt); } else { alleleStr.replace(0, allele.ref.size(), allele.alt); } allele.ref = ref; allele.alt = alleleStr; altstrs.push_back(alleleStr); } assert(population_alleles.size() == copies); // shuffle the alleles around the population random_shuffle(population_alleles.begin(), population_alleles.end()); vcf::Variant var(vcfFile); var.sequenceName = seqname; var.position = pos + 1; var.quality = 99; var.id = "."; var.filter = "."; var.info["NS"].push_back(convert(population_size)); var.info["NA"].push_back(convert(present_alleles.size())); var.format.push_back("GT"); var.ref = ref; var.alt = altstrs; // debugging, uncomment to see sequence context //cout << sequence.substr(pos - 10, 10) << "*" << ref << "*" << sequence.substr(pos + 1, 9) << endl; map<string, int> alleleIndexes; alleleIndexes[convert(reference_allele)] = 0; // XXX should we handle this differently, by adding the reference allele to present_alleles? int i = 1; for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a, ++i) { Allele& allele = *a; //cout << allele << " " << i << endl; alleleIndexes[convert(allele)] = i; //cout << allele << " " << i << endl; } //for (map<string, int>::iterator a = alleleIndexes.begin(); a != alleleIndexes.end(); ++a) { // cout << a->first << " = " << a->second << endl; //} int j = 0; for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s, ++j) { string& sample = *s; vector<string> genotype; // XXX hack, maybe this should get stored in another map for easier access? for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; //cout << l << " " << population_alleles.at(l) << " " << alleleIndexes[convert(population_alleles.at(l))] << endl; genotype.push_back(convert(alleleIndexes[convert(*population_alleles.at(l))])); } var.samples[sample]["GT"].push_back(join(genotype, "|")); //cout << var.samples[sample]["GT"].front() << endl; } // XXX THIS IS BROKEN BECAUSE YOUR REFERENCE ALLELE CHANGES // LENGTH WITH DELETIONS. // // IT'S POSSIBLE TO GET COMPLEX ALLELES AT THE INTERSECTIONS // BETWEEN ONE ALLELIC VARIANT AND ANOTHER. THIS IS BROKEN! // // TO FIX--- BUILD HAPLOTYPES, THEN DISTRIBUTE THEM WITHIN THE POPULATION // // now write out our sequence data (FASTA files) for (int j = 0; j < population_size; ++j) { for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; Allele* allele = population_alleles.at(l); if (!dry_run) { sequences.at(l)->write(allele->alt); } } } // tabulate allele frequency, and write some details to the VCF for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { Allele& allele = *a; Allele* allelePtr = &*a; vector<string> genotypes; genotypes.resize(population_size); int allele_freq = 0; // obtain allele frequencies and output FASTA sequence data // for each simulated sample for (int j = 0; j < population_size; ++j) { for (int i = 0; i < ploidy; ++i) { int l = (j * ploidy) + i; if (population_alleles.at(l) == allelePtr) { ++allele_freq; } } } // set up the allele-specific INFO fields in the VCF record var.info["AC"].push_back(convert(allele_freq)); int delta = allele.alt.size() - allele.ref.size(); if (delta == 0) { if (allele.ref.size() == 1) { var.info["TYPE"].push_back("snp"); var.info["LEN"].push_back(convert(allele.ref.size())); } else { var.info["TYPE"].push_back("mnp");; var.info["LEN"].push_back(convert(allele.ref.size())); } } else if (delta > 0) { var.info["TYPE"].push_back("ins");; var.info["LEN"].push_back(convert(abs(delta))); } else { var.info["TYPE"].push_back("del");; var.info["LEN"].push_back(convert(abs(delta))); } if (!allele.type.empty()) { var.infoFlags[allele.type] = true; } } // write the VCF record to stdout cout << var << endl; int largest_ref = 1; // enforce one pos for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) { if (a->ref.size() > largest_ref) { largest_ref = a->ref.size(); } } pos += largest_ref; // step by the size of the last event } } } // close, clean up files for (map<string, vector<SampleFastaFile*> >::iterator s = sequencesByRefseq.begin(); s != sequencesByRefseq.end(); ++s) { vector<SampleFastaFile*>& files = s->second; for (vector<SampleFastaFile*>::iterator f = files.begin(); f != files.end(); ++f) { delete *f; } files.clear(); } return 0; }