// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) {

  if((s1.length() == 0) || (s2.length() == 0)) {
    cout << "ERROR: Found a read with a zero length." << endl;
    exit(1);
  }

  unsigned int referenceLen      = s1.length() + 1;
  unsigned int queryLen          = s2.length() + 1;
  unsigned int sequenceSumLength = s1.length() + s2.length();

  // reinitialize our matrices

  if((referenceLen * queryLen) > mCurrentMatrixSize) {

    // calculate the new matrix size
    mCurrentMatrixSize = referenceLen * queryLen;

    // delete the old arrays
    if(mPointers)              delete [] mPointers;
    if(mSizesOfVerticalGaps)   delete [] mSizesOfVerticalGaps;
    if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;

    try {

      // initialize the arrays
      mPointers              = new char[mCurrentMatrixSize];
      mSizesOfVerticalGaps   = new short[mCurrentMatrixSize];
      mSizesOfHorizontalGaps = new short[mCurrentMatrixSize];

    } catch(bad_alloc) {
      cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
      exit(1);
    }
  }

  // initialize the traceback matrix to STOP
  memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen);
  for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0;

  // initialize the gap matrices to 1
  uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1);
  uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1);


  // initialize our repeat counts if they are needed
  vector<map<string, int> > referenceRepeats;
  vector<map<string, int> > queryRepeats;

  if (mUseRepeatGapExtensionPenalty) {
    for (unsigned int i = 0; i < queryLen; ++i)
      queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max));
    for (unsigned int i = 0; i < referenceLen; ++i)
      referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max));

    // keep only the biggest repeat
    vector<map<string, int> >::iterator q = queryRepeats.begin();
    for (; q != queryRepeats.end(); ++q) {
      map<string, int>::iterator biggest = q->begin();
      map<string, int>::iterator z = q->begin();
      for (; z != q->end(); ++z)
        if (z->first.size() > biggest->first.size()) biggest = z;
      z = q->begin();
      while (z != q->end()) {
        if (z != biggest)
          q->erase(z++);
        else ++z;
      }
    }

    q = referenceRepeats.begin();
    for (; q != referenceRepeats.end(); ++q) {
      map<string, int>::iterator biggest = q->begin();
      map<string, int>::iterator z = q->begin();
      for (; z != q->end(); ++z)
        if (z->first.size() > biggest->first.size()) biggest = z;
      z = q->begin();
      while (z != q->end()) {
        if (z != biggest)
          q->erase(z++);
        else ++z;
      }
    }

    // remove repeat information from ends of queries
    // this results in the addition of spurious flanking deletions in repeats
    map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2);
    if (!qrend.empty()) {
      int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second;
      for (int i = 0; i < queryEndRepeatBases; ++i)
        queryRepeats.at(queryRepeats.size() - 2 - i).clear();
    }

    map<string, int>& qrbegin = queryRepeats.front();
    if (!qrbegin.empty()) {
      int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second;
      for (int i = 0; i < queryBeginRepeatBases; ++i)
        queryRepeats.at(i).clear();
    }

  }

  int entropyWindowSize = 8;
  vector<float> referenceEntropies;
  vector<float> queryEntropies;
  if (mUseEntropyGapOpenPenalty) {
    for (unsigned int i = 0; i < queryLen; ++i)
      queryEntropies.push_back(
        shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))],
              entropyWindowSize));
    for (unsigned int i = 0; i < referenceLen; ++i)
      referenceEntropies.push_back(
        shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))],
              entropyWindowSize));
  }

  // normalize entropies
  /*
  float qsum = 0;
  float qnorm = 0;
  float qmax = 0;
  for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) {
    qsum += *q;
    if (*q > qmax) qmax = *q;
  }
  qnorm = qsum / queryEntropies.size();
  for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q)
    *q = *q / qsum + qmax;

  float rsum = 0;
  float rnorm = 0;
  float rmax = 0;
  for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) {
    rsum += *r;
    if (*r > rmax) rmax = *r;
  }
  rnorm = rsum / referenceEntropies.size();
  for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r)
    *r = *r / rsum + rmax;
  */

  //
  // construct
  //

  // reinitialize our query-dependent arrays
  if(s2.length() > mCurrentQuerySize) {

    // calculate the new query array size
    mCurrentQuerySize = s2.length();

    // delete the old arrays
    if(mQueryGapScores) delete [] mQueryGapScores;
    if(mBestScores)     delete [] mBestScores;

    // initialize the arrays
    try {

      mQueryGapScores = new float[mCurrentQuerySize + 1];
      mBestScores     = new float[mCurrentQuerySize + 1];

    } catch(bad_alloc) {
      cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
      exit(1);
    }
  }

  // reinitialize our reference+query-dependent arrays
  if(sequenceSumLength > mCurrentAQSumSize) {

    // calculate the new reference array size
    mCurrentAQSumSize = sequenceSumLength;

    // delete the old arrays
    if(mReversedAnchor) delete [] mReversedAnchor;
    if(mReversedQuery)  delete [] mReversedQuery;

    // initialize the arrays
    try {

      mReversedAnchor = new char[mCurrentAQSumSize + 1];  // reversed sequence #1
      mReversedQuery  = new char[mCurrentAQSumSize + 1];  // reversed sequence #2

    } catch(bad_alloc) {
      cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
      exit(1);
    }
  }

  // initialize the gap score and score vectors
  uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY);
  memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen);

  float similarityScore, totalSimilarityScore, bestScoreDiagonal;
  float queryGapExtendScore, queryGapOpenScore;
  float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore;

  unsigned int BestColumn = 0;
  unsigned int BestRow    = 0;
  BestScore               = FLOAT_NEGATIVE_INFINITY;

  for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) {

    currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY;
    bestScoreDiagonal = mBestScores[0];

    for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) {

      // calculate our similarity score
      similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A'];

      // fill the matrices
      totalSimilarityScore = bestScoreDiagonal + similarityScore;
      
      //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl;

      queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
      queryGapOpenScore   = mBestScores[j] - mGapOpenPenalty;
      
      // compute the h**o-polymer gap score if enabled
      if(mUseHomoPolymerGapOpenPenalty)
        if((j > 1) && (s2[j - 1] == s2[j - 2]))
          queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty;
      
      // compute the entropy gap score if enabled
      if (mUseEntropyGapOpenPenalty) {
        queryGapOpenScore = 
          mBestScores[j] - mGapOpenPenalty 
          * max(queryEntropies.at(j), referenceEntropies.at(i))
          * mEntropyGapOpenPenalty;
      }

      int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1;

      if (mUseRepeatGapExtensionPenalty) {
        map<string, int>& repeats = queryRepeats[j];
        // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
        if (!repeats.empty()) {

          const pair<string, int>& repeat = *repeats.begin();
          int repeatsize = repeat.first.size();
          if (gaplen != repeatsize && gaplen % repeatsize != 0) {
            gaplen = gaplen / repeatsize + repeatsize;
          }

          if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) {
            string gapseq = string(&s1[i], gaplen);
            if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
              queryGapExtendScore = mQueryGapScores[j]
                + mRepeatGapExtensionPenalty / (float) gaplen;
                //    mMaxRepeatGapExtensionPenalty)
            } else {
              queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
            }
          }
        } else {
          queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
        }
      }
          
      if(queryGapExtendScore > queryGapOpenScore) {
        mQueryGapScores[j] = queryGapExtendScore;
        mSizesOfVerticalGaps[l] = gaplen;
      } else mQueryGapScores[j] = queryGapOpenScore;
      
      referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
      referenceGapOpenScore   = mBestScores[j - 1] - mGapOpenPenalty;
          
      // compute the h**o-polymer gap score if enabled
      if(mUseHomoPolymerGapOpenPenalty)
        if((i > 1) && (s1[i - 1] == s1[i - 2]))
          referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty;
          
      // compute the entropy gap score if enabled
      if (mUseEntropyGapOpenPenalty) {
        referenceGapOpenScore = 
          mBestScores[j - 1] - mGapOpenPenalty 
          * max(queryEntropies.at(j), referenceEntropies.at(i))
          * mEntropyGapOpenPenalty;
      }

      gaplen = mSizesOfHorizontalGaps[l - 1] + 1;

      if (mUseRepeatGapExtensionPenalty) {
        map<string, int>& repeats = referenceRepeats[i];
        // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
        if (!repeats.empty()) {

          const pair<string, int>& repeat = *repeats.begin();
          int repeatsize = repeat.first.size();
          if (gaplen != repeatsize && gaplen % repeatsize != 0) {
            gaplen = gaplen / repeatsize + repeatsize;
          }

          if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) {
            string gapseq = string(&s2[j], gaplen);
            if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
              referenceGapExtendScore = currentAnchorGapScore
                + mRepeatGapExtensionPenalty / (float) gaplen;
                //mMaxRepeatGapExtensionPenalty)
            } else {
              referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
            }
          }
        } else {
          referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
        }
      }

      if(referenceGapExtendScore > referenceGapOpenScore) {
        currentAnchorGapScore = referenceGapExtendScore;
        mSizesOfHorizontalGaps[l] = gaplen;
      } else currentAnchorGapScore = referenceGapOpenScore;
          
      bestScoreDiagonal = mBestScores[j];
      mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore);
          
          
      // determine the traceback direction
      // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
      if(mBestScores[j] == 0)                         mPointers[l] = Directions_STOP;
      else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL;
      else if(mBestScores[j] == mQueryGapScores[j])   mPointers[l] = Directions_UP;
      else                                            mPointers[l] = Directions_LEFT;
          
      // set the traceback start at the current cell i, j and score
      if(mBestScores[j] > BestScore) {
        BestRow    = i;
        BestColumn = j;
        BestScore  = mBestScores[j];
      }
    }
  }

  //
  // traceback
  //

  // aligned sequences
  int gappedAnchorLen  = 0;   // length of sequence #1 after alignment
  int gappedQueryLen   = 0;   // length of sequence #2 after alignment
  int numMismatches    = 0;   // the mismatched nucleotide count

  char c1, c2;

  int ci = BestRow;
  int cj = BestColumn;
  int ck = ci * queryLen;

  // traceback flag
  bool keepProcessing = true;

  while(keepProcessing) {
    //cerr << ci << " " << cj << " " << ck << "  ... " << gappedAnchorLen << " " << gappedQueryLen <<  endl;

    // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
    switch(mPointers[ck + cj]) {

    case Directions_DIAGONAL:
      c1 = s1[--ci];
      c2 = s2[--cj];
      ck -= queryLen;

      mReversedAnchor[gappedAnchorLen++] = c1;
      mReversedQuery[gappedQueryLen++]   = c2;

      // increment our mismatch counter
      if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++;   
      break;

    case Directions_STOP:
      keepProcessing = false;
      break;

    case Directions_UP:
      for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) {
        if (ci <= 0) {
          keepProcessing = false;
          break;
        }
        mReversedAnchor[gappedAnchorLen++] = s1[--ci];
        mReversedQuery[gappedQueryLen++]   = GAP;
        ck -= queryLen;
        numMismatches++;
      }
      break;

    case Directions_LEFT:
      for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) {
        if (cj <= 0) {
          keepProcessing = false;
          break;
        }
        mReversedAnchor[gappedAnchorLen++] = GAP;
        mReversedQuery[gappedQueryLen++]   = s2[--cj];
        numMismatches++;
      }
      break;
    }
  }

  // define the reference and query sequences
  mReversedAnchor[gappedAnchorLen] = 0;
  mReversedQuery[gappedQueryLen]   = 0;

  // catch sequences with different lengths
  if(gappedAnchorLen != gappedQueryLen) {
    cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl;
    exit(1);
  }

  // reverse the strings and assign them to our alignment structure
  reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
  reverse(mReversedQuery,  mReversedQuery  + gappedQueryLen);

  //alignment.Reference = mReversedAnchor;
  //alignment.Query     = mReversedQuery;

  // set the reference endpoints
  //alignment.ReferenceBegin = ci;
  //alignment.ReferenceEnd   = BestRow - 1;
  referenceAl = ci;

  // set the query endpoints
  /*  
    if(alignment.IsReverseComplement) {
    alignment.QueryBegin = s2Length - BestColumn;
    alignment.QueryEnd   = s2Length - cj - 1;
    // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1;
    } else {
    alignment.QueryBegin = cj;
    alignment.QueryEnd   = BestColumn - 1;
    // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1;
    }
  */

  // set the query length and number of mismatches
  //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
  //alignment.NumMismatches  = numMismatches;

  unsigned int alLength = strlen(mReversedAnchor);
  unsigned int m = 0, d = 0, i = 0;
  bool dashRegion = false;
  ostringstream oCigar (ostringstream::out);
  int insertedBases = 0;

  if ( cj != 0 ) {
    if ( cj > 0 ) {
      oCigar << cj << 'S';
    } else { // how do we get negative cj's?
      referenceAl -= cj;
      alLength += cj;
    }
  }
    
  for ( unsigned int j = 0; j < alLength; j++ ) {
    // m
    if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
      if ( dashRegion ) {
        if ( d != 0 ) oCigar << d << 'D';
        else          { oCigar << i << 'I'; insertedBases += i; }
      }
      dashRegion = false;
      m++;
      d = 0;
      i = 0;
    }
    else {
      if ( !dashRegion && m )
        oCigar << m << 'M';
      dashRegion = true;
      m = 0;
      if ( mReversedAnchor[j] == GAP ) {
        if ( d != 0 ) oCigar << d << 'D';
        i++;
        d = 0;
      }
      else {
        if ( i != 0) { oCigar << i << 'I'; insertedBases += i; }
        d++;
        i = 0;
      }
    }
  }
  if      ( m != 0 ) oCigar << m << 'M';
  else if ( d != 0 ) oCigar << d << 'D';
  else if ( i != 0 ) oCigar << i << 'I';

  if ( BestColumn != s2.length() )
    oCigar << s2.length() - BestColumn << 'S';

  cigarAl = oCigar.str();

  // fix the gap order
  CorrectHomopolymerGapOrder(alLength, numMismatches);

  if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) {
    int offset = 0;
    string oldCigar;
    try {
      oldCigar = cigarAl;
      stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset);
    } catch (...) {
      cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl;
      cigarAl = oldCigar; // undo the failed left-realignment attempt
      offset = 0;
    }
    referenceAl += offset;
  }

}
Example #2
0
int main (int argc, char** argv) {

    double snp_mutation_rate = 0.001;
    double indel_mutation_rate = 0.0001;
    double het_rate = 0.5;
    double afs_alpha = 1;
    double indel_alpha = 3;
    double microsatellite_afs_alpha = 1;
    double microsatellite_len_alpha = 1.7;
    double microsatellite_mutation_rate = 0.0001;
    double mnp_ratio = 0.01;
    double tstv_ratio = 2.5;
    double deamination_ratio = 1.8;
    int microsatellite_min_length = 1;
    int indel_max = 1000;
    int ploidy = 1;
    int population_size = 1;
    int sample_id_max_digits = 1;
    int seed = time(NULL);
    string fastaFileName;
    string file_prefix = "";
    string sample_prefix = "";
    bool dry_run = false;
    int repeat_size_max = 20;
    bool uniform_indel_distribution = false;

    double p, lambda, shape, mu, sigma;

    string command_line = argv[0];
    for (int i = 1; i < argc; ++i) {
        command_line += " ";
        command_line += argv[i];
    }

    int c;

    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                //{"brief",   no_argument,       &verbose_flag, 0},
                {"help", no_argument, 0, 'h'},
                {"snp-rate",  required_argument, 0, 's'},
                {"mnp-ratio", required_argument, 0, 'M'},
                {"indel-rate",  required_argument, 0, 'i'},
                {"indel-alpha", required_argument, 0, 'z'},
                {"indel-max", required_argument, 0, 'X'},
                {"repeat-size-max", required_argument, 0, 'q'},
                {"microsat-rate",  required_argument, 0, 'm'},
                {"microsat-afs-alpha", required_argument, 0, 't'},
                {"microsat-len-alpha", required_argument, 0, 'j'},
                {"microsat-min-len", required_argument, 0, 'l'},
                {"afs-alpha",  required_argument, 0, 'a'},
                {"ploidy", required_argument, 0, 'p'},
                {"population-size", required_argument, 0, 'n'},
                {"file-prefix", required_argument, 0, 'P'},
                {"sample-prefix", required_argument, 0, 'S'},
                {"random-seed", required_argument, 0, 'g'},
                {"dry-run", no_argument, 0, 'd'},
                {"uniform-indels", no_argument, 0, 'U'},
                {"ts-tv-ratio", required_argument, 0, 'T'},
                {"deamination-ratio", required_argument, 0, 'D'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hdUa:z:s:i:q:p:n:M:X:t:m:P:S:g:l:j:T:", long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c)
        {
        case 0:
            /* If this option set a flag, do nothing else now. */
            if (long_options[option_index].flag != 0)
                break;
            printf ("option %s", long_options[option_index].name);
            if (optarg)
                printf (" with arg %s", optarg);
            printf ("\n");
            break;

        case 'd':
            dry_run = true;
            break;

        case 'U':
            uniform_indel_distribution = true;
            break;

        case 'q':
            if (!convert(optarg, repeat_size_max)) {
                cerr << "could not read -q, --repeat-size-max" << endl;
                exit(1);
            }
            break;

        case 's':
            if (!convert(optarg, snp_mutation_rate)) {
                cerr << "could not read -s, --snp-rate" << endl;
                exit(1);
            }
            break;

        case 'i':
            if (!convert(optarg, indel_mutation_rate)) {
                cerr << "could not read -i, --indel-rate" << endl;
                exit(1);
            }
            break;

        case 'a':
            if (!convert(optarg, afs_alpha)) {
                cerr << "could not read -a, --afs-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'z':
            if (!convert(optarg, indel_alpha)) {
                cerr << "could not read -z, --indel-alpha" << endl;
                exit(1);
            }
            break;

        case 'X':
            if (!convert(optarg, indel_max)) {
                cerr << "could not read -M, --indel-max" << endl;
                exit(1);
            }
            break;
 
        case 'M':
            if (!convert(optarg, mnp_ratio)) {
                cerr << "could not read -m, --mnp-ratio" << endl;
                exit(1);
            }
            break;
 
        case 'm':
            if (!convert(optarg, microsatellite_mutation_rate)) {
                cerr << "could not read -m, --microsat-rate" << endl;
                exit(1);
            }
            break;

        case 'T':
            if (!convert(optarg, tstv_ratio)) {
                cerr << "could not read -T, --ts-tv-ratio" << endl;
                exit(1);
            }
            break;
 
        case 't':
            if (!convert(optarg, microsatellite_afs_alpha)) {
                cerr << "could not read -m, --microsatellite-afs-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'j':
            if (!convert(optarg, microsatellite_len_alpha)) {
                cerr << "could not read -m, --microsatellite-len-alpha" << endl;
                exit(1);
            }
            break;
 
        case 'l':
            if (!convert(optarg, microsatellite_min_length)) {
                cerr << "could not read -l, --microsat-min-len" << endl;
                exit(1);
            }
            break;
 
        case 'p':
            if (!convert(optarg, ploidy)) {
                cerr << "could not read -p, --ploidy" << endl;
                exit(1);
            }
            break;

        case 'P':
            file_prefix = optarg;
            break;

        case 'S':
            sample_prefix = optarg;
            break;
 
        case 'n':
            if (!convert(optarg, population_size)) {
                cerr << "could not read -n, --population-size" << endl;
                exit(1);
            }
            sample_id_max_digits = strlen(optarg);
            break;

        case 'g':
            if (!convert(optarg, seed)) {
                cerr << "could not read -g, --random-seed" << endl;
                exit(1);
            }
            break;

        case 'h':
            printSummary();
            exit(0);
            break;
 
        case '?':
            /* getopt_long already printed an error message. */
            printSummary();
            exit(1);
            break;
 
        default:
            abort ();
        }
    }

    /* Print any remaining command line arguments (not options). */
    if (optind < argc) {
        //cerr << "fasta file: " << argv[optind] << endl;
        fastaFileName = argv[optind];
    } else {
        cerr << "please specify a fasta file" << endl;
        printSummary();
        exit(1);
    }

    init_genrand(seed); // seed mt with current time

    //mt19937 eng(seed);

    int bpPerHaplotypeMean = 1000;
    double bpPerHaplotypeSigma = 200;
    normal_distribution<double> normal(mu, sigma);
     
    //lambda = 7.0;
    //poisson_distribution<int> poisson(lambda);
    //poisson(eng);

    string seqname;
    string sequence;  // holds sequence so we can process it

    FastaReference fr;
    fr.open(fastaFileName);

    string bases = "ATGC";

    vcf::VariantCallFile vcfFile;

    // write the VCF header
    stringstream headerss;
    headerss 
        << "##fileformat=VCFv4.1" << endl
        << "##fileDate=" << dateStr() << endl
        << "##source=mutatrix population genome simulator" << endl
        << "##seed=" << seed << endl
        << "##reference=" << fastaFileName << endl
        << "##phasing=true" << endl
        << "##commandline=" << command_line << endl
        << "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Alternate allele count\">" << endl
        << "##INFO=<ID=TYPE,Number=A,Type=String,Description=\"Type of each allele (snp, ins, del, mnp, complex)\">" << endl
        << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples at the site\">" << endl
        << "##INFO=<ID=NA,Number=1,Type=Integer,Description=\"Number of alternate alleles\">" << endl
        << "##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"Length of each alternate allele\">" << endl
        << "##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description=\"Generated at a sequence repeat loci\">" << endl
        << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl
        << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";

    vector<string> samples;
    for (int i = 0; i < population_size; ++i) {
        stringstream sampless;
        sampless << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1; // one-based sample names
        samples.push_back(sampless.str());
        headerss << "\t" << sampless.str();
    }

    // and set up our VCF output file
    string header = headerss.str();
    vcfFile.openForOutput(header);
    cout << vcfFile.header << endl;

    int copies = ploidy * population_size;

    map<string, vector<SampleFastaFile*> > sequencesByRefseq;

    if (!dry_run) {
        for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) {

            FastaIndexEntry& indexEntry = s->second;
            seqname = indexEntry.name;

            vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname];
            for (int i = 0; i < population_size; ++i) {
                stringstream sname;
                sname << sample_prefix << setfill('0') << setw(sample_id_max_digits) << i + 1;
                string samplename = sname.str();
                for (int j = 0; j < ploidy; ++j) {
                    stringstream cname;
                    cname << j;
                    string chromname = cname.str();
                    string fullname = samplename + ":" + seqname + ":" + chromname;
                    string filename = file_prefix + fullname + ".fa";
                    //sequences.push_back(SampleFastaFile(filename, seqname));
                    sequences.push_back(new SampleFastaFile(filename, seqname));
                }
            }
        }
    }



    for (FastaIndex::iterator s = fr.index->begin(); s != fr.index->end(); ++s) {

        FastaIndexEntry& indexEntry = s->second;
        seqname = indexEntry.name;
        sequence = fr.getSequence(s->first);

        vector<SampleFastaFile*>& sequences = sequencesByRefseq[seqname];
        //sequences.resize(copies);
        
        long int pos = 0;
        long int microsatellite_end_pos = 0;
        while (pos < sequence.size()) {

            //cout << pos + 1 << " microsat end pos " << microsatellite_end_pos << endl;

            string ref = sequence.substr(pos, 1); // by default, ref is just the current base

            // skip non-DNA sequence information
            if (!(ref == "A" || ref == "T" || ref == "C" || ref == "G")) {
                pos += ref.size();
                for (vector<SampleFastaFile*>::iterator s = sequences.begin(); s != sequences.end(); ++s) {
                    (*s)->write(ref);
                }
                continue;
            }

            vector<Allele> alleles;

            // establish if we are in a repeat
            // and what motif is being repeated, how many times

            int len = 1;

            // get reference repeats
            // if we have a repeat, adjust the mutation rate
            // using length and direction-dependent
            // formula from "Likelihood-Based Estimation of Microsatellite Mutation Rates"
            // http://www.genetics.org/cgi/content/full/164/2/781#T1

            if (pos > microsatellite_end_pos) {

                map<string, int> repeats = repeatCounts(pos + 1, (const string&) sequence, repeat_size_max);

                string seq;
                int repeat_count = 0;
                // get the "biggest" repeat, the most likely ms allele at this site
                for (map<string, int>::iterator r = repeats.begin(); r != repeats.end(); ++r) {
                    if (repeat_count < r->second) {
                        repeat_count = r->second;
                        seq = r->first;
                    }
                }
                //cout << pos + 1 << " " << sequence.substr(pos + 1, seq.size() * repeat_count) << " ?= " << seq * repeat_count << endl;

                // guard ensures that we are in a pure repeat situoation, tandem-tandem repeats are not handled presently
                if (repeats.size() > 0 && sequence.substr(pos + 1, seq.size() * repeat_count) == seq * repeat_count) {

                    int microsatellite_length = repeat_count * seq.size();

                    // record end of microsatellite so we don't generate more mutations until we pass it
                    microsatellite_end_pos = pos + microsatellite_length - 1;

                    if (microsatellite_length > microsatellite_min_length
                        //&& genrand_real1() / copies 
                        //    < microsatellite_mutation_rate * repeat_count) {
                        && genrand_real1() > pow(1 - (microsatellite_mutation_rate * repeat_count), log(copies) * 2)) {

                        // establish the relative rate of ins and del events
                        /*
                          long double repeatMutationDelProbability = microsatelliteDelProb(repeat_count);
                          long double repeatMutationInsProbability = microsatelliteInsProb(repeat_count);
                          long double indel_balance = 1;
                          if (repeatMutationInsProbability > repeatMutationDelProbability) {
                          indel_balance = repeatMutationInsProbability / repeatMutationDelProbability;
                          } else {
                          indel_balance = 1 - (repeatMutationInsProbability / repeatMutationDelProbability);
                          }
                        */
                        double indel_balance = 0.5;

                        // how many alleles at the site?

                        //int numalleles = min((int) floor(zetarandom(microsatellite_afs_alpha)), (int) ((double) repeat_count * indel_balance));
                        int numalleles = random_allele_frequency(repeat_count, microsatellite_afs_alpha);
                        //cout << "repeat_count: " << repeat_count << " numalleles: " << numalleles << endl;

                        map<int, bool> allele_lengths;
                        // lengths of the alleles
                        while (allele_lengths.size() < numalleles) {
                            int allele_length;
                            // TODO adjust length so that shorter events are more likely...
                            if (genrand_real1() > indel_balance) {
                                allele_length = -1 * min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count);
                            } else {
                                allele_length = min((int) floor(zetarandom(microsatellite_len_alpha)), repeat_count);
                            }
                            //cout << allele_length << endl;
                            map<int, bool>::iterator f = allele_lengths.find(allele_length);
                            if (f == allele_lengths.end()) {
                                allele_lengths[allele_length] = true;
                            }
                        }

                        // generate alleles
                        for (map<int, bool>::iterator f = allele_lengths.begin();
                             f != allele_lengths.end(); ++f) {

                            int allele_length = f->first;
                            int c = abs(f->first);
                            string alt = seq;

                            for (int i = 1; i < c; ++i)
                                alt += seq;

                            if (allele_length > 0) {
                                alleles.push_back(Allele(ref, ref + alt, "MICROSAT"));
                            } else {
                                alleles.push_back(Allele(ref + alt, ref, "MICROSAT"));
                            }
                            //cout << pos + 1 << " "  << microsatellite_length << " " << alleles.back() << endl;
                        }
                        //cout << "alleles.size() == " << alleles.size() << endl;
                    }
                }
            }

            // snp case
            if (genrand_real1() > pow(1 - snp_mutation_rate, log(max(copies, 2)) * 2)) {

                // make an alternate allele
                /*
                  string alt = ref;
                  while (alt == ref) {
                  alt = string(1, bases.at(genrand_int32() % 4));
                  }
                */
                string alt = ref;
                if (genrand_real1() > 1 / (1 + tstv_ratio)) {
                    if (ref == "A") {
                        alt = "G";
                    } else if (ref == "G") {
                        alt = "A";
                    } else if (ref == "C") {
                        alt = "T";
                    } else if (ref == "T") {
                        alt = "C";
                    }
                } else {
                    while (alt == ref || isTransition(ref, alt)) {
                        alt = string(1, bases.at(genrand_int32() % 4));
                    }
                }

                if (genrand_real1() < mnp_ratio) {
                    int i = 1;
                    do {
                        ref += sequence.substr(pos + i, 1);
                        alt += sequence.substr(pos + i, 1);
                        ++i;
                        while (alt.at(alt.size() - 1) == ref.at(ref.size() - 1)) {
                            alt.at(alt.size() - 1) = bases.at(genrand_int32() % 4);
                        }
                    } while (genrand_real1() < mnp_ratio);
                    len = alt.size();
                }
                alleles.push_back(Allele(ref, alt));
            }

            // indel case
            if (genrand_real1() > pow(1 - indel_mutation_rate, log(max(copies, 2)) * 2)) {
                // how many bp?
                if (uniform_indel_distribution) {
                    len = (int) floor(genrand_real1() * indel_max);
                } else {
                    len = (int) floor(zetarandom(indel_alpha));
                }
                // guard against out-of-sequence indels
                if (pos + len < sequence.size() && len <= indel_max) {
                    if (genrand_int32() % 2 == 0) {
                        // deletion
                        alleles.push_back(Allele(sequence.substr(pos, 1 + len), sequence.substr(pos, 1)));
                    } else {
                        string alt = ref;
                        // insertion?
                        // insert some random de novo bases
                        while (alt.length() < len + 1) {
                            alt += string(1, bases.at(genrand_int32() % 4));
                        }
                        alleles.push_back(Allele(ref, alt));
                    }
                } else {
                    // fall through
                }
            }

            // no mutation generated
            if (alleles.empty()) {
                for (int i = 0; i < copies; ++i) {
                    if (!dry_run) {
                        sequences.at(i)->write(ref);
                    }
                }
                pos += ref.size();
            } else {

                // TODO randomly distribute all the alleles throughout the population
                // generate allele frequencies for each
                // fun times...

                string genotype;

                vector<bool> alts;
                random_shuffle(alleles.begin(), alleles.end());

                vector<Allele*> population_alleles;
                list<Allele> present_alleles; // filtered for AFS > 0 in the sample
                
                // AFS simulation
                int remaining_copies = copies;
                while (remaining_copies > 0 && !alleles.empty()) {
                    Allele allele = alleles.back();
                    alleles.pop_back();
                    int allele_freq = random_allele_frequency(remaining_copies, afs_alpha);
                    if (allele_freq > 0) {
                        present_alleles.push_back(allele);
                        Allele* allelePtr = &present_alleles.back();
                        for (int i = 0; i < allele_freq; ++i) {
                            population_alleles.push_back(allelePtr);
                        }
                        remaining_copies -= allele_freq;
                    }
                }

                if (present_alleles.empty()) {
                    for (int i = 0; i < copies; ++i) {
                        if (!dry_run) {
                            sequences.at(i)->write(ref);
                        }
                    }
                    pos += ref.size();
                    continue;
                }

                reverse(present_alleles.begin(), present_alleles.end());

                // establish the correct reference sequence and alternate allele set
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    Allele& allele = *a;
                    //cout << allele << endl;
                    if (allele.ref.size() > ref.size()) {
                        ref = allele.ref;
                    }
                }

                // reference alleles take up the rest
                Allele reference_allele = Allele(ref, ref);
                for (int i = 0; i < remaining_copies; ++i) {
                    population_alleles.push_back(&reference_allele);
                }

                vector<string> altstrs;
                // now the reference allele is the largest possible, adjust the alt allele strings to reflect this
                // if we have indels, add the base before, set the position back one
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    Allele& allele = *a;
                    string alleleStr = ref;
                    if (allele.ref.size() == allele.alt.size()) {
                        alleleStr.replace(0, allele.alt.size(), allele.alt);
                    } else {
                        alleleStr.replace(0, allele.ref.size(), allele.alt);
                    }
                    allele.ref = ref;
                    allele.alt = alleleStr;
                    altstrs.push_back(alleleStr);
                }

                assert(population_alleles.size() == copies);

                // shuffle the alleles around the population
                random_shuffle(population_alleles.begin(), population_alleles.end());

                vcf::Variant var(vcfFile);
                var.sequenceName = seqname;
                var.position = pos + 1;
                var.quality = 99;
                var.id = ".";
                var.filter = ".";
                var.info["NS"].push_back(convert(population_size));
                var.info["NA"].push_back(convert(present_alleles.size()));
                var.format.push_back("GT");
                var.ref = ref;
                var.alt = altstrs;

                // debugging, uncomment to see sequence context
                //cout << sequence.substr(pos - 10, 10) << "*" << ref << "*" << sequence.substr(pos + 1, 9) << endl;

                map<string, int> alleleIndexes;
                alleleIndexes[convert(reference_allele)] = 0; // XXX should we handle this differently, by adding the reference allele to present_alleles?
                int i = 1;
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a, ++i) {
                    Allele& allele = *a;
                    //cout << allele << " " << i << endl;
                    alleleIndexes[convert(allele)] = i;
                    //cout << allele << " " << i << endl;
                }

                //for (map<string, int>::iterator a = alleleIndexes.begin(); a != alleleIndexes.end(); ++a) {
                //    cout << a->first << " = " << a->second << endl;
                //}

                int j = 0;
                for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s, ++j) {
                    string& sample = *s;
                    vector<string> genotype;
                    // XXX hack, maybe this should get stored in another map for easier access?
                    for (int i = 0; i < ploidy; ++i) {
                        int l = (j * ploidy) + i;
                        //cout << l << " " << population_alleles.at(l) << " " << alleleIndexes[convert(population_alleles.at(l))] << endl;
                        genotype.push_back(convert(alleleIndexes[convert(*population_alleles.at(l))]));
                    }
                    var.samples[sample]["GT"].push_back(join(genotype, "|"));
                    //cout << var.samples[sample]["GT"].front() << endl;
                }

                // XXX THIS IS BROKEN BECAUSE YOUR REFERENCE ALLELE CHANGES
                // LENGTH WITH DELETIONS.
                //
                // IT'S POSSIBLE TO GET COMPLEX ALLELES AT THE INTERSECTIONS
                // BETWEEN ONE ALLELIC VARIANT AND ANOTHER.  THIS IS BROKEN!
                //
                // TO FIX--- BUILD HAPLOTYPES, THEN DISTRIBUTE THEM WITHIN THE POPULATION
                //
                // now write out our sequence data (FASTA files)
                for (int j = 0; j < population_size; ++j) {
                    for (int i = 0; i < ploidy; ++i) {
                        int l = (j * ploidy) + i;
                        Allele* allele = population_alleles.at(l);
                        if (!dry_run) {
                            sequences.at(l)->write(allele->alt);
                        }
                    }
                }

                // tabulate allele frequency, and write some details to the VCF
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {

                    Allele& allele = *a;
                    Allele* allelePtr = &*a;

                    vector<string> genotypes;
                    genotypes.resize(population_size);

                    int allele_freq = 0;

                    // obtain allele frequencies and output FASTA sequence data
                    // for each simulated sample
                    for (int j = 0; j < population_size; ++j) {
                        for (int i = 0; i < ploidy; ++i) {
                            int l = (j * ploidy) + i;
                            if (population_alleles.at(l) == allelePtr) {
                                ++allele_freq;
                            }
                        }
                    }

                    // set up the allele-specific INFO fields in the VCF record
                    var.info["AC"].push_back(convert(allele_freq));

                    int delta = allele.alt.size() - allele.ref.size();
                    if (delta == 0) {
                        if (allele.ref.size() == 1) {
                            var.info["TYPE"].push_back("snp");
                            var.info["LEN"].push_back(convert(allele.ref.size()));
                        } else {
                            var.info["TYPE"].push_back("mnp");;
                            var.info["LEN"].push_back(convert(allele.ref.size()));
                        }
                    } else if (delta > 0) {
                        var.info["TYPE"].push_back("ins");;
                        var.info["LEN"].push_back(convert(abs(delta)));
                    } else {
                        var.info["TYPE"].push_back("del");;
                        var.info["LEN"].push_back(convert(abs(delta)));
                    }
                    if (!allele.type.empty()) {
                        var.infoFlags[allele.type] = true;
                    }

                }

                // write the VCF record to stdout
                cout << var << endl;

                int largest_ref = 1; // enforce one pos
                for (list<Allele>::iterator a = present_alleles.begin(); a != present_alleles.end(); ++a) {
                    if (a->ref.size() > largest_ref) {
                        largest_ref = a->ref.size();
                    }
                }

                pos += largest_ref; // step by the size of the last event
            }
        }
    }

    // close, clean up files
    for (map<string, vector<SampleFastaFile*> >::iterator s = sequencesByRefseq.begin(); s != sequencesByRefseq.end(); ++s) {
        vector<SampleFastaFile*>& files = s->second;
        for (vector<SampleFastaFile*>::iterator f = files.begin(); f != files.end(); ++f) {
            delete *f;
        }
        files.clear();
    }

    return 0;

}