Beispiel #1
0
int
main(int argc, char **argv) {
  char             *gkpName = 0L;
  char             *ovsName = 0L;

  char             *iniClrName = NULL;
  char             *maxClrName = NULL;
  char             *outClrName = NULL;

  uint32            errorValue     = AS_OVS_encodeEvalue(0.015);
  uint32            minAlignLength = 40;
  uint32            minReadLength  = 64;

  char             *outputPrefix  = NULL;
  char              logName[FILENAME_MAX] = {0};
  char              sumName[FILENAME_MAX] = {0};
  FILE             *logFile = 0L;
  FILE             *sumFile = 0L;

  uint32            idMin = 1;
  uint32            idMax = UINT32_MAX;

  uint32            minEvidenceOverlap  = 40;
  uint32            minEvidenceCoverage = 1;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      ovsName = argv[++arg];

    } else if (strcmp(argv[arg], "-Ci") == 0) {
      iniClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Cm") == 0) {
      maxClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Co") == 0) {
      outClrName = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      double erate = atof(argv[++arg]);
      errorValue = AS_OVS_encodeEvalue(erate);

    } else if (strcmp(argv[arg], "-l") == 0) {
      minAlignLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minReadLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-ol") == 0) {
      minEvidenceOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-oc") == 0) {
      minEvidenceCoverage = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-o") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      AS_UTL_decodeRange(argv[++arg], idMin, idMax);

    } else {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
      err++;
    }

    arg++;
  }
  if ((gkpName       == NULL) ||
      (ovsName       == NULL) ||
      (outputPrefix  == NULL) ||
      (err)) {
    fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Co output.clearFile -o outputPrefix\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G gkpStore    path to read store\n");
    fprintf(stderr, "  -O ovlStore    path to overlap store\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -o name        output prefix, for logging\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -t bgn-end     limit processing to only reads from bgn to end (inclusive)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -Ci clearFile  path to input clear ranges (NOT SUPPORTED)\n");
    //fprintf(stderr, "  -Cm clearFile  path to maximal clear ranges\n");
    fprintf(stderr, "  -Co clearFile  path to ouput clear ranges\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -e erate       ignore overlaps with more than 'erate' percent error\n");
    //fprintf(stderr, "  -l length      ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -ol l          the minimum evidence overlap length\n");
    fprintf(stderr, "  -oc c          the minimum evidence overlap coverage\n");
    fprintf(stderr, "                   evidence overlaps must overlap by 'l' bases to be joined, and\n");
    fprintf(stderr, "                   must be at least 'c' deep to be retained\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -minlength l   reads trimmed below this many bases are deleted\n");
    fprintf(stderr, "\n");
    exit(1);
  }

  gkStore          *gkp = gkStore::gkStore_open(gkpName);
  ovStore          *ovs = new ovStore(ovsName, gkp);

  clearRangeFile   *iniClr = (iniClrName == NULL) ? NULL : new clearRangeFile(iniClrName, gkp);
  clearRangeFile   *maxClr = (maxClrName == NULL) ? NULL : new clearRangeFile(maxClrName, gkp);
  clearRangeFile   *outClr = (outClrName == NULL) ? NULL : new clearRangeFile(outClrName, gkp);

  if (outClr)
    //  If the outClr file exists, those clear ranges are loaded.  We need to reset them
    //  back to 'untrimmed' for now.
    outClr->reset(gkp);

  if (iniClr && outClr)
    //  An iniClr file was supplied, so use those as the initial clear ranges.
    outClr->copy(iniClr);


  if (outputPrefix) {
    sprintf(logName, "%s.log",     outputPrefix);
    sprintf(sumName, "%s.summary", outputPrefix);

    errno = 0;
    logFile = fopen(logName, "w");
    if (errno)
      fprintf(stderr, "Failed to open log file '%s' for writing: %s\n", logName, strerror(errno)), exit(1);

    sumFile = fopen(sumName, "w");
    if (errno)
      fprintf(stderr, "Failed to open summary file '%s' for writing: %s\n", sumName, strerror(errno)), exit(1);

    fprintf(logFile, "id\tinitL\tinitR\tfinalL\tfinalR\tmessage (DEL=deleted NOC=no change MOD=modified)\n");

    fprintf(sumFile, "Overlap error rate     <= %.4f fraction error\n", AS_OVS_decodeEvalue(errorValue));
    fprintf(sumFile, "Overlap min overlap    >= %u base%s (for 'largest covered')\n", minEvidenceOverlap,  (minEvidenceOverlap  == 1) ? "" : "s");
    fprintf(sumFile, "Overlap min coverage   >= %u read%s (for 'largest covered')\n", minEvidenceCoverage, (minEvidenceCoverage == 1) ? "" : "s");
  }


  uint32      ovlLen       = 0;
  uint32      ovlMax       = 64 * 1024;
  ovOverlap  *ovl          = ovOverlap::allocateOverlaps(gkp, ovlMax);

  memset(ovl, 0, sizeof(ovOverlap) * ovlMax);

  char        logMsg[1024] = {0};

  if (idMin < 1)
    idMin = 1;
  if (idMax > gkp->gkStore_getNumReads())
    idMax = gkp->gkStore_getNumReads();

  fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads.\n",
          idMin,
          idMax,
          gkp->gkStore_getNumReads());

  for (uint32 id=idMin; id<=idMax; id++) {
    gkRead     *read = gkp->gkStore_getRead(id);
    gkLibrary  *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID());

    logMsg[0] = 0;

    //  If the fragment is deleted, do nothing.  If the fragment was deleted AFTER overlaps were
    //  generated, then the overlaps will be out of sync -- we'll get overlaps for these fragments
    //  we skip.
    //
    if ((iniClr) && (iniClr->isDeleted(id) == true))
      continue;

    //  If it did not request trimming, do nothing.  Similar to the above, we'll get overlaps to
    //  fragments we skip.
    //
    if ((libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) &&
        (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE))
      continue;

    //  Decide on the initial trimming.  We copied any iniClr into outClr above, and if there wasn't
    //  an iniClr, then outClr is the full read.

    uint32      ibgn   = outClr->bgn(id);
    uint32      iend   = outClr->end(id);

    //  Set the, ahem, initial final trimming.

    bool        isGood = false;
    uint32      fbgn   = ibgn;
    uint32      fend   = iend;

    //  Load overlaps.

    uint32      nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax);

    //  Trim!

    if (nLoaded == 0) {
      //  No overlaps, so mark it as junk.
      isGood = false;
    }

    else if (libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) {
      //  Use the largest region covered by overlaps as the trim

      assert(ovlLen > 0);
      assert(id == ovl[0].a_iid);

      isGood = largestCovered(ovl, ovlLen,
                              read,
                              ibgn, iend, fbgn, fend,
                              logMsg,
                              errorValue,
                              minEvidenceOverlap,
                              minEvidenceCoverage,
                              minReadLength);
      assert(fbgn <= fend);

    }

    else if (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE) {
      //  Use the largest region covered by overlaps as the trim

      assert(ovlLen > 0);
      assert(id == ovl[0].a_iid);

      isGood = bestEdge(ovl, ovlLen,
                        read,
                        ibgn, iend, fbgn, fend,
                        logMsg,
                        errorValue,
                        minEvidenceOverlap,
                        minEvidenceCoverage,
                        minReadLength);
      assert(fbgn <= fend);

    }

    else {
      //  Do nothing.  Really shouldn't get here.
      assert(0);
      continue;
    }

    //  Enforce the maximum clear range

    if ((isGood) && (maxClr)) {
      isGood = enforceMaximumClearRange(ovl, ovlLen,
                                        read,
                                        ibgn, iend, fbgn, fend,
                                        logMsg,
                                        maxClr);
      assert(fbgn <= fend);
    }

    //
    //  Trimmed.  Make sense of the result, write some logs, and update the output.
    //


    //  If bad trimming or too small, write the log and keep going.
    //
    if ((isGood == false) || (fend - fbgn < minReadLength)) {
      outClr->setbgn(id) = fbgn;
      outClr->setend(id) = fend;
      outClr->setDeleted(id);  //  Gah, just obliterates the clear range.

      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tDEL%s\n",
              id,
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);
    }

    //  If we didn't change anything, also write a log.
    //
    else if ((ibgn == fbgn) &&
        (iend == fend)) {
      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tNOC%s\n",
              id,
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);
      continue;
    }

    //  Otherwise, we actually did something.

    else {
      outClr->setbgn(id) = fbgn;
      outClr->setend(id) = fend;

      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tMOD%s\n",
              id,
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);
    }
  }

  gkp->gkStore_close();

  delete ovs;

  delete iniClr;
  delete maxClr;
  delete outClr;

  fclose(logFile);
  fclose(sumFile);

  exit(0);
}
//  Read old fragments in  gkpStore  and choose the ones that
//  have overlaps with fragments in  Frag. Recompute the
//  overlaps, using fragment corrections and output the revised error.
void
Redo_Olaps(coParameters *G, gkStore *gkpStore) {

  //  Figure out the range of B reads we care about.  We probably could just loop over every read in
  //  the store with minimal penalty.

  uint64     thisOvl = 0;
  uint64     lastOvl = G->olapsLen - 1;

  uint32     loBid   = G->olaps[thisOvl].b_iid;
  uint32     hiBid   = G->olaps[lastOvl].b_iid;

  //  Open all the corrections.

  memoryMappedFile     *Cfile = new memoryMappedFile(G->correctionsName);
  Correction_Output_t  *C     = (Correction_Output_t *)Cfile->get();
  uint64                Cpos  = 0;
  uint64                Clen  = Cfile->length() / sizeof(Correction_Output_t);

  //  Allocate some temporary work space for the forward and reverse corrected B reads.

  fprintf(stderr, "--Allocate "F_U64" MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20);
  char          *fseq    = new char     [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
  uint32         fseqLen = 0;

  char          *rseq    = new char     [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
  uint32         rseqLen = 0;

  fprintf(stderr, "--Allocate "F_U64" MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20);
  Adjust_t      *fadj    = new Adjust_t [AS_MAX_READLEN + 1];
  Adjust_t      *radj    = new Adjust_t [AS_MAX_READLEN + 1];
  uint32         fadjLen  = 0;  //  radj is the same length

  fprintf(stderr, "--Allocate "F_U64" MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20);
  gkReadData    *readData = new gkReadData;
  pedWorkArea_t *ped      = new pedWorkArea_t;

  uint64         Total_Alignments_Ct           = 0;

  uint64         Failed_Alignments_Ct          = 0;
  uint64         Failed_Alignments_Both_Ct     = 0;
  uint64         Failed_Alignments_End_Ct      = 0;
  uint64         Failed_Alignments_Length_Ct   = 0;

  uint32         rhaFail = 0;
  uint32         rhaPass = 0;

  uint64         olapsFwd = 0;
  uint64         olapsRev = 0;



  ped->initialize(G, G->errorRate);

  //  Process overlaps.  Loop over the B reads, and recompute each overlap.

  for (uint32 curID=loBid; curID<=hiBid; curID++) {
    if (((curID - loBid) % 1024) == 0)
      fprintf(stderr, "Recomputing overlaps - %9u - %9u - %9u\r", loBid, curID, hiBid);

    if (curID < G->olaps[thisOvl].b_iid)
      continue;

    gkRead *read = gkpStore->gkStore_getRead(curID);

    gkpStore->gkStore_loadReadData(read, readData);

    //  Apply corrections to the B read (also converts to lower case, reverses it, etc)

    //fprintf(stderr, "Correcting B read %u at Cpos=%u\n", curID, Cpos);

    fseqLen = 0;
    rseqLen = 0;

    fadjLen = 0;

    correctRead(curID,
                fseq, fseqLen, fadj, fadjLen,
                readData->gkReadData_getSequence(),
                read->gkRead_sequenceLength(),
                C, Cpos, Clen);

    //  Create copies of the sequence for forward and reverse.  There isn't a need for the forward copy (except that
    //  we mutate it with corrections), and the reverse copy could be deferred until it is needed.

    memcpy(rseq, fseq, sizeof(char) * (fseqLen + 1));

    reverseComplementSequence(rseq, fseqLen);

    Make_Rev_Adjust(radj, fadj, fadjLen, fseqLen);

    //  Recompute alignments for all overlaps involving the B read.

    for (; ((thisOvl <= lastOvl) &&
            (G->olaps[thisOvl].b_iid == curID)); thisOvl++) {
      Olap_Info_t  *olap = G->olaps + thisOvl;

      //fprintf(stderr, "processing overlap %u - %u\n", olap->a_iid, olap->b_iid);

      //  Find the A segment.  It's always forward.  It's already been corrected.

      char *a_part = G->reads[olap->a_iid - G->bgnID].bases;

      if (olap->a_hang > 0) {
        int32 ha = Hang_Adjust(olap->a_hang,
                               G->reads[olap->a_iid - G->bgnID].adjusts,
                               G->reads[olap->a_iid - G->bgnID].adjustsLen);
        a_part += ha;
        //fprintf(stderr, "offset a_part by ha=%d\n", ha);
      }

      //  Find the B segment.

      char *b_part = (olap->normal == true) ? fseq : rseq;

      //if (olap->normal == true)
      //  fprintf(stderr, "b_part = fseq %40.40s\n", fseq);
      //else
      //  fprintf(stderr, "b_part = rseq %40.40s\n", rseq);

      if (olap->normal == true)
        olapsFwd++;
      else
        olapsRev++;

      bool rha=false;
      if (olap->a_hang < 0) {
        int32 ha = (olap->normal == true) ? Hang_Adjust(-olap->a_hang, fadj, fadjLen) :
                                            Hang_Adjust(-olap->a_hang, radj, fadjLen);
        b_part += ha;
        //fprintf(stderr, "offset b_part by ha=%d normal=%d\n", ha, olap->normal);
        rha=true;
      }

      //  Compute the alignment.

      int32   a_part_len  = strlen(a_part);
      int32   b_part_len  = strlen(b_part);
      int32   olap_len    = min(a_part_len, b_part_len);

      int32   a_end        = 0;
      int32   b_end        = 0;
      bool    match_to_end = false;

      //fprintf(stderr, ">A\n%s\n", a_part);
      //fprintf(stderr, ">B\n%s\n", b_part);

      int32 errors = Prefix_Edit_Dist(a_part, a_part_len,
                                      b_part, b_part_len,
                                      G->Error_Bound[olap_len],
                                      a_end,
                                      b_end,
                                      match_to_end,
                                      ped);

      //  ped->delta isn't used.

      //  ??  These both occur, but the first is much much more common.

      if ((ped->deltaLen > 0) && (ped->delta[0] == 1) && (0 < G->olaps[thisOvl].a_hang)) {
        int32  stop = min(ped->deltaLen, (int32)G->olaps[thisOvl].a_hang);  //  a_hang is int32:31!
        int32  i = 0;

        for  (i=0; (i < stop) && (ped->delta[i] == 1); i++)
          ;

        //fprintf(stderr, "RESET 1 i=%d delta=%d\n", i, ped->delta[i]);
        assert((i == stop) || (ped->delta[i] != -1));

        ped->deltaLen -= i;

        memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int));

        a_part     += i;
        a_end      -= i;
        a_part_len -= i;
        errors     -= i;

      } else if ((ped->deltaLen > 0) && (ped->delta[0] == -1) && (G->olaps[thisOvl].a_hang < 0)) {
        int32  stop = min(ped->deltaLen, - G->olaps[thisOvl].a_hang);
        int32  i = 0;

        for  (i=0; (i < stop) && (ped->delta[i] == -1); i++)
          ;

        //fprintf(stderr, "RESET 2 i=%d delta=%d\n", i, ped->delta[i]);
        assert((i == stop) || (ped->delta[i] != 1));

        ped->deltaLen -= i;

        memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int));

        b_part     += i;
        b_end      -= i;
        b_part_len -= i;
        errors     -= i;
      }


      Total_Alignments_Ct++;


      int32  olapLen = min(a_end, b_end);

      if ((match_to_end == false) && (olapLen <= 0))
        Failed_Alignments_Both_Ct++;

      if (match_to_end == false)
        Failed_Alignments_End_Ct++;

      if (olapLen <= 0)
        Failed_Alignments_Length_Ct++;

      if ((match_to_end == false) || (olapLen <= 0)) {
        Failed_Alignments_Ct++;

#if 0
        //  I can't find any patterns in these errors.  I thought that it was caused by the corrections, but I
        //  found a case where no corrections were made and the alignment still failed.  Perhaps it is differences
        //  in the alignment code (the forward vs reverse prefix distance in overlapper vs only the forward here)?

        fprintf(stderr, "Redo_Olaps()--\n");
        fprintf(stderr, "Redo_Olaps()--\n");
        fprintf(stderr, "Redo_Olaps()--  Bad alignment  errors %d  a_end %d  b_end %d  match_to_end %d  olapLen %d\n",
                errors, a_end, b_end, match_to_end, olapLen);
        fprintf(stderr, "Redo_Olaps()--  Overlap        a_hang %d b_hang %d innie %d\n",
                olap->a_hang, olap->b_hang, olap->innie);
        fprintf(stderr, "Redo_Olaps()--  Reads          a_id %u a_length %d b_id %u b_length %d\n",
                G->olaps[thisOvl].a_iid,
                G->reads[ G->olaps[thisOvl].a_iid ].basesLen,
                G->olaps[thisOvl].b_iid,
                G->reads[ G->olaps[thisOvl].b_iid ].basesLen);
        fprintf(stderr, "Redo_Olaps()--  A %s\n", a_part);
        fprintf(stderr, "Redo_Olaps()--  B %s\n", b_part);

        Display_Alignment(a_part, a_part_len, b_part, b_part_len, ped->delta, ped->deltaLen);

        fprintf(stderr, "\n");
#endif

        if (rha)
          rhaFail++;

        continue;
      }

      if (rha)
        rhaPass++;

      G->olaps[thisOvl].evalue = AS_OVS_encodeEvalue((double)errors / olapLen);

      //fprintf(stderr, "REDO - errors = %u / olapLep = %u -- %f\n", errors, olapLen, AS_OVS_decodeEvalue(G->olaps[thisOvl].evalue));
    }
  }

  fprintf(stderr, "\n");

  delete    ped;
  delete    readData;
  delete [] radj;
  delete [] fadj;
  delete [] rseq;
  delete [] fseq;
  delete    Cfile;

  fprintf(stderr, "--  Release bases, adjusts and reads.\n");

  delete [] G->bases;     G->bases   = NULL;
  delete [] G->adjusts;   G->adjusts = NULL;
  delete [] G->reads;     G->reads   = NULL;

  fprintf(stderr, "Olaps Fwd "F_U64"\n", olapsFwd);
  fprintf(stderr, "Olaps Rev "F_U64"\n", olapsRev);

  fprintf(stderr, "Total:  "F_U64"\n", Total_Alignments_Ct);
  fprintf(stderr, "Failed: "F_U64" (both)\n", Failed_Alignments_Both_Ct);
  fprintf(stderr, "Failed: "F_U64" (either)\n", Failed_Alignments_Ct);
  fprintf(stderr, "Failed: "F_U64" (match to end)\n", Failed_Alignments_End_Ct);
  fprintf(stderr, "Failed: "F_U64" (negative length)\n", Failed_Alignments_Length_Ct);

  fprintf(stderr, "rhaFail %u rhaPass %u\n", rhaFail, rhaPass);
}
Beispiel #3
0
int
main(int argc, char **argv) {
  char           *gkpName        = NULL;
  char           *ovlName        = NULL;
  char           *outPrefix      = NULL;

  uint32          bgnID          = 0;
  uint32          endID          = UINT32_MAX;

  uint32          ovlSelect      = 0;
  double          ovlAtMost      = AS_OVS_encodeEvalue(1.0);
  double          ovlAtLeast     = AS_OVS_encodeEvalue(0.0);

  double          expectedMean   = 30.0;
  double          expectedStdDev =  7.0;

  bool            toFile         = true;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {

    if      (strcmp(argv[arg], "-G") == 0)
      gkpName = argv[++arg];

    else if (strcmp(argv[arg], "-O") == 0)
      ovlName = argv[++arg];


    else if (strcmp(argv[arg], "-o") == 0)
      outPrefix = argv[++arg];


    else if (strcmp(argv[arg], "-C") == 0) {
      expectedMean   = atof(argv[++arg]);
      expectedStdDev = atof(argv[++arg]);
    }

    else if (strcmp(argv[arg], "-c") == 0)
      toFile = false;


    else if (strcmp(argv[arg], "-b") == 0)
      bgnID = atoi(argv[++arg]);

    else if (strcmp(argv[arg], "-e") == 0)
      endID = atoi(argv[++arg]);


    else if (strcmp(argv[arg], "-overlap") == 0) {
      arg++;

      if      (strcmp(argv[arg], "5") == 0)
        ovlSelect |= OVL_5;

      else if (strcmp(argv[arg], "3") == 0)
        ovlSelect |= OVL_3;

      else if (strcmp(argv[arg], "contained") == 0)
        ovlSelect |= OVL_CONTAINED;

      else if (strcmp(argv[arg], "container") == 0)
        ovlSelect |= OVL_CONTAINER;

      else if (strcmp(argv[arg], "partial") == 0)
        ovlSelect |= OVL_PARTIAL;

      else if (strcmp(argv[arg], "atmost") == 0)
        ovlAtMost = atof(argv[++arg]);

      else if (strcmp(argv[arg], "atleast") == 0)
        ovlAtLeast = atof(argv[++arg]);

      else {
        fprintf(stderr, "ERROR: unknown -overlap '%s'\n", argv[arg]);
        exit(1);
      }
    }


    else {
      fprintf(stderr, "%s: unknown option '%s'.\n", argv[0], argv[arg]);
      err++;
    }

    arg++;
  }

  if (gkpName == NULL)
    err++;
  if (ovlName == NULL)
    err++;
  if (outPrefix == NULL)
    err++;

  if (err) {
    fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -o outPrefix [-b bgnID] [-e endID] ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "Generates statistics for an overlap store.  By default all possible classes\n");
    fprintf(stderr, "are generated, options can disable specific classes.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -C mean stddev           Expect coverage at mean +- stddev\n");
    fprintf(stderr, "  -c                       Write stats to stdout, not to a file\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Outputs:\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  outPrefix.per-read.log   One line per read, giving readID, read length and classification.\n");
    fprintf(stderr, "  outPrefix.summary        The primary statistical output.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Overlap Selection:\n");
    fprintf(stderr, "  -overlap 5               5' overlaps only\n");
    fprintf(stderr, "  -overlap 3               3' overlaps only\n");
    fprintf(stderr, "  -overlap contained       contained overlaps only\n");
    fprintf(stderr, "  -overlap container       container overlaps only\n");
    fprintf(stderr, "  -overlap partial         overlap is not valid for assembly\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  An overlap is classified as exactly one of 5', 3', contained or container.\n");
    fprintf(stderr, "  By default, all overlaps are selected.  Specifying any of these options will\n");
    fprintf(stderr, "  restrict overlaps to just those classifications.  E.g., '-overlap 5 -overlap 3'\n");
    fprintf(stderr, "  will select dovetail overlaps off either end of the read.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -overlap atmost x        at most fraction x error  (overlap-erate <= x)\n");
    fprintf(stderr, "  -overlap atleast x       at least fraction x error (x <= overlap-erate)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  Overlaps can be further filtered by fraction error.  Usually, this will be an\n");
    fprintf(stderr, "  'atmost' filtering to use only the higher qualtiy overlaps.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  A contained read has at least one container overlap.  Container read    -> ---------------\n");
    fprintf(stderr, "  A container read has at least one contained overlap.  Contained overlap ->      -----\n");
    fprintf(stderr, "\n");

    exit(1);
  }

  //  Set the default to 'all' if nothing set.

  if (ovlSelect == 0)
    ovlSelect = 0xff;

  //  Open inputs, find limits.

  gkStore    *gkpStore = gkStore::gkStore_open(gkpName);
  ovStore    *ovlStore = new ovStore(ovlName, gkpStore);

  if (endID > gkpStore->gkStore_getNumReads())
    endID = gkpStore->gkStore_getNumReads();

  if (endID < bgnID)
    fprintf(stderr, "ERROR: invalid bgn/end range bgn=%u end=%u; only %u reads in the store\n", bgnID, endID, gkpStore->gkStore_getNumReads()), exit(1);

  ovlStore->setRange(bgnID, endID);

  //  Allocate output histograms.

  histogramStatistics   *readNoOlaps         = new histogramStatistics;  //  Bad reads!  (read length)
  histogramStatistics   *readHole            = new histogramStatistics;
  histogramStatistics   *readHump            = new histogramStatistics;
  histogramStatistics   *readNo5             = new histogramStatistics;
  histogramStatistics   *readNo3             = new histogramStatistics;

  histogramStatistics   *olapHole            = new histogramStatistics;  //  Hole size (sum of holes if more than one)
  histogramStatistics   *olapHump            = new histogramStatistics;  //  Hump size (sum of humps if more than one)
  histogramStatistics   *olapNo5             = new histogramStatistics;  //  5' uncovered size
  histogramStatistics   *olapNo3             = new histogramStatistics;  //  3' uncovered size

  histogramStatistics   *readLowCov          = new histogramStatistics;  //  Good reads!  (read length)
  histogramStatistics   *readUnique          = new histogramStatistics;
  histogramStatistics   *readRepeatCont      = new histogramStatistics;
  histogramStatistics   *readRepeatDove      = new histogramStatistics;
  histogramStatistics   *readSpanRepeat      = new histogramStatistics;
  histogramStatistics   *readUniqRepeatCont  = new histogramStatistics;
  histogramStatistics   *readUniqRepeatDove  = new histogramStatistics;
  histogramStatistics   *readUniqAnchor      = new histogramStatistics;

  histogramStatistics   *covrLowCov          = new histogramStatistics;  //  Good reads!  (overlap length)
  histogramStatistics   *covrUnique          = new histogramStatistics;
  histogramStatistics   *covrRepeatCont      = new histogramStatistics;
  histogramStatistics   *covrRepeatDove      = new histogramStatistics;
  histogramStatistics   *covrSpanRepeat      = new histogramStatistics;
  histogramStatistics   *covrUniqRepeatCont  = new histogramStatistics;
  histogramStatistics   *covrUniqRepeatDove  = new histogramStatistics;
  histogramStatistics   *covrUniqAnchor      = new histogramStatistics;

  histogramStatistics   *olapLowCov          = new histogramStatistics;  //  Good reads!  (overlap length)
  histogramStatistics   *olapUnique          = new histogramStatistics;
  histogramStatistics   *olapRepeatCont      = new histogramStatistics;
  histogramStatistics   *olapRepeatDove      = new histogramStatistics;
  histogramStatistics   *olapSpanRepeat      = new histogramStatistics;
  histogramStatistics   *olapUniqRepeatCont  = new histogramStatistics;
  histogramStatistics   *olapUniqRepeatDove  = new histogramStatistics;
  histogramStatistics   *olapUniqAnchor      = new histogramStatistics;

  //  Coverage interval lists, of all overlaps selected.

  //  Open outputs.

  char N[FILENAME_MAX];
  sprintf(N, "%s.per-read.log", outPrefix);

  FILE  *LOG = fopen(N, "w");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);

  //  Compute!

  uint32                 overlapsMax = 1024 * 1024;

  uint32                 overlapsLen = 0;
  ovOverlap             *overlaps    = ovOverlap::allocateOverlaps(gkpStore, overlapsMax);

  overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);

  while (overlapsLen > 0) {
    uint32  readID  = overlaps[0].a_iid;
    uint32  readLen = gkpStore->gkStore_getRead(readID)->gkRead_sequenceLength();

    intervalList<uint32>   cov;
    uint32                 covID = 0;

    bool    readCoverage5     = false;
    bool    readCoverage3     = false;
    bool    readContained     = false;
    bool    readContainer     = false;
    bool    readPartial       = false;

    for (uint32 oo=0; oo<overlapsLen; oo++) {
      bool  is5prime    = (overlaps[oo].overlapAEndIs5prime()  == true) && (ovlSelect & OVL_5)         && (overlaps[oo].overlap5primeIsPartial() == false);
      bool  is3prime    = (overlaps[oo].overlapAEndIs3prime()  == true) && (ovlSelect & OVL_3)         && (overlaps[oo].overlap3primeIsPartial() == false);
      bool  isContained = (overlaps[oo].overlapAIsContained()  == true) && (ovlSelect & OVL_CONTAINED);
      bool  isContainer = (overlaps[oo].overlapAIsContainer()  == true) && (ovlSelect & OVL_CONTAINER);
      bool  isPartial   = (overlaps[oo].overlapIsPartial()     == true) && (ovlSelect & OVL_PARTIAL);

      //  Ignore the overlap?

      if ((is5prime    == false) &&
          (is3prime    == false) &&
          (isContained == false) &&
          (isContainer == false) &&
          (isPartial   == false))
        continue;

      if (overlaps[oo].evalue() < ovlAtLeast)
        continue;

      if (overlaps[oo].evalue() > ovlAtMost)
        continue;

      readCoverage5    |= is5prime;     //  If there is a 5' overlap, the read isn't missing 5' coverage
      readCoverage3    |= is3prime;
      readContained    |= isContained;  //  Read is contained in something else
      readContainer    |= isContainer;  //  Read is a container of somethign else
      readPartial      |= isPartial;

      cov.add(overlaps[oo].a_bgn(), overlaps[oo].a_end() - overlaps[oo].a_bgn());
    }

    //  If we filtered all the overlaps, just get out of here.  Yeah, some code duplication,
    //  but cleaner than sticking an if block around the rest of the loop.

    if (cov.numberOfIntervals() == 0) {
      readNoOlaps->add(readLen);

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);
      continue;
    }

    //  Generate a depth-of-coverage map, then merge intervals

    intervalList<uint32>  depth(cov);

    cov.merge();

    //  Analyze the intervals, save per-read information to the log.

    uint32  lastInt           = cov.numberOfIntervals() - 1;
    uint32  bgn               = cov.lo(0);
    uint32  end               = cov.hi(lastInt);
    bool    contiguous        = (lastInt == 0) ? true : false;

    bool    readFullCoverage  = (lastInt == 0) && (bgn == 0) && (end == readLen);
    bool    readMissingMiddle = (lastInt != 0);

    uint32  holeSize          = 0;
    uint32  no5Size           = bgn;
    uint32  no3Size           = readLen - end;

    for (uint32 ii=1; ii<cov.numberOfIntervals(); ii++)
      holeSize += cov.lo(ii) - cov.hi(ii-1);

    //  Handle bad cases.  If it's a partial overlap, ignore the is5prime and is3prime markings.


    if (readMissingMiddle == true) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "middle-missing");
      readHole->add(readLen);
      olapHole->add(holeSize);

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);
      continue;
    }

    if ((readCoverage5 == false) && (readCoverage3 == false) && (readContained == false) && (readPartial == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "middle-only");
      readHump->add(readLen);
      olapHump->add(no5Size + no3Size);

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);
      continue;
    }

    if ((readCoverage5 == false) && (readContained == false) && (readPartial == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "no-5-prime");
      readNo5->add(readLen);
      olapNo5->add(no5Size);

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);
      continue;
    }

    if ((readCoverage3 == false) && (readContained == false) && (readPartial == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "no-3-prime");
      readNo3->add(readLen);
      olapNo3->add(no3Size);

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);
      continue;
    }

    //  Handle good cases.  For partial overlaps, bgn and end are not the extent of the read.

    if (readPartial == false) {
      assert(bgn == 0);
      assert(end == readLen);
      assert(contiguous == true);
      assert(readFullCoverage == true);
    }

    //  Compute mean and std.dev of coverage.  From this, we decide if the read is 'unique',
    //  'repeat' or 'mixed'.  If 'mixed', we then need to decide if the read spans a repeat, or
    //  joins unique and repeat.

    double  covMean   = 0;
    double  covStdDev = 0;

    for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
      covMean += (depth.hi(ii) - depth.lo(ii)) * depth.depth(ii);

    covMean /= readLen;

    for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
      covStdDev += (depth.hi(ii) - depth.lo(ii)) * (depth.depth(ii) - covMean) * (depth.depth(ii) - covMean);

    covStdDev = sqrt(covStdDev / (readLen - 1));

    //  Classify each interval as either 'l'owcoverage, 'u'nique or 'r'epeat.

    char *classification = new char [depth.numberOfIntervals()];

    for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) {
      if        (depth.depth(ii) < expectedMean - 3 * expectedStdDev) {
        classification[ii] = 'l';

      } else if (depth.depth(ii) < expectedMean + 3 * expectedStdDev) {
        classification[ii] = 'u';

      } else {
        classification[ii] = 'r';
      }
    }

    //  Try to detect if a read is part unique and part repeat.

    bool   isLowCov     = false;
    bool   isUnique     = false;
    bool   isRepeat     = false;
    bool   isSpanRepeat = false;
    bool   isUniqRepeat = false;
    bool   isUniqAnchor = false;

    int32  bgni = 0;
    int32  endi = depth.numberOfIntervals() - 1;

    char   type5 = classification[bgni];
    char   typem = 0;
    char   type3 = classification[endi];

    while ((bgni <= endi) && (type5 == classification[bgni]))
      bgni++;
    bgni--;

    while ((bgni <= endi) && (type3 == classification[endi]))
      endi--;
    endi++;

    //  All the same classification?

    if (bgni == endi) {
      isLowCov = (type5 == 'l');
      isUnique = (type5 == 'u');
      isRepeat = (type5 == 'r');
    }

    //  Nope, if we aren't the same, assume it is uniqRepeat.

    else if (type5 != type3) {
      isUniqRepeat = true;
    }

    //  Nope, the same on both ends.  Assume we're just flipped.

    else {
      if (type5 == 'r')
        isUniqAnchor = true;
      else
        isSpanRepeat = true;
    }

    //  Now, do something with it.

    //  LOG - readID readLen classification

    if (isLowCov) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "low-cov");
      readLowCov->add(readLen);

      for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
        covrLowCov->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii));
    }

    if (isUnique) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "unique");
      readUnique->add(readLen);

      for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
        covrUnique->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii));
    }

    if ((isRepeat) && (readContained == true)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "contained-repeat");
      readRepeatCont->add(readLen);

      for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
        covrRepeatCont->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii));
    }

    if ((isRepeat) && (readContained == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "dovetail-repeat");
      readRepeatDove->add(readLen);

      for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
        covrRepeatDove->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii));
    }

    if (isSpanRepeat) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "span-repeat");
      readSpanRepeat->add(readLen);
      olapSpanRepeat->add(depth.lo(endi) - depth.hi(bgni));
    }

    if ((isUniqRepeat) && (readContained == true)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-repeat-cont");
      readUniqRepeatCont->add(readLen);
    }

    if ((isUniqRepeat) && (readContained == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-repeat-dove");
      readUniqRepeatDove->add(readLen);
    }

    if (isUniqAnchor) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-anchor");
      readUniqAnchor->add(readLen);
      olapUniqAnchor->add(depth.lo(endi) - depth.hi(bgni));
    }

    //  Done.  Read more data.

    overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);
  }

  fclose(LOG);  //  Done with logging.

  readHole->finalizeData();
  olapHole->finalizeData();

  readHump->finalizeData();
  olapHump->finalizeData();

  readNo5->finalizeData();
  olapNo5->finalizeData();

  readNo3->finalizeData();
  olapNo3->finalizeData();


  readLowCov->finalizeData();
  olapLowCov->finalizeData();
  covrLowCov->finalizeData();

  readUnique->finalizeData();
  olapUnique->finalizeData();
  covrUnique->finalizeData();

  readRepeatCont->finalizeData();
  olapRepeatCont->finalizeData();
  covrRepeatCont->finalizeData();

  readRepeatDove->finalizeData();
  olapRepeatDove->finalizeData();
  covrRepeatDove->finalizeData();


  readSpanRepeat->finalizeData();
  olapSpanRepeat->finalizeData();

  readUniqRepeatCont->finalizeData();
  olapUniqRepeatCont->finalizeData();

  readUniqRepeatDove->finalizeData();
  olapUniqRepeatDove->finalizeData();

  readUniqAnchor->finalizeData();
  olapUniqAnchor->finalizeData();


  LOG = stdout;

  if (toFile == true) {
    sprintf(N, "%s.summary", outPrefix);

    LOG = fopen(N, "w");
    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);
  }

  fprintf(LOG, "category            reads       read length        feature size or coverage  analysis\n");
  fprintf(LOG, "----------------  -------  ----------------------  ------------------------  --------------------\n");
  fprintf(LOG, "middle-missing    %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (bad trimming)\n", readHole->numberOfObjects(), readHole->mean(), readHole->stddev(), olapHole->mean(), olapHole->stddev());
  fprintf(LOG, "middle-hump       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (bad trimming)\n", readHump->numberOfObjects(), readHump->mean(), readHump->stddev(), olapHump->mean(), olapHump->stddev());
  fprintf(LOG, "no-5-prime        %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (bad trimming)\n", readNo5->numberOfObjects(), readNo5->mean(), readNo5->stddev(), olapNo5->mean(), olapNo5->stddev());
  fprintf(LOG, "no-3-prime        %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (bad trimming)\n", readNo3->numberOfObjects(), readNo3->mean(), readNo3->stddev(), olapNo3->mean(), olapNo3->stddev());
  fprintf(LOG, "\n");
  fprintf(LOG, "low-coverage      %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (easy to assemble, potential for lower quality consensus)\n", readLowCov->numberOfObjects(), readLowCov->mean(), readLowCov->stddev(), covrLowCov->mean(), covrLowCov->stddev());
  fprintf(LOG, "unique            %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (easy to assemble, perfect, yay)\n", readUnique->numberOfObjects(), readUnique->mean(), readUnique->stddev(), covrUnique->mean(), covrUnique->stddev());
  fprintf(LOG, "repeat-cont       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (potential for consensus errors, no impact on assembly)\n", readRepeatCont->numberOfObjects(), readRepeatCont->mean(), readRepeatCont->stddev(), covrRepeatCont->mean(), covrRepeatCont->stddev());
  fprintf(LOG, "repeat-dove       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (hard to assemble, likely won't assemble correctly or even at all)\n", readRepeatDove->numberOfObjects(), readRepeatDove->mean(), readRepeatDove->stddev(), covrRepeatDove->mean(), covrRepeatDove->stddev());
  fprintf(LOG, "\n");
  fprintf(LOG, "span-repeat       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (read spans a large repeat, usually easy to assemble)\n", readSpanRepeat->numberOfObjects(), readSpanRepeat->mean(), readSpanRepeat->stddev(), olapSpanRepeat->mean(), olapSpanRepeat->stddev());
  fprintf(LOG, "uniq-repeat-cont  %7"F_U64P"  %10.2f +- %-8.2f                            (should be uniquely placed, low potential for consensus errors, no impact on assembly)\n", readUniqRepeatCont->numberOfObjects(), readUniqRepeatCont->mean(), readUniqRepeatCont->stddev());
  fprintf(LOG, "uniq-repeat-dove  %7"F_U64P"  %10.2f +- %-8.2f                            (will end contigs, potential to misassemble)\n", readUniqRepeatDove->numberOfObjects(), readUniqRepeatDove->mean(), readUniqRepeatDove->stddev());
  fprintf(LOG, "uniq-anchor       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (repeat read, with unique section, probable bad read)\n", readUniqAnchor->numberOfObjects(), readUniqAnchor->mean(), readUniqAnchor->stddev(), olapUniqAnchor->mean(), olapUniqAnchor->stddev());

  if (toFile == true)
    fclose(LOG);

  delete ovlStore;

  gkpStore->gkStore_close();

  exit(0);
}