Example #1
0
int
main(int argc, char **argv) {
  int  illegal;

  argc = AS_configure(argc, argv);

  G.initialize();

  int err=0;
  int arg=1;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      G.Doing_Partial_Overlaps = TRUE;

    } else if (strcmp(argv[arg], "-h") == 0) {
      AS_UTL_decodeRange(argv[++arg], G.bgnHashID, G.endHashID);

    } else if (strcmp(argv[arg], "-H") == 0) {
      AS_UTL_decodeRange(argv[++arg], G.minLibToHash, G.maxLibToHash);

    } else if (strcmp(argv[arg], "-r") == 0) {
      AS_UTL_decodeRange(argv[++arg], G.bgnRefID, G.endRefID);

    } else if (strcmp(argv[arg], "-R") == 0) {
      AS_UTL_decodeRange(argv[++arg], G.minLibToRef, G.maxLibToRef);

    } else if (strcmp(argv[arg], "-k") == 0) {
      arg++;
      if ((isdigit(argv[arg][0]) && (argv[arg][1] == 0)) ||
          (isdigit(argv[arg][0]) && isdigit(argv[arg][1]) && (argv[arg][2] == 0))) {
        G.Kmer_Len = strtoull(argv[arg], NULL, 10);
      } else {
        errno = 0;
        G.Kmer_Skip_File = fopen(argv[arg], "r");
        if (errno)
          fprintf(stderr, "ERROR: Failed to open -k '%s': %s\n", argv[arg], strerror(errno)), exit(1);
      }

    } else if (strcmp(argv[arg], "-l") == 0) {
      G.Frag_Olap_Limit = strtol(argv[++arg], NULL, 10);
      if  (G.Frag_Olap_Limit < 1)
        G.Frag_Olap_Limit = UINT64_MAX;

    } else if (strcmp(argv[arg], "-m") == 0) {
      G.Unique_Olap_Per_Pair = FALSE;
    } else if (strcmp(argv[arg], "-u") == 0) {
      G.Unique_Olap_Per_Pair = TRUE;

    } else if (strcmp(argv[arg], "--hashbits") == 0) {
      G.Hash_Mask_Bits = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashstrings") == 0) {
      G.Max_Hash_Strings = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashdatalen") == 0) {
      G.Max_Hash_Data_Len = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashload") == 0) {
      G.Max_Hash_Load = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "--maxreadlen") == 0) {
      //  Quite the gross way to do this, but simple.
      uint32 desired = strtoul(argv[++arg], NULL, 10);
      OFFSET_BITS = 1;
      while (((uint32)1 << OFFSET_BITS) < desired)
        OFFSET_BITS++;

      STRING_NUM_BITS       = 30 - OFFSET_BITS;

      STRING_NUM_MASK       = (1 << STRING_NUM_BITS) - 1;
      OFFSET_MASK           = (1 << OFFSET_BITS) - 1;

      MAX_STRING_NUM        = STRING_NUM_MASK;

    } else if (strcmp(argv[arg], "-o") == 0) {
      G.Outfile_Name = argv[++arg];

    } else if (strcmp(argv[arg], "-s") == 0) {
      G.Outstat_Name = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      G.Num_PThreads = strtoull(argv[++arg], NULL, 10);


    } else if (strcmp(argv[arg], "--minlength") == 0) {
      G.Min_Olap_Len = strtol (argv[++arg], NULL, 10);
    } else if (strcmp(argv[arg], "--maxerate") == 0) {
      G.maxErate = ceil(strtof(argv[++arg], NULL) * 100) / 100;

    } else if (strcmp(argv[arg], "-w") == 0) {
      G.Use_Window_Filter = TRUE;

    } else if (strcmp(argv[arg], "-z") == 0) {
      G.Use_Hopeless_Check = FALSE;

    } else {
      if (G.Frag_Store_Path == NULL) {
        G.Frag_Store_Path = argv[arg];
      } else {
        fprintf(stderr, "Unknown option '%s'\n", argv[arg]);
        err++;
      }
    }
    arg++;
  }

  //  Fix up some flags if we're allowing high error rates.
  //
  if (G.maxErate > 0.06) {
    if (G.Use_Window_Filter)
      fprintf(stderr, "High error rates requested -- window-filter turned off despite -w flag!\n");
    G.Use_Window_Filter  = FALSE;
    G.Use_Hopeless_Check = FALSE;
  }

  if (G.Max_Hash_Strings == 0)
    fprintf(stderr, "* No memory model supplied; -M needed!\n"), err++;

  if (G.Kmer_Len == 0)
    fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++;

  if (G.Max_Hash_Strings > MAX_STRING_NUM)
    fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++;

  if (G.Outfile_Name == NULL)
    fprintf (stderr, "ERROR:  No output file name specified\n"), err++;

  if ((err) || (G.Frag_Store_Path == NULL)) {
    fprintf(stderr, "USAGE:  %s [options] <gkpStorePath>\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "-b <fn>     in contig mode, specify the output file\n");
    fprintf(stderr, "-c          contig mode.  Use 2 frag stores.  First is\n");
    fprintf(stderr, "            for reads; second is for contigs\n");
    fprintf(stderr, "-G          do partial overlaps\n");
    fprintf(stderr, "-h <range>  to specify fragments to put in hash table\n");
    fprintf(stderr, "            Implies LSF mode (no changes to frag store)\n");
    fprintf(stderr, "-I          designate a file of frag iids to limit olaps to\n");
    fprintf(stderr, "            (Contig mode only)\n");
    fprintf(stderr, "-k          if one or two digits, the length of a kmer, otherwise\n");
    fprintf(stderr, "            the filename containing a list of kmers to ignore in\n");
    fprintf(stderr, "            the hash table\n");
    fprintf(stderr, "-l          specify the maximum number of overlaps per\n");
    fprintf(stderr, "            fragment-end per batch of fragments.\n");
    fprintf(stderr, "-m          allow multiple overlaps per oriented fragment pair\n");
    fprintf(stderr, "-M          specify memory size.  Valid values are '8GB', '4GB',\n");
    fprintf(stderr, "            '2GB', '1GB', '256MB'.  (Not for Contig mode)\n");
    fprintf(stderr, "-o          specify output file name\n");
    fprintf(stderr, "-P          write protoIO output (if not -G)\n");
    fprintf(stderr, "-r <range>  specify old fragments to overlap\n");
    fprintf(stderr, "-t <n>      use <n> parallel threads\n");
    fprintf(stderr, "-u          allow only 1 overlap per oriented fragment pair\n");
    fprintf(stderr, "-w          filter out overlaps with too many errors in a window\n");
    fprintf(stderr, "-z          skip the hopeless check\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--maxerate <n>     only output overlaps with fraction <n> or less error (e.g., 0.06 == 6%%)\n");
    fprintf(stderr, "--minlength <n>    only output overlaps of <n> or more bases\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--hashbits n       Use n bits for the hash mask.\n");
    fprintf(stderr, "--hashstrings n    Load at most n strings into the hash table at one time.\n");
    fprintf(stderr, "--hashdatalen n    Load at most n bytes into the hash table at one time.\n");
    fprintf(stderr, "--hashload f       Load to at most 0.0 < f < 1.0 capacity (default 0.7).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--maxreadlen n     For batches with all short reads, pack bits differently to\n");
    fprintf(stderr, "                   process more reads per batch.\n");
    fprintf(stderr, "                     all reads must be shorter than n\n");
    fprintf(stderr, "                     --hashstrings limited to 2^(30-m)\n");
    fprintf(stderr, "                   Common values:\n");
    fprintf(stderr, "                     maxreadlen 2048->hashstrings  524288 (default)\n");
    fprintf(stderr, "                     maxreadlen  512->hashstrings 2097152\n");
    fprintf(stderr, "                     maxreadlen  128->hashstrings 8388608\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--readsperbatch n  Force batch size to n.\n");
    fprintf(stderr, "--readsperthread n Force each thread to process n reads.\n");
    fprintf(stderr, "\n");
    exit(1);
  }

  Out_BOF = new ovFile(G.Outfile_Name, ovFileFullWrite);

  //  We know enough now to set the hash function variables, and some other random variables.

  HSF1 = G.Kmer_Len - (G.Hash_Mask_Bits / 2);
  HSF2 = 2 * G.Kmer_Len - G.Hash_Mask_Bits;
  SV1  = HSF1 + 2;
  SV2  = (HSF1 + HSF2) / 2;
  SV3  = HSF2 - 2;

  //  Log parameters.

  fprintf(stderr, "\n");
  fprintf(stderr, "STRING_NUM_BITS       "F_U32"\n", STRING_NUM_BITS);
  fprintf(stderr, "OFFSET_BITS           "F_U32"\n", OFFSET_BITS);
  fprintf(stderr, "STRING_NUM_MASK       "F_U64"\n", STRING_NUM_MASK);
  fprintf(stderr, "OFFSET_MASK           "F_U64"\n", OFFSET_MASK);
  fprintf(stderr, "MAX_STRING_NUM        "F_U64"\n", MAX_STRING_NUM);
  fprintf(stderr, "\n");
  fprintf(stderr, "Hash_Mask_Bits        "F_U32"\n", G.Hash_Mask_Bits);
  fprintf(stderr, "Max_Hash_Strings      "F_U32"\n", G.Max_Hash_Strings);
  fprintf(stderr, "Max_Hash_Data_Len     "F_U64"\n", G.Max_Hash_Data_Len);
  fprintf(stderr, "Max_Hash_Load         %f\n", G.Max_Hash_Load);
  fprintf(stderr, "Kmer Length           "F_U64"\n", G.Kmer_Len);
  fprintf(stderr, "Min Overlap Length    %d\n", G.Min_Olap_Len);
  fprintf(stderr, "Max Error Rate        %f\n", G.maxErate);
  fprintf(stderr, "\n");
  fprintf(stderr, "Num_PThreads          "F_U32"\n", G.Num_PThreads);

  assert (8 * sizeof (uint64) > 2 * G.Kmer_Len);

  Bit_Equivalent['a'] = Bit_Equivalent['A'] = 0;
  Bit_Equivalent['c'] = Bit_Equivalent['C'] = 1;
  Bit_Equivalent['g'] = Bit_Equivalent['G'] = 2;
  Bit_Equivalent['t'] = Bit_Equivalent['T'] = 3;

  for  (int i = 0;  i < 256;  i ++) {
    char  ch = tolower ((char) i);

    if  (ch == 'a' || ch == 'c' || ch == 'g' || ch == 't')
      Char_Is_Bad[i] = 0;
    else
      Char_Is_Bad[i] = 1;
  }

  fprintf(stderr, "\n");
  fprintf(stderr, "HASH_TABLE_SIZE         "F_U32"\n",     HASH_TABLE_SIZE);
  fprintf(stderr, "sizeof(Hash_Bucket_t)   "F_SIZE_T"\n",  sizeof(Hash_Bucket_t));
  fprintf(stderr, "hash table size:        "F_SIZE_T" MB\n",  (HASH_TABLE_SIZE * sizeof(Hash_Bucket_t)) >> 20);
  fprintf(stderr, "\n");

  Hash_Table       = new Hash_Bucket_t [HASH_TABLE_SIZE];

  fprintf(stderr, "check  "F_SIZE_T" MB\n", (HASH_TABLE_SIZE    * sizeof (Check_Vector_t) >> 20));
  fprintf(stderr, "info   "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (Hash_Frag_Info_t) >> 20));
  fprintf(stderr, "start  "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (int64) >> 20));
  fprintf(stderr, "\n");

  Hash_Check_Array = new Check_Vector_t [HASH_TABLE_SIZE];
  String_Info      = new Hash_Frag_Info_t [G.Max_Hash_Strings];
  String_Start     = new int64 [G.Max_Hash_Strings];

  String_Start_Size = G.Max_Hash_Strings;

  memset(Hash_Check_Array, 0, sizeof(Check_Vector_t)   * HASH_TABLE_SIZE);
  memset(String_Info,      0, sizeof(Hash_Frag_Info_t) * G.Max_Hash_Strings);
  memset(String_Start,     0, sizeof(int64)            * G.Max_Hash_Strings);



  OverlapDriver();



  delete [] basesData;
  delete [] qualsData;
  delete [] nextRef;

  delete [] String_Start;
  delete [] String_Info;
  delete [] Hash_Check_Array;
  delete [] Hash_Table;

  delete Out_BOF;

  FILE *stats = stderr;

  if (G.Outstat_Name != NULL) {
    errno = 0;
    stats = fopen(G.Outstat_Name, "w");
    if (errno) {
      fprintf(stderr, "WARNING: failed to open '%s' for writing: %s\n", G.Outstat_Name, strerror(errno));
      stats = stderr;
    }
  }

  fprintf(stats, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct);
  fprintf(stats, "    Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct);
  fprintf(stats, "  Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct);
  fprintf(stats, " Total overlaps produced = "F_S64"\n", Total_Overlaps);
  fprintf(stats, "      Contained overlaps = "F_S64"\n", Contained_Overlap_Ct);
  fprintf(stats, "       Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct);
  fprintf(stats, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct);
  fprintf(stats, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct);

  if (stats != stderr)
    fclose(stats);

  return(0);
}
Example #2
0
int
main(int argc, char **argv) {
  char             *gkpName = 0L;
  char             *ovsName = 0L;

  char             *iniClrName = NULL;
  char             *maxClrName = NULL;
  char             *outClrName = NULL;

  uint32            errorValue     = AS_OVS_encodeEvalue(0.015);
  uint32            minAlignLength = 40;
  uint32            minReadLength  = 64;

  char             *outputPrefix  = NULL;
  char              logName[FILENAME_MAX] = {0};
  char              sumName[FILENAME_MAX] = {0};
  FILE             *logFile = 0L;
  FILE             *sumFile = 0L;

  uint32            idMin = 1;
  uint32            idMax = UINT32_MAX;

  uint32            minEvidenceOverlap  = 40;
  uint32            minEvidenceCoverage = 1;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      ovsName = argv[++arg];

    } else if (strcmp(argv[arg], "-Ci") == 0) {
      iniClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Cm") == 0) {
      maxClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Co") == 0) {
      outClrName = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      double erate = atof(argv[++arg]);
      errorValue = AS_OVS_encodeEvalue(erate);

    } else if (strcmp(argv[arg], "-l") == 0) {
      minAlignLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minReadLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-ol") == 0) {
      minEvidenceOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-oc") == 0) {
      minEvidenceCoverage = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-o") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      AS_UTL_decodeRange(argv[++arg], idMin, idMax);

    } else {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
      err++;
    }

    arg++;
  }
  if ((gkpName       == NULL) ||
      (ovsName       == NULL) ||
      (outputPrefix  == NULL) ||
      (err)) {
    fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Co output.clearFile -o outputPrefix\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G gkpStore    path to read store\n");
    fprintf(stderr, "  -O ovlStore    path to overlap store\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -o name        output prefix, for logging\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -t bgn-end     limit processing to only reads from bgn to end (inclusive)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -Ci clearFile  path to input clear ranges (NOT SUPPORTED)\n");
    //fprintf(stderr, "  -Cm clearFile  path to maximal clear ranges\n");
    fprintf(stderr, "  -Co clearFile  path to ouput clear ranges\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -e erate       ignore overlaps with more than 'erate' percent error\n");
    //fprintf(stderr, "  -l length      ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -ol l          the minimum evidence overlap length\n");
    fprintf(stderr, "  -oc c          the minimum evidence overlap coverage\n");
    fprintf(stderr, "                   evidence overlaps must overlap by 'l' bases to be joined, and\n");
    fprintf(stderr, "                   must be at least 'c' deep to be retained\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -minlength l   reads trimmed below this many bases are deleted\n");
    fprintf(stderr, "\n");
    exit(1);
  }

  gkStore          *gkp = gkStore::gkStore_open(gkpName);
  ovStore          *ovs = new ovStore(ovsName, gkp);

  clearRangeFile   *iniClr = (iniClrName == NULL) ? NULL : new clearRangeFile(iniClrName, gkp);
  clearRangeFile   *maxClr = (maxClrName == NULL) ? NULL : new clearRangeFile(maxClrName, gkp);
  clearRangeFile   *outClr = (outClrName == NULL) ? NULL : new clearRangeFile(outClrName, gkp);

  if (outClr)
    //  If the outClr file exists, those clear ranges are loaded.  We need to reset them
    //  back to 'untrimmed' for now.
    outClr->reset(gkp);

  if (iniClr && outClr)
    //  An iniClr file was supplied, so use those as the initial clear ranges.
    outClr->copy(iniClr);


  if (outputPrefix) {
    sprintf(logName, "%s.log",     outputPrefix);
    sprintf(sumName, "%s.summary", outputPrefix);

    errno = 0;
    logFile = fopen(logName, "w");
    if (errno)
      fprintf(stderr, "Failed to open log file '%s' for writing: %s\n", logName, strerror(errno)), exit(1);

    sumFile = fopen(sumName, "w");
    if (errno)
      fprintf(stderr, "Failed to open summary file '%s' for writing: %s\n", sumName, strerror(errno)), exit(1);

    fprintf(logFile, "id\tinitL\tinitR\tfinalL\tfinalR\tmessage (DEL=deleted NOC=no change MOD=modified)\n");

    fprintf(sumFile, "Overlap error rate     <= %.4f fraction error\n", AS_OVS_decodeEvalue(errorValue));
    fprintf(sumFile, "Overlap min overlap    >= %u base%s (for 'largest covered')\n", minEvidenceOverlap,  (minEvidenceOverlap  == 1) ? "" : "s");
    fprintf(sumFile, "Overlap min coverage   >= %u read%s (for 'largest covered')\n", minEvidenceCoverage, (minEvidenceCoverage == 1) ? "" : "s");
  }


  uint32      ovlLen       = 0;
  uint32      ovlMax       = 64 * 1024;
  ovOverlap  *ovl          = ovOverlap::allocateOverlaps(gkp, ovlMax);

  memset(ovl, 0, sizeof(ovOverlap) * ovlMax);

  char        logMsg[1024] = {0};

  if (idMin < 1)
    idMin = 1;
  if (idMax > gkp->gkStore_getNumReads())
    idMax = gkp->gkStore_getNumReads();

  fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads.\n",
          idMin,
          idMax,
          gkp->gkStore_getNumReads());

  for (uint32 id=idMin; id<=idMax; id++) {
    gkRead     *read = gkp->gkStore_getRead(id);
    gkLibrary  *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID());

    logMsg[0] = 0;

    //  If the fragment is deleted, do nothing.  If the fragment was deleted AFTER overlaps were
    //  generated, then the overlaps will be out of sync -- we'll get overlaps for these fragments
    //  we skip.
    //
    if ((iniClr) && (iniClr->isDeleted(id) == true))
      continue;

    //  If it did not request trimming, do nothing.  Similar to the above, we'll get overlaps to
    //  fragments we skip.
    //
    if ((libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) &&
        (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE))
      continue;

    //  Decide on the initial trimming.  We copied any iniClr into outClr above, and if there wasn't
    //  an iniClr, then outClr is the full read.

    uint32      ibgn   = outClr->bgn(id);
    uint32      iend   = outClr->end(id);

    //  Set the, ahem, initial final trimming.

    bool        isGood = false;
    uint32      fbgn   = ibgn;
    uint32      fend   = iend;

    //  Load overlaps.

    uint32      nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax);

    //  Trim!

    if (nLoaded == 0) {
      //  No overlaps, so mark it as junk.
      isGood = false;
    }

    else if (libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) {
      //  Use the largest region covered by overlaps as the trim

      assert(ovlLen > 0);
      assert(id == ovl[0].a_iid);

      isGood = largestCovered(ovl, ovlLen,
                              read,
                              ibgn, iend, fbgn, fend,
                              logMsg,
                              errorValue,
                              minEvidenceOverlap,
                              minEvidenceCoverage,
                              minReadLength);
      assert(fbgn <= fend);

    }

    else if (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE) {
      //  Use the largest region covered by overlaps as the trim

      assert(ovlLen > 0);
      assert(id == ovl[0].a_iid);

      isGood = bestEdge(ovl, ovlLen,
                        read,
                        ibgn, iend, fbgn, fend,
                        logMsg,
                        errorValue,
                        minEvidenceOverlap,
                        minEvidenceCoverage,
                        minReadLength);
      assert(fbgn <= fend);

    }

    else {
      //  Do nothing.  Really shouldn't get here.
      assert(0);
      continue;
    }

    //  Enforce the maximum clear range

    if ((isGood) && (maxClr)) {
      isGood = enforceMaximumClearRange(ovl, ovlLen,
                                        read,
                                        ibgn, iend, fbgn, fend,
                                        logMsg,
                                        maxClr);
      assert(fbgn <= fend);
    }

    //
    //  Trimmed.  Make sense of the result, write some logs, and update the output.
    //


    //  If bad trimming or too small, write the log and keep going.
    //
    if ((isGood == false) || (fend - fbgn < minReadLength)) {
      outClr->setbgn(id) = fbgn;
      outClr->setend(id) = fend;
      outClr->setDeleted(id);  //  Gah, just obliterates the clear range.

      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tDEL%s\n",
              id,
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);
    }

    //  If we didn't change anything, also write a log.
    //
    else if ((ibgn == fbgn) &&
        (iend == fend)) {
      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tNOC%s\n",
              id,
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);
      continue;
    }

    //  Otherwise, we actually did something.

    else {
      outClr->setbgn(id) = fbgn;
      outClr->setend(id) = fend;

      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tMOD%s\n",
              id,
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);
    }
  }

  gkp->gkStore_close();

  delete ovs;

  delete iniClr;
  delete maxClr;
  delete outClr;

  fclose(logFile);
  fclose(sumFile);

  exit(0);
}
Example #3
0
int
main(int argc, char **argv) {
  char     *gkpName = NULL;
  char     *ovsName = NULL;

  char     *finClrName = NULL;
  char     *outClrName = NULL;

  double    errorRate       = 0.06;
  //uint32    minAlignLength  = 40;
  uint32    minReadLength   = 64;

  uint32    idMin = 1;
  uint32    idMax = UINT32_MAX;

  char     *outputPrefix = NULL;
  char      outputName[FILENAME_MAX];

  FILE     *staFile      = NULL;
  FILE     *reportFile   = NULL;
  FILE     *subreadFile  = NULL;

  bool      doSubreadLogging        = true;
  bool      doSubreadLoggingVerbose = false;

  //  Statistics on the trimming - the second set are from the old logging, and don't really apply anymore.

  trimStat  readsIn;                  //  Read is eligible for trimming
  trimStat  deletedIn;                //  Read was deleted already
  trimStat  noTrimIn;                 //  Read not requesting trimming

  trimStat  noOverlaps;               //  no overlaps in store
  trimStat  noCoverage;               //  no coverage after adjusting for trimming done

  trimStat  readsProcChimera;         //  Read was processed for chimera signal
  trimStat  readsProcSpur;            //  Read was processed for spur signal
  trimStat  readsProcSubRead;         //  Read was processed for subread signal

#if 0
  trimStat  badSpur5;
  trimStat  badSpur3;
  trimStat  badChimera;
  trimStat  badSubread;
#endif

  trimStat  readsNoChange;

  trimStat  readsBadSpur5,   basesBadSpur5;
  trimStat  readsBadSpur3,   basesBadSpur3;
  trimStat  readsBadChimera, basesBadChimera;
  trimStat  readsBadSubread, basesBadSubread;

  trimStat  readsTrimmed5;
  trimStat  readsTrimmed3;

#if 0
  trimStat  fullCoverage;             //  fully covered by overlaps
  trimStat  noSignalNoGap;            //  no signal, no gaps
  trimStat  noSignalButGap;           //  no signal, with gaps

  trimStat  bothFixed;                //  both chimera and spur signal trimmed
  trimStat  chimeraFixed;             //  only chimera signal trimmed
  trimStat  spurFixed;                //  only spur signal trimmed

  trimStat  bothDeletedSmall;         //  deleted because of both cimera and spur signals
  trimStat  chimeraDeletedSmall;      //  deleted because of chimera signal
  trimStat  spurDeletedSmall;         //  deleted because of spur signal

  trimStat  spurDetectedNormal;       //  normal spur detected
  trimStat  spurDetectedLinker;       //  linker spur detected

  trimStat  chimeraDetectedInnie;     //  innpue-pair chimera detected
  trimStat  chimeraDetectedOverhang;  //  overhanging chimera detected
  trimStat  chimeraDetectedGap;       //  gap chimera detected
  trimStat  chimeraDetectedLinker;    //  linker chimera detected
#endif

  trimStat  deletedOut;               //  Read was deleted by trimming

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      ovsName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      AS_UTL_decodeRange(argv[++arg], idMin, idMax);

    } else if (strcmp(argv[arg], "-Ci") == 0) {
      finClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Co") == 0) {
      outClrName = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      errorRate = atof(argv[++arg]);

    //} else if (strcmp(argv[arg], "-l") == 0) {
    //  minAlignLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minReadLength = atoi(argv[++arg]);

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);
      err++;
    }
    arg++;
  }

  if (errorRate < 0.0)
    err++;

  if ((gkpName == 0L) || (ovsName == 0L) || (outputPrefix == NULL) || (err)) {
    fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Ci input.clearFile -Co output.clearFile -o outputPrefix]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G gkpStore    path to read store\n");
    fprintf(stderr, "  -O ovlStore    path to overlap store\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -o name        output prefix, for logging\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -t bgn-end     limit processing to only reads from bgn to end (inclusive)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -Ci clearFile  path to input clear ranges (NOT SUPPORTED)\n");
    fprintf(stderr, "  -Co clearFile  path to ouput clear ranges\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -e erate       ignore overlaps with more than 'erate' percent error\n");
    //fprintf(stderr, "  -l length      ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -minlength l   reads trimmed below this many bases are deleted\n");
    fprintf(stderr, "\n");

    if (errorRate < 0.0)
      fprintf(stderr, "ERROR: Error rate (-e) value %f too small; must be 'fraction error' and above 0.0\n", errorRate);

    exit(1);
  }

  gkStore         *gkp = gkStore::gkStore_open(gkpName);
  ovStore         *ovs = new ovStore(ovsName, gkp);

  clearRangeFile  *finClr = new clearRangeFile(finClrName, gkp);
  clearRangeFile  *outClr = new clearRangeFile(outClrName, gkp);

  if (outClr)
    //  If the outClr file exists, those clear ranges are loaded.  We need to reset them
    //  back to 'untrimmed' for now.
    outClr->reset(gkp);

  if (finClr && outClr)
    //  A finClr file was supplied, so use those as the clear ranges.
    outClr->copy(finClr);


  sprintf(outputName, "%s.log",         outputPrefix);
  errno = 0;
  reportFile  = fopen(outputName, "w");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);

  sprintf(outputName, "%s.subread.log", outputPrefix);
  errno = 0;
  subreadFile = fopen(outputName, "w");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);


  uint32      ovlLen = 0;
  uint32      ovlMax = 64 * 1024;
  ovOverlap  *ovl    = ovOverlap::allocateOverlaps(gkp, ovlMax);

  memset(ovl, 0, sizeof(ovOverlap) * ovlMax);

  workUnit *w = new workUnit;


  if (idMin < 1)
    idMin = 1;
  if (idMax > gkp->gkStore_getNumReads())
    idMax = gkp->gkStore_getNumReads();

  fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads, using errorRate = %.2f\n",
          idMin,
          idMax,
          gkp->gkStore_getNumReads(),
          errorRate);

  for (uint32 id=idMin; id<=idMax; id++) {
    gkRead     *read = gkp->gkStore_getRead(id);
    gkLibrary  *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID());

    if (finClr->isDeleted(id)) {
      //  Read already trashed.
      deletedIn += read->gkRead_sequenceLength();
      continue;
    }

    if ((libr->gkLibrary_removeSpurReads()     == false) &&
        (libr->gkLibrary_removeChimericReads() == false) &&
        (libr->gkLibrary_checkForSubReads()    == false)) {
      //  Nothing to do.
      noTrimIn += read->gkRead_sequenceLength();
      continue;
    }

    readsIn += read->gkRead_sequenceLength();


    uint32   nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax);

    //fprintf(stderr, "read %7u with %7u overlaps\r", id, nLoaded);

    if (nLoaded == 0) {
      //  No overlaps, nothing to check!
      noOverlaps += read->gkRead_sequenceLength();
      continue;
    }

    w->clear(id, finClr->bgn(id), finClr->end(id));
    w->addAndFilterOverlaps(gkp, finClr, errorRate, ovl, ovlLen);

    if (w->adjLen == 0) {
      //  All overlaps trimmed out!
      noCoverage += read->gkRead_sequenceLength();
      continue;
    }

    //  Find bad regions.

    //if (libr->gkLibrary_markBad() == true)
    //  //  From an external file, a list of known bad regions.  If no overlaps span
    //  //  the region with sufficient coverage, mark the region as bad.  This was
    //  //  motivated by the old 454 linker detection.
    //  markBad(gkp, w, subreadFile, doSubreadLoggingVerbose);

    //if (libr->gkLibrary_removeSpurReads() == true) {
    //  readsProcSpur += read->gkRead_sequenceLength();
    //  detectSpur(gkp, w, subreadFile, doSubreadLoggingVerbose);
    //  Get stats on spur region detected - save the length of each region to the trimStats object.
    //}

    //if (libr->gkLibrary_removeChimericReads() == true) {
    //  readsProcChimera += read->gkRead_sequenceLength();
    //  detectChimer(gkp, w, subreadFile, doSubreadLoggingVerbose);
    //  Get stats on chimera region detected - save the length of each region to the trimStats object.
    //}

    if (libr->gkLibrary_checkForSubReads() == true) {
      readsProcSubRead += read->gkRead_sequenceLength();
      detectSubReads(gkp, w, subreadFile, doSubreadLoggingVerbose);
    }

    //  Get stats on the bad regions found.  This kind of duplicates code in trimBadInterval(), but
    //  I don't want to pass all the stats objects into there.

    if (w->blist.size() == 0) {
      readsNoChange += read->gkRead_sequenceLength();
    }

    else {
      uint32  nSpur5   = 0, bSpur5   = 0;
      uint32  nSpur3   = 0, bSpur3   = 0;
      uint32  nChimera = 0, bChimera = 0;
      uint32  nSubread = 0, bSubread = 0;

      for (uint32 bb=0; bb<w->blist.size(); bb++) {
        switch (w->blist[bb].type) {
          case badType_5spur:
            nSpur5        += 1;
            basesBadSpur5 += w->blist[bb].end - w->blist[bb].bgn;
            break;
          case badType_3spur:
            nSpur3        += 1;
            basesBadSpur3 += w->blist[bb].end - w->blist[bb].bgn;
            break;
          case badType_chimera:
            nChimera        += 1;
            basesBadChimera += w->blist[bb].end - w->blist[bb].bgn;
            break;
          case badType_subread:
            nSubread        += 1;
            basesBadSubread += w->blist[bb].end - w->blist[bb].bgn;
            break;
          default:
            break;
        }
      }

      if (nSpur5   > 0)   readsBadSpur5   += nSpur5;
      if (nSpur3   > 0)   readsBadSpur3   += nSpur3;
      if (nChimera > 0)   readsBadChimera += nChimera;
      if (nSubread > 0)   readsBadSubread += nSubread;
    }

    //  Find solution.  This coalesces the list (in 'w') of all the bad regions found, picks out the
    //  largest good region, generates a log of the bad regions that support this decision, and sets
    //  the trim points.

    trimBadInterval(gkp, w, minReadLength, subreadFile, doSubreadLoggingVerbose);

    //  Log the solution.

    AS_UTL_safeWrite(reportFile, w->logMsg, "logMsg", sizeof(char), strlen(w->logMsg));

    //  Save the solution....

    outClr->setbgn(w->id) = w->clrBgn;
    outClr->setend(w->id) = w->clrEnd;

    //  And maybe delete the read.

    if (w->isOK == false) {
      deletedOut += read->gkRead_sequenceLength();

      outClr->setDeleted(w->id);
    }

    //  Update stats on what was trimmed.  The asserts say the clear range didn't expand, and the if
    //  tests if the clear range changed.

    assert(w->clrBgn >= w->iniBgn);
    assert(w->iniEnd >= w->clrEnd);

    if (w->clrBgn > w->iniBgn)
      readsTrimmed5 += w->clrBgn - w->iniBgn;

    if (w->iniEnd > w->clrEnd)
      readsTrimmed3 += w->iniEnd - w->clrEnd;
  }


  delete [] ovl;

  delete    w;

  gkp->gkStore_close();

  delete    finClr;
  delete    outClr;

  //  Close log files

  if (reportFile)
    fclose(reportFile);

  if (subreadFile)
    fclose(subreadFile);

  //  Write the summary

  if (outputPrefix) {
    sprintf(outputName, "%s.stats", outputPrefix);

    errno = 0;
    staFile = fopen(outputName, "w");
    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno));
  }

  if (staFile == NULL)
    staFile = stdout;

  //  Would like to know number of subreads per read

  fprintf(staFile, "PARAMETERS:\n");
  fprintf(staFile, "----------\n");
  fprintf(staFile, "%7u    (reads trimmed below this many bases are deleted)\n", minReadLength);
  fprintf(staFile, "%7.4f    (use overlaps at or below this fraction error)\n", errorRate);
  //fprintf(staFile, "%7u    (use only overlaps longer than this)\n", minAlignLength);  //  NOT SUPPORTED!
  fprintf(staFile, "INPUT READS:\n");
  fprintf(staFile, "-----------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads processed)\n", readsIn.nReads, readsIn.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "PROCESSED:\n");
  fprintf(staFile, "--------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no overlaps)\n", noOverlaps.nReads, noOverlaps.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no coverage after adjusting for trimming done already)\n", noCoverage.nReads, noCoverage.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for chimera)\n",  readsProcChimera.nReads, readsProcChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for spur)\n",     readsProcSpur.nReads,    readsProcSpur.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for subreads)\n", readsProcSubRead.nReads, readsProcSubRead.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "READS WITH SIGNALS:\n");
  fprintf(staFile, "------------------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 5' spur signal)\n", readsBadSpur5.nReads,   readsBadSpur5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 3' spur signal)\n", readsBadSpur3.nReads,   readsBadSpur3.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of chimera signal)\n", readsBadChimera.nReads, readsBadChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of subread signal)\n", readsBadSubread.nReads, readsBadSubread.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "SIGNALS:\n");
  fprintf(staFile, "-------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 5' spur signal)\n", basesBadSpur5.nReads,   basesBadSpur5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 3' spur signal)\n", basesBadSpur3.nReads,   basesBadSpur3.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of chimera signal)\n", basesBadChimera.nReads, basesBadChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of subread signal)\n", basesBadSubread.nReads, basesBadSubread.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "TRIMMING:\n");
  fprintf(staFile, "--------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 5' end of the read)\n", readsTrimmed5.nReads, readsTrimmed5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 3' end of the read)\n", readsTrimmed3.nReads, readsTrimmed3.nBases);

#if 0
  fprintf(staFile, "DELETED:\n");
  fprintf(staFile, "-------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of both cimera and spur signals)\n", bothDeletedSmall.nReads, bothDeletedSmall.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of chimera signal)\n", chimeraDeletedSmall.nReads, chimeraDeletedSmall.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of spur signal)\n", spurDeletedSmall.nReads, spurDeletedSmall.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "SPUR TYPES:\n");
  fprintf(staFile, "----------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (normal spur detected)\n", spurDetectedNormal.nReads, spurDetectedNormal.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker spur detected)\n", spurDetectedLinker.nReads, spurDetectedLinker.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "CHIMERA TYPES:\n");
  fprintf(staFile, "-------------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (innie-pair chimera detected)\n", chimeraDetectedInnie.nReads, chimeraDetectedInnie.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (overhanging chimera detected)\n", chimeraDetectedOverhang.nReads, chimeraDetectedOverhang.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (gap chimera detected)\n", chimeraDetectedGap.nReads, chimeraDetectedGap.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker chimera detected)\n", chimeraDetectedLinker.nReads, chimeraDetectedLinker.nBases);
#endif

  //  INPUT READS  = ACCEPTED + TRIMMED + DELETED
  //  SPUR TYPE    = TRIMMED and DELETED spur and both categories
  //  CHIMERA TYPE = TRIMMED and DELETED chimera and both categories

  if (staFile != stdout)
    fclose(staFile);

  exit(0);
}
Example #4
0
int
main (int argc, char **argv) {
  char   tmpName[FILENAME_MAX] = {0};

  char  *gkpName = NULL;

  char  *tigName = NULL;
  int32  tigVers = -1;
  int32  tigPart = -1;

  int64  ctgBgn = -1;
  int64  ctgEnd = -1;

  char  *ctgName = NULL;
  char  *outName = NULL;
  char  *inName  = NULL;

  bool   forceCompute = false;

  int32  numFailures = 0;
  int32  numSkipped  = 0;

  bool   useUnitig  = false;
  bool   showResult = false;

  CNS_Options options = { CNS_OPTIONS_SPLIT_ALLELES_DEFAULT,
                          CNS_OPTIONS_MIN_ANCHOR_DEFAULT,
                          CNS_OPTIONS_DO_PHASING_DEFAULT };

  //  Comminucate to MultiAlignment_CNS.c that we are doing consensus and not cgw.
  thisIsConsensus = 1;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-g") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      tigName = argv[++arg];
      tigVers = atoi(argv[++arg]);
      tigPart = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-c") == 0) {
      AS_UTL_decodeRange(argv[++arg], ctgBgn, ctgEnd);

    } else if (strcmp(argv[arg], "-T") == 0) {
      ctgName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      outName = argv[++arg];
    } else if (strcmp(argv[arg], "-I") == 0) {
      inName = argv[++arg];

    } else if (strcmp(argv[arg], "-f") == 0) {
      forceCompute = true;

    } else if (strcmp(argv[arg], "-U") == 0) {
      useUnitig = true;

    } else if (strcmp(argv[arg], "-v") == 0) {
      showResult = true;

    } else if (strcmp(argv[arg], "-V") == 0) {
      VERBOSE_MULTIALIGN_OUTPUT++;

    } else if (strcmp(argv[arg], "-w") == 0) {
      options.smooth_win = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-P") == 0) {
      options.do_phasing = atoi(argv[++arg]);

    } else {
      fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);
      err++;
    }

    arg++;
  }
  if ((err) || (gkpName == NULL) || (tigName == NULL)) {
    fprintf(stderr, "usage: %s -g gkpStore -t tigStore version partition [opts]\n", argv[0]);
    fprintf(stderr, "    -c b         Compute only contig ID 'b' (must be in the correct partition!)\n");
    fprintf(stderr, "    -c b-e       Compute only contigs from ID 'b' to ID 'e'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -T file      Test the computation of the contig layout in 'file'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -f           Recompute contigs that already have a multialignment\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -U           Reuse the unitig consensus for contigs with only a single\n");
    fprintf(stderr, "                 unitig (EXPERIMENTAL!)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -O file      Don't update tigStore, dump a binary file instead.\n");
    fprintf(stderr, "    -I file      Import binary file into tigStore\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -v           Show multialigns.\n");
    fprintf(stderr, "    -V           Enable debugging option 'verbosemultialign'.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -w ws        Smoothing window size\n");
    fprintf(stderr, "\n");
    exit(1);
  }

  //  Open both stores for read only.
  gkpStore = new gkStore(gkpName, false, false);
  tigStore = new MultiAlignStore(tigName, tigVers, 0, tigPart, false, false, false);

  gkpStore->gkStore_loadPartition(tigPart);

  //  Decide on what to compute.  Either all contigs, or a single contig, or a special case test.
  uint32 b = 0;
  uint32 e = tigStore->numContigs();

  if (ctgBgn != -1) {
    b = ctgBgn;
    e = ctgEnd + 1;
  }

  FORCE_UNITIG_ABUT = 1;

  if (ctgName != NULL) {
    errno = 0;
    FILE         *F = fopen(ctgName, "r");
    if (errno)
      fprintf(stderr, "Failed to open input contig file '%s': %s\n", ctgName, strerror(errno)), exit(1);

    MultiAlignT  *ma       = CreateEmptyMultiAlignT();
    bool          isUnitig = false;

    while (LoadMultiAlignFromHuman(ma, isUnitig, F) == true) {
      if (ma->maID < 0)
        ma->maID = (isUnitig) ? tigStore->numUnitigs() : tigStore->numContigs();

      if (MultiAlignContig(ma, gkpStore, &options)) {
        if (showResult)
          PrintMultiAlignT(stdout, ma, gkpStore, false, false, AS_READ_CLEAR_LATEST);
      } else {
        fprintf(stderr, "MultiAlignContig()-- contig %d failed.\n", ma->maID);
        numFailures++;
      }
    }

    DeleteMultiAlignT(ma);

    b = e = 0;
  }

  //  Reopen for writing, if we have work to do.
  if (((inName) || (b < e)) && (outName == NULL)) {
    delete tigStore;
    tigStore = new MultiAlignStore(tigName, tigVers, 0, tigPart, true, false, true);
  }

  if (inName) {
    importFromFile(inName, tigPart);

    b = e = 0;
  }

  //  Now the usual case.  Iterate over all contigs, compute and update.
  for (uint32 i=b; i<e; i++) {
    MultiAlignT  *cma = tigStore->loadMultiAlign(i, false);

    if (cma == NULL) {
      //  Not in our partition, or deleted.
      continue;
    }

    bool  exists = (cma->consensus != NULL) && (GetNumchars(cma->consensus) > 1);

    if ((forceCompute == false) && (exists == true)) {
      //  Already finished contig consensus.
      fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments) - already computed, skipped\n",
              cma->maID, cma->data.num_unitigs, cma->data.num_frags);

      numSkipped++;

      tigStore->unloadMultiAlign(cma->maID, false);

      continue;
    }

    int32         uID = GetIntUnitigPos(cma->u_list, 0)->ident;

    //  If this is a surrogate, we CANNOT reuse the unitig.  We need to process the contig so that
    //  the unplaced reads are stripped out.  A surrogate should have different contig and unitig
    //  IDs; we could also check the contig status.

    if ((cma->data.num_unitigs == 1) &&
        (cma->maID == uID) &&
        (useUnitig == true)) {
      fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments) - reusing unitig %d consensus\n",
              cma->maID, cma->data.num_unitigs, cma->data.num_frags, uID);

      MultiAlignT  *uma = tigStore->loadMultiAlign(uID, true);

      uma->data = cma->data;

      tigStore->unloadMultiAlign(cma->maID, false);

      if (outName)
        writeToOutFile(outName, tigPart, uma);
      else
        tigStore->insertMultiAlign(uma, false, false);

      tigStore->unloadMultiAlign(uma->maID, true);

      continue;
    }

    fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments)%s\n",
            cma->maID, cma->data.num_unitigs, cma->data.num_frags,
            (exists) ? " - already computed, recomputing" : "");

    if (MultiAlignContig(cma, gkpStore, &options)) {
      if (outName)
        writeToOutFile(outName, tigPart, cma);
      else
        tigStore->insertMultiAlign(cma, false, true);

      if (showResult)
        PrintMultiAlignT(stdout, cma, gkpStore, false, false, AS_READ_CLEAR_LATEST);

      tigStore->unloadMultiAlign(cma->maID, false);
    } else {
      fprintf(stderr, "MultiAlignContig()-- contig %d failed.\n", cma->maID);
      numFailures++;
    }
  }

  delete tigStore;

  fprintf(stderr, "\n");
  fprintf(stderr, "NumColumnsInUnitigs             = %d\n", NumColumnsInUnitigs);
  fprintf(stderr, "NumGapsInUnitigs                = %d\n", NumGapsInUnitigs);
  fprintf(stderr, "NumRunsOfGapsInUnitigReads      = %d\n", NumRunsOfGapsInUnitigReads);
  fprintf(stderr, "NumColumnsInContigs             = %d\n", NumColumnsInContigs);
  fprintf(stderr, "NumGapsInContigs                = %d\n", NumGapsInContigs);
  fprintf(stderr, "NumRunsOfGapsInContigReads      = %d\n", NumRunsOfGapsInContigReads);
  fprintf(stderr, "NumAAMismatches                 = %d\n", NumAAMismatches);
  fprintf(stderr, "NumVARRecords                   = %d\n", NumVARRecords);
  fprintf(stderr, "NumVARStringsWithFlankingGaps   = %d\n", NumVARStringsWithFlankingGaps);
  fprintf(stderr, "NumUnitigRetrySuccess           = %d\n", NumUnitigRetrySuccess);
  fprintf(stderr, "\n");

  if (numFailures) {
    fprintf(stderr, "WARNING:  Total number of contig failures = %d\n", numFailures);
    fprintf(stderr, "\n");
    fprintf(stderr, "Consensus did NOT finish successfully.\n");
    return(1);
  }

  fprintf(stderr, "Consensus finished successfully.  Bye.\n");
  return(0);
}
Example #5
0
int
main(int argc, char **argv) {
  char            *gkpStoreName      = NULL;
  char            *outPrefix         = NULL;
  char            *outSuffix         = NULL;

  char            *clrName           = NULL;

  uint32           libToDump         = 0;

  uint32           bgnID             = 1;
  uint32           endID             = UINT32_MAX;

  bool             dumpAllReads      = false;
  bool             dumpAllBases      = false;
  bool             dumpOnlyDeleted   = false;

  bool             dumpFASTQ         = true;
  bool             dumpFASTA         = false;

  bool             withLibName       = true;

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpStoreName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outPrefix = argv[++arg];
      outSuffix = scanPrefix(outPrefix);


    } else if (strcmp(argv[arg], "-c") == 0) {
      clrName = argv[++arg];


    } else if (strcmp(argv[arg], "-l") == 0) {
      libToDump = atoi(argv[++arg]);


    } else if (strcmp(argv[arg], "-b") == 0) {   //  DEPRECATED!
      bgnID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-e") == 0) {   //  DEPRECATED!
      endID  = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-r") == 0) {
      AS_UTL_decodeRange(argv[++arg], bgnID, endID);

    } else if (strcmp(argv[arg], "-allreads") == 0) {
      dumpAllReads    = true;

    } else if (strcmp(argv[arg], "-allbases") == 0) {
      dumpAllBases    = true;

    } else if (strcmp(argv[arg], "-onlydeleted") == 0) {
      dumpOnlyDeleted = true;
      dumpAllReads    = true;  //  Otherwise we won't report the deleted reads!


    } else if (strcmp(argv[arg], "-fastq") == 0) {
      dumpFASTQ       = true;
      dumpFASTA       = false;

    } else if (strcmp(argv[arg], "-fasta") == 0) {
      dumpFASTQ       = false;
      dumpFASTA       = true;

    } else if (strcmp(argv[arg], "-nolibname") == 0) {
      withLibName     = false;


    } else {
      err++;
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
    }
    arg++;
  }

  if (gkpStoreName == NULL)
    err++;
  if (outPrefix == NULL)
    err++;
  if (err) {
    fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]);
    fprintf(stderr, "  -G gkpStore\n");
    fprintf(stderr, "  -o fastq-prefix     write files fastq-prefix.(libname).fastq, ...\n");
    fprintf(stderr, "                      if fastq-prefix is '-', all sequences output to stdout\n");
    fprintf(stderr, "                      if fastq-prefix ends in .gz, .bz2 or .xz, output is compressed\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -l libToDump        output only read in library number libToDump (NOT IMPLEMENTED)\n");
    fprintf(stderr, "  -r id[-id]          output only the single read 'id', or the specified range of ids\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -c clearFile        clear range file from OBT modules\n");
    fprintf(stderr, "  -allreads           if a clear range file, lower case mask the deleted reads\n");
    fprintf(stderr, "  -allbases           if a clear range file, lower case mask the non-clear bases\n");
    fprintf(stderr, "  -onlydeleted        if a clear range file, only output deleted reads (the entire read)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -fastq              output is FASTQ format (with extension .fastq, default)\n");
    fprintf(stderr, "  -fasta              output is FASTA format (with extension .fasta)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -nolibname          don't include the library name in the output file name\n");
    fprintf(stderr, "\n");

    if (gkpStoreName == NULL)
      fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n");
    if (outPrefix == NULL)
      fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n");

    exit(1);
  }

  gkStore        *gkpStore  = gkStore::gkStore_open(gkpStoreName);
  uint32          numReads  = gkpStore->gkStore_getNumReads();
  uint32          numLibs   = gkpStore->gkStore_getNumLibraries();

  clearRangeFile *clrRange  = (clrName == NULL) ? NULL : new clearRangeFile(clrName, gkpStore);

  if (bgnID < 1)
    bgnID = 1;

  if (numReads < endID)
    endID = numReads;

  if (endID < bgnID)
    fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID);




  fprintf(stderr, "Dumping reads from %u to %u (inclusive).\n", bgnID, endID);

  libOutput   **out = new libOutput * [numLibs + 1];

  //  Allocate outputs.  If withLibName == false, all reads will artificially be in lib zero, the
  //  other files won't ever be created.  Otherwise, the zeroth file won't ever be created.

  out[0] = new libOutput(outPrefix, outSuffix, NULL);

  for (uint32 i=1; i<=numLibs; i++)
    out[i] = new libOutput(outPrefix, outSuffix, gkpStore->gkStore_getLibrary(i)->gkLibrary_libraryName());

  //  Grab a new readData, and iterate through reads to dump.

  gkReadData   *readData = new gkReadData;

  for (uint32 rid=bgnID; rid<=endID; rid++) {
    gkRead      *read   = gkpStore->gkStore_getRead(rid);

    uint32       libID  = (withLibName == false) ? 0 : read->gkRead_libraryID();

    uint32       flen   = read->gkRead_sequenceLength();
    uint32       lclr   = 0;
    uint32       rclr   = flen;
    bool         ignore = false;

    //fprintf(stderr, "READ %u claims id %u length %u in lib %u\n", rid, read->gkRead_readID(), read->gkRead_sequenceLength(), libID);

    //  If a clear range file is supplied, grab the clear range.  If it hasn't been set, the default
    //  is the entire read.

    if (clrRange) {
      lclr   = clrRange->bgn(rid);
      rclr   = clrRange->end(rid);
      ignore = clrRange->isDeleted(rid);
    }

    //  Abort if we're not dumping anything from this read
    //   - not in a library we care about
    //   - deleted, and not dumping all reads
    //   - not deleted, but only reporting deleted reads

    if (((libToDump != 0) && (libID == libToDump)) ||
        ((dumpAllReads == false) && (ignore == true)) ||
        ((dumpOnlyDeleted == true) && (ignore == false)))
      continue;

    //  And if we're told to ignore the read, and here, then the read was deleted and we're printing
    //  all reads.  Reset the clear range to the whole read, the clear range is invalid.

    if (ignore) {
      lclr = 0;
      rclr = read->gkRead_sequenceLength();
    }

    //  Grab the sequence and quality.

    gkpStore->gkStore_loadReadData(read, readData);

    char   *seq  = readData->gkReadData_getSequence();
    char   *qlt  = readData->gkReadData_getQualities();
    uint32  clen = rclr - lclr;

    //  Soft mask not-clear bases

    if (dumpAllBases == true) {
      for (uint32 i=0; i<lclr; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      for (uint32 i=lclr; i<rclr; i++)
        seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a';

      for (uint32 i=rclr; flen; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      lclr = 0;
      rclr = flen;
    }

    //  Chop off the ends we're not printing.

    seq += lclr;
    qlt += lclr;

    seq[clen] = 0;
    qlt[clen] = 0;

    //  Print the read.

    if (dumpFASTA)
      AS_UTL_writeFastA(out[libID]->getFASTA(), seq, clen, 100,
                        ">"F_U32" clr="F_U32","F_U32"\n",
                        rid, lclr, rclr);

    if (dumpFASTQ)
      AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, clen, qlt, clen,
                        "@"F_U32" clr="F_U32","F_U32"\n",
                        rid, lclr, rclr);
  }

  delete clrRange;

  delete readData;

  for (uint32 i=0; i<=numLibs; i++)
    delete out[i];
  delete [] out;

  gkpStore->gkStore_close();

  exit(0);
}
Example #6
0
File: utgcns.C Project: swang8/canu
int
main (int argc, char **argv) {
  char    *gkpName         = NULL;

  char    *tigName         = NULL;
  uint32   tigVers         = UINT32_MAX;
  uint32   tigPart         = UINT32_MAX;

  char    *tigFileName     = NULL;

  uint32   utgBgn          = UINT32_MAX;
  uint32   utgEnd          = UINT32_MAX;

  char    *outResultsName  = NULL;
  char    *outLayoutsName  = NULL;
  char    *outSeqNameA     = NULL;
  char    *outSeqNameQ     = NULL;
  char    *outPackageName  = NULL;

  FILE     *outResultsFile = NULL;
  FILE     *outLayoutsFile = NULL;
  FILE     *outSeqFileA    = NULL;
  FILE     *outSeqFileQ    = NULL;
  FILE     *outPackageFile = NULL;

  char    *inPackageName   = NULL;

  char      algorithm      = 'P';
  uint32    numThreads	   = 0;

  bool      forceCompute   = false;

  double    errorRate      = 0.12;
  double    errorRateMax   = 0.40;
  uint32    minOverlap     = 40;

  int32     numFailures    = 0;

  bool      showResult     = false;

  double    maxCov         = 0.0;
  uint32    maxLen         = UINT32_MAX;

  uint32    verbosity      = 0;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-T") == 0) {
      tigName = argv[++arg];
      tigVers = atoi(argv[++arg]);
      tigPart = atoi(argv[++arg]);

      if (argv[arg][0] == '.')
        tigPart = UINT32_MAX;

      if (tigVers == 0)
        fprintf(stderr, "invalid tigStore version (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1);
      if (tigPart == 0)
        fprintf(stderr, "invalid tigStore partition (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1);

    } else if (strcmp(argv[arg], "-u") == 0) {
      AS_UTL_decodeRange(argv[++arg], utgBgn, utgEnd);

    } else if (strcmp(argv[arg], "-t") == 0) {
      tigFileName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      outResultsName = argv[++arg];

    } else if (strcmp(argv[arg], "-L") == 0) {
      outLayoutsName = argv[++arg];

    } else if (strcmp(argv[arg], "-A") == 0) {
      outSeqNameA = argv[++arg];

    } else if (strcmp(argv[arg], "-Q") == 0) {
      outSeqNameQ = argv[++arg];

    } else if (strcmp(argv[arg], "-quick") == 0) {
      algorithm = 'Q';
    } else if (strcmp(argv[arg], "-pbdagcon") == 0) {
      algorithm = 'P';
    } else if (strcmp(argv[arg], "-utgcns") == 0) {
      algorithm = 'U';

    } else if (strcmp(argv[arg], "-threads") == 0) {
      numThreads = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-p") == 0) {
      inPackageName = argv[++arg];

    } else if (strcmp(argv[arg], "-P") == 0) {
      outPackageName = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      errorRate = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-em") == 0) {
      errorRateMax = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-l") == 0) {
      minOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-f") == 0) {
      forceCompute = true;

    } else if (strcmp(argv[arg], "-v") == 0) {
      showResult = true;

    } else if (strcmp(argv[arg], "-V") == 0) {
      verbosity++;

    } else if (strcmp(argv[arg], "-maxcoverage") == 0) {
      maxCov   = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-maxlength") == 0) {
      maxLen   = atof(argv[++arg]);

    } else {
      fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);
      err++;
    }

    arg++;
  }

  if ((gkpName == NULL) && (inPackageName == NULL))
    err++;

  if ((tigFileName == NULL) && (tigName == NULL) && (inPackageName == NULL))
    err++;

  if (err) {
    fprintf(stderr, "usage: %s [opts]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  INPUT\n");
    fprintf(stderr, "    -G g            Load reads from gkStore 'g'\n");
    fprintf(stderr, "    -T t v p        Load unitigs from tgStore 't', version 'v', partition 'p'.\n");
    fprintf(stderr, "                      Expects reads will be in gkStore partition 'p' as well\n");
    fprintf(stderr, "                      Use p='.' to specify no partition\n");
    fprintf(stderr, "    -t file         Test the computation of the unitig layout in 'file'\n");
    fprintf(stderr, "                      'file' can be from:\n");
    fprintf(stderr, "                        'tgStoreDump -d layout' (human readable layout format)\n");
    fprintf(stderr, "                        'utgcns -L'             (human readable layout format)\n");
    fprintf(stderr, "                        'utgcns -O'             (binary multialignment format)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -p package      Load unitig and read from 'package' created with -P.  This\n");
    fprintf(stderr, "                    is usually used by developers.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  ALGORITHM\n");
    fprintf(stderr, "    -quick          No alignments, just paste read sequence into the unitig positions.\n");
    fprintf(stderr, "                    This is very fast, but the consensus sequence is formed from a mosaic\n");
    fprintf(stderr, "                    of read sequences, and there can be large indel.  This is useful for\n");
    fprintf(stderr, "                    checking intermediate assembly structure by mapping to reference, or\n");
    fprintf(stderr, "                    possibly for use as input to a polishing step.\n");
    fprintf(stderr, "    -pbdagcon       Use pbdagcon (https://github.com/PacificBiosciences/pbdagcon).\n");
    fprintf(stderr, "                    This is fast and robust.  It is the default algorithm.  It does not\n");
    fprintf(stderr, "                    generate a final multialignment output (the -v option will not show\n");
    fprintf(stderr, "                    anything useful).\n");
    fprintf(stderr, "    -utgcns         Use utgcns (the original Celera Assembler consensus algorithm)\n");
    fprintf(stderr, "                    This isn't as fast, isn't as robust, but does generate a final multialign\n");
    fprintf(stderr, "                    output.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  OUTPUT\n");
    fprintf(stderr, "    -O results      Write computed tigs to binary output file 'results'\n");
    fprintf(stderr, "    -L layouts      Write computed tigs to layout output file 'layouts'\n");
    fprintf(stderr, "    -A fasta        Write computed tigs to fasta  output file 'fasta'\n");
    fprintf(stderr, "    -Q fastq        Write computed tigs to fastq  output file 'fastq'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -P package      Create a copy of the inputs needed to compute the unitigs.  This\n");
    fprintf(stderr, "                    file can then be sent to the developers for debugging.  The unitig(s)\n");
    fprintf(stderr, "                    are not processed and no other outputs are created.  Ideally,\n");
    fprintf(stderr, "                    only one unitig is selected (-u, below).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  TIG SELECTION (if -T input is used)\n");
    fprintf(stderr, "    -u b            Compute only unitig ID 'b' (must be in the correct partition!)\n");
    fprintf(stderr, "    -u b-e          Compute only unitigs from ID 'b' to ID 'e'\n");
    fprintf(stderr, "    -f              Recompute unitigs that already have a multialignment\n");
    fprintf(stderr, "    -maxlength l    Do not compute consensus for unitigs longer than l bases.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  PARAMETERS\n");
    fprintf(stderr, "    -e e            Expect alignments at up to fraction e error\n");
    fprintf(stderr, "    -em m           Don't ever allow alignments more than fraction m error\n");
    fprintf(stderr, "    -l l            Expect alignments of at least l bases\n");
    fprintf(stderr, "    -maxcoverage c  Use non-contained reads and the longest contained reads, up to\n");
    fprintf(stderr, "                    C coverage, for consensus generation.  The default is 0, and will\n");
    fprintf(stderr, "                    use all reads.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  LOGGING\n");
    fprintf(stderr, "    -v              Show multialigns.\n");
    fprintf(stderr, "    -V              Enable debugging option 'verbosemultialign'.\n");
    fprintf(stderr, "\n");


    if ((gkpName == NULL) && (inPackageName == NULL))
      fprintf(stderr, "ERROR:  No gkpStore (-G) and no package (-p) supplied.\n");

    if ((tigFileName == NULL) && (tigName == NULL)  && (inPackageName == NULL))
      fprintf(stderr, "ERROR:  No tigStore (-T) OR no test unitig (-t) OR no package (-p)  supplied.\n");

    exit(1);
  }

  errno = 0;

  //  Open output files.  If we're creating a package, the usual output files are not opened.

  if (outPackageName)
    outPackageFile = fopen(outPackageName, "w");
  if (errno)
    fprintf(stderr, "Failed to open output package file '%s': %s\n", outPackageName, strerror(errno)), exit(1);

  if ((outResultsName) && (outPackageName == NULL))
    outResultsFile = fopen(outResultsName, "w");
  if (errno)
    fprintf(stderr, "Failed to open output results file '%s': %s\n", outResultsName, strerror(errno)), exit(1);

  if ((outLayoutsName) && (outPackageName == NULL))
    outLayoutsFile = fopen(outLayoutsName, "w");
  if (errno)
    fprintf(stderr, "Failed to open output layout file '%s': %s\n", outLayoutsName, strerror(errno)), exit(1);

  if ((outSeqNameA) && (outPackageName == NULL))
    outSeqFileA = fopen(outSeqNameA, "w");
  if (errno)
    fprintf(stderr, "Failed to open output FASTA file '%s': %s\n", outSeqNameA, strerror(errno)), exit(1);

  if ((outSeqNameQ) && (outPackageName == NULL))
    outSeqFileQ = fopen(outSeqNameQ, "w");
  if (errno)
    fprintf(stderr, "Failed to open output FASTQ file '%s': %s\n", outSeqNameQ, strerror(errno)), exit(1);

  if (numThreads > 0) {
    omp_set_num_threads(numThreads);
    fprintf(stderr, "number of threads     = %d (command line)\n", numThreads);
    fprintf(stderr, "\n");
  } else {
    fprintf(stderr, "number of threads     = %d (OpenMP default)\n", omp_get_max_threads());
    fprintf(stderr, "\n");
  }

  //  Open gatekeeper for read only, and load the partitioned data if tigPart > 0.

  gkStore                   *gkpStore          = NULL;
  tgStore                   *tigStore          = NULL;
  FILE                      *tigFile           = NULL;
  FILE                      *inPackageFile     = NULL;
  map<uint32, gkRead *>     *inPackageRead     = NULL;
  map<uint32, gkReadData *> *inPackageReadData = NULL;

  if (gkpName) {
    fprintf(stderr, "-- Opening gkpStore '%s' partition %u.\n", gkpName, tigPart);
    gkpStore = gkStore::gkStore_open(gkpName, gkStore_readOnly, tigPart);
  }

  if (tigName) {
    fprintf(stderr, "-- Opening tigStore '%s' version %u.\n", tigName, tigVers);
    tigStore = new tgStore(tigName, tigVers);
  }

  if (tigFileName) {
    fprintf(stderr, "-- Opening tigFile '%s'.\n", tigFileName);

    errno = 0;
    tigFile = fopen(tigFileName, "r");
    if (errno)
      fprintf(stderr, "Failed to open input tig file '%s': %s\n", tigFileName, strerror(errno)), exit(1);
  }

  if (inPackageName) {
    fprintf(stderr, "-- Opening package file '%s'.\n", inPackageName);

    errno = 0;
    inPackageFile = fopen(inPackageName, "r");
    if (errno)
      fprintf(stderr, "Failed to open input package file '%s': %s\n", inPackageName, strerror(errno)), exit(1);
  }

  //  Report some sizes.

  fprintf(stderr, "sizeof(abBead)     "F_SIZE_T"\n", sizeof(abBead));
  fprintf(stderr, "sizeof(abColumn)   "F_SIZE_T"\n", sizeof(abColumn));
  fprintf(stderr, "sizeof(abAbacus)   "F_SIZE_T"\n", sizeof(abAbacus));
  fprintf(stderr, "sizeof(abSequence) "F_SIZE_T"\n", sizeof(abSequence));

  //  Decide on what to compute.  Either all unitigs, or a single unitig, or a special case test.

  uint32  b = 0;
  uint32  e = UINT32_MAX;

  if (tigStore) {
    if (utgEnd > tigStore->numTigs() - 1)
      utgEnd = tigStore->numTigs() - 1;

    if (utgBgn != UINT32_MAX) {
      b = utgBgn;
      e = utgEnd;

    } else {
      b = 0;
      e = utgEnd;
    }

    fprintf(stderr, "-- Computing unitig consensus for b="F_U32" to e="F_U32" with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n",
            b, e, errorRate, errorRateMax, minOverlap);
  }

  else {
    fprintf(stderr, "-- Computing unitig consensus with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n",
            errorRate, errorRateMax, minOverlap);
  }

  fprintf(stderr, "\n");

  //  I don't like this loop control.

  for (uint32 ti=b; (e == UINT32_MAX) || (ti <= e); ti++) {
    tgTig  *tig = NULL;

    //  If a tigStore, load the tig.  The tig is the owner; it cannot be deleted by us.
    if (tigStore)
      tig = tigStore->loadTig(ti);

    //  If a tigFile or a package, create a new tig and fill it.  Obviously, we own it.
    if (tigFile || inPackageFile) {
      tig = new tgTig();

      if (tig->loadFromStreamOrLayout((tigFile != NULL) ? tigFile : inPackageFile) == false) {
        delete tig;
        break;
      }
    }

    //  No tig loaded, keep going.

    if (tig == NULL)
      continue;

    //  If a package, populate the read and readData maps with data from the package.

    if (inPackageFile) {
      inPackageRead      = new map<uint32, gkRead *>;
      inPackageReadData  = new map<uint32, gkReadData *>;

      for (int32 ii=0; ii<tig->numberOfChildren(); ii++) {
        uint32       readID = tig->getChild(ii)->ident();
        gkRead      *read   = (*inPackageRead)[readID]     = new gkRead;
        gkReadData  *data   = (*inPackageReadData)[readID] = new gkReadData;

        gkStore::gkStore_loadReadFromStream(inPackageFile, read, data);

        if (read->gkRead_readID() != readID)
          fprintf(stderr, "ERROR: package not in sync with tig.  package readID = %u  tig readID = %u\n",
                  read->gkRead_readID(), readID);
        assert(read->gkRead_readID() == readID);
      }
    }

    //  More 'not liking' - set the verbosity level for logging.

    tig->_utgcns_verboseLevel = verbosity;

    //  Are we parittioned?  Is this tig in our partition?

    if (tigPart != UINT32_MAX) {
      uint32  missingReads = 0;

      for (uint32 ii=0; ii<tig->numberOfChildren(); ii++)
        if (gkpStore->gkStore_getReadInPartition(tig->getChild(ii)->ident()) == NULL)
          missingReads++;

      if (missingReads) {
        //fprintf(stderr, "SKIP unitig %u with %u reads found only %u reads in partition, skipped\n",
        //        tig->tigID(), tig->numberOfChildren(), tig->numberOfChildren() - missingReads);
        continue;
      }
    }

    if (tig->length(true) > maxLen) {
      fprintf(stderr, "SKIP unitig %d of length %d (%d children) - too long, skipped\n",
              tig->tigID(), tig->length(true), tig->numberOfChildren());
      continue;
    }

    if (tig->numberOfChildren() == 0) {
      fprintf(stderr, "SKIP unitig %d of length %d (%d children) - no children, skipped\n",
              tig->tigID(), tig->length(true), tig->numberOfChildren());
      continue;
    }

    bool exists   = tig->consensusExists();

    if (tig->numberOfChildren() > 1)
      fprintf(stderr, "Working on unitig %d of length %d (%d children)%s%s\n",
              tig->tigID(), tig->length(true), tig->numberOfChildren(),
              ((exists == true)  && (forceCompute == false)) ? " - already computed"              : "",
              ((exists == true)  && (forceCompute == true))  ? " - already computed, recomputing" : "");

    //  Process the tig.  Remove deep coverage, create a consensus object, process it, and report the results.
    //  before we add it to the store.

    unitigConsensus  *utgcns       = new unitigConsensus(gkpStore, errorRate, errorRateMax, minOverlap);
    savedChildren    *origChildren = NULL;
    bool              success      = exists;

    //  Save the tig in the package?
    //
    //  The original idea was to dump the tig and all the reads, then load the tig and process as normal.
    //  Sadly, stashContains() rearranges the order of the reads even if it doesn't remove any.  The rearranged
    //  tig couldn't be saved (otherwise it would be rearranged again).  So, we were in the position of
    //  needing to save the original tig and the rearranged reads.  Impossible.
    //
    //  Instead, we save the origianl tig and original reads -- including any that get stashed -- then
    //  load them all back into a map for use in consensus proper.  It's a bit of a pain, and could
    //  have way more reads saved than necessary.

    if (outPackageFile) {
      utgcns->savePackage(outPackageFile, tig);
      fprintf(stderr, "  Packaged unitig %u into '%s'\n", tig->tigID(), outPackageName);
    }

    //  Compute consensus if it doesn't exist, or if we're forcing a recompute.  But only if we
    //  didn't just package it.

    if ((outPackageFile == NULL) &&
        ((exists == false) || (forceCompute == true))) {
      origChildren = stashContains(tig, maxCov, true);

      switch (algorithm) {
        case 'Q':
          success = utgcns->generateQuick(tig, inPackageRead, inPackageReadData);
          break;
        case 'P':
        default:
          success = utgcns->generatePBDAG(tig, inPackageRead, inPackageReadData);
          break;
        case 'U':
          success = utgcns->generate(tig, inPackageRead, inPackageReadData);
          break;
      }
    }

    //  If it was successful (or existed already), output.  Success is always false if the unitig
    //  was packaged, regardless of if it existed already.

    if (success == true) {
      if ((showResult) && (gkpStore))  //  No gkpStore if we're from a package.  Dang.
        tig->display(stdout, gkpStore, 200, 3);

      unstashContains(tig, origChildren);

      if (outResultsFile)
        tig->saveToStream(outResultsFile);

      if (outLayoutsFile)
        tig->dumpLayout(outLayoutsFile);

      if (outSeqFileA)
        tig->dumpFASTA(outSeqFileA, true);

      if (outSeqFileQ)
        tig->dumpFASTQ(outSeqFileQ, true);
    }

    //  Report failures.

    if ((success == false) && (outPackageFile == NULL)) {
      fprintf(stderr, "unitigConsensus()-- unitig %d failed.\n", tig->tigID());
      numFailures++;
    }

    //  Clean up, unloading or deleting the tig.

    delete utgcns;        //  No real reason to keep this until here.
    delete origChildren;  //  Need to keep it until after we display() above.

    if (tigStore)
      tigStore->unloadTig(tig->tigID(), true);  //  Tell the store we're done with it

    if (tigFile)
      delete tig;
  }

 finish:
  delete tigStore;

  gkpStore->gkStore_close();

  if (tigFile)         fclose(tigFile);
  if (outResultsFile)  fclose(outResultsFile);
  if (outLayoutsFile)  fclose(outLayoutsFile);
  if (outPackageFile)  fclose(outPackageFile);
  if (inPackageFile)   fclose(inPackageFile);

  if (numFailures) {
    fprintf(stderr, "WARNING:  Total number of unitig failures = %d\n", numFailures);
    fprintf(stderr, "\n");
    fprintf(stderr, "Consensus did NOT finish successfully.\n");

  } else {
    fprintf(stderr, "Consensus finished successfully.  Bye.\n");
  }

  return(numFailures != 0);
}
int
main(int argc, char **argv) {
  char  bolfile_name[FILENAME_MAX] = {0};
  char  Outfile_Name[FILENAME_MAX] = {0};
  int  illegal;
  char  * p;

  argc = AS_configure(argc, argv);
  Min_Olap_Len = AS_OVERLAP_MIN_LEN; // set after configure

  int err=0;
  int arg=1;
  while (arg < argc) {
    if (strcmp(argv[arg], "-G") == 0) {
      Doing_Partial_Overlaps = TRUE;
    } else if (strcmp(argv[arg], "-h") == 0) {
      AS_UTL_decodeRange(argv[++arg], Lo_Hash_Frag, Hi_Hash_Frag);

    } else if (strcmp(argv[arg], "-H") == 0) {
      AS_UTL_decodeRange(argv[++arg], minLibToHash, maxLibToHash);

    } else if (strcmp(argv[arg], "-R") == 0) {
      AS_UTL_decodeRange(argv[++arg], minLibToRef, maxLibToRef);

    } else if (strcmp(argv[arg], "-k") == 0) {
      arg++;
      if ((isdigit(argv[arg][0]) && (argv[arg][1] == 0)) ||
          (isdigit(argv[arg][0]) && isdigit(argv[arg][1]) && (argv[arg][2] == 0))) {
        Kmer_Len = strtoull(argv[arg], NULL, 10);
      } else {
        errno = 0;
        Kmer_Skip_File = fopen(argv[arg], "r");
        if (errno)
          fprintf(stderr, "ERROR: Failed to open -k '%s': %s\n", argv[arg], strerror(errno)), exit(1);
      }

    } else if (strcmp(argv[arg], "-l") == 0) {
      Frag_Olap_Limit = strtol(argv[++arg], NULL, 10);
      if  (Frag_Olap_Limit < 1)
        Frag_Olap_Limit = INT_MAX;

    } else if (strcmp(argv[arg], "-m") == 0) {
      Unique_Olap_Per_Pair = FALSE;

    } else if (strcmp(argv[arg], "--hashbits") == 0) {
      Hash_Mask_Bits = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashstrings") == 0) {
      Max_Hash_Strings = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashdatalen") == 0) {
      Max_Hash_Data_Len = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashload") == 0) {
      Max_Hash_Load = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "--maxreadlen") == 0) {
      //  Quite the gross way to do this, but simple.
      uint32 desired = strtoul(argv[++arg], NULL, 10);
      OFFSET_BITS = 1;
      while (((uint32)1 << OFFSET_BITS) < desired)
        OFFSET_BITS++;

      STRING_NUM_BITS       = 30 - OFFSET_BITS;

      STRING_NUM_MASK       = (1 << STRING_NUM_BITS) - 1;
      OFFSET_MASK           = (1 << OFFSET_BITS) - 1;

      MAX_STRING_NUM        = STRING_NUM_MASK;

    } else if (strcmp(argv[arg], "--readsperbatch") == 0) {
      Max_Reads_Per_Batch = strtoul(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--readsperthread") == 0) {
      Max_Reads_Per_Thread = strtoul(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-o") == 0) {
      strcpy(Outfile_Name, argv[++arg]);

    } else if (strcmp(argv[arg], "-r") == 0) {
      AS_UTL_decodeRange(argv[++arg], Lo_Old_Frag, Hi_Old_Frag);

    } else if (strcmp(argv[arg], "-t") == 0) {
      Num_PThreads = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-u") == 0) {
      Unique_Olap_Per_Pair = TRUE;

    } else if (strcmp(argv[arg], "-v") == 0) {
      Min_Olap_Len = (int) strtol (argv[++arg], & p, 10);

    } else if (strcmp(argv[arg], "-w") == 0) {
      Use_Window_Filter = TRUE;

    } else if (strcmp(argv[arg], "-x") == 0) {
      Ignore_Clear_Range = TRUE;

    } else if (strcmp(argv[arg], "-z") == 0) {
      Use_Hopeless_Check = FALSE;

    } else {
      if (Frag_Store_Path == NULL) {
        Frag_Store_Path = argv[arg];
      } else {
        fprintf(stderr, "Unknown option '%s'\n", argv[arg]);
        err++;
      }
    }
    arg++;
  }

  //  Fix up some flags if we're allowing high error rates.
  //
  if (AS_OVL_ERROR_RATE > 0.06) {
    if (Use_Window_Filter)
      fprintf(stderr, "High error rates requested -- window-filter turned off despite -w flag!\n");
    Use_Window_Filter  = FALSE;
    Use_Hopeless_Check = FALSE;
  }

  if (Max_Hash_Strings == 0)
    fprintf(stderr, "* No memory model supplied; -M needed!\n"), err++;

  if (Kmer_Len == 0)
    fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++;

  if (Max_Hash_Strings > MAX_STRING_NUM)
    fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++;

  if (Outfile_Name[0] == 0)
    fprintf (stderr, "ERROR:  No output file name specified\n"), err++;

  if ((err) || (Frag_Store_Path == NULL)) {
    fprintf(stderr, "USAGE:  %s [options] <gkpStorePath>\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "-b <fn>     in contig mode, specify the output file\n");
    fprintf(stderr, "-c          contig mode.  Use 2 frag stores.  First is\n");
    fprintf(stderr, "            for reads; second is for contigs\n");
    fprintf(stderr, "-G          do partial overlaps\n");
    fprintf(stderr, "-h <range>  to specify fragments to put in hash table\n");
    fprintf(stderr, "            Implies LSF mode (no changes to frag store)\n");
    fprintf(stderr, "-I          designate a file of frag iids to limit olaps to\n");
    fprintf(stderr, "            (Contig mode only)\n");
    fprintf(stderr, "-k          if one or two digits, the length of a kmer, otherwise\n");
    fprintf(stderr, "            the filename containing a list of kmers to ignore in\n");
    fprintf(stderr, "            the hash table\n");
    fprintf(stderr, "-l          specify the maximum number of overlaps per\n");
    fprintf(stderr, "            fragment-end per batch of fragments.\n");
    fprintf(stderr, "-m          allow multiple overlaps per oriented fragment pair\n");
    fprintf(stderr, "-M          specify memory size.  Valid values are '8GB', '4GB',\n");
    fprintf(stderr, "            '2GB', '1GB', '256MB'.  (Not for Contig mode)\n");
    fprintf(stderr, "-o          specify output file name\n");
    fprintf(stderr, "-P          write protoIO output (if not -G)\n");
    fprintf(stderr, "-r <range>  specify old fragments to overlap\n");
    fprintf(stderr, "-s          ignore screen information with fragments\n");
    fprintf(stderr, "-t <n>      use <n> parallel threads\n");
    fprintf(stderr, "-u          allow only 1 overlap per oriented fragment pair\n");
    fprintf(stderr, "-v <n>      only output overlaps of <n> or more bases\n");
    fprintf(stderr, "-w          filter out overlaps with too many errors in a window\n");
    fprintf(stderr, "-x          ignore the clear ranges on reads and use the \n");
    fprintf(stderr, "            full sequence\n");
    fprintf(stderr, "-z          skip the hopeless check\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--hashbits n       Use n bits for the hash mask.\n");
    fprintf(stderr, "--hashstrings n    Load at most n strings into the hash table at one time.\n");
    fprintf(stderr, "--hashdatalen n    Load at most n bytes into the hash table at one time.\n");
    fprintf(stderr, "--hashload f       Load to at most 0.0 < f < 1.0 capacity (default 0.7).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--maxreadlen n     For batches with all short reads, pack bits differently to\n");
    fprintf(stderr, "                   process more reads per batch.\n");
    fprintf(stderr, "                     all reads must be shorter than n\n");
    fprintf(stderr, "                     --hashstrings limited to 2^(30-m)\n");
    fprintf(stderr, "                   Common values:\n");
    fprintf(stderr, "                     maxreadlen 2048 -> hashstrings  524288 (default)\n");
    fprintf(stderr, "                     maxreadlen  512 -> hashstrings 2097152\n");
    fprintf(stderr, "                     maxreadlen  128 -> hashstrings 8388608\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--readsperbatch n  Force batch size to n.\n");
    fprintf(stderr, "--readsperthread n Force each thread to process n reads.\n");
    fprintf(stderr, "\n");
    exit(1);
  }

  assert(NULL == Out_BOF);

  Out_BOF    = AS_OVS_createBinaryOverlapFile(Outfile_Name, FALSE);

  //  Adjust the number of reads to load into memory at once (for processing, not the hash table),

  if (Max_Reads_Per_Batch == 0)
    Max_Reads_Per_Batch = (Max_Hash_Strings < 100000) ? Max_Hash_Strings : 100000;

  //if (Max_Hash_Strings < Max_Reads_Per_Batch)
  //  Max_Reads_Per_Batch = Max_Hash_Strings;

  //  Adjust the number of reads processed per thread.  Default to having four blocks per thread,
  //  but make sure that (a) all threads have work to do, and (b) batches are not minuscule.

  if (Max_Reads_Per_Thread == 0)
    Max_Reads_Per_Thread = Max_Reads_Per_Batch / (4 * Num_PThreads);

  if (Max_Reads_Per_Thread * Num_PThreads > Max_Reads_Per_Batch)
    Max_Reads_Per_Thread = Max_Reads_Per_Batch / Num_PThreads + 1;

  if (Max_Reads_Per_Thread < 10)
    Max_Reads_Per_Thread = 10;

  //  We know enough now to set the hash function variables, and some other random variables.

  HSF1 = Kmer_Len - (Hash_Mask_Bits / 2);
  HSF2 = 2 * Kmer_Len - Hash_Mask_Bits;
  SV1  = HSF1 + 2;
  SV2  = (HSF1 + HSF2) / 2;
  SV3  = HSF2 - 2;

  Branch_Match_Value = (Doing_Partial_Overlaps) ? PARTIAL_BRANCH_MATCH_VAL : DEFAULT_BRANCH_MATCH_VAL;
  Branch_Error_Value = Branch_Match_Value - 1.0;

  fprintf(stderr, "\n");
  fprintf(stderr, "STRING_NUM_BITS       "F_U32"\n", STRING_NUM_BITS);
  fprintf(stderr, "OFFSET_BITS           "F_U32"\n", OFFSET_BITS);
  fprintf(stderr, "STRING_NUM_MASK       "F_U64"\n", STRING_NUM_MASK);
  fprintf(stderr, "OFFSET_MASK           "F_U64"\n", OFFSET_MASK);
  fprintf(stderr, "MAX_STRING_NUM        "F_U64"\n", MAX_STRING_NUM);
  fprintf(stderr, "\n");
  fprintf(stderr, "Hash_Mask_Bits        "F_U32"\n", Hash_Mask_Bits);
  fprintf(stderr, "Max_Hash_Strings      "F_U32"\n", Max_Hash_Strings);
  fprintf(stderr, "Max_Hash_Data_Len     "F_U64"\n", Max_Hash_Data_Len);
  fprintf(stderr, "Max_Hash_Load         %f\n", Max_Hash_Load);
  fprintf(stderr, "Kmer Length           %d\n", (int)Kmer_Len);
  fprintf(stderr, "Min Overlap Length    %d\n", Min_Olap_Len);
  fprintf(stderr, "MAX_ERRORS            %d\n", MAX_ERRORS);
  fprintf(stderr, "ERRORS_FOR_FREE       %d\n", ERRORS_FOR_FREE);
  fprintf(stderr, "\n");
  fprintf(stderr, "Num_PThreads          "F_U32"\n", Num_PThreads);
  fprintf(stderr, "Max_Reads_Per_Batch   "F_U32"\n", Max_Reads_Per_Batch);
  fprintf(stderr, "Max_Reads_Per_Thread  "F_U32"\n", Max_Reads_Per_Thread);

  assert (8 * sizeof (uint64) > 2 * Kmer_Len);

  Initialize_Globals ();

  OldFragStore = new gkStore(Frag_Store_Path, FALSE, FALSE);

  /****************************************/
  OverlapDriver();
  /****************************************/

  fprintf (stderr, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct);
  fprintf (stderr, "    Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct);
  fprintf (stderr, "  Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct);
  fprintf (stderr, " Total overlaps produced = "F_S64"\n", Total_Overlaps);
  fprintf (stderr, "      Contained overlaps = "F_S64"\n", Contained_Overlap_Ct);
  fprintf (stderr, "       Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct);
  fprintf (stderr, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct);
  fprintf (stderr, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct);

  delete OldFragStore;

  AS_OVS_closeBinaryOverlapFile(Out_BOF);

  return(0);
}