Пример #1
main(int argc, char **argv) {
    OverlapStore  *ovs        = NULL;

    argc = AS_configure(argc, argv);

    int arg=1;
    int err=0;
    while (arg < argc) {
        if        (strcmp(argv[arg], "-O") == 0) {
            ovs = AS_OVS_openOverlapStore(argv[++arg]);
        } else {

    if ((ovs == NULL) || (err)) {
        fprintf(stderr, "usage: %s -O ovlStore < unitigs.cgb > fixedUnitigs.cgb\n", argv[0]);

    GenericMesg   *pmesg = NULL;
    while ((ReadProtoMesg_AS(stdin, &pmesg) != EOF)) {
        if (pmesg->t == MESG_IUM)
            fixUnitig((IntUnitigMesg *)(pmesg->m), ovs);

        WriteProtoMesg_AS(stdout, pmesg);

Пример #2
main (int argc, char **argv) {
  char            *gkpName   = NULL;
  char            *tigName   = NULL;
  int32            tigVers   = -1;
  vector<char *>   tigInputs;
  tgStoreType      tigType   = tgStoreModify;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-T") == 0) {
      tigName = argv[++arg];
      tigVers = atoi(argv[++arg]);

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);

  if ((err) || (gkpName == NULL) || (tigName == NULL) || (tigInputs.size() == 0)) {
    fprintf(stderr, "usage: %s -G <gkpStore> -T <tigStore> <v>\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G <gkpStore>         Path to the gatekeeper store\n");
    fprintf(stderr, "  -T <tigStore> <v>     Path to the tigStore and version to add tigs to\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  Remove store versions before <v>.  Data present in versions before <v>\n");
    fprintf(stderr, "  are copied to version <v>.  Files for the earlier versions are removed.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  WARNING!  This code HAS NOT been tested with canu.\n");
    fprintf(stderr, "\n");

    if (gkpName == NULL)
      fprintf(stderr, "ERROR:  no gatekeeper store (-G) supplied.\n");
    if (tigName == NULL)
      fprintf(stderr, "ERROR:  no tig store (-T) supplied.\n");


  operationCompress(tigName, tigVers);

Пример #3
Файл: mercy.C Проект: lhon/canu
main(int argc, char **argv) {
  merylStreamReader  *AF = 0L;
  merylStreamReader  *TF = 0L;
  merylStreamReader  *AC = 0L;
  merylStreamReader  *DC = 0L;
  merylStreamReader  *CO = 0L;

  uint32              AFmode = 0;
  uint32              TFmode = 0;

  char                dumpSCZFname[1024] = {0};  //  single contig, zero frags
  char                dumpMCZFname[1024] = {0};  //  low contig, zero frags
  char                dumpMCSFname[1024] = {0};  //  medium contig, low frags
  char                dumpMCMFname[1024] = {0};  //  everything else, contig > frags

  bool                beVerbose = false;

  argc = AS_configure(argc, argv);

  int arg=1;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-af") == 0) {  //  All frags
      AFmode = findMode(argv[arg]);
      AF = new merylStreamReader(argv[arg]);
    } else if (strcmp(argv[arg], "-tf") == 0) {  //  Trimmed frags
      TFmode = findMode(argv[arg]);
      TF = new merylStreamReader(argv[arg]);
    } else if (strcmp(argv[arg], "-ac") == 0) {  //  All contigs
      AC = new merylStreamReader(argv[++arg]);
    } else if (strcmp(argv[arg], "-dc") == 0) {  //  Degenerate contigs
      DC = new merylStreamReader(argv[++arg]);
    } else if (strcmp(argv[arg], "-co") == 0) {  //  Contigs
      CO = new merylStreamReader(argv[++arg]);
    } else if (strcmp(argv[arg], "-dump") == 0) {
      dumpFlag = true;
      sprintf(dumpSCZFname, "%s.0.singlecontig.zerofrag.fasta",       argv[arg]);
      sprintf(dumpMCZFname, "%s.1.multiplecontig.zerofrag.fasta",     argv[arg]);
      sprintf(dumpMCSFname, "%s.2.multiplecontig.lowfrag.fasta",      argv[arg]);
      sprintf(dumpMCMFname, "%s.3.multiplecontig.multiplefrag.fasta", argv[arg]);
    } else if (strcmp(argv[arg], "-v") == 0) {
      beVerbose = true;
    } else {
      fprintf(stderr, "unknown option '%s'\n", argv[arg]);

  if ((AF == 0L) && (TF == 0L) && (AC == 0L) && (DC == 0L) && (CO == 0L)) {
    fprintf(stderr, "usage: %s [opts] [-v] [-dump prefix]\n", argv[0]);
    fprintf(stderr, "At least one fragcounts and one contigcounts are needed.\n");
    fprintf(stderr, "          -af | -tf        fragcounts\n");
    fprintf(stderr, "          -ac | -dc | -co  contigcounts \n");
    fprintf(stderr, "Dumping is probably only useful with exactly one frag and\n");
    fprintf(stderr, "one contig, but I'll let you do it with any number.\n");
  if ((AF == 0L) && (TF == 0L)) {
    fprintf(stderr, "ERROR - need at least one of -af, -tf\n");
  if ((AC == 0L) && (DC == 0L) && (CO == 0L)) {
    fprintf(stderr, "ERROR - need at least one of -ac, -dc, -co\n");

  //  Check mersizes.
  uint32  merSize = 0;
  uint32  ms[5] = { 0 };

  if (AF)  merSize = ms[0] = AF->merSize();
  if (TF)  merSize = ms[1] = TF->merSize();
  if (AC)  merSize = ms[2] = AC->merSize();
  if (DC)  merSize = ms[3] = DC->merSize();
  if (CO)  merSize = ms[4] = CO->merSize();

  bool  differ = false;

  if ((ms[0] > 0) && (ms[0] != merSize))  differ = true;
  if ((ms[1] > 0) && (ms[1] != merSize))  differ = true;
  if ((ms[2] > 0) && (ms[2] != merSize))  differ = true;
  if ((ms[3] > 0) && (ms[3] != merSize))  differ = true;
  if ((ms[4] > 0) && (ms[4] != merSize))  differ = true;

  if (differ) {
    fprintf(stderr, "error:  mer size differ.\n");
    fprintf(stderr, "        AF - "F_U32"\n", ms[0]);
    fprintf(stderr, "        TF - "F_U32"\n", ms[1]);
    fprintf(stderr, "        AC - "F_U32"\n", ms[2]);
    fprintf(stderr, "        DC - "F_U32"\n", ms[3]);
    fprintf(stderr, "        CO - "F_U32"\n", ms[4]);

  if (dumpFlag) {
    errno = 0;
    dumpSCZF = fopen(dumpSCZFname, "w");
    dumpMCZF = fopen(dumpMCZFname, "w");
    dumpMCSF = fopen(dumpMCSFname, "w");
    dumpMCMF = fopen(dumpMCMFname, "w");
    if (errno)
      fprintf(stderr, "Failed to open the dump files: %s\n", strerror(errno)), exit(1);

  for (uint32 i=0; i<NUMCATEGORIES; i++)
    for (uint32 j=0; j<NUMCATEGORIES; j++) {
      AFvsAC[i][j] = 0;
      AFvsDC[i][j] = 0;
      AFvsCO[i][j] = 0;
      TFvsAC[i][j] = 0;
      TFvsDC[i][j] = 0;
      TFvsCO[i][j] = 0;

  //  The default constructor for kMer sets the mer to size 0, all A.
  //  We need it to be the proper size, and all T.
  kMer   minmer(merSize);

  //  Don't care what we pick, as long as it's a mer in the set.
  if (AF && AF->validMer())  minmer = AF->theFMer();
  if (TF && TF->validMer())  minmer = TF->theFMer();
  if (AC && AC->validMer())  minmer = AC->theFMer();
  if (DC && DC->validMer())  minmer = DC->theFMer();
  if (CO && CO->validMer())  minmer = CO->theFMer();

  speedCounter *C = new speedCounter(" Examining: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose);

  bool  morestuff = true;
  while (morestuff) {

    //  Find any mer in our set
    if (AF && AF->validMer())  minmer = AF->theFMer();
    if (TF && TF->validMer())  minmer = TF->theFMer();
    if (AC && AC->validMer())  minmer = AC->theFMer();
    if (DC && DC->validMer())  minmer = DC->theFMer();
    if (CO && CO->validMer())  minmer = CO->theFMer();

    //  Find the smallest mer in our set
    if (AF && AF->validMer() && (AF->theFMer() < minmer))  minmer = AF->theFMer();
    if (TF && TF->validMer() && (TF->theFMer() < minmer))  minmer = TF->theFMer();
    if (AC && AC->validMer() && (AC->theFMer() < minmer))  minmer = AC->theFMer();
    if (DC && DC->validMer() && (DC->theFMer() < minmer))  minmer = DC->theFMer();
    if (CO && CO->validMer() && (CO->theFMer() < minmer))  minmer = CO->theFMer();

    //  We need to do up to six comparisons here.
    if (AF && AC)   compare(AF, AC, minmer, AFmode, AFvsAC);
    if (AF && DC)   compare(AF, DC, minmer, AFmode, AFvsDC);
    if (AF && CO)   compare(AF, CO, minmer, AFmode, AFvsCO);
    if (TF && AC)   compare(TF, AC, minmer, TFmode, TFvsAC);
    if (TF && DC)   compare(TF, DC, minmer, TFmode, TFvsDC);
    if (TF && CO)   compare(TF, CO, minmer, TFmode, TFvsCO);

#if 0
    if (C->tick()) {
      char stringjunk[256];
      fprintf(stderr, "\nMM %s\n", minmer.merToString(stringjunk));
      if (AF) fprintf(stderr, "AF %s\n", AF->theFMer().merToString(stringjunk));
      if (TF) fprintf(stderr, "TF %s\n", TF->theFMer().merToString(stringjunk));
      if (AC) fprintf(stderr, "AC %s\n", AC->theFMer().merToString(stringjunk));
      if (DC) fprintf(stderr, "DC %s\n", DC->theFMer().merToString(stringjunk));
      if (CO) fprintf(stderr, "CO %s\n", CO->theFMer().merToString(stringjunk));

    //  Advance to the next mer, if we were just used
    morestuff = false;
    if ((AF) && (AF->theFMer() == minmer))   morestuff |= AF->nextMer();
    if ((TF) && (TF->theFMer() == minmer))   morestuff |= TF->nextMer();
    if ((AC) && (AC->theFMer() == minmer))   morestuff |= AC->nextMer();
    if ((DC) && (DC->theFMer() == minmer))   morestuff |= DC->nextMer();
    if ((CO) && (CO->theFMer() == minmer))   morestuff |= CO->nextMer();

  delete C;

  //  output

  if ((AF) && (AC))   output("all frags vs all contigs",          AFmode, AFvsAC);
  if ((AF) && (DC))   output("all frags vs deg. contigs",         AFmode, AFvsDC);
  if ((AF) && (CO))   output("all frags vs non-deg. contigs",     AFmode, AFvsCO);
  if ((TF) && (AC))   output("trimmed frags vs all contigs",      TFmode, TFvsAC);
  if ((TF) && (DC))   output("trimmed frags vs deg. contigs",     TFmode, TFvsDC);
  if ((TF) && (CO))   output("trimmed frags vs non-deg. contigs", TFmode, TFvsCO);

  delete AF;
  delete TF;
  delete AC;
  delete DC;
  delete CO;

Пример #4
int main(int argc, char *argv[])
{ int    K=-1,KB=-1;
  char **Seqs;
  char **Names;
  char **SeqsB;
  char **NamesB;
  char *seqfilename=NULL,*dbfilename=NULL;
  int internalCompare=0; /* whether query and database sequences are the same */
  int *Profiles;
  int *ProfilesB=NULL;
  int ori;
  int first=1;
  FILE *seqfile=NULL, *dbfile=NULL;
  int *kmerCounts,*kmerIndex;
  int *len,*lenB;
  int *frontDiscount;
  int i,j;
  int maxlen=0;
  int minlen=40;
  int doTrimming=0;

  argc = AS_configure(argc, argv);

  { /* Parse the argument list using "man 3 getopt". */
    int ch,errflg=0;
    optarg = NULL;
    while (!errflg && ((ch = getopt(argc, argv, "fFg:hk:q:d:m:t")) != EOF))
	switch(ch) {
	case 'f':
	case 'F':
	case 'g':
	  GRANULARITY = atoi(optarg);
	case 'h':
	case 'q':
	case 'd':
	case 'k':
	case 'm':
	case 't':
	  fprintf(stderr,"Will trim to common range\n");
	case '?':
	default :
	  fprintf( stderr, "Unrecognized option -%c\n", optopt);

  if(seqfilename==dbfilename || strcmp(seqfilename,dbfilename)==0){
  } else {

  fprintf(stderr,"Read in %d query sequences\n",K+1);
  len = (int *)safe_malloc(sizeof(int)*(K+1));


  fprintf(stderr,"Read in %d database sequences\n",KB+1);
  lenB = (int *)safe_malloc(sizeof(int)*(KB+1));
  frontDiscount = (int *)safe_malloc(sizeof(int)*(KB+1));

  for (i = 0; i < 128; i++){
    Map[i] = -1;
  Map['a'] = Map['A'] = 0;
  Map['c'] = Map['C'] = 1;
  Map['g'] = Map['G'] = 2;
  Map['t'] = Map['T'] = 3;

  calc_kmer_members((const char **)SeqsB,ksize,&kmerCounts,&kmerIndex,KB+1);

  fprintf(stderr,"Built index\n");

    int **hitCounts=NULL;

    hitCounts = (int **) safe_malloc(maxlen*sizeof(int*));

    for(j= 0; j<maxlen; j++){
      hitCounts[j] = (int *) safe_malloc((KB+1)*sizeof(int));

    for (i = 0; i <= K; i++){
      int k;
      int kword = 0;
      int h = -ksize;
      int bestfront=-1, bestback=-1;
      int bestscore=-1,bestloc=-1;
      int bestsimple=-1,simplescore=-1;
      int ilen,ilenlessone;
      int startbestmatch=0;
      int endbestmatch = len[i]-1;


	for(j= 0; j<=KB; j++){
	  hitCounts[k][j] = 0;

	int x = Map[(int) (Seqs[i][j])];
	if (x >= 0){
	  kword = (kword << 2) | x;
	  kword <<= 2;
	  h = j-(ksize-1);

	int x = Map[(int) (Seqs[i][j])];
	if (x >= 0){
	  kword = ((kword << 2) | x) & kmax;
	  kword <<= 2;
	  h = j-(ksize-1);
	if (j >= h+ksize){
	    if(internalCompare && kmerIndex[k]==i)continue;

      for( k=0;k<=KB;k++){
	  simplescore = hitCounts[ilenlessone][k];

	ALNoverlap *ovl=NULL;
	double erate=0.02;
	// below, .9 fudge factor may be necessary to handle cases where some matching kmers are random out of order matches
	int minovl=simplescore *.9;
	  ovl = DP_Compare(Seqs[i], SeqsB[bestsimple],
                           -lenB[bestsimple]+minovl, len[i]-minovl,
                           strlen(Seqs[i]), strlen(SeqsB[bestsimple]),

      //      printf("startbestmatch init at %d\n",startbestmatch);
      //      printf("endbestmatch init at %d\n",endbestmatch);


	int maxfront=-1;
	int maxback=-1;
	int whichfront=-1,whichback=-1;
	  fprintf(stderr,"New best %d: loc %d seqs %s / %s bestfront %d bestback %d\n",

	int frontstart=startbestmatch;
	int frontend=endbestmatch;

	int backstart=startbestmatch;
	int backend=endbestmatch;

	ALNoverlap *ovl=NULL;
	double erate=0.02;
	// below, .9 fudge factor may be necessary to handle cases where some matching kmers are random out of order matches
	int minovl=hitCounts[bestloc][bestfront]*.9;
	    ovl = DP_Compare(Seqs[i], SeqsB[bestfront],
                             -lenB[bestfront]+minovl, len[i]-minovl,
                             strlen(Seqs[i]), strlen(SeqsB[bestfront]),
	    if( minim(frontend,len[i]+ovl->endpos) <= maxim(frontstart,ovl->begpos) ){
	      // complain
		      "trouble with overlap found at erate %f: ahang %d bhang %d => frontend %d <= frontstart %d\n"
		      ovl->begpos, ovl->endpos, frontend, frontstart,Seqs[i],SeqsB[bestfront]);
	      // and do nothing!
	    } else {
	      // update
	      frontstart = maxim(frontstart,ovl->begpos);
	      frontend = minim(frontend,len[i]+ovl->endpos);
	// below, .9 fudge factor may be necessary to handle cases where some matching kmers are random out of order matches
	//	fprintf(stderr,"DEBUG: minovl = %d\n",hitCounts[ilenlessone][bestback]);
	  //	  fprintf(stderr,"initial settings: backstart, backend to %d %d\n",backstart,backend);
	    ovl = DP_Compare(Seqs[i], SeqsB[bestback],
                             -lenB[bestback], len[i],
                             strlen(Seqs[i]), strlen(SeqsB[bestback]),
	    //ovl = DP_Compare(Seqs[i],SeqsB[bestback],-lenB[bestback],len[i],0,erate,1e-6,40,AS_FIND_LOCAL_ALIGN_NO_TRACE);

	    if( minim(backend,len[i]+ovl->endpos) <= maxim(backstart,ovl->begpos) ){
	      // complain
		      "trouble with overlap found at erate %f: ahang %d bhang %d => backend %d <= backstart %d\n"
		      ovl->begpos, ovl->endpos, backend, backstart,Seqs[i],SeqsB[bestback]);
	      // and do nothing
	    } else {
	      // update
	      //	      fprintf(stderr,"Updating backstart, backend to %d %d\n",backstart,backend);

	// things are problematic if the overlap is to a region that doesn't come close to the implied breakpoint (bestloc);
	// however, partial sequence issues can make an absolute test fail, hence the constant 100 below:
	  startbestmatch= (startbestmatch > frontstart ? startbestmatch : frontstart);
	  endbestmatch= (endbestmatch < frontend ? endbestmatch : frontend);
	  startbestmatch= (startbestmatch > backstart ? startbestmatch : backstart);
	  endbestmatch= (endbestmatch < backend ? endbestmatch : backend);

	//	fprintf(stderr,"Final bestmatch start %d end %d\n",startbestmatch,endbestmatch);



      //      printf("startbestmatch final at %d\n",startbestmatch);
      //      printf("endbestmatch final at %d\n",endbestmatch);


      if(simplescore <= hitCounts[endbestmatch][bestfront]-hitCounts[startbestmatch+ksize-2][bestfront]){
	//	fprintf(stderr,"Resetting best simple, presumably due to trimming (now bestfront)\n");

      if( simplescore <= hitCounts[endbestmatch][bestback]-hitCounts[startbestmatch+ksize-2][bestback]){
	//	fprintf(stderr,"Resetting best simple, presumably due to trimming (now bestback)\n");

	printf("%s (len=%d): best split = %d %s : %s (score = %d from %d to %d ; separately, scores = %d and %d; best single %s scores %d )\n",
      } else {
	printf("%s (len=%d): best match = %s (from %d to %d, score = %d )\n",



  return (0);
Пример #5
main(int argc, char **argv) {
  feParameters  *G = new feParameters();

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      G->gkpStorePath = argv[++arg];

    } else if (strcmp(argv[arg], "-R") == 0) {
      G->bgnID = atoi(argv[++arg]);
      G->endID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-O") == 0) {
      G->ovlStorePath = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      G->errorRate = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-l") == 0) {
      G->minOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-o") == 0) {  //  For 'corrections' file output
      G->outputFileName = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      G->numThreads = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-d") == 0) {
      G->Degree_Threshold = strtol(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-k") == 0) {
      G->Kmer_Len = strtol(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-p") == 0) {
      G->Use_Haplo_Ct = FALSE;

    } else if (strcmp(argv[arg], "-V") == 0) {
      G->Vote_Qualify_Len = strtol(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-x") == 0) {
      G->End_Exclude_Len = strtol(argv[++arg], NULL, 10);

    } else {
      fprintf(stderr, "Unknown option '%s'\n", argv[arg]);


  if (G->numThreads == 0)

  if (err > 0) {
    fprintf(stderr, "usage: %s[-ehp][-d DegrThresh][-k KmerLen][-x ExcludeLen]\n", argv[0]);
    fprintf(stderr, "        [-F OlapFile][-S OlapStore][-o CorrectFile]\n");
    fprintf(stderr, "        [-t NumPThreads][-v VerboseLevel]\n");
    fprintf(stderr, "        [-V Vote_Qualify_Len]\n");
    fprintf(stderr, "          <FragStore> <lo> <hi>\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Makes corrections to fragment sequence based on overlaps\n");
    fprintf(stderr, "and recomputes overlaps on corrected fragments\n");
    fprintf(stderr, "Fragments come from <FragStore> <lo> and <hi> specify\n");
    fprintf(stderr, "the range of fragments to modify\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Options:\n");
    fprintf(stderr, "-d   set keep flag on end of frags with less than this many olaps\n");
    fprintf(stderr, "-F   specify file of sorted overlaps to use (in the format produced\n");
    fprintf(stderr, "     by  get-olaps\n");
    fprintf(stderr, "-h   print this message\n");
    fprintf(stderr, "-k   minimum exact-match region to prevent change\n");
    fprintf(stderr, "-o   specify output file to hold correction info\n");
    fprintf(stderr, "-p   don't use haplotype counts to correct\n");
    fprintf(stderr, "-S   specify the binary overlap store containing overlaps to use\n");
    fprintf(stderr, "-t   set number of p-threads to use\n");
    fprintf(stderr, "-v   specify level of verbose outputs, higher is more\n");
    fprintf(stderr, "-V   specify number of exact match bases around an error to vote to change\n");
    fprintf(stderr, "-x   length of end of exact match to exclude in preventing change\n");

    if (G->numThreads == 0)
      fprintf(stderr, "ERROR: number of compute threads (-t) must be larger than zero.\n");


  //  Initialize Globals

  double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN);

  Initialize_Match_Limit(G->Edit_Match_Limit, G->errorRate, MAX_ERRORS);

  for  (uint32 i = 0;  i <= AS_MAX_READLEN;  i++)
    G->Error_Bound[i] = (int)ceil(i * G->errorRate);


  gkStore *gkpStore = gkStore::gkStore_open(G->gkpStorePath);

  if (G->bgnID < 1)
    G->bgnID = 1;

  if (gkpStore->gkStore_getNumReads() < G->endID)
    G->endID = gkpStore->gkStore_getNumReads();

  Read_Frags(G, gkpStore);
  Read_Olaps(G, gkpStore);

  //  Now sort them!

  sort(G->olaps, G->olaps + G->olapsLen);

  //fprintf (stderr, "Before Stream_Old_Frags  Num_Olaps = "F_S64"\n", Num_Olaps);

  Threaded_Stream_Old_Frags(G, gkpStore);

  //fprintf (stderr, "                   Failed overlaps = %d\n", Failed_Olaps);



  delete G;

Пример #6
main(int argc, char **argv) {
  char             *gkpName = 0L;
  char             *ovsName = 0L;

  char             *iniClrName = NULL;
  char             *maxClrName = NULL;
  char             *outClrName = NULL;

  uint32            errorValue     = AS_OVS_encodeEvalue(0.015);
  uint32            minAlignLength = 40;
  uint32            minReadLength  = 64;

  char             *outputPrefix  = NULL;
  char              logName[FILENAME_MAX] = {0};
  char              sumName[FILENAME_MAX] = {0};
  FILE             *logFile = 0L;
  FILE             *sumFile = 0L;

  uint32            idMin = 1;
  uint32            idMax = UINT32_MAX;

  uint32            minEvidenceOverlap  = 40;
  uint32            minEvidenceCoverage = 1;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      ovsName = argv[++arg];

    } else if (strcmp(argv[arg], "-Ci") == 0) {
      iniClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Cm") == 0) {
      maxClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Co") == 0) {
      outClrName = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      double erate = atof(argv[++arg]);
      errorValue = AS_OVS_encodeEvalue(erate);

    } else if (strcmp(argv[arg], "-l") == 0) {
      minAlignLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minReadLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-ol") == 0) {
      minEvidenceOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-oc") == 0) {
      minEvidenceCoverage = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-o") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      AS_UTL_decodeRange(argv[++arg], idMin, idMax);

    } else {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);

  if ((gkpName       == NULL) ||
      (ovsName       == NULL) ||
      (outputPrefix  == NULL) ||
      (err)) {
    fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Co output.clearFile -o outputPrefix\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G gkpStore    path to read store\n");
    fprintf(stderr, "  -O ovlStore    path to overlap store\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -o name        output prefix, for logging\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -t bgn-end     limit processing to only reads from bgn to end (inclusive)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -Ci clearFile  path to input clear ranges (NOT SUPPORTED)\n");
    //fprintf(stderr, "  -Cm clearFile  path to maximal clear ranges\n");
    fprintf(stderr, "  -Co clearFile  path to ouput clear ranges\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -e erate       ignore overlaps with more than 'erate' percent error\n");
    //fprintf(stderr, "  -l length      ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -ol l          the minimum evidence overlap length\n");
    fprintf(stderr, "  -oc c          the minimum evidence overlap coverage\n");
    fprintf(stderr, "                   evidence overlaps must overlap by 'l' bases to be joined, and\n");
    fprintf(stderr, "                   must be at least 'c' deep to be retained\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -minlength l   reads trimmed below this many bases are deleted\n");
    fprintf(stderr, "\n");

  gkStore          *gkp = gkStore::gkStore_open(gkpName);
  ovStore          *ovs = new ovStore(ovsName, gkp);

  clearRangeFile   *iniClr = (iniClrName == NULL) ? NULL : new clearRangeFile(iniClrName, gkp);
  clearRangeFile   *maxClr = (maxClrName == NULL) ? NULL : new clearRangeFile(maxClrName, gkp);
  clearRangeFile   *outClr = (outClrName == NULL) ? NULL : new clearRangeFile(outClrName, gkp);

  if (outClr)
    //  If the outClr file exists, those clear ranges are loaded.  We need to reset them
    //  back to 'untrimmed' for now.

  if (iniClr && outClr)
    //  An iniClr file was supplied, so use those as the initial clear ranges.

  if (outputPrefix) {
    sprintf(logName, "%s.log",     outputPrefix);
    sprintf(sumName, "%s.summary", outputPrefix);

    errno = 0;
    logFile = fopen(logName, "w");
    if (errno)
      fprintf(stderr, "Failed to open log file '%s' for writing: %s\n", logName, strerror(errno)), exit(1);

    sumFile = fopen(sumName, "w");
    if (errno)
      fprintf(stderr, "Failed to open summary file '%s' for writing: %s\n", sumName, strerror(errno)), exit(1);

    fprintf(logFile, "id\tinitL\tinitR\tfinalL\tfinalR\tmessage (DEL=deleted NOC=no change MOD=modified)\n");

    fprintf(sumFile, "Overlap error rate     <= %.4f fraction error\n", AS_OVS_decodeEvalue(errorValue));
    fprintf(sumFile, "Overlap min overlap    >= %u base%s (for 'largest covered')\n", minEvidenceOverlap,  (minEvidenceOverlap  == 1) ? "" : "s");
    fprintf(sumFile, "Overlap min coverage   >= %u read%s (for 'largest covered')\n", minEvidenceCoverage, (minEvidenceCoverage == 1) ? "" : "s");

  uint32      ovlLen       = 0;
  uint32      ovlMax       = 64 * 1024;
  ovOverlap  *ovl          = ovOverlap::allocateOverlaps(gkp, ovlMax);

  memset(ovl, 0, sizeof(ovOverlap) * ovlMax);

  char        logMsg[1024] = {0};

  if (idMin < 1)
    idMin = 1;
  if (idMax > gkp->gkStore_getNumReads())
    idMax = gkp->gkStore_getNumReads();

  fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads.\n",

  for (uint32 id=idMin; id<=idMax; id++) {
    gkRead     *read = gkp->gkStore_getRead(id);
    gkLibrary  *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID());

    logMsg[0] = 0;

    //  If the fragment is deleted, do nothing.  If the fragment was deleted AFTER overlaps were
    //  generated, then the overlaps will be out of sync -- we'll get overlaps for these fragments
    //  we skip.
    if ((iniClr) && (iniClr->isDeleted(id) == true))

    //  If it did not request trimming, do nothing.  Similar to the above, we'll get overlaps to
    //  fragments we skip.
    if ((libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) &&
        (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE))

    //  Decide on the initial trimming.  We copied any iniClr into outClr above, and if there wasn't
    //  an iniClr, then outClr is the full read.

    uint32      ibgn   = outClr->bgn(id);
    uint32      iend   = outClr->end(id);

    //  Set the, ahem, initial final trimming.

    bool        isGood = false;
    uint32      fbgn   = ibgn;
    uint32      fend   = iend;

    //  Load overlaps.

    uint32      nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax);

    //  Trim!

    if (nLoaded == 0) {
      //  No overlaps, so mark it as junk.
      isGood = false;

    else if (libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) {
      //  Use the largest region covered by overlaps as the trim

      assert(ovlLen > 0);
      assert(id == ovl[0].a_iid);

      isGood = largestCovered(ovl, ovlLen,
                              ibgn, iend, fbgn, fend,
      assert(fbgn <= fend);


    else if (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE) {
      //  Use the largest region covered by overlaps as the trim

      assert(ovlLen > 0);
      assert(id == ovl[0].a_iid);

      isGood = bestEdge(ovl, ovlLen,
                        ibgn, iend, fbgn, fend,
      assert(fbgn <= fend);


    else {
      //  Do nothing.  Really shouldn't get here.

    //  Enforce the maximum clear range

    if ((isGood) && (maxClr)) {
      isGood = enforceMaximumClearRange(ovl, ovlLen,
                                        ibgn, iend, fbgn, fend,
      assert(fbgn <= fend);

    //  Trimmed.  Make sense of the result, write some logs, and update the output.

    //  If bad trimming or too small, write the log and keep going.
    if ((isGood == false) || (fend - fbgn < minReadLength)) {
      outClr->setbgn(id) = fbgn;
      outClr->setend(id) = fend;
      outClr->setDeleted(id);  //  Gah, just obliterates the clear range.

      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tDEL%s\n",
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);

    //  If we didn't change anything, also write a log.
    else if ((ibgn == fbgn) &&
        (iend == fend)) {
      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tNOC%s\n",
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);

    //  Otherwise, we actually did something.

    else {
      outClr->setbgn(id) = fbgn;
      outClr->setend(id) = fend;

      fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tMOD%s\n",
              ibgn, iend,
              fbgn, fend,
              (logMsg[0] == 0) ? "" : logMsg);


  delete ovs;

  delete iniClr;
  delete maxClr;
  delete outClr;


Пример #7
main(int argc, char **argv) {
  char           *gkpName        = NULL;
  char           *ovlName        = NULL;
  char           *outPrefix      = NULL;

  uint32          bgnID          = 0;
  uint32          endID          = UINT32_MAX;

  uint32          ovlSelect      = 0;
  double          ovlAtMost      = AS_OVS_encodeEvalue(1.0);
  double          ovlAtLeast     = AS_OVS_encodeEvalue(0.0);

  double          expectedMean   = 30.0;
  double          expectedStdDev =  7.0;

  bool            toFile         = true;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {

    if      (strcmp(argv[arg], "-G") == 0)
      gkpName = argv[++arg];

    else if (strcmp(argv[arg], "-O") == 0)
      ovlName = argv[++arg];

    else if (strcmp(argv[arg], "-o") == 0)
      outPrefix = argv[++arg];

    else if (strcmp(argv[arg], "-C") == 0) {
      expectedMean   = atof(argv[++arg]);
      expectedStdDev = atof(argv[++arg]);

    else if (strcmp(argv[arg], "-c") == 0)
      toFile = false;

    else if (strcmp(argv[arg], "-b") == 0)
      bgnID = atoi(argv[++arg]);

    else if (strcmp(argv[arg], "-e") == 0)
      endID = atoi(argv[++arg]);

    else if (strcmp(argv[arg], "-overlap") == 0) {

      if      (strcmp(argv[arg], "5") == 0)
        ovlSelect |= OVL_5;

      else if (strcmp(argv[arg], "3") == 0)
        ovlSelect |= OVL_3;

      else if (strcmp(argv[arg], "contained") == 0)
        ovlSelect |= OVL_CONTAINED;

      else if (strcmp(argv[arg], "container") == 0)
        ovlSelect |= OVL_CONTAINER;

      else if (strcmp(argv[arg], "partial") == 0)
        ovlSelect |= OVL_PARTIAL;

      else if (strcmp(argv[arg], "atmost") == 0)
        ovlAtMost = atof(argv[++arg]);

      else if (strcmp(argv[arg], "atleast") == 0)
        ovlAtLeast = atof(argv[++arg]);

      else {
        fprintf(stderr, "ERROR: unknown -overlap '%s'\n", argv[arg]);

    else {
      fprintf(stderr, "%s: unknown option '%s'.\n", argv[0], argv[arg]);


  if (gkpName == NULL)
  if (ovlName == NULL)
  if (outPrefix == NULL)

  if (err) {
    fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -o outPrefix [-b bgnID] [-e endID] ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "Generates statistics for an overlap store.  By default all possible classes\n");
    fprintf(stderr, "are generated, options can disable specific classes.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -C mean stddev           Expect coverage at mean +- stddev\n");
    fprintf(stderr, "  -c                       Write stats to stdout, not to a file\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Outputs:\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  outPrefix.per-read.log   One line per read, giving readID, read length and classification.\n");
    fprintf(stderr, "  outPrefix.summary        The primary statistical output.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Overlap Selection:\n");
    fprintf(stderr, "  -overlap 5               5' overlaps only\n");
    fprintf(stderr, "  -overlap 3               3' overlaps only\n");
    fprintf(stderr, "  -overlap contained       contained overlaps only\n");
    fprintf(stderr, "  -overlap container       container overlaps only\n");
    fprintf(stderr, "  -overlap partial         overlap is not valid for assembly\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  An overlap is classified as exactly one of 5', 3', contained or container.\n");
    fprintf(stderr, "  By default, all overlaps are selected.  Specifying any of these options will\n");
    fprintf(stderr, "  restrict overlaps to just those classifications.  E.g., '-overlap 5 -overlap 3'\n");
    fprintf(stderr, "  will select dovetail overlaps off either end of the read.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -overlap atmost x        at most fraction x error  (overlap-erate <= x)\n");
    fprintf(stderr, "  -overlap atleast x       at least fraction x error (x <= overlap-erate)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  Overlaps can be further filtered by fraction error.  Usually, this will be an\n");
    fprintf(stderr, "  'atmost' filtering to use only the higher qualtiy overlaps.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  A contained read has at least one container overlap.  Container read    -> ---------------\n");
    fprintf(stderr, "  A container read has at least one contained overlap.  Contained overlap ->      -----\n");
    fprintf(stderr, "\n");


  //  Set the default to 'all' if nothing set.

  if (ovlSelect == 0)
    ovlSelect = 0xff;

  //  Open inputs, find limits.

  gkStore    *gkpStore = gkStore::gkStore_open(gkpName);
  ovStore    *ovlStore = new ovStore(ovlName, gkpStore);

  if (endID > gkpStore->gkStore_getNumReads())
    endID = gkpStore->gkStore_getNumReads();

  if (endID < bgnID)
    fprintf(stderr, "ERROR: invalid bgn/end range bgn=%u end=%u; only %u reads in the store\n", bgnID, endID, gkpStore->gkStore_getNumReads()), exit(1);

  ovlStore->setRange(bgnID, endID);

  //  Allocate output histograms.

  histogramStatistics   *readNoOlaps         = new histogramStatistics;  //  Bad reads!  (read length)
  histogramStatistics   *readHole            = new histogramStatistics;
  histogramStatistics   *readHump            = new histogramStatistics;
  histogramStatistics   *readNo5             = new histogramStatistics;
  histogramStatistics   *readNo3             = new histogramStatistics;

  histogramStatistics   *olapHole            = new histogramStatistics;  //  Hole size (sum of holes if more than one)
  histogramStatistics   *olapHump            = new histogramStatistics;  //  Hump size (sum of humps if more than one)
  histogramStatistics   *olapNo5             = new histogramStatistics;  //  5' uncovered size
  histogramStatistics   *olapNo3             = new histogramStatistics;  //  3' uncovered size

  histogramStatistics   *readLowCov          = new histogramStatistics;  //  Good reads!  (read length)
  histogramStatistics   *readUnique          = new histogramStatistics;
  histogramStatistics   *readRepeatCont      = new histogramStatistics;
  histogramStatistics   *readRepeatDove      = new histogramStatistics;
  histogramStatistics   *readSpanRepeat      = new histogramStatistics;
  histogramStatistics   *readUniqRepeatCont  = new histogramStatistics;
  histogramStatistics   *readUniqRepeatDove  = new histogramStatistics;
  histogramStatistics   *readUniqAnchor      = new histogramStatistics;

  histogramStatistics   *covrLowCov          = new histogramStatistics;  //  Good reads!  (overlap length)
  histogramStatistics   *covrUnique          = new histogramStatistics;
  histogramStatistics   *covrRepeatCont      = new histogramStatistics;
  histogramStatistics   *covrRepeatDove      = new histogramStatistics;
  histogramStatistics   *covrSpanRepeat      = new histogramStatistics;
  histogramStatistics   *covrUniqRepeatCont  = new histogramStatistics;
  histogramStatistics   *covrUniqRepeatDove  = new histogramStatistics;
  histogramStatistics   *covrUniqAnchor      = new histogramStatistics;

  histogramStatistics   *olapLowCov          = new histogramStatistics;  //  Good reads!  (overlap length)
  histogramStatistics   *olapUnique          = new histogramStatistics;
  histogramStatistics   *olapRepeatCont      = new histogramStatistics;
  histogramStatistics   *olapRepeatDove      = new histogramStatistics;
  histogramStatistics   *olapSpanRepeat      = new histogramStatistics;
  histogramStatistics   *olapUniqRepeatCont  = new histogramStatistics;
  histogramStatistics   *olapUniqRepeatDove  = new histogramStatistics;
  histogramStatistics   *olapUniqAnchor      = new histogramStatistics;

  //  Coverage interval lists, of all overlaps selected.

  //  Open outputs.

  sprintf(N, "%s.per-read.log", outPrefix);

  FILE  *LOG = fopen(N, "w");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);

  //  Compute!

  uint32                 overlapsMax = 1024 * 1024;

  uint32                 overlapsLen = 0;
  ovOverlap             *overlaps    = ovOverlap::allocateOverlaps(gkpStore, overlapsMax);

  overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);

  while (overlapsLen > 0) {
    uint32  readID  = overlaps[0].a_iid;
    uint32  readLen = gkpStore->gkStore_getRead(readID)->gkRead_sequenceLength();

    intervalList<uint32>   cov;
    uint32                 covID = 0;

    bool    readCoverage5     = false;
    bool    readCoverage3     = false;
    bool    readContained     = false;
    bool    readContainer     = false;
    bool    readPartial       = false;

    for (uint32 oo=0; oo<overlapsLen; oo++) {
      bool  is5prime    = (overlaps[oo].overlapAEndIs5prime()  == true) && (ovlSelect & OVL_5)         && (overlaps[oo].overlap5primeIsPartial() == false);
      bool  is3prime    = (overlaps[oo].overlapAEndIs3prime()  == true) && (ovlSelect & OVL_3)         && (overlaps[oo].overlap3primeIsPartial() == false);
      bool  isContained = (overlaps[oo].overlapAIsContained()  == true) && (ovlSelect & OVL_CONTAINED);
      bool  isContainer = (overlaps[oo].overlapAIsContainer()  == true) && (ovlSelect & OVL_CONTAINER);
      bool  isPartial   = (overlaps[oo].overlapIsPartial()     == true) && (ovlSelect & OVL_PARTIAL);

      //  Ignore the overlap?

      if ((is5prime    == false) &&
          (is3prime    == false) &&
          (isContained == false) &&
          (isContainer == false) &&
          (isPartial   == false))

      if (overlaps[oo].evalue() < ovlAtLeast)

      if (overlaps[oo].evalue() > ovlAtMost)

      readCoverage5    |= is5prime;     //  If there is a 5' overlap, the read isn't missing 5' coverage
      readCoverage3    |= is3prime;
      readContained    |= isContained;  //  Read is contained in something else
      readContainer    |= isContainer;  //  Read is a container of somethign else
      readPartial      |= isPartial;

      cov.add(overlaps[oo].a_bgn(), overlaps[oo].a_end() - overlaps[oo].a_bgn());

    //  If we filtered all the overlaps, just get out of here.  Yeah, some code duplication,
    //  but cleaner than sticking an if block around the rest of the loop.

    if (cov.numberOfIntervals() == 0) {

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);

    //  Generate a depth-of-coverage map, then merge intervals

    intervalList<uint32>  depth(cov);


    //  Analyze the intervals, save per-read information to the log.

    uint32  lastInt           = cov.numberOfIntervals() - 1;
    uint32  bgn               = cov.lo(0);
    uint32  end               = cov.hi(lastInt);
    bool    contiguous        = (lastInt == 0) ? true : false;

    bool    readFullCoverage  = (lastInt == 0) && (bgn == 0) && (end == readLen);
    bool    readMissingMiddle = (lastInt != 0);

    uint32  holeSize          = 0;
    uint32  no5Size           = bgn;
    uint32  no3Size           = readLen - end;

    for (uint32 ii=1; ii<cov.numberOfIntervals(); ii++)
      holeSize += cov.lo(ii) - cov.hi(ii-1);

    //  Handle bad cases.  If it's a partial overlap, ignore the is5prime and is3prime markings.

    if (readMissingMiddle == true) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "middle-missing");

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);

    if ((readCoverage5 == false) && (readCoverage3 == false) && (readContained == false) && (readPartial == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "middle-only");
      olapHump->add(no5Size + no3Size);

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);

    if ((readCoverage5 == false) && (readContained == false) && (readPartial == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "no-5-prime");

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);

    if ((readCoverage3 == false) && (readContained == false) && (readPartial == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "no-3-prime");

      overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);

    //  Handle good cases.  For partial overlaps, bgn and end are not the extent of the read.

    if (readPartial == false) {
      assert(bgn == 0);
      assert(end == readLen);
      assert(contiguous == true);
      assert(readFullCoverage == true);

    //  Compute mean and std.dev of coverage.  From this, we decide if the read is 'unique',
    //  'repeat' or 'mixed'.  If 'mixed', we then need to decide if the read spans a repeat, or
    //  joins unique and repeat.

    double  covMean   = 0;
    double  covStdDev = 0;

    for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
      covMean += (depth.hi(ii) - depth.lo(ii)) * depth.depth(ii);

    covMean /= readLen;

    for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
      covStdDev += (depth.hi(ii) - depth.lo(ii)) * (depth.depth(ii) - covMean) * (depth.depth(ii) - covMean);

    covStdDev = sqrt(covStdDev / (readLen - 1));

    //  Classify each interval as either 'l'owcoverage, 'u'nique or 'r'epeat.

    char *classification = new char [depth.numberOfIntervals()];

    for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) {
      if        (depth.depth(ii) < expectedMean - 3 * expectedStdDev) {
        classification[ii] = 'l';

      } else if (depth.depth(ii) < expectedMean + 3 * expectedStdDev) {
        classification[ii] = 'u';

      } else {
        classification[ii] = 'r';

    //  Try to detect if a read is part unique and part repeat.

    bool   isLowCov     = false;
    bool   isUnique     = false;
    bool   isRepeat     = false;
    bool   isSpanRepeat = false;
    bool   isUniqRepeat = false;
    bool   isUniqAnchor = false;

    int32  bgni = 0;
    int32  endi = depth.numberOfIntervals() - 1;

    char   type5 = classification[bgni];
    char   typem = 0;
    char   type3 = classification[endi];

    while ((bgni <= endi) && (type5 == classification[bgni]))

    while ((bgni <= endi) && (type3 == classification[endi]))

    //  All the same classification?

    if (bgni == endi) {
      isLowCov = (type5 == 'l');
      isUnique = (type5 == 'u');
      isRepeat = (type5 == 'r');

    //  Nope, if we aren't the same, assume it is uniqRepeat.

    else if (type5 != type3) {
      isUniqRepeat = true;

    //  Nope, the same on both ends.  Assume we're just flipped.

    else {
      if (type5 == 'r')
        isUniqAnchor = true;
        isSpanRepeat = true;

    //  Now, do something with it.

    //  LOG - readID readLen classification

    if (isLowCov) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "low-cov");

      for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
        covrLowCov->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii));

    if (isUnique) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "unique");

      for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
        covrUnique->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii));

    if ((isRepeat) && (readContained == true)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "contained-repeat");

      for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
        covrRepeatCont->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii));

    if ((isRepeat) && (readContained == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "dovetail-repeat");

      for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++)
        covrRepeatDove->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii));

    if (isSpanRepeat) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "span-repeat");
      olapSpanRepeat->add(depth.lo(endi) - depth.hi(bgni));

    if ((isUniqRepeat) && (readContained == true)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-repeat-cont");

    if ((isUniqRepeat) && (readContained == false)) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-repeat-dove");

    if (isUniqAnchor) {
      fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-anchor");
      olapUniqAnchor->add(depth.lo(endi) - depth.hi(bgni));

    //  Done.  Read more data.

    overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax);

  fclose(LOG);  //  Done with logging.













  LOG = stdout;

  if (toFile == true) {
    sprintf(N, "%s.summary", outPrefix);

    LOG = fopen(N, "w");
    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);

  fprintf(LOG, "category            reads       read length        feature size or coverage  analysis\n");
  fprintf(LOG, "----------------  -------  ----------------------  ------------------------  --------------------\n");
  fprintf(LOG, "middle-missing    %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (bad trimming)\n", readHole->numberOfObjects(), readHole->mean(), readHole->stddev(), olapHole->mean(), olapHole->stddev());
  fprintf(LOG, "middle-hump       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (bad trimming)\n", readHump->numberOfObjects(), readHump->mean(), readHump->stddev(), olapHump->mean(), olapHump->stddev());
  fprintf(LOG, "no-5-prime        %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (bad trimming)\n", readNo5->numberOfObjects(), readNo5->mean(), readNo5->stddev(), olapNo5->mean(), olapNo5->stddev());
  fprintf(LOG, "no-3-prime        %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (bad trimming)\n", readNo3->numberOfObjects(), readNo3->mean(), readNo3->stddev(), olapNo3->mean(), olapNo3->stddev());
  fprintf(LOG, "\n");
  fprintf(LOG, "low-coverage      %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (easy to assemble, potential for lower quality consensus)\n", readLowCov->numberOfObjects(), readLowCov->mean(), readLowCov->stddev(), covrLowCov->mean(), covrLowCov->stddev());
  fprintf(LOG, "unique            %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (easy to assemble, perfect, yay)\n", readUnique->numberOfObjects(), readUnique->mean(), readUnique->stddev(), covrUnique->mean(), covrUnique->stddev());
  fprintf(LOG, "repeat-cont       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (potential for consensus errors, no impact on assembly)\n", readRepeatCont->numberOfObjects(), readRepeatCont->mean(), readRepeatCont->stddev(), covrRepeatCont->mean(), covrRepeatCont->stddev());
  fprintf(LOG, "repeat-dove       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (hard to assemble, likely won't assemble correctly or even at all)\n", readRepeatDove->numberOfObjects(), readRepeatDove->mean(), readRepeatDove->stddev(), covrRepeatDove->mean(), covrRepeatDove->stddev());
  fprintf(LOG, "\n");
  fprintf(LOG, "span-repeat       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (read spans a large repeat, usually easy to assemble)\n", readSpanRepeat->numberOfObjects(), readSpanRepeat->mean(), readSpanRepeat->stddev(), olapSpanRepeat->mean(), olapSpanRepeat->stddev());
  fprintf(LOG, "uniq-repeat-cont  %7"F_U64P"  %10.2f +- %-8.2f                            (should be uniquely placed, low potential for consensus errors, no impact on assembly)\n", readUniqRepeatCont->numberOfObjects(), readUniqRepeatCont->mean(), readUniqRepeatCont->stddev());
  fprintf(LOG, "uniq-repeat-dove  %7"F_U64P"  %10.2f +- %-8.2f                            (will end contigs, potential to misassemble)\n", readUniqRepeatDove->numberOfObjects(), readUniqRepeatDove->mean(), readUniqRepeatDove->stddev());
  fprintf(LOG, "uniq-anchor       %7"F_U64P"  %10.2f +- %-8.2f   %10.2f +- %-8.2f   (repeat read, with unique section, probable bad read)\n", readUniqAnchor->numberOfObjects(), readUniqAnchor->mean(), readUniqAnchor->stddev(), olapUniqAnchor->mean(), olapUniqAnchor->stddev());

  if (toFile == true)

  delete ovlStore;


Пример #8
main(int argc, char **argv) {
    int            msglist[NUM_OF_REC_TYPES + 1];
    FILE          *outfile[NUM_OF_REC_TYPES + 1];
    off_t          count[NUM_OF_REC_TYPES + 1];
    off_t          size[NUM_OF_REC_TYPES + 1];
    int            i;

    for (i=0; i<=NUM_OF_REC_TYPES; i++) {
        msglist[i] = 0;
        outfile[i] = 0L;
        count[i]   = 0;
        size[i]    = 0;

    int arg = 1;
    int inc = 0;
    int err = 0;
    int msg = 0;

    argc = AS_configure(argc, argv);

    while (arg < argc) {
        if        (strcmp(argv[arg], "-i") == 0) {
            inc = 1;
        } else if (strcmp(argv[arg], "-x") == 0) {
            inc = 0;
        } else if (strcmp(argv[arg], "-o") == 0) {
            errno = 0;
            FILE *F = fopen(argv[++arg], "w");
            if (errno)
                fprintf(stderr, "%s: failed to open output file '%s': %s\n", argv[0], argv[arg], strerror(errno)), exit(1);

            //  Depending on the include flag, we either write all messages
            //  listed in our msglist (or write all message not in the
            //  msglist) to the freshly opened file.
            if (inc) {
                //  Include message i in the output if it was listed
                for (i=1; i<=NUM_OF_REC_TYPES; i++)
                    if ((outfile[i] == NULL) && (msglist[i] > 0))
                        outfile[i] = F;
            } else {
                //  Include message i in the output if it was not listed
                for (i=1; i<=NUM_OF_REC_TYPES; i++)
                    if ((outfile[i] == NULL) && (msglist[i] == 0))
                        outfile[i] = F;

            for (i=0; i<=NUM_OF_REC_TYPES; i++)
                msglist[i] = 0;
        } else if (strcmp(argv[arg], "-m") == 0) {
            int type = GetMessageType(argv[++arg]);
            if ((type >= 1) && (type <= NUM_OF_REC_TYPES)) {
            } else {
                fprintf(stderr, "%s: invalid message type '%s'.\n", argv[0], argv[arg]);
                err = 1;
        } else if (strcmp(argv[arg], "-h") == 0) {
            err = 1;
        } else {
            int type = GetMessageType(argv[arg]);
            if ((type >= 1) && (type <= NUM_OF_REC_TYPES)) {
            } else {
                fprintf(stderr, "%s: invalid option '%s'.\n", argv[0], argv[arg]);
                err = 1;

    if (err)
        usage(argv[0]), exit(1);

    //  Assume everything else goes to stdout.  We need to obey the inc
    //  flag, still, though.
    if (inc) {
        //  Include message i in the output if it was listed
        for (i=1; i<=NUM_OF_REC_TYPES; i++)
            if ((outfile[i] == NULL) && (msglist[i] > 0))
                outfile[i] = stdout;
    } else {
        //  Include message i in the output if it was not listed
        for (i=1; i<=NUM_OF_REC_TYPES; i++)
            if ((outfile[i] == NULL) && (msglist[i] == 0))
                outfile[i] = stdout;

    GenericMesg   *pmesg;
    off_t          currPos = 0;
    off_t          prevPos = 0;

    while (ReadProtoMesg_AS(stdin, &pmesg) != EOF) {
        assert(pmesg->t <= NUM_OF_REC_TYPES);

        currPos = AS_UTL_ftell(stdin);

        if (outfile[pmesg->t] != NULL) {

            size[pmesg->t] += currPos - prevPos;

            WriteProtoMesg_AS(outfile[pmesg->t], pmesg);

        prevPos = currPos;

    for (i=0; i<=NUM_OF_REC_TYPES; i++)
        if (count[i] > 0)
            fprintf(stderr, "%s num "F_OFF_T" size "F_OFF_T" avg %f\n",
                    MessageTypeName[i], count[i], size[i], (double)size[i] / count[i]);

Пример #9
main(int argc, char **argv) {
    char           *ovlName      = NULL;
    uint32          maxJob       = 0;

    bool            deleteIntermediates = true;

    bool            doExplicitTest = false;
    bool            doFixes        = false;

    char            name[FILENAME_MAX];

    argc = AS_configure(argc, argv);

    int err=0;
    int arg=1;
    while (arg < argc) {
        if        (strcmp(argv[arg], "-O") == 0) {
            ovlName = argv[++arg];

        } else if (strcmp(argv[arg], "-F") == 0) {
            maxJob = atoi(argv[++arg]);

        } else if (strcmp(argv[arg], "-f") == 0) {
            doFixes = true;

        } else if (strcmp(argv[arg], "-t") == 0) {
            doExplicitTest = true;
            ovlName = argv[++arg];

        } else if (strcmp(argv[arg], "-nodelete") == 0) {
            deleteIntermediates = false;

        } else {
            fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);

    if (ovlName == NULL)
    if ((maxJob == 0) && (doExplicitTest == false))

    if (err) {
        fprintf(stderr, "usage: %s ...\n", argv[0]);
        fprintf(stderr, "  -O x.ovlStore    path to overlap store to build the final index for\n");
        fprintf(stderr, "  -F s             number of slices used in bucketizing/sorting\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "  -t x.ovlStore    explicitly test a previously constructed index\n");
        fprintf(stderr, "  -f               when testing, also create a new 'idx.fixed' which might\n");
        fprintf(stderr, "                   resolve rare problems\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "  -nodelete        do not remove intermediate files when the index is\n");
        fprintf(stderr, "                   successfully created\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "    DANGER    DO NOT USE     DO NOT USE     DO NOT USE    DANGER\n");
        fprintf(stderr, "    DANGER                                                DANGER\n");
        fprintf(stderr, "    DANGER   This command is difficult to run by hand.    DANGER\n");
        fprintf(stderr, "    DANGER          Use ovStoreCreate instead.            DANGER\n");
        fprintf(stderr, "    DANGER                                                DANGER\n");
        fprintf(stderr, "    DANGER    DO NOT USE     DO NOT USE     DO NOT USE    DANGER\n");
        fprintf(stderr, "\n");

        if (ovlName == NULL)
            fprintf(stderr, "ERROR: No overlap store (-O) supplied.\n");
        if ((maxJob == 0) && (doExplicitTest == false))
            fprintf(stderr, "ERROR: One of -F (number of slices) or -t (test a store) must be supplied.\n");


    //  Do the test, and maybe fix things up.

    if (doExplicitTest == true) {
        bool passed = testIndex(ovlName, doFixes);

        exit((passed == true) ? 0 : 1);

    //  Check that all segments are present.  Every segment should have an info file.

    uint32  cntJob = 0;

    for (uint32 i=1; i<=maxJob; i++) {
        uint32  complete = 0;

        sprintf(name, "%s/%04d", ovlName, i);
        if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
            fprintf(stderr, "ERROR: Segment "F_U32" data not present  (%s)\n", i, name);

        sprintf(name, "%s/%04d.info", ovlName, i);
        if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
            fprintf(stderr, "ERROR: Segment "F_U32" info not present (%s)\n", i, name);

        sprintf(name, "%s/%04d.index", ovlName, i);
        if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
            fprintf(stderr, "ERROR: Segment "F_U32" index not present (%s)\n", i, name);

        if (complete == 3)

    if (cntJob != maxJob) {
        fprintf(stderr, "ERROR: Expected "F_U32" segments, only found "F_U32".\n", maxJob, cntJob);

    //  Merge the stuff.

    mergeInfoFiles(ovlName, maxJob);

    //  Diagnostics.

    if (testIndex(ovlName, false) == false) {
        fprintf(stderr, "ERROR: index failed tests.\n");

    //  Remove intermediates.  For the buckets, we keep going until there are 10 in a row not present.
    //  During testing, on a microbe using 2850 buckets, some buckets were empty.

    if (deleteIntermediates == false) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Not removing intermediate files.  Finished.\n");

    fprintf(stderr, "\n");
    fprintf(stderr, "Removing intermediate files.\n");

    //  Removing indices is easy, beacuse we know how many there are.

    for (uint32 i=1; i<=maxJob; i++) {
        sprintf(name, "%s/%04u.index", ovlName, i);
        sprintf(name, "%s/%04u.info",  ovlName, i);

    //  We don't know how many buckets there are, so we remove until we fail to find ten
    //  buckets in a row.

    for (uint32 missing=0, i=1; missing<10; i++) {
        sprintf(name, "%s/bucket%04d", ovlName, i);

        if (AS_UTL_fileExists(name, TRUE, FALSE) == FALSE) {

        missing = 0;

        sprintf(name, "%s/bucket%04d/sliceSizes", ovlName, i);

        sprintf(name, "%s/bucket%04d", ovlName, i);

    fprintf(stderr, "Finished.\n");

Пример #10
main(int32 argc, char **argv) {
   int32 arg = 1;
   int32 err = 0;
   int32 hlp = 0;

   char * gkpStoreName  = NULL;
   int32  gkpStorePart  = 0;
   char * msgFile       = NULL;
   char * outputFileName= NULL;
   char * seqAn         = NULL;
   char * wrkDir        = NULL;
   char * seqStoreName  = NULL;
   int32  seqStoreVer   = 0;
   int32  seqStorePart  = 0;     

   argc = AS_configure(argc, argv);

   while (arg < argc) {
      if (strcmp(argv[arg], "-c") == 0) {
         msgFile = argv[++arg];
      } else if (strcmp(argv[arg], "-G") == 0) {
         gkpStoreName = argv[++arg];
      } else if (strcmp(argv[arg], "-S") == 0) {
         gkpStorePart = atoi(argv[++arg]);
      } else if (strcmp(argv[arg], "-o") == 0) {
         outputFileName = argv[++arg];
      } else if (strcmp(argv[arg], "-s") == 0) {
         seqAn = argv[++arg];
      } else if (strcmp(argv[arg], "-w") == 0) {
         wrkDir = argv[++arg];
      } else if (strcmp(argv[arg], "-u") == 0) {
         seqStoreName = argv[++arg];
      } else if (strcmp(argv[arg], "-V") == 0) {
         seqStoreVer = atoi(argv[++arg]);
      } else if (strcmp(argv[arg], "-p") == 0) {
         seqStorePart = atoi(argv[++arg]);
      } else {

   if ((err) || (gkpStoreName == NULL) || (msgFile == NULL) || (outputFileName == NULL) || seqAn == NULL) {
      fprintf(stderr, "USAGE: SeqAn_CNS -G <gkpStore> -c <input.cgb> -o <output.cgi> -s <seqan_executable> [-u seqstore, required for contig consensus] [-w working directory]\n");      

   gkStore        *gkpStore = new gkStore(gkpStoreName, FALSE, FALSE);

   gkFragment      fr;
   GenericMesg    *pmesg;
   tSequenceDB    *sequenceDB = NULL;   

   FILE *infp = fopen(msgFile,"r");
   FILE *tempReads;
   FILE *outfp = fopen(outputFileName, "w");
   char *prefix = outputFileName;
   getFileName(prefix, wrkDir, AS_SEQAN_INPUT_NAME, fileName);

   int32 i = 0;
   while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) {
      int32 freeMem = 0;
      if (pmesg->t == MESG_IUM) {
         IntUnitigMesg *ium_mesg = (IntUnitigMesg *)pmesg->m;         
         if (strlen(ium_mesg->consensus) == 0) {
            tempReads = fopen(fileName,"w");

            for (i =0; i < ium_mesg->num_frags; i++) {
               // get the fragment sequence
               gkpStore->gkStore_getFragment(ium_mesg->f_list[i].ident, &fr, GKFRAGMENT_QLT);
               uint32   clrBeg = fr.gkFragment_getClearRegionBegin();
               uint32   clrEnd = fr.gkFragment_getClearRegionEnd  ();
               char    *seqStart = fr.gkFragment_getSequence();
               char     *seq      = seqStart+clrBeg;

               seq[clrEnd] = 0;
                  seq, clrEnd-clrBeg,
                   ">"F_IID","F_IID"\n", ium_mesg->f_list[i].position.bgn, ium_mesg->f_list[i].position.end);
            updateRecord(ium_mesg, fileName, seqAn, prefix, wrkDir);
            freeMem = 1;
         WriteProtoMesg_AS(outfp, pmesg);

         if (freeMem) {
      else if (pmesg->t == MESG_ICM) {         
         IntConConMesg *icm_mesg = (IntConConMesg *)pmesg->m;

         if (seqStoreName == NULL) {
            fprintf(stderr, "USAGE: The -u option is required for contig consensus\n");
         if (sequenceDB == NULL) {
            sequenceDB = openSequenceDB(seqStoreName, FALSE, seqStoreVer);
            openSequenceDBPartition(sequenceDB, seqStorePart);
         if (strlen(icm_mesg->consensus) == 0) {
            tempReads = fopen(fileName,"w");

            for (i =0; i < icm_mesg->num_pieces; i++) {
               // get the fragment sequence
               gkpStore->gkStore_getFragment(icm_mesg->pieces[i].ident, &fr, GKFRAGMENT_QLT);
               uint32   clrBeg   = fr.gkFragment_getClearRegionBegin();
               uint32   clrEnd   = fr.gkFragment_getClearRegionEnd  ();
               char    *seqStart = fr.gkFragment_getSequence();
               char    *seq      = seqStart+clrBeg;

               seq[clrEnd] = 0;
                  seq, clrEnd-clrBeg,
                   ">"F_IID","F_IID"\n", icm_mesg->pieces[i].position.bgn, icm_mesg->pieces[i].position.end);
            // now handle the unitig messages
            for (i =0; i < icm_mesg->num_unitigs; i++) {
               VA_TYPE(char) *ungappedSequence = CreateVA_char(0);
               VA_TYPE(char) *ungappedQuality  = CreateVA_char(0);
               MultiAlignT *uma = loadMultiAlignTFromSequenceDB(sequenceDB, icm_mesg->unitigs[i].ident, 1);
               assert(uma != NULL);
               GetMultiAlignUngappedConsensus(uma, ungappedSequence, ungappedQuality);
               char * seq = Getchar(ungappedSequence,0);

                  seq, strlen(seq),
                   ">"F_IID","F_IID"\n", icm_mesg->unitigs[i].position.bgn, icm_mesg->unitigs[i].position.end);

            updateICMRecord(icm_mesg, fileName, seqAn, prefix, wrkDir);
            freeMem = 1;
         WriteProtoMesg_AS(outfp, pmesg);

         if (freeMem) {

   return 0;
Пример #11
int main (int argc, char *argv[]) {
  int32       checkpointVers           = 0;
  int32       tigStoreVers             = 0;

  GlobalData = new Globals_CGW();

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-g") == 0) {
      strcpy(GlobalData->gkpStoreName, argv[++arg]);

    } else if (strcmp(argv[arg], "-t") == 0) {
      strcpy(GlobalData->tigStoreName, argv[++arg]);
      tigStoreVers = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-c") == 0) {
      strcpy(GlobalData->outputPrefix, argv[++arg]);
      checkpointVers = atoi(argv[++arg]);

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);
  if ((GlobalData->gkpStoreName[0] == 0) ||
      (GlobalData->tigStoreName[0] == 0) ||
      (err)) {
    fprintf(stderr, "usage: %s -g gkpStore [-o prefix] [-s firstUID] [-n namespace] [-E server] [-h]\n", argv[0]);
    fprintf(stderr, "  -g gkpStore             mandatory path to the gkpStore\n");
    fprintf(stderr, "  -t tigStore version     mandatory path to the tigStore and version\n");
    fprintf(stderr, "  -c checkpoint version   optional path to a checkpoint and version\n");
    fprintf(stderr, "\n");

  LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix, checkpointVers, FALSE);

  vector<instrumentLIB>   lib;

  for (int32 i=0; i<GetNumDistTs(ScaffoldGraph->Dists); i++) {
    DistT *dptr = GetDistT(ScaffoldGraph->Dists, i);

    lib.push_back(instrumentLIB(i, dptr->mu, dptr->sigma, true));

  GraphNodeIterator   scaffolds;
  CIScaffoldT        *scaffold;

  InitGraphNodeIterator(&scaffolds, ScaffoldGraph->ScaffoldGraph, GRAPH_NODE_DEFAULT);
  while ((scaffold = NextGraphNodeIterator(&scaffolds)) != NULL) {
    if(scaffold->type != REAL_SCAFFOLD)

    //if (scaffold->id != 14)
    //  continue;

    instrumentSCF  scf(scaffold);



Пример #12
main (int argc, char **argv) {
  char    *gkpName         = NULL;

  char    *tigName         = NULL;
  uint32   tigVers         = UINT32_MAX;
  uint32   tigPart         = UINT32_MAX;

  char    *tigFileName     = NULL;

  uint32   utgBgn          = UINT32_MAX;
  uint32   utgEnd          = UINT32_MAX;

  char    *outResultsName  = NULL;
  char    *outLayoutsName  = NULL;
  char    *outSeqNameA     = NULL;
  char    *outSeqNameQ     = NULL;
  char    *outPackageName  = NULL;

  FILE     *outResultsFile = NULL;
  FILE     *outLayoutsFile = NULL;
  FILE     *outSeqFileA    = NULL;
  FILE     *outSeqFileQ    = NULL;
  FILE     *outPackageFile = NULL;

  char    *inPackageName   = NULL;

  char      algorithm      = 'P';
  uint32    numThreads	   = 0;

  bool      forceCompute   = false;

  double    errorRate      = 0.12;
  double    errorRateMax   = 0.40;
  uint32    minOverlap     = 40;

  int32     numFailures    = 0;

  bool      showResult     = false;

  double    maxCov         = 0.0;
  uint32    maxLen         = UINT32_MAX;

  uint32    verbosity      = 0;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-T") == 0) {
      tigName = argv[++arg];
      tigVers = atoi(argv[++arg]);
      tigPart = atoi(argv[++arg]);

      if (argv[arg][0] == '.')
        tigPart = UINT32_MAX;

      if (tigVers == 0)
        fprintf(stderr, "invalid tigStore version (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1);
      if (tigPart == 0)
        fprintf(stderr, "invalid tigStore partition (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1);

    } else if (strcmp(argv[arg], "-u") == 0) {
      AS_UTL_decodeRange(argv[++arg], utgBgn, utgEnd);

    } else if (strcmp(argv[arg], "-t") == 0) {
      tigFileName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      outResultsName = argv[++arg];

    } else if (strcmp(argv[arg], "-L") == 0) {
      outLayoutsName = argv[++arg];

    } else if (strcmp(argv[arg], "-A") == 0) {
      outSeqNameA = argv[++arg];

    } else if (strcmp(argv[arg], "-Q") == 0) {
      outSeqNameQ = argv[++arg];

    } else if (strcmp(argv[arg], "-quick") == 0) {
      algorithm = 'Q';
    } else if (strcmp(argv[arg], "-pbdagcon") == 0) {
      algorithm = 'P';
    } else if (strcmp(argv[arg], "-utgcns") == 0) {
      algorithm = 'U';

    } else if (strcmp(argv[arg], "-threads") == 0) {
      numThreads = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-p") == 0) {
      inPackageName = argv[++arg];

    } else if (strcmp(argv[arg], "-P") == 0) {
      outPackageName = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      errorRate = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-em") == 0) {
      errorRateMax = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-l") == 0) {
      minOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-f") == 0) {
      forceCompute = true;

    } else if (strcmp(argv[arg], "-v") == 0) {
      showResult = true;

    } else if (strcmp(argv[arg], "-V") == 0) {

    } else if (strcmp(argv[arg], "-maxcoverage") == 0) {
      maxCov   = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-maxlength") == 0) {
      maxLen   = atof(argv[++arg]);

    } else {
      fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);


  if ((gkpName == NULL) && (inPackageName == NULL))

  if ((tigFileName == NULL) && (tigName == NULL) && (inPackageName == NULL))

  if (err) {
    fprintf(stderr, "usage: %s [opts]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  INPUT\n");
    fprintf(stderr, "    -G g            Load reads from gkStore 'g'\n");
    fprintf(stderr, "    -T t v p        Load unitigs from tgStore 't', version 'v', partition 'p'.\n");
    fprintf(stderr, "                      Expects reads will be in gkStore partition 'p' as well\n");
    fprintf(stderr, "                      Use p='.' to specify no partition\n");
    fprintf(stderr, "    -t file         Test the computation of the unitig layout in 'file'\n");
    fprintf(stderr, "                      'file' can be from:\n");
    fprintf(stderr, "                        'tgStoreDump -d layout' (human readable layout format)\n");
    fprintf(stderr, "                        'utgcns -L'             (human readable layout format)\n");
    fprintf(stderr, "                        'utgcns -O'             (binary multialignment format)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -p package      Load unitig and read from 'package' created with -P.  This\n");
    fprintf(stderr, "                    is usually used by developers.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  ALGORITHM\n");
    fprintf(stderr, "    -quick          No alignments, just paste read sequence into the unitig positions.\n");
    fprintf(stderr, "                    This is very fast, but the consensus sequence is formed from a mosaic\n");
    fprintf(stderr, "                    of read sequences, and there can be large indel.  This is useful for\n");
    fprintf(stderr, "                    checking intermediate assembly structure by mapping to reference, or\n");
    fprintf(stderr, "                    possibly for use as input to a polishing step.\n");
    fprintf(stderr, "    -pbdagcon       Use pbdagcon (https://github.com/PacificBiosciences/pbdagcon).\n");
    fprintf(stderr, "                    This is fast and robust.  It is the default algorithm.  It does not\n");
    fprintf(stderr, "                    generate a final multialignment output (the -v option will not show\n");
    fprintf(stderr, "                    anything useful).\n");
    fprintf(stderr, "    -utgcns         Use utgcns (the original Celera Assembler consensus algorithm)\n");
    fprintf(stderr, "                    This isn't as fast, isn't as robust, but does generate a final multialign\n");
    fprintf(stderr, "                    output.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  OUTPUT\n");
    fprintf(stderr, "    -O results      Write computed tigs to binary output file 'results'\n");
    fprintf(stderr, "    -L layouts      Write computed tigs to layout output file 'layouts'\n");
    fprintf(stderr, "    -A fasta        Write computed tigs to fasta  output file 'fasta'\n");
    fprintf(stderr, "    -Q fastq        Write computed tigs to fastq  output file 'fastq'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -P package      Create a copy of the inputs needed to compute the unitigs.  This\n");
    fprintf(stderr, "                    file can then be sent to the developers for debugging.  The unitig(s)\n");
    fprintf(stderr, "                    are not processed and no other outputs are created.  Ideally,\n");
    fprintf(stderr, "                    only one unitig is selected (-u, below).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  TIG SELECTION (if -T input is used)\n");
    fprintf(stderr, "    -u b            Compute only unitig ID 'b' (must be in the correct partition!)\n");
    fprintf(stderr, "    -u b-e          Compute only unitigs from ID 'b' to ID 'e'\n");
    fprintf(stderr, "    -f              Recompute unitigs that already have a multialignment\n");
    fprintf(stderr, "    -maxlength l    Do not compute consensus for unitigs longer than l bases.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  PARAMETERS\n");
    fprintf(stderr, "    -e e            Expect alignments at up to fraction e error\n");
    fprintf(stderr, "    -em m           Don't ever allow alignments more than fraction m error\n");
    fprintf(stderr, "    -l l            Expect alignments of at least l bases\n");
    fprintf(stderr, "    -maxcoverage c  Use non-contained reads and the longest contained reads, up to\n");
    fprintf(stderr, "                    C coverage, for consensus generation.  The default is 0, and will\n");
    fprintf(stderr, "                    use all reads.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  LOGGING\n");
    fprintf(stderr, "    -v              Show multialigns.\n");
    fprintf(stderr, "    -V              Enable debugging option 'verbosemultialign'.\n");
    fprintf(stderr, "\n");

    if ((gkpName == NULL) && (inPackageName == NULL))
      fprintf(stderr, "ERROR:  No gkpStore (-G) and no package (-p) supplied.\n");

    if ((tigFileName == NULL) && (tigName == NULL)  && (inPackageName == NULL))
      fprintf(stderr, "ERROR:  No tigStore (-T) OR no test unitig (-t) OR no package (-p)  supplied.\n");


  errno = 0;

  //  Open output files.  If we're creating a package, the usual output files are not opened.

  if (outPackageName)
    outPackageFile = fopen(outPackageName, "w");
  if (errno)
    fprintf(stderr, "Failed to open output package file '%s': %s\n", outPackageName, strerror(errno)), exit(1);

  if ((outResultsName) && (outPackageName == NULL))
    outResultsFile = fopen(outResultsName, "w");
  if (errno)
    fprintf(stderr, "Failed to open output results file '%s': %s\n", outResultsName, strerror(errno)), exit(1);

  if ((outLayoutsName) && (outPackageName == NULL))
    outLayoutsFile = fopen(outLayoutsName, "w");
  if (errno)
    fprintf(stderr, "Failed to open output layout file '%s': %s\n", outLayoutsName, strerror(errno)), exit(1);

  if ((outSeqNameA) && (outPackageName == NULL))
    outSeqFileA = fopen(outSeqNameA, "w");
  if (errno)
    fprintf(stderr, "Failed to open output FASTA file '%s': %s\n", outSeqNameA, strerror(errno)), exit(1);

  if ((outSeqNameQ) && (outPackageName == NULL))
    outSeqFileQ = fopen(outSeqNameQ, "w");
  if (errno)
    fprintf(stderr, "Failed to open output FASTQ file '%s': %s\n", outSeqNameQ, strerror(errno)), exit(1);

  if (numThreads > 0) {
    fprintf(stderr, "number of threads     = %d (command line)\n", numThreads);
    fprintf(stderr, "\n");
  } else {
    fprintf(stderr, "number of threads     = %d (OpenMP default)\n", omp_get_max_threads());
    fprintf(stderr, "\n");

  //  Open gatekeeper for read only, and load the partitioned data if tigPart > 0.

  gkStore                   *gkpStore          = NULL;
  tgStore                   *tigStore          = NULL;
  FILE                      *tigFile           = NULL;
  FILE                      *inPackageFile     = NULL;
  map<uint32, gkRead *>     *inPackageRead     = NULL;
  map<uint32, gkReadData *> *inPackageReadData = NULL;

  if (gkpName) {
    fprintf(stderr, "-- Opening gkpStore '%s' partition %u.\n", gkpName, tigPart);
    gkpStore = gkStore::gkStore_open(gkpName, gkStore_readOnly, tigPart);

  if (tigName) {
    fprintf(stderr, "-- Opening tigStore '%s' version %u.\n", tigName, tigVers);
    tigStore = new tgStore(tigName, tigVers);

  if (tigFileName) {
    fprintf(stderr, "-- Opening tigFile '%s'.\n", tigFileName);

    errno = 0;
    tigFile = fopen(tigFileName, "r");
    if (errno)
      fprintf(stderr, "Failed to open input tig file '%s': %s\n", tigFileName, strerror(errno)), exit(1);

  if (inPackageName) {
    fprintf(stderr, "-- Opening package file '%s'.\n", inPackageName);

    errno = 0;
    inPackageFile = fopen(inPackageName, "r");
    if (errno)
      fprintf(stderr, "Failed to open input package file '%s': %s\n", inPackageName, strerror(errno)), exit(1);

  //  Report some sizes.

  fprintf(stderr, "sizeof(abBead)     "F_SIZE_T"\n", sizeof(abBead));
  fprintf(stderr, "sizeof(abColumn)   "F_SIZE_T"\n", sizeof(abColumn));
  fprintf(stderr, "sizeof(abAbacus)   "F_SIZE_T"\n", sizeof(abAbacus));
  fprintf(stderr, "sizeof(abSequence) "F_SIZE_T"\n", sizeof(abSequence));

  //  Decide on what to compute.  Either all unitigs, or a single unitig, or a special case test.

  uint32  b = 0;
  uint32  e = UINT32_MAX;

  if (tigStore) {
    if (utgEnd > tigStore->numTigs() - 1)
      utgEnd = tigStore->numTigs() - 1;

    if (utgBgn != UINT32_MAX) {
      b = utgBgn;
      e = utgEnd;

    } else {
      b = 0;
      e = utgEnd;

    fprintf(stderr, "-- Computing unitig consensus for b="F_U32" to e="F_U32" with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n",
            b, e, errorRate, errorRateMax, minOverlap);

  else {
    fprintf(stderr, "-- Computing unitig consensus with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n",
            errorRate, errorRateMax, minOverlap);

  fprintf(stderr, "\n");

  //  I don't like this loop control.

  for (uint32 ti=b; (e == UINT32_MAX) || (ti <= e); ti++) {
    tgTig  *tig = NULL;

    //  If a tigStore, load the tig.  The tig is the owner; it cannot be deleted by us.
    if (tigStore)
      tig = tigStore->loadTig(ti);

    //  If a tigFile or a package, create a new tig and fill it.  Obviously, we own it.
    if (tigFile || inPackageFile) {
      tig = new tgTig();

      if (tig->loadFromStreamOrLayout((tigFile != NULL) ? tigFile : inPackageFile) == false) {
        delete tig;

    //  No tig loaded, keep going.

    if (tig == NULL)

    //  If a package, populate the read and readData maps with data from the package.

    if (inPackageFile) {
      inPackageRead      = new map<uint32, gkRead *>;
      inPackageReadData  = new map<uint32, gkReadData *>;

      for (int32 ii=0; ii<tig->numberOfChildren(); ii++) {
        uint32       readID = tig->getChild(ii)->ident();
        gkRead      *read   = (*inPackageRead)[readID]     = new gkRead;
        gkReadData  *data   = (*inPackageReadData)[readID] = new gkReadData;

        gkStore::gkStore_loadReadFromStream(inPackageFile, read, data);

        if (read->gkRead_readID() != readID)
          fprintf(stderr, "ERROR: package not in sync with tig.  package readID = %u  tig readID = %u\n",
                  read->gkRead_readID(), readID);
        assert(read->gkRead_readID() == readID);

    //  More 'not liking' - set the verbosity level for logging.

    tig->_utgcns_verboseLevel = verbosity;

    //  Are we parittioned?  Is this tig in our partition?

    if (tigPart != UINT32_MAX) {
      uint32  missingReads = 0;

      for (uint32 ii=0; ii<tig->numberOfChildren(); ii++)
        if (gkpStore->gkStore_getReadInPartition(tig->getChild(ii)->ident()) == NULL)

      if (missingReads) {
        //fprintf(stderr, "SKIP unitig %u with %u reads found only %u reads in partition, skipped\n",
        //        tig->tigID(), tig->numberOfChildren(), tig->numberOfChildren() - missingReads);

    if (tig->length(true) > maxLen) {
      fprintf(stderr, "SKIP unitig %d of length %d (%d children) - too long, skipped\n",
              tig->tigID(), tig->length(true), tig->numberOfChildren());

    if (tig->numberOfChildren() == 0) {
      fprintf(stderr, "SKIP unitig %d of length %d (%d children) - no children, skipped\n",
              tig->tigID(), tig->length(true), tig->numberOfChildren());

    bool exists   = tig->consensusExists();

    if (tig->numberOfChildren() > 1)
      fprintf(stderr, "Working on unitig %d of length %d (%d children)%s%s\n",
              tig->tigID(), tig->length(true), tig->numberOfChildren(),
              ((exists == true)  && (forceCompute == false)) ? " - already computed"              : "",
              ((exists == true)  && (forceCompute == true))  ? " - already computed, recomputing" : "");

    //  Process the tig.  Remove deep coverage, create a consensus object, process it, and report the results.
    //  before we add it to the store.

    unitigConsensus  *utgcns       = new unitigConsensus(gkpStore, errorRate, errorRateMax, minOverlap);
    savedChildren    *origChildren = NULL;
    bool              success      = exists;

    //  Save the tig in the package?
    //  The original idea was to dump the tig and all the reads, then load the tig and process as normal.
    //  Sadly, stashContains() rearranges the order of the reads even if it doesn't remove any.  The rearranged
    //  tig couldn't be saved (otherwise it would be rearranged again).  So, we were in the position of
    //  needing to save the original tig and the rearranged reads.  Impossible.
    //  Instead, we save the origianl tig and original reads -- including any that get stashed -- then
    //  load them all back into a map for use in consensus proper.  It's a bit of a pain, and could
    //  have way more reads saved than necessary.

    if (outPackageFile) {
      utgcns->savePackage(outPackageFile, tig);
      fprintf(stderr, "  Packaged unitig %u into '%s'\n", tig->tigID(), outPackageName);

    //  Compute consensus if it doesn't exist, or if we're forcing a recompute.  But only if we
    //  didn't just package it.

    if ((outPackageFile == NULL) &&
        ((exists == false) || (forceCompute == true))) {
      origChildren = stashContains(tig, maxCov, true);

      switch (algorithm) {
        case 'Q':
          success = utgcns->generateQuick(tig, inPackageRead, inPackageReadData);
        case 'P':
          success = utgcns->generatePBDAG(tig, inPackageRead, inPackageReadData);
        case 'U':
          success = utgcns->generate(tig, inPackageRead, inPackageReadData);

    //  If it was successful (or existed already), output.  Success is always false if the unitig
    //  was packaged, regardless of if it existed already.

    if (success == true) {
      if ((showResult) && (gkpStore))  //  No gkpStore if we're from a package.  Dang.
        tig->display(stdout, gkpStore, 200, 3);

      unstashContains(tig, origChildren);

      if (outResultsFile)

      if (outLayoutsFile)

      if (outSeqFileA)
        tig->dumpFASTA(outSeqFileA, true);

      if (outSeqFileQ)
        tig->dumpFASTQ(outSeqFileQ, true);

    //  Report failures.

    if ((success == false) && (outPackageFile == NULL)) {
      fprintf(stderr, "unitigConsensus()-- unitig %d failed.\n", tig->tigID());

    //  Clean up, unloading or deleting the tig.

    delete utgcns;        //  No real reason to keep this until here.
    delete origChildren;  //  Need to keep it until after we display() above.

    if (tigStore)
      tigStore->unloadTig(tig->tigID(), true);  //  Tell the store we're done with it

    if (tigFile)
      delete tig;

  delete tigStore;


  if (tigFile)         fclose(tigFile);
  if (outResultsFile)  fclose(outResultsFile);
  if (outLayoutsFile)  fclose(outLayoutsFile);
  if (outPackageFile)  fclose(outPackageFile);
  if (inPackageFile)   fclose(inPackageFile);

  if (numFailures) {
    fprintf(stderr, "WARNING:  Total number of unitig failures = %d\n", numFailures);
    fprintf(stderr, "\n");
    fprintf(stderr, "Consensus did NOT finish successfully.\n");

  } else {
    fprintf(stderr, "Consensus finished successfully.  Bye.\n");

  return(numFailures != 0);
Пример #13
main(int argc, char **argv) {
  int    toBinary      = 0;
  int    toASCII       = 0;
  int    format        = FORMAT_NONE;

  argc = AS_configure(argc, argv);

  maxError = AS_OVS_encodeQuality(1.0);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-ovl") == 0) {
      format = FORMAT_OVL;
    } else if (strcmp(argv[arg], "-obt") == 0) {
      format = FORMAT_OBT;

    } else if (strncmp(argv[arg], "-minlength", 5) == 0) {
      minLength = atoi(argv[++arg]);
    } else if (strncmp(argv[arg], "-maxerror", 5) == 0) {
      double e = atof(argv[++arg]);
      maxError = AS_OVS_encodeQuality(e);
    } else if (strncmp(argv[arg], "-nocontainment", 4) == 0) {
      //  aka, only dovetail
      noContainment = 1;
    } else if (strncmp(argv[arg], "-nodovetail", 4) == 0) {
      //  aka, only containment
      noDovetail = 1;

    } else if (strcmp(argv[arg], "-gkp") == 0) {
      gkpStoreName = argv[++arg];

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);
  if ((format == FORMAT_NONE) ||
      (err)) {
    fprintf(stderr, "usage: %s [-ovl | -obt] < input > output\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "Filters overlaps (raw binary files, not the store) based on\n");
    fprintf(stderr, "length, error, dovetail or containment.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -ovl             -- overlaps are OVL\n");
    fprintf(stderr, "  -obt             -- overlaps are OBT\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -minlength l     -- throw out overlaps shorter than l\n");
    fprintf(stderr, "  -maxerror e      -- throw out overlaps with more than fraction e error\n");
    fprintf(stderr, "  -nocontainment   -- throw out containment overlaps\n");
    fprintf(stderr, "  -nodovetail      -- throw out dovetail overlaps\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -gkp gkpstore     Needed for -ovl or -nocontainment or -nodovetail\n");

  if (noDovetail || noContainment || (format == FORMAT_OVL)) {
    gkStore       *gkp = new gkStore(gkpStoreName, FALSE, FALSE);
    gkStream      *fs  = new gkStream(gkp, 0, 0, GKFRAGMENT_INF);
    gkFragment     fr;

    numReads   = gkp->gkStore_getNumFragments();
    readLength = (fragInfo *)safe_malloc(sizeof(fragInfo) * numReads);

    fprintf(stderr, "Reading gkpStore to get clear ranges for "F_U32" reads.\n", numReads);

    while (fs->next(&fr)) {
      AS_IID iid = fr.gkFragment_getReadIID();
      readLength[iid].len = fr.gkFragment_getSequenceLength();
      readLength[iid].beg = fr.gkFragment_getClearRegionBegin();
      readLength[iid].end = fr.gkFragment_getClearRegionEnd  ();

    delete fs;
    delete gkp;

  fprintf(stderr, "WARNING:\n");
  fprintf(stderr, "WARNING:  This has not been fully tested.  Only -obt -minlength\n");
  fprintf(stderr, "WARNING:  is guaranteed (unless it doesn't work).  Though,\n");
  fprintf(stderr, "WARNING:  -maxerror is pretty trivial.\n");
  fprintf(stderr, "WARNING:\n");

  switch (format) {
    case FORMAT_OVL:
    case FORMAT_OBT:
      fprintf(stderr, "%s: unknown format (%d)?!?!\n", argv[0], format);

Пример #14
main(int argc, char **argv) {
  char  bolfile_name[FILENAME_MAX] = {0};
  char  Outfile_Name[FILENAME_MAX] = {0};
  int  illegal;
  char  * p;

  argc = AS_configure(argc, argv);
  Min_Olap_Len = AS_OVERLAP_MIN_LEN; // set after configure

  int err=0;
  int arg=1;
  while (arg < argc) {
    if (strcmp(argv[arg], "-G") == 0) {
      Doing_Partial_Overlaps = TRUE;
    } else if (strcmp(argv[arg], "-h") == 0) {
      AS_UTL_decodeRange(argv[++arg], Lo_Hash_Frag, Hi_Hash_Frag);

    } else if (strcmp(argv[arg], "-H") == 0) {
      AS_UTL_decodeRange(argv[++arg], minLibToHash, maxLibToHash);

    } else if (strcmp(argv[arg], "-R") == 0) {
      AS_UTL_decodeRange(argv[++arg], minLibToRef, maxLibToRef);

    } else if (strcmp(argv[arg], "-k") == 0) {
      if ((isdigit(argv[arg][0]) && (argv[arg][1] == 0)) ||
          (isdigit(argv[arg][0]) && isdigit(argv[arg][1]) && (argv[arg][2] == 0))) {
        Kmer_Len = strtoull(argv[arg], NULL, 10);
      } else {
        errno = 0;
        Kmer_Skip_File = fopen(argv[arg], "r");
        if (errno)
          fprintf(stderr, "ERROR: Failed to open -k '%s': %s\n", argv[arg], strerror(errno)), exit(1);

    } else if (strcmp(argv[arg], "-l") == 0) {
      Frag_Olap_Limit = strtol(argv[++arg], NULL, 10);
      if  (Frag_Olap_Limit < 1)
        Frag_Olap_Limit = INT_MAX;

    } else if (strcmp(argv[arg], "-m") == 0) {
      Unique_Olap_Per_Pair = FALSE;

    } else if (strcmp(argv[arg], "--hashbits") == 0) {
      Hash_Mask_Bits = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashstrings") == 0) {
      Max_Hash_Strings = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashdatalen") == 0) {
      Max_Hash_Data_Len = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashload") == 0) {
      Max_Hash_Load = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "--maxreadlen") == 0) {
      //  Quite the gross way to do this, but simple.
      uint32 desired = strtoul(argv[++arg], NULL, 10);
      OFFSET_BITS = 1;
      while (((uint32)1 << OFFSET_BITS) < desired)

      STRING_NUM_BITS       = 30 - OFFSET_BITS;

      STRING_NUM_MASK       = (1 << STRING_NUM_BITS) - 1;
      OFFSET_MASK           = (1 << OFFSET_BITS) - 1;


    } else if (strcmp(argv[arg], "--readsperbatch") == 0) {
      Max_Reads_Per_Batch = strtoul(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--readsperthread") == 0) {
      Max_Reads_Per_Thread = strtoul(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-o") == 0) {
      strcpy(Outfile_Name, argv[++arg]);

    } else if (strcmp(argv[arg], "-r") == 0) {
      AS_UTL_decodeRange(argv[++arg], Lo_Old_Frag, Hi_Old_Frag);

    } else if (strcmp(argv[arg], "-t") == 0) {
      Num_PThreads = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-u") == 0) {
      Unique_Olap_Per_Pair = TRUE;

    } else if (strcmp(argv[arg], "-v") == 0) {
      Min_Olap_Len = (int) strtol (argv[++arg], & p, 10);

    } else if (strcmp(argv[arg], "-w") == 0) {
      Use_Window_Filter = TRUE;

    } else if (strcmp(argv[arg], "-x") == 0) {
      Ignore_Clear_Range = TRUE;

    } else if (strcmp(argv[arg], "-z") == 0) {
      Use_Hopeless_Check = FALSE;

    } else {
      if (Frag_Store_Path == NULL) {
        Frag_Store_Path = argv[arg];
      } else {
        fprintf(stderr, "Unknown option '%s'\n", argv[arg]);

  //  Fix up some flags if we're allowing high error rates.
  if (AS_OVL_ERROR_RATE > 0.06) {
    if (Use_Window_Filter)
      fprintf(stderr, "High error rates requested -- window-filter turned off despite -w flag!\n");
    Use_Window_Filter  = FALSE;
    Use_Hopeless_Check = FALSE;

  if (Max_Hash_Strings == 0)
    fprintf(stderr, "* No memory model supplied; -M needed!\n"), err++;

  if (Kmer_Len == 0)
    fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++;

  if (Max_Hash_Strings > MAX_STRING_NUM)
    fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++;

  if (Outfile_Name[0] == 0)
    fprintf (stderr, "ERROR:  No output file name specified\n"), err++;

  if ((err) || (Frag_Store_Path == NULL)) {
    fprintf(stderr, "USAGE:  %s [options] <gkpStorePath>\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "-b <fn>     in contig mode, specify the output file\n");
    fprintf(stderr, "-c          contig mode.  Use 2 frag stores.  First is\n");
    fprintf(stderr, "            for reads; second is for contigs\n");
    fprintf(stderr, "-G          do partial overlaps\n");
    fprintf(stderr, "-h <range>  to specify fragments to put in hash table\n");
    fprintf(stderr, "            Implies LSF mode (no changes to frag store)\n");
    fprintf(stderr, "-I          designate a file of frag iids to limit olaps to\n");
    fprintf(stderr, "            (Contig mode only)\n");
    fprintf(stderr, "-k          if one or two digits, the length of a kmer, otherwise\n");
    fprintf(stderr, "            the filename containing a list of kmers to ignore in\n");
    fprintf(stderr, "            the hash table\n");
    fprintf(stderr, "-l          specify the maximum number of overlaps per\n");
    fprintf(stderr, "            fragment-end per batch of fragments.\n");
    fprintf(stderr, "-m          allow multiple overlaps per oriented fragment pair\n");
    fprintf(stderr, "-M          specify memory size.  Valid values are '8GB', '4GB',\n");
    fprintf(stderr, "            '2GB', '1GB', '256MB'.  (Not for Contig mode)\n");
    fprintf(stderr, "-o          specify output file name\n");
    fprintf(stderr, "-P          write protoIO output (if not -G)\n");
    fprintf(stderr, "-r <range>  specify old fragments to overlap\n");
    fprintf(stderr, "-s          ignore screen information with fragments\n");
    fprintf(stderr, "-t <n>      use <n> parallel threads\n");
    fprintf(stderr, "-u          allow only 1 overlap per oriented fragment pair\n");
    fprintf(stderr, "-v <n>      only output overlaps of <n> or more bases\n");
    fprintf(stderr, "-w          filter out overlaps with too many errors in a window\n");
    fprintf(stderr, "-x          ignore the clear ranges on reads and use the \n");
    fprintf(stderr, "            full sequence\n");
    fprintf(stderr, "-z          skip the hopeless check\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--hashbits n       Use n bits for the hash mask.\n");
    fprintf(stderr, "--hashstrings n    Load at most n strings into the hash table at one time.\n");
    fprintf(stderr, "--hashdatalen n    Load at most n bytes into the hash table at one time.\n");
    fprintf(stderr, "--hashload f       Load to at most 0.0 < f < 1.0 capacity (default 0.7).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--maxreadlen n     For batches with all short reads, pack bits differently to\n");
    fprintf(stderr, "                   process more reads per batch.\n");
    fprintf(stderr, "                     all reads must be shorter than n\n");
    fprintf(stderr, "                     --hashstrings limited to 2^(30-m)\n");
    fprintf(stderr, "                   Common values:\n");
    fprintf(stderr, "                     maxreadlen 2048 -> hashstrings  524288 (default)\n");
    fprintf(stderr, "                     maxreadlen  512 -> hashstrings 2097152\n");
    fprintf(stderr, "                     maxreadlen  128 -> hashstrings 8388608\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--readsperbatch n  Force batch size to n.\n");
    fprintf(stderr, "--readsperthread n Force each thread to process n reads.\n");
    fprintf(stderr, "\n");

  assert(NULL == Out_BOF);

  Out_BOF    = AS_OVS_createBinaryOverlapFile(Outfile_Name, FALSE);

  //  Adjust the number of reads to load into memory at once (for processing, not the hash table),

  if (Max_Reads_Per_Batch == 0)
    Max_Reads_Per_Batch = (Max_Hash_Strings < 100000) ? Max_Hash_Strings : 100000;

  //if (Max_Hash_Strings < Max_Reads_Per_Batch)
  //  Max_Reads_Per_Batch = Max_Hash_Strings;

  //  Adjust the number of reads processed per thread.  Default to having four blocks per thread,
  //  but make sure that (a) all threads have work to do, and (b) batches are not minuscule.

  if (Max_Reads_Per_Thread == 0)
    Max_Reads_Per_Thread = Max_Reads_Per_Batch / (4 * Num_PThreads);

  if (Max_Reads_Per_Thread * Num_PThreads > Max_Reads_Per_Batch)
    Max_Reads_Per_Thread = Max_Reads_Per_Batch / Num_PThreads + 1;

  if (Max_Reads_Per_Thread < 10)
    Max_Reads_Per_Thread = 10;

  //  We know enough now to set the hash function variables, and some other random variables.

  HSF1 = Kmer_Len - (Hash_Mask_Bits / 2);
  HSF2 = 2 * Kmer_Len - Hash_Mask_Bits;
  SV1  = HSF1 + 2;
  SV2  = (HSF1 + HSF2) / 2;
  SV3  = HSF2 - 2;

  Branch_Match_Value = (Doing_Partial_Overlaps) ? PARTIAL_BRANCH_MATCH_VAL : DEFAULT_BRANCH_MATCH_VAL;
  Branch_Error_Value = Branch_Match_Value - 1.0;

  fprintf(stderr, "\n");
  fprintf(stderr, "STRING_NUM_BITS       "F_U32"\n", STRING_NUM_BITS);
  fprintf(stderr, "OFFSET_BITS           "F_U32"\n", OFFSET_BITS);
  fprintf(stderr, "STRING_NUM_MASK       "F_U64"\n", STRING_NUM_MASK);
  fprintf(stderr, "OFFSET_MASK           "F_U64"\n", OFFSET_MASK);
  fprintf(stderr, "MAX_STRING_NUM        "F_U64"\n", MAX_STRING_NUM);
  fprintf(stderr, "\n");
  fprintf(stderr, "Hash_Mask_Bits        "F_U32"\n", Hash_Mask_Bits);
  fprintf(stderr, "Max_Hash_Strings      "F_U32"\n", Max_Hash_Strings);
  fprintf(stderr, "Max_Hash_Data_Len     "F_U64"\n", Max_Hash_Data_Len);
  fprintf(stderr, "Max_Hash_Load         %f\n", Max_Hash_Load);
  fprintf(stderr, "Kmer Length           %d\n", (int)Kmer_Len);
  fprintf(stderr, "Min Overlap Length    %d\n", Min_Olap_Len);
  fprintf(stderr, "MAX_ERRORS            %d\n", MAX_ERRORS);
  fprintf(stderr, "ERRORS_FOR_FREE       %d\n", ERRORS_FOR_FREE);
  fprintf(stderr, "\n");
  fprintf(stderr, "Num_PThreads          "F_U32"\n", Num_PThreads);
  fprintf(stderr, "Max_Reads_Per_Batch   "F_U32"\n", Max_Reads_Per_Batch);
  fprintf(stderr, "Max_Reads_Per_Thread  "F_U32"\n", Max_Reads_Per_Thread);

  assert (8 * sizeof (uint64) > 2 * Kmer_Len);

  Initialize_Globals ();

  OldFragStore = new gkStore(Frag_Store_Path, FALSE, FALSE);


  fprintf (stderr, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct);
  fprintf (stderr, "    Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct);
  fprintf (stderr, "  Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct);
  fprintf (stderr, " Total overlaps produced = "F_S64"\n", Total_Overlaps);
  fprintf (stderr, "      Contained overlaps = "F_S64"\n", Contained_Overlap_Ct);
  fprintf (stderr, "       Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct);
  fprintf (stderr, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct);
  fprintf (stderr, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct);

  delete OldFragStore;


Пример #15
main(int argc, char **argv) {
  char     *gkpName = NULL;
  char     *ovsName = NULL;

  char     *finClrName = NULL;
  char     *outClrName = NULL;

  double    errorRate       = 0.06;
  //uint32    minAlignLength  = 40;
  uint32    minReadLength   = 64;

  uint32    idMin = 1;
  uint32    idMax = UINT32_MAX;

  char     *outputPrefix = NULL;
  char      outputName[FILENAME_MAX];

  FILE     *staFile      = NULL;
  FILE     *reportFile   = NULL;
  FILE     *subreadFile  = NULL;

  bool      doSubreadLogging        = true;
  bool      doSubreadLoggingVerbose = false;

  //  Statistics on the trimming - the second set are from the old logging, and don't really apply anymore.

  trimStat  readsIn;                  //  Read is eligible for trimming
  trimStat  deletedIn;                //  Read was deleted already
  trimStat  noTrimIn;                 //  Read not requesting trimming

  trimStat  noOverlaps;               //  no overlaps in store
  trimStat  noCoverage;               //  no coverage after adjusting for trimming done

  trimStat  readsProcChimera;         //  Read was processed for chimera signal
  trimStat  readsProcSpur;            //  Read was processed for spur signal
  trimStat  readsProcSubRead;         //  Read was processed for subread signal

#if 0
  trimStat  badSpur5;
  trimStat  badSpur3;
  trimStat  badChimera;
  trimStat  badSubread;

  trimStat  readsNoChange;

  trimStat  readsBadSpur5,   basesBadSpur5;
  trimStat  readsBadSpur3,   basesBadSpur3;
  trimStat  readsBadChimera, basesBadChimera;
  trimStat  readsBadSubread, basesBadSubread;

  trimStat  readsTrimmed5;
  trimStat  readsTrimmed3;

#if 0
  trimStat  fullCoverage;             //  fully covered by overlaps
  trimStat  noSignalNoGap;            //  no signal, no gaps
  trimStat  noSignalButGap;           //  no signal, with gaps

  trimStat  bothFixed;                //  both chimera and spur signal trimmed
  trimStat  chimeraFixed;             //  only chimera signal trimmed
  trimStat  spurFixed;                //  only spur signal trimmed

  trimStat  bothDeletedSmall;         //  deleted because of both cimera and spur signals
  trimStat  chimeraDeletedSmall;      //  deleted because of chimera signal
  trimStat  spurDeletedSmall;         //  deleted because of spur signal

  trimStat  spurDetectedNormal;       //  normal spur detected
  trimStat  spurDetectedLinker;       //  linker spur detected

  trimStat  chimeraDetectedInnie;     //  innpue-pair chimera detected
  trimStat  chimeraDetectedOverhang;  //  overhanging chimera detected
  trimStat  chimeraDetectedGap;       //  gap chimera detected
  trimStat  chimeraDetectedLinker;    //  linker chimera detected

  trimStat  deletedOut;               //  Read was deleted by trimming

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      ovsName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      AS_UTL_decodeRange(argv[++arg], idMin, idMax);

    } else if (strcmp(argv[arg], "-Ci") == 0) {
      finClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Co") == 0) {
      outClrName = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      errorRate = atof(argv[++arg]);

    //} else if (strcmp(argv[arg], "-l") == 0) {
    //  minAlignLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minReadLength = atoi(argv[++arg]);

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);

  if (errorRate < 0.0)

  if ((gkpName == 0L) || (ovsName == 0L) || (outputPrefix == NULL) || (err)) {
    fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Ci input.clearFile -Co output.clearFile -o outputPrefix]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G gkpStore    path to read store\n");
    fprintf(stderr, "  -O ovlStore    path to overlap store\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -o name        output prefix, for logging\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -t bgn-end     limit processing to only reads from bgn to end (inclusive)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -Ci clearFile  path to input clear ranges (NOT SUPPORTED)\n");
    fprintf(stderr, "  -Co clearFile  path to ouput clear ranges\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -e erate       ignore overlaps with more than 'erate' percent error\n");
    //fprintf(stderr, "  -l length      ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -minlength l   reads trimmed below this many bases are deleted\n");
    fprintf(stderr, "\n");

    if (errorRate < 0.0)
      fprintf(stderr, "ERROR: Error rate (-e) value %f too small; must be 'fraction error' and above 0.0\n", errorRate);


  gkStore         *gkp = gkStore::gkStore_open(gkpName);
  ovStore         *ovs = new ovStore(ovsName, gkp);

  clearRangeFile  *finClr = new clearRangeFile(finClrName, gkp);
  clearRangeFile  *outClr = new clearRangeFile(outClrName, gkp);

  if (outClr)
    //  If the outClr file exists, those clear ranges are loaded.  We need to reset them
    //  back to 'untrimmed' for now.

  if (finClr && outClr)
    //  A finClr file was supplied, so use those as the clear ranges.

  sprintf(outputName, "%s.log",         outputPrefix);
  errno = 0;
  reportFile  = fopen(outputName, "w");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);

  sprintf(outputName, "%s.subread.log", outputPrefix);
  errno = 0;
  subreadFile = fopen(outputName, "w");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);

  uint32      ovlLen = 0;
  uint32      ovlMax = 64 * 1024;
  ovOverlap  *ovl    = ovOverlap::allocateOverlaps(gkp, ovlMax);

  memset(ovl, 0, sizeof(ovOverlap) * ovlMax);

  workUnit *w = new workUnit;

  if (idMin < 1)
    idMin = 1;
  if (idMax > gkp->gkStore_getNumReads())
    idMax = gkp->gkStore_getNumReads();

  fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads, using errorRate = %.2f\n",

  for (uint32 id=idMin; id<=idMax; id++) {
    gkRead     *read = gkp->gkStore_getRead(id);
    gkLibrary  *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID());

    if (finClr->isDeleted(id)) {
      //  Read already trashed.
      deletedIn += read->gkRead_sequenceLength();

    if ((libr->gkLibrary_removeSpurReads()     == false) &&
        (libr->gkLibrary_removeChimericReads() == false) &&
        (libr->gkLibrary_checkForSubReads()    == false)) {
      //  Nothing to do.
      noTrimIn += read->gkRead_sequenceLength();

    readsIn += read->gkRead_sequenceLength();

    uint32   nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax);

    //fprintf(stderr, "read %7u with %7u overlaps\r", id, nLoaded);

    if (nLoaded == 0) {
      //  No overlaps, nothing to check!
      noOverlaps += read->gkRead_sequenceLength();

    w->clear(id, finClr->bgn(id), finClr->end(id));
    w->addAndFilterOverlaps(gkp, finClr, errorRate, ovl, ovlLen);

    if (w->adjLen == 0) {
      //  All overlaps trimmed out!
      noCoverage += read->gkRead_sequenceLength();

    //  Find bad regions.

    //if (libr->gkLibrary_markBad() == true)
    //  //  From an external file, a list of known bad regions.  If no overlaps span
    //  //  the region with sufficient coverage, mark the region as bad.  This was
    //  //  motivated by the old 454 linker detection.
    //  markBad(gkp, w, subreadFile, doSubreadLoggingVerbose);

    //if (libr->gkLibrary_removeSpurReads() == true) {
    //  readsProcSpur += read->gkRead_sequenceLength();
    //  detectSpur(gkp, w, subreadFile, doSubreadLoggingVerbose);
    //  Get stats on spur region detected - save the length of each region to the trimStats object.

    //if (libr->gkLibrary_removeChimericReads() == true) {
    //  readsProcChimera += read->gkRead_sequenceLength();
    //  detectChimer(gkp, w, subreadFile, doSubreadLoggingVerbose);
    //  Get stats on chimera region detected - save the length of each region to the trimStats object.

    if (libr->gkLibrary_checkForSubReads() == true) {
      readsProcSubRead += read->gkRead_sequenceLength();
      detectSubReads(gkp, w, subreadFile, doSubreadLoggingVerbose);

    //  Get stats on the bad regions found.  This kind of duplicates code in trimBadInterval(), but
    //  I don't want to pass all the stats objects into there.

    if (w->blist.size() == 0) {
      readsNoChange += read->gkRead_sequenceLength();

    else {
      uint32  nSpur5   = 0, bSpur5   = 0;
      uint32  nSpur3   = 0, bSpur3   = 0;
      uint32  nChimera = 0, bChimera = 0;
      uint32  nSubread = 0, bSubread = 0;

      for (uint32 bb=0; bb<w->blist.size(); bb++) {
        switch (w->blist[bb].type) {
          case badType_5spur:
            nSpur5        += 1;
            basesBadSpur5 += w->blist[bb].end - w->blist[bb].bgn;
          case badType_3spur:
            nSpur3        += 1;
            basesBadSpur3 += w->blist[bb].end - w->blist[bb].bgn;
          case badType_chimera:
            nChimera        += 1;
            basesBadChimera += w->blist[bb].end - w->blist[bb].bgn;
          case badType_subread:
            nSubread        += 1;
            basesBadSubread += w->blist[bb].end - w->blist[bb].bgn;

      if (nSpur5   > 0)   readsBadSpur5   += nSpur5;
      if (nSpur3   > 0)   readsBadSpur3   += nSpur3;
      if (nChimera > 0)   readsBadChimera += nChimera;
      if (nSubread > 0)   readsBadSubread += nSubread;

    //  Find solution.  This coalesces the list (in 'w') of all the bad regions found, picks out the
    //  largest good region, generates a log of the bad regions that support this decision, and sets
    //  the trim points.

    trimBadInterval(gkp, w, minReadLength, subreadFile, doSubreadLoggingVerbose);

    //  Log the solution.

    AS_UTL_safeWrite(reportFile, w->logMsg, "logMsg", sizeof(char), strlen(w->logMsg));

    //  Save the solution....

    outClr->setbgn(w->id) = w->clrBgn;
    outClr->setend(w->id) = w->clrEnd;

    //  And maybe delete the read.

    if (w->isOK == false) {
      deletedOut += read->gkRead_sequenceLength();


    //  Update stats on what was trimmed.  The asserts say the clear range didn't expand, and the if
    //  tests if the clear range changed.

    assert(w->clrBgn >= w->iniBgn);
    assert(w->iniEnd >= w->clrEnd);

    if (w->clrBgn > w->iniBgn)
      readsTrimmed5 += w->clrBgn - w->iniBgn;

    if (w->iniEnd > w->clrEnd)
      readsTrimmed3 += w->iniEnd - w->clrEnd;

  delete [] ovl;

  delete    w;


  delete    finClr;
  delete    outClr;

  //  Close log files

  if (reportFile)

  if (subreadFile)

  //  Write the summary

  if (outputPrefix) {
    sprintf(outputName, "%s.stats", outputPrefix);

    errno = 0;
    staFile = fopen(outputName, "w");
    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno));

  if (staFile == NULL)
    staFile = stdout;

  //  Would like to know number of subreads per read

  fprintf(staFile, "PARAMETERS:\n");
  fprintf(staFile, "----------\n");
  fprintf(staFile, "%7u    (reads trimmed below this many bases are deleted)\n", minReadLength);
  fprintf(staFile, "%7.4f    (use overlaps at or below this fraction error)\n", errorRate);
  //fprintf(staFile, "%7u    (use only overlaps longer than this)\n", minAlignLength);  //  NOT SUPPORTED!
  fprintf(staFile, "INPUT READS:\n");
  fprintf(staFile, "-----------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads processed)\n", readsIn.nReads, readsIn.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "PROCESSED:\n");
  fprintf(staFile, "--------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no overlaps)\n", noOverlaps.nReads, noOverlaps.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no coverage after adjusting for trimming done already)\n", noCoverage.nReads, noCoverage.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for chimera)\n",  readsProcChimera.nReads, readsProcChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for spur)\n",     readsProcSpur.nReads,    readsProcSpur.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for subreads)\n", readsProcSubRead.nReads, readsProcSubRead.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "READS WITH SIGNALS:\n");
  fprintf(staFile, "------------------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 5' spur signal)\n", readsBadSpur5.nReads,   readsBadSpur5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 3' spur signal)\n", readsBadSpur3.nReads,   readsBadSpur3.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of chimera signal)\n", readsBadChimera.nReads, readsBadChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of subread signal)\n", readsBadSubread.nReads, readsBadSubread.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "SIGNALS:\n");
  fprintf(staFile, "-------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 5' spur signal)\n", basesBadSpur5.nReads,   basesBadSpur5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 3' spur signal)\n", basesBadSpur3.nReads,   basesBadSpur3.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of chimera signal)\n", basesBadChimera.nReads, basesBadChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of subread signal)\n", basesBadSubread.nReads, basesBadSubread.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "TRIMMING:\n");
  fprintf(staFile, "--------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 5' end of the read)\n", readsTrimmed5.nReads, readsTrimmed5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 3' end of the read)\n", readsTrimmed3.nReads, readsTrimmed3.nBases);

#if 0
  fprintf(staFile, "DELETED:\n");
  fprintf(staFile, "-------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of both cimera and spur signals)\n", bothDeletedSmall.nReads, bothDeletedSmall.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of chimera signal)\n", chimeraDeletedSmall.nReads, chimeraDeletedSmall.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of spur signal)\n", spurDeletedSmall.nReads, spurDeletedSmall.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "SPUR TYPES:\n");
  fprintf(staFile, "----------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (normal spur detected)\n", spurDetectedNormal.nReads, spurDetectedNormal.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker spur detected)\n", spurDetectedLinker.nReads, spurDetectedLinker.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "CHIMERA TYPES:\n");
  fprintf(staFile, "-------------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (innie-pair chimera detected)\n", chimeraDetectedInnie.nReads, chimeraDetectedInnie.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (overhanging chimera detected)\n", chimeraDetectedOverhang.nReads, chimeraDetectedOverhang.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (gap chimera detected)\n", chimeraDetectedGap.nReads, chimeraDetectedGap.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker chimera detected)\n", chimeraDetectedLinker.nReads, chimeraDetectedLinker.nBases);

  //  SPUR TYPE    = TRIMMED and DELETED spur and both categories
  //  CHIMERA TYPE = TRIMMED and DELETED chimera and both categories

  if (staFile != stdout)

Пример #16
main(int argc, char **argv) {
  char            *gkpStoreName      = NULL;
  char            *outPrefix         = NULL;

  AS_IID           libToDump         = 0;
  uint32           clrToDump         = AS_READ_CLEAR_LATEST;

  AS_IID           bgnIID            = 1;
  AS_IID           endIID            = AS_IID_MAX;

  bool             dumpAllBases      = true;
  bool             dumpAllReads      = false;

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-l") == 0) {
      libToDump = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-b") == 0) {
      bgnIID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-e") == 0) {
      endIID  = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-c") == 0) {
      clrToDump = gkStore_decodeClearRegionLabel(argv[++arg]);

    } else if (strcmp(argv[arg], "-g") == 0) {
      gkpStoreName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outPrefix = argv[++arg];

    } else {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);

  if (gkpStoreName == NULL)
  if (outPrefix == NULL)
  if (clrToDump == AS_READ_CLEAR_ERROR)
  if (err) {
    fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]);
    fprintf(stderr, "  -g gkpStore\n");
    fprintf(stderr, "  -o fastq-prefix     write files fastq-prefix.1.fastq, fastq-prefix.2.fastq, fastq-prefix.paired.fastq, fastq-prefix.unmated.fastq\n");
    fprintf(stderr, "  \n");
    fprintf(stderr, "  -l libToDump        output only fragments in library number libToDump (NOT IMPLEMENTED)\n");
    fprintf(stderr, "  -b iid              output starting at fragment iid\n");
    fprintf(stderr, "  -e iid              output stopping after fragment iid\n");
    fprintf(stderr, "  -c clrName          output clear range 'clrName'\n");
    fprintf(stderr, "  \n");

    if (gkpStoreName == NULL)
      fprintf(stderr, "ERROR: no gkpStore (-g) supplied.\n");
    if (outPrefix == NULL)
      fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n");
    if (clrToDump == AS_READ_CLEAR_ERROR)
      fprintf(stderr, "ERROR: clear range (-c) is not a valid clear range.\n");

  gkStore    *gkp       = new gkStore(gkpStoreName, FALSE, FALSE);

  AS_IID    numFrags    = gkp->gkStore_getNumFragments();
  AS_IID    numLibs     = gkp->gkStore_getNumLibraries();

  libInfo **lib         = new libInfo * [numLibs];

  lib[0] = new libInfo(outPrefix, "legacy");

  for (uint32 i=1; i<numLibs; i++)
    lib[i] = new libInfo(outPrefix, gkp->gkStore_getLibrary(i)->libraryName);

  if (bgnIID < 1)
    bgnIID = 1;
  if (numFrags < endIID)
    endIID = numFrags;

  //AS_IID    streamBgn = AS_IID_MIN;
  //AS_IID    streamEnd = AS_IID_MAX;

  gkStream   *fs        = new gkStream(gkp, bgnIID, endIID, GKFRAGMENT_QLT);
  gkFragment  fr;

  while (fs->next(&fr)) {
    int32   lclr   = fr.gkFragment_getClearRegionBegin(clrToDump);
    int32   rclr   = fr.gkFragment_getClearRegionEnd  (clrToDump);

    AS_IID  id1    = fr.gkFragment_getReadIID();
    AS_IID  id2    = fr.gkFragment_getMateIID();

    AS_IID  libIID = fr.gkFragment_getLibraryIID();

    if ((dumpAllReads == false) && (fr.gkFragment_getIsDeleted() == true))
      //  Fragment is deleted, don't dump.

    if ((libToDump != 0) && (fr.gkFragment_getLibraryIID() == libToDump))
      //  Fragment isn't marked for dumping, don't dump.

    if ((dumpAllBases == false) && (lclr >= rclr))
      //  Fragment has null or invalid clear range, don't dump.

    if ((id2 != 0) && (id2 < id1))
      //  Mated, and the mate is the first frag.  We've already reported this one.

    char *seq = fr.gkFragment_getSequence() + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0);
    char *qlt = fr.gkFragment_getQuality()  + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0);

    int32 len = (dumpAllBases == false) ? fr.gkFragment_getClearRegionLength(clrToDump) : fr.gkFragment_getSequenceLength();

    seq[len] = 0;
    qlt[len] = 0;

    if (dumpAllBases == true) {
      for (uint32 i=0; i<lclr; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      for (uint32 i=lclr; i<rclr; i++)
        seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a';

      for (uint32 i=rclr; seq[i]; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

    if (id2 == 0) {
      //  Unmated read, dump to the unmated reads file.
      AS_UTL_writeFastQ(lib[libIID]->u, seq, len, qlt, len,
                        "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                        fr.gkFragment_getClearRegionEnd  (clrToDump),
                        fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                        fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                        fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                        fr.gkFragment_getIsNonRandom() ? 'f' : 't');

    //  Write the first fragment (twice).
    AS_UTL_writeFastQ(lib[libIID]->a, seq, len, qlt, len,
                      "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                      fr.gkFragment_getClearRegionEnd  (clrToDump),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                      fr.gkFragment_getIsNonRandom() ? 'f' : 't');

    AS_UTL_writeFastQ(lib[libIID]->p, seq, len, qlt, len,
                      "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                      fr.gkFragment_getClearRegionEnd  (clrToDump),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                      fr.gkFragment_getIsNonRandom() ? 'f' : 't');

    //  Grab the second fragment.

    gkp->gkStore_getFragment(id2, &fr, GKFRAGMENT_QLT);

    lclr = fr.gkFragment_getClearRegionBegin(clrToDump) + 1;
    rclr = fr.gkFragment_getClearRegionEnd  (clrToDump);

    seq = fr.gkFragment_getSequence() + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0);
    qlt = fr.gkFragment_getQuality()  + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0);
    len = (dumpAllBases == false) ? fr.gkFragment_getClearRegionLength(clrToDump) : fr.gkFragment_getSequenceLength();

    seq[len] = 0;
    qlt[len] = 0;

    //  Write the second fragment (twice).
    AS_UTL_writeFastQ(lib[libIID]->b, seq, len, qlt, len,
                      "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                      fr.gkFragment_getClearRegionEnd  (clrToDump),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                      fr.gkFragment_getIsNonRandom() ? 'f' : 't');

    AS_UTL_writeFastQ(lib[libIID]->p, seq, len, qlt, len,
                      "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                      fr.gkFragment_getClearRegionEnd  (clrToDump),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                      fr.gkFragment_getIsNonRandom() ? 'f' : 't');

  delete fs;
  delete gkp;

Пример #17
main(int argc, char **argv) {

  //  Options controlling main

  int    generateOutput = 1;
  int    preserveConsensus = 0;

  int    preMergeRezLevel = -1;
  int    repeatRezLevel   = 0;

  int    restartFromCheckpoint = -1;
  char  *restartFromLogical    = "ckp00-NUL";

  bool   recomputeLeastSquaresOnLoad = false;
  bool   reloadMates                 = false;

  int    doResolveSurrogates               = 1;      //  resolveSurrogates
  int    placeAllFragsInSinglePlacedSurros = 0;      //  resolveSurrogates
  double cutoffToInferSingleCopyStatus     = 0.666;  //  resolveSurrogates

  int    firstFileArg = 0;

  int32  outputFragsPerPartition = 0;

  ContigOrientChecker * coc;
  coc = CreateContigOrientChecker();
  assert(coc != NULL);

  //  temporary!
  fprintf(stderr, "Using up to %d OpenMP threads.\n", omp_get_max_threads());

  GlobalData = new Globals_CGW();

  argc = AS_configure(argc, argv);

  int arg     = 1;
  int err     = 0;
  int unk[64] = {0};
  int unl     = 0;

  while (arg < argc) {
    if        (strcmp(argv[arg], "-C") == 0) {
      GlobalData->performCleanupScaffolds = 0;

    } else if (strcmp(argv[arg], "-D") == 0) {
      GlobalData->debugLevel = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-E") == 0) {
      GlobalData->outputOverlapOnlyContigEdges = 1;

    } else if (strcmp(argv[arg], "-F") == 0) {
      GlobalData->allowDemoteMarkedUnitigs = FALSE;

    } else if (strcmp(argv[arg], "-G") == 0) {
      generateOutput = 0;

    } else if (strcmp(argv[arg], "-GG") == 0) {
      preserveConsensus = 1;

    } else if (strcmp(argv[arg], "-g") == 0) {
      strcpy(GlobalData->gkpStoreName, argv[++arg]);

    } else if (strcmp(argv[arg], "-t") == 0) {
      strcpy(GlobalData->tigStoreName, argv[++arg]);

    } else if (strcmp(argv[arg], "-I") == 0) {
      GlobalData->ignoreChaffUnitigs = 1;

    } else if (strcmp(argv[arg], "-j") == 0) {
      GlobalData->cgbUniqueCutoff = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-K") == 0) {
      GlobalData->removeNonOverlapingContigsFromScaffold = 1;

    } else if (strcmp(argv[arg], "-k") == 0) {
      GlobalData->cgbDefinitelyUniqueCutoff = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-m") == 0) {
      GlobalData->minSamplesForOverride = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-N") == 0) {
      restartFromLogical = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      strcpy(GlobalData->outputPrefix, argv[++arg]);

    } else if (strcmp(argv[arg], "-B") == 0) {
      outputFragsPerPartition = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-P") == 0) {
      GlobalData->closurePlacement = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-p") == 0) {
      preMergeRezLevel = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-R") == 0) {
      restartFromCheckpoint = atoi(argv[++arg]);
    } else if (strcmp(argv[arg], "-r") == 0) {
      repeatRezLevel = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-S") == 0) {
      doResolveSurrogates               = 1;
      cutoffToInferSingleCopyStatus     = atof(argv[++arg]);
      placeAllFragsInSinglePlacedSurros = 0;

      if (cutoffToInferSingleCopyStatus == 0.0)
        doResolveSurrogates               = 0;

      if (cutoffToInferSingleCopyStatus < 0) {
        cutoffToInferSingleCopyStatus     = 0.0;
        placeAllFragsInSinglePlacedSurros = 1;

    } else if (strcmp(argv[arg], "-s") == 0) {
      GlobalData->stoneLevel = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-filter") == 0) {
      GlobalData->mergeFilterLevel = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-shatter") == 0) {
      GlobalData->shatterLevel = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-missingMate") == 0) {
      GlobalData->mergeScaffoldMissingMates = atof(argv[++arg]);

      // the value is a percentage between 0 and 1 so make sure it never goes out of those bounds
      if (GlobalData->mergeScaffoldMissingMates < 0) {
    	  GlobalData->mergeScaffoldMissingMates = -1;
      } else if (GlobalData->mergeScaffoldMissingMates > 1) {
    	  GlobalData->mergeScaffoldMissingMates = 1;

    } else if (strcmp(argv[arg], "-U") == 0) {
      GlobalData->doUnjiggleWhenMerging = 1;

    } else if (strcmp(argv[arg], "-u") == 0) {
      fprintf(stderr, "Option -u is broken.\n");
      strcpy(GlobalData->unitigOverlaps, argv[++arg]);

    } else if (strcmp(argv[arg], "-Z") == 0) {
      GlobalData->demoteSingletonScaffolds = FALSE;

    } else if (strcmp(argv[arg], "-z") == 0) {
      GlobalData->checkRepeatBranchPattern = TRUE;

    } else if (strcmp(argv[arg], "-minmergeweight") == 0) {
      GlobalData->minWeightToMerge = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-recomputegaps") == 0) {
      recomputeLeastSquaresOnLoad = true;

    } else if (strcmp(argv[arg], "-reloadmates") == 0) {
      reloadMates = true;

    } else if ((argv[arg][0] != '-') && (firstFileArg == 0)) {
      firstFileArg = arg;
      arg = argc;

    } else {
      unk[unl++] = arg;


  if (GlobalData->gkpStoreName[0] == 0)

  if (GlobalData->outputPrefix[0] == 0)

  if (cutoffToInferSingleCopyStatus > 1.0)

  if (err) {
    fprintf(stderr, "usage: %s [options] -g <GatekeeperStoreName> -o <OutputPath> <unitigs*.cgb>\n", argv[0]);
    fprintf(stderr, "   -C                     Don't cleanup scaffolds\n");    
    fprintf(stderr, "   -D <lvl>               Debug\n");
    fprintf(stderr, "   -E                     output overlap only contig edges\n");
    fprintf(stderr, "   -e <thresh>            Microhet score probability cutoff\n");
    fprintf(stderr, "   -F                     strongly enforce unique/repeat flag set in unitig, default if not set is to still\n");
    fprintf(stderr, "                              allow those marked unique to be demoted due to Repeat Branch Pattern or being\n");
    fprintf(stderr, "                              too small\n");
    fprintf(stderr, "   -g                     gkp Store path (required)\n");
    fprintf(stderr, "   -G                     Don't generate output (cgw or cam)\n");
    fprintf(stderr, "   -GG                    Don't destroy consensus on output (ctgcns will do nothing)\n");
    fprintf(stderr, "   -I                     ignore chaff unitigs\n");
    fprintf(stderr, "   -i <thresh>            Set max coverage stat for microhet determination of non-uniqueness (default -1)\n");
    fprintf(stderr, "   -j <thresh>            Set min coverage stat for definite uniqueness\n");
    fprintf(stderr, "   -K                     Allow kicking out a contig placed in a scaffold by mate pairs that has no overlaps\n");
    fprintf(stderr, "                            to both its left and right neighbor contigs.\n");
    fprintf(stderr, "   -k <thresh>            Set max coverage stat for possible uniqueness\n");
    fprintf(stderr, "   -M                     don't do interleaved scaffold merging\n");
    fprintf(stderr, "   -m <min>               Number of mate samples to recompute an insert size, default is 100\n");
    fprintf(stderr, "   -N <ckp>               restart from checkpoint location 'ckp' (see the timing file)\n");
    fprintf(stderr, "   -o                     Output Name (required)\n");
    fprintf(stderr, "   -P <int>               how to place closure reads.\n");
    fprintf(stderr, "                              0 - place at first location found\n");
    fprintf(stderr, "                              1 - place at best gap\n");
    fprintf(stderr, "                              2 - allow to be placed in multiple gaps\n");
    fprintf(stderr, "   -R <ckp>               restart from checkpoint file number 'ckp'\n");
    fprintf(stderr, "   -r <lvl>               repeat resolution level\n");
    fprintf(stderr, "   -S <t>                 place all frags in singly-placed surrogates if at least fraction <x> can be placed\n");
    fprintf(stderr, "                          two special cases:\n");
    fprintf(stderr, "                              if <t> = -1, place all frags in singly-placed surrogates aggressively\n");
    fprintf(stderr, "                                           (which really mean t = 0.0, but triggers a better algorithm)\n");
    fprintf(stderr, "                              if <t> =  0, do not resolve surrogate fragments\n");
    fprintf(stderr, "   -s <lvl>               stone throwing level\n");
    fprintf(stderr, "   -shatter <thresh>      Set threshold for shattering scaffolds when loading from checkpoint. Any contigs\n");
    fprintf(stderr, "                            connected to a scaffold only by edges with less weight than the threshold will be\n");
    fprintf(stderr, "                            split into a new scaffold (default OFF)\n");
    fprintf(stderr, "   -missingMate <thresh>  Set threshold (0-1) for the percentage of mates (out of total) that are allowed to be\n");
    fprintf(stderr, "                            missing when attempting a scaffold merge (default 0). A value of -1 will ignore all\n");
    fprintf(stderr, "                            missing mates\n");
    fprintf(stderr, "   -minmergeweight <w>    Only use weight w or better edges for merging scaffolds.\n");
    fprintf(stderr, "   -recomputegaps         if loading a checkpoint, recompute gaps, merging contigs and splitting low weight scaffolds.\n");
    fprintf(stderr, "   -reloadmates           If loading a checkpoint, also load any new mates from gkpStore.\n");
    fprintf(stderr, "   -U                     after inserting rocks/stones try shifting contig positions back to their original location\n");
    fprintf(stderr, "                            when computing overlaps to see if they overlap with the rock/stone and allow them to merge\n");
    fprintf(stderr, "                            if they do\n");
    fprintf(stderr, "   -u <file>              load these overlaps (from BOG) into the scaffold graph\n");
    fprintf(stderr, "   -v                     verbose\n");
    fprintf(stderr, "   -Z                     Don't demote singleton scaffolds\n");
    fprintf(stderr, "   -z                     Turn on Check for Repeat Branch Pattern (demotes some unique unitigs to repeat)\n");

    fprintf(stderr, "\n");

    if (GlobalData->gkpStoreName[0] == 0)
      fprintf(stderr, "ERROR:  No gatekeeper (-g) supplied.\n");

    if (GlobalData->outputPrefix[0] == 0)
      fprintf(stderr, "ERROR:  No output prefix (-o) supplied.\n");

    if (cutoffToInferSingleCopyStatus > 1.0)
      fprintf(stderr, "ERROR:  surrogate fraction cutoff (-S) must be between 0.0 and 1.0.\n");

    if (unl) {
      for (arg=0; arg<unl; arg++)
        fprintf(stderr, "ERROR:  Unknown option '%s'\n", argv[unk[arg]]);



  if(GlobalData->cgbDefinitelyUniqueCutoff < GlobalData->cgbUniqueCutoff)
    GlobalData->cgbDefinitelyUniqueCutoff = GlobalData->cgbUniqueCutoff;

  if (preMergeRezLevel >= 0)
    GlobalData->repeatRezLevel = preMergeRezLevel;
    GlobalData->repeatRezLevel = repeatRezLevel;

  if (runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_LOADING) == true) {
    int ctme     = time(0);

    //  Create the checkpoint from scratch
    ScaffoldGraph = CreateScaffoldGraph(GlobalData->outputPrefix);

    ProcessInput(firstFileArg, argc, argv);

    //  Insert sizes are set already, but we'll estimate again anyway.
    ComputeMatePairStatisticsRestricted(UNITIG_OPERATIONS, GlobalData->minSamplesForOverride, "unitig_initial");

    if (time(0) - ctme > 60 * 60)
      CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_LOADING], "after loading");

  } else if (isThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_LOADING) == true) {
    //  Load the checkpoint if we are exactly after loading, otherwise, fall through to the
    //  real load.
    LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix,restartFromCheckpoint, TRUE);

  if (runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_EDGE_BUILDING) == true) {
    vector<CDS_CID_t>  rawEdges;

    BuildGraphEdgesDirectly(ScaffoldGraph->CIGraph, rawEdges);

    //  Broken, see comments in ChunkOverlap_CGW.c
    //if (GlobalData->unitigOverlaps[0])
    //  AddUnitigOverlaps(ScaffoldGraph->CIGraph, GlobalData->unitigOverlaps, rawEdges);

    // Compute all overlaps implied by mate links between pairs of unique unitigs
    ComputeOverlaps(ScaffoldGraph->CIGraph, rawEdges);

    MergeAllGraphEdges(ScaffoldGraph->CIGraph, rawEdges, FALSE, FALSE);


    //  Mark some Unitigs/Chunks/CIs as repeats based on overlaps GRANGER 2/2/07
    if (GlobalData->checkRepeatBranchPattern)
      DemoteUnitigsWithRBP(stderr, ScaffoldGraph->CIGraph);

    //  At this Point we've constructed the CIGraph


    if(GlobalData->debugLevel > 0){

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_EDGE_BUILDING], "after building edges");
  } else {
    LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix,restartFromCheckpoint, TRUE);

    if (reloadMates)

    //  Dump stats on the loaded checkpoint

    // shatter scaffolds if requested
    if (GlobalData->shatterLevel > 0) {
    	ShatterScaffoldsConnectedByLowWeight(stderr, ScaffoldGraph, GlobalData->shatterLevel, TRUE);

    //  Useful for checking mate happiness on loading.  Currently only checks one scaffold.
    if (0) {
      vector<instrumentLIB>   libs;

      for (int32 i=0; i<GetNumDistTs(ScaffoldGraph->Dists); i++) {
        DistT *dptr = GetDistT(ScaffoldGraph->Dists, i);

        libs.push_back(instrumentLIB(i, dptr->mu, dptr->sigma, true));

      for (int32 sID=287340; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
        CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

        fprintf(stderr, "ANALYZING SCAFFOLD %d\n", sID);

        if (scaffold->flags.bits.isDead == true)

        instrumentSCF   A(scaffold);


    if (recomputeLeastSquaresOnLoad) {
      for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
        CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

        if (scaffold->flags.bits.isDead == true)

        if (true == LeastSquaresGapEstimates(ScaffoldGraph, GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID), LeastSquares_Cleanup | LeastSquares_Split))
          ScaffoldSanity(ScaffoldGraph, scaffold);

  //  We DO want to flush unused unitigs/contigs at this point.  They're not in
  //  a scaffold, and possibly will never be used again (except as rocks/stones).

  if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_DURING_INITIAL_SCAFFOLDING) == true) &&
      (GlobalData->repeatRezLevel > 0)) {
    int ctme     = time(0);

    if(GlobalData->debugLevel > 0)
      DumpContigs(stderr,ScaffoldGraph, FALSE);

    // Transitive reduction of ContigGraph followed by construction of SEdges

    //  With markShakyBifurcations enabled.
    BuildUniqueCIScaffolds(ScaffoldGraph, TRUE, FALSE);


    //  Equivalent to TidyUpScaffolds().
    for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
      CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

      if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
        ScaffoldSanity(ScaffoldGraph, scaffold);

    if (time(0) - ctme > 60 * 60)
      CheckpointScaffoldGraph(ckpNames[CHECKPOINT_DURING_INITIAL_SCAFFOLDING], "during initial scaffolding");

  if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_INITIAL_SCAFFOLDING) == true) &&
      (GlobalData->repeatRezLevel > 0)) {


      vector<CDS_CID_t>  rawEdges;

      BuildSEdges(rawEdges, FALSE);
      MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, FALSE);


    //  rocks is called inside of here
    //  checkpoints are written inside of here

    int iter     = 0;
    int iterMax  = 10;  //  MAX_OUTPUT_REZ_ITERATIONS
    int ctme     = time(0);
    int changed  = TRUE;

    fprintf(stderr,"** Running Level 1 Repeat Rez **\n");

    while ((changed) && (iter < iterMax)) {

      changed = RepeatRez(GlobalData->repeatRezLevel, GlobalData->outputPrefix);

      if (changed){
        CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE);

        //  With markShakyBifurcations disabled.
        BuildUniqueCIScaffolds(ScaffoldGraph, FALSE, FALSE);


        for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
          CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

          if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
            ScaffoldSanity(ScaffoldGraph, scaffold);


        //  This shouldn't be necessary (RepeatRez() calling TidyUpScaffolds() should be doing it),
        //  but it is infrequent (at most iterMax=10 times).
          vector<CDS_CID_t>  rawEdges;

          BuildSEdges(rawEdges, FALSE);
          MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, FALSE);

        //  If we've been running for 2 hours, AND we've not just
        //  completed the last iteration, checkpoint.
        if ((time(0) - ctme > 120 * 60) && (changed) && (iter+1 < iterMax)) {
          ctme = time(0);
          CheckpointScaffoldGraph(ckpNames[CHECKPOINT_DURING_INITIAL_SCAFFOLDING], "during initial scaffolding");


    AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc);

    if(GlobalData->debugLevel > 0)
      DumpCIScaffolds(stderr,ScaffoldGraph, FALSE);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_INITIAL_SCAFFOLDING], "after initial scaffolding");
  //  else TidyUpScaffolds (ScaffoldGraph);

  //  We DO want to flush unused unitigs/contigs at this point.  They're not in
  //  a scaffold, and possibly will never be used again (except as rocks/stones).

  if (runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_1ST_SCAFF_MERGE) == true) {
    CleanupScaffolds(ScaffoldGraph,FALSE, NULLINDEX, FALSE);


    /* First we try to merge Scaffolds agressively */
    MergeScaffoldsAggressive(ScaffoldGraph, ckpNames[CHECKPOINT_DURING_1ST_SCAFF_MERGE], FALSE);
    CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE);

    fprintf(stderr, "---Checking contig orders after MergeScaffoldsAggressive (1)\n\n");
    CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE);

    AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_1ST_SCAFF_MERGE], "after 1st scaffold merge");

  //  We DO want to flush unused unitigs/contigs at this point.  They're not in
  //  a scaffold, and possibly will never be used again (except as rocks/stones).

    now that we are done with initial scaffold merge, we want to use the
    standard/default repeatRezLevel. Up to now, the value of preMergeRezLevel
    was in use if set on the command line
  GlobalData->repeatRezLevel = repeatRezLevel;

  /* Now we throw stones */
  if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_STONES) == true) &&
      (GlobalData->stoneLevel > 0)) {

    // Convert single-contig scaffolds that are marginally unique back
    // to unplaced contigs so they might be placed as stones
    //  If we removed any scaffolds, rebuild all the edges.
    if ((GlobalData->demoteSingletonScaffolds == true) &&
        (DemoteSmallSingletonScaffolds() == true)) {
      vector<CDS_CID_t>  rawEdges;

      BuildSEdges(rawEdges, TRUE);
      MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, TRUE);

    Throw_Stones(GlobalData->outputPrefix, GlobalData->stoneLevel, FALSE);

    //  Cleanup and split scaffolds.  The cleanup shouldn't do anything, but it's cheap.
    CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE);

    for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
      CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

      if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
        ScaffoldSanity(ScaffoldGraph, scaffold);

    vector<CDS_CID_t>  rawEdges;

    BuildSEdges(rawEdges, TRUE);
    MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, TRUE);


    fprintf(stderr, "---Checking contig orders after Throw_Stones\n\n");
    CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE);

    AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_STONES], "after stone throwing");

    //GenerateLinkStats(ScaffoldGraph->CIGraph, "Stones", 0);
    //GeneratePlacedContigGraphStats("Stones", 0);
    //GenerateLinkStats(ScaffoldGraph->ContigGraph, "Stones", 0);
    //GenerateScaffoldGraphStats("Stones", 0);

  if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_2ND_SCAFF_MERGE) == true) &&
      (GlobalData->stoneLevel > 0)) {


    MergeScaffoldsAggressive(ScaffoldGraph, ckpNames[CHECKPOINT_DURING_2ND_SCAFF_MERGE], FALSE);

    CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE);

    fprintf(stderr, "---Checking contig orders after MergeScaffoldsAggressive (2)\n\n");
    CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE);

    AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_2ND_SCAFF_MERGE], "after 2nd scaffold merge");

  //  We DO want to flush unused unitigs/contigs at this point.  They're not in
  //  a scaffold, and possibly will never be used again (except as rocks/stones).

  //  The original rock throwing (above, RepeatRez()) calls TidyUpScaffolds() after each call to
  //  Fill_Gaps().  This does CleanupAScaffold() and LeastSquaresGapEstimates().  The it rebuilds
  //  scaffold edges (but not contig edges).  It's not been tested here, so we don't do it yet.

  if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_FINAL_ROCKS) == true) &&
      (GlobalData->repeatRezLevel > 0)) {
    int32  extra_rocks = 0;
    int32  iter        = 0;
    do {

      //  Zero means to rebuild the hopeless scaffold array - e.g., try all scaffolds again.
      //  Before this, it was using iter, but iter was never changed from zero.
      extra_rocks = Fill_Gaps(GlobalData->outputPrefix, GlobalData->repeatRezLevel, 0);
      fprintf(stderr, "Threw additional %d rocks on iter %d\n", extra_rocks, iter++);

#if 0
      CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, FALSE);

      for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
        CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

        if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
          ScaffoldSanity(ScaffoldGraph, scaffold);

      vector<CDS_CID_t>  rawEdges;

      BuildSEdges(rawEdges, FALSE);
      MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, FALSE);

    } while (extra_rocks > 1);

    //  XXX do we need least squares here?
#if 1
    fprintf(stderr, "Beta - LeastSquaresGapEstimates #1 after final rocks\n");
    for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
      CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

      if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
        ScaffoldSanity(ScaffoldGraph, scaffold);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_FINAL_ROCKS], "after final rocks");

  if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_PARTIAL_STONES) == true) &&
      (GlobalData->stoneLevel > 0)) {

    ScaffoldSanity (ScaffoldGraph);

    int partial_stones = Throw_Stones(GlobalData->outputPrefix, GlobalData->stoneLevel, TRUE);

    //  XXX do we need least squares here?
#if 1
    fprintf(stderr, "Beta - LeastSquaresGapEstimates #2 after partial stones\n");
    for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
      CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

      if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
        ScaffoldSanity(ScaffoldGraph, scaffold);

    //  If throw_stones splits scaffolds, rebuild edges
      vector<CDS_CID_t>  rawEdges;

      BuildSEdges(rawEdges, TRUE);
      MergeAllGraphEdges(ScaffoldGraph->ScaffoldGraph, rawEdges, TRUE, TRUE);

    ScaffoldSanity (ScaffoldGraph);


    fprintf (stderr, "Threw %d partial stones\n", partial_stones);
            "---Checking contig orders after partial_stones\n\n");
    CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE);
    AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_PARTIAL_STONES], "after partial stones");

    //GenerateLinkStats (ScaffoldGraph->CIGraph, "PStones", 0);
    //GeneratePlacedContigGraphStats ("PStones", 0);
    //GenerateLinkStats(ScaffoldGraph->ContigGraph, "PStones", 0);
    //GenerateScaffoldGraphStats ("PStones", 0);

  if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_FINAL_CONTAINED_STONES) == true) &&
      (GlobalData->stoneLevel > 0)) {

    ScaffoldSanity (ScaffoldGraph);

    int contained_stones = Toss_Contained_Stones (GlobalData->outputPrefix, GlobalData->stoneLevel, 0);
    fprintf(stderr, "Threw %d contained stones\n", contained_stones);
    fprintf (stderr, "**** Finished Final Contained Stones level %d ****\n", GlobalData->stoneLevel);

    //  Merge contigs before fiddling with gap sizes.
    CleanupScaffolds (ScaffoldGraph, FALSE, NULLINDEX, FALSE);

    //  XXX do we need least squares here?
#if 1
    fprintf(stderr, "Beta - LeastSquaresGapEstimates #3 after contained stones\n");
    for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
      CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

      if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
        ScaffoldSanity(ScaffoldGraph, scaffold);

    ScaffoldSanity (ScaffoldGraph);

    // Remove copies of surrogates which are placed multiple times in the same place in a contig


    fprintf(stderr, "---Checking contig orders after contained_stones\n\n");
    CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE);
    AddAllScaffoldsToContigOrientChecker(ScaffoldGraph, coc);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_FINAL_CONTAINED_STONES], "after final contained stones");

    //GenerateLinkStats (ScaffoldGraph->CIGraph, "CStones", 0);
    //GeneratePlacedContigGraphStats ("CStones", 0);
    //GenerateLinkStats(ScaffoldGraph->ContigGraph, "CStones", 0);
    //GenerateScaffoldGraphStats ("CStones", 0);

  //  We DO want to flush unused unitigs/contigs at this point.  They're not in
  //  a scaffold, and possibly will never be used again (except as rocks/stones).

  if (runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_FINAL_CLEANUP) == true) {

    // Try to cleanup failed merges, and if we do, generate a checkpoint
      // This call deletes surrogate-only contigs that failed to merge
      if(CleanupScaffolds(ScaffoldGraph, FALSE, NULLINDEX, TRUE)){

        fprintf(stderr, "---Checking contig orders after final cleanup\n\n");
        CheckAllContigOrientationsInAllScaffolds(ScaffoldGraph, coc, POPULATE_COC_HASHTABLE);

      //  XXX do we need least squares here?
#if 1
      fprintf(stderr, "Beta - LeastSquaresGapEstimates #4 after final cleanup\n");
      for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
        CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

        if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
          ScaffoldSanity(ScaffoldGraph, scaffold);

      CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_FINAL_CLEANUP], "after final cleanup");

  if ((runThisCheckpoint(restartFromLogical, CHECKPOINT_AFTER_RESOLVE_SURROGATES) == true) &&
      (doResolveSurrogates > 0)) {

    resolveSurrogates(placeAllFragsInSinglePlacedSurros, cutoffToInferSingleCopyStatus);
    // Call resolve surrogate twice, this is necessary for finishing (closure) reads.
    // Consider a closure read and its two bounding reads, named left and right:
    //    If one (right) is placed in a unique region while the other (left) is in a surrogate itself, the closure read cannot be placed
    //    However, once the surrogate bounding read is placed (and fully incorporated which happens at the very end of resolveSurrogates)
    //    the closure read can be placed. 
    //    Therefore, we run resolve surrogates twice. 
    // Note that is closure reads are themselves mated, it may be necessary to do a third round of placement.  
    resolveSurrogates(placeAllFragsInSinglePlacedSurros, cutoffToInferSingleCopyStatus);
    //  XXX do we need least squares here?
#if 1
    fprintf(stderr, "Beta - LeastSquaresGapEstimates #5 after resolve surrogates\n");
    for (int32 sID=0; sID < GetNumCIScaffoldTs(ScaffoldGraph->CIScaffolds); sID++) {
      CIScaffoldT *scaffold = GetCIScaffoldT(ScaffoldGraph->CIScaffolds, sID);

      if (true == LeastSquaresGapEstimates(ScaffoldGraph, scaffold, LeastSquares_Cleanup | LeastSquares_Split))
        ScaffoldSanity(ScaffoldGraph, scaffold);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_RESOLVE_SURROGATES], "after resolve surrogates");

  //  This generates the 'rezlog/gapreads' file.  It's hugely
  //  expensive, usually dies on a negative variance assert, and as
  //  far as BPW knows, unused.
  //Show_Reads_In_Gaps (GlobalData->outputPrefix);

  ComputeMatePairStatisticsRestricted(SCAFFOLD_OPERATIONS, GlobalData->minSamplesForOverride, "scaffold_final");
  ComputeMatePairStatisticsRestricted(CONTIG_OPERATIONS, GlobalData->minSamplesForOverride, "contig_final");

#ifdef DEBUG
  int j = 0;
  for (j = 0; j < GetNumVA_CIFragT(ScaffoldGraph->CIFrags); j++) {
    CIFragT * frag = GetCIFragT(ScaffoldGraph->CIFrags, j);
    if (ScaffoldGraph->gkpStore->gkStore_getFRGtoPLC(frag->read_iid) != 0) {
      AS_UID uid = getGatekeeperIIDtoUID(ScaffoldGraph->gkpStore, frag->read_iid, AS_IID_FRG);
      if (frag->contigID != -1) {
        ChunkInstanceT * ctg = GetGraphNode(ScaffoldGraph->ContigGraph, frag->contigID);            
        fprintf(stderr, "CLOSURE_READS: CLOSURE READ %s PLACED=%d CHAFF=%d SINGLETON=%d IN ASM type %c in SCF %d\n", AS_UID_toString(uid), frag->flags.bits.isPlaced, frag->flags.bits.isChaff, frag->flags.bits.isSingleton, frag->type, ctg->scaffoldID);

  //  We DO want to flush unused unitigs/contigs at this point.  They're not in
  //  a scaffold, and possibly will never be used again (except as rocks/stones).
  //  (This assumes that output doesn't load unitigs/contigs again)




    //  Note that OutputContigs partitions the tigStore, and closes ScaffoldGraph->tigStore.  The
    //  only operation valid after this function is CheckpointScaffoldGraph().

    OutputContigsFromMultiAligns(outputFragsPerPartition, preserveConsensus);

    CheckpointScaffoldGraph(ckpNames[CHECKPOINT_AFTER_OUTPUT], "after output");


  delete GlobalData;

  fprintf(stderr,"* Bye *\n");

Пример #18
int main (int argc, char *argv[]) {
  FILE       *asmFile                  = NULL;
  char       *outputPrefix             = NULL;
  char        outputName[FILENAME_MAX] = {0};
  int32       checkpointVers           = 0;
  int32       tigStoreVers             = 0;
  uint64      uidStart                 = 0;

  int32       outputScaffolds          = FALSE;

  GlobalData = new Globals_CGW();

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-g") == 0) {
      strcpy(GlobalData->gkpStoreName, argv[++arg]);

    } else if (strcmp(argv[arg], "-t") == 0) {
      strcpy(GlobalData->tigStoreName, argv[++arg]);
      tigStoreVers = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-c") == 0) {
      strcpy(GlobalData->outputPrefix, argv[++arg]);
      checkpointVers = atoi(argv[++arg]);
      outputScaffolds = TRUE;

    } else if (strcmp(argv[arg], "-o") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-s") == 0) {
      uidStart = strtoul(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "-n") == 0) {

    } else if (strcmp(argv[arg], "-E") == 0) {

    } else if (strcmp(argv[arg], "-h") == 0) {

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);
  if ((GlobalData->gkpStoreName[0] == 0) ||
      (GlobalData->tigStoreName[0] == 0) ||
      (err)) {
    fprintf(stderr, "usage: %s -g gkpStore [-o prefix] [-s firstUID] [-n namespace] [-E server] [-h]\n", argv[0]);
    fprintf(stderr, "  -g gkpStore             mandatory path to the gkpStore\n");
    fprintf(stderr, "  -t tigStore version     mandatory path to the tigStore and version\n");
    fprintf(stderr, "  -c checkpoint version   optional path to a checkpoint and version\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -o prefix               write the output here\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -s firstUID      don't use real UIDs, but start counting from here\n");
    fprintf(stderr, "  -n namespace     use this UID namespace\n");
    fprintf(stderr, "  -E server        use this UID server\n");

  sprintf(outputName, "%s.asm", outputPrefix);
  errno = 0;
  asmFile = fopen(outputName, "w");
  if (errno)
    fprintf(stderr, "%s: Couldn't open '%s' for write: %s\n", argv[0], outputName, strerror(errno)), exit(1);

  // if we have contigs
  if (outputScaffolds) {
    LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix, checkpointVers, FALSE);
  } else {
    ScaffoldGraph = CreateScaffoldGraph(outputPrefix);

  //  Reopen the tigStore used for consensus.
  delete ScaffoldGraph->tigStore;
  ScaffoldGraph->tigStore = new MultiAlignStore(GlobalData->tigStoreName, tigStoreVers, 0, 0, FALSE, FALSE);

  fprintf(stderr, "Writing assembly file\n");

  writeMDI(asmFile, true);
  writeAFG(asmFile, true, outputScaffolds);

  //  If uidStart is zero, use the UID server; otherwise, initialize the 'fake uid server' to start
  //  there, or after the last UID used by a fragment.
  uidServer = UIDserverInitialize(256, (uidStart == 0) ? 0 : MAX(uidMin, uidStart));

  writeAMP(asmFile, true);
  writeUTG(asmFile, true, outputScaffolds);
  writeULK(asmFile, true);
  writeCCO(asmFile, true);
  writeCLK(asmFile, true);
  writeSCF(asmFile, true);
  writeSLK(asmFile, true);


  fprintf(stderr, "Assembly file complete.\n");
  fprintf(stderr, "Writing IID to UID mapping files.\n");

  sprintf(outputName, "%s.iidtouid", outputPrefix);
  errno = 0;
  asmFile = fopen(outputName, "w");
  if (errno)
    fprintf(stderr, "%s: Couldn't open '%s' for write: %s\n", argv[0], outputName, strerror(errno)), exit(1);

  FRGmap.dump("FRG", asmFile);
  UTGmap.dump("UTG", asmFile);
  CCOmap.dump("CTG", asmFile);
  SCFmap.dump("SCF", asmFile);


  fprintf(stderr, "IID to UID mapping files complete.\n");


Пример #19
main( int argc, char **argv) {
  int          ckptNum           = NULLINDEX;
  int          makeMiniScaffolds = 1;
  uint64       uidStart          = 1230000;
  UIDserver   *uids              = NULL;

  GlobalData = new Globals_CGW();

  argc = AS_configure(argc, argv);

  int err=0;
  int arg=1;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-p") == 0) {
      ckptNum = GlobalData->setPrefix(argv[++arg]);
    } else if (strcmp(argv[arg], "-c") == 0) {
      strcpy(GlobalData->outputPrefix, argv[++arg]);
    } else if (strcmp(argv[arg], "-g") == 0) {
      strcpy(GlobalData->gkpStoreName, argv[++arg]);
    } else if (strcmp(argv[arg], "-t") == 0) {
      strcpy(GlobalData->tigStoreName, argv[++arg]);
    } else if (strcmp(argv[arg], "-n") == 0) {
      ckptNum = atoi(argv[++arg]);
    } else if (strcmp(argv[arg], "-U") == 0) {
      uidStart = 0;
    } else if (strcmp(argv[arg], "-S") == 0) {
      makeMiniScaffolds = 0;
    } else {
      fprintf(stderr, "unknown option '%s'\n", argv[arg]);
      err = 1;

  if ((GlobalData->outputPrefix[0] == 0) ||
      (GlobalData->gkpStoreName[0] == 0)) {
    fprintf(stderr, "usage: %s [[-p prefix] | [-c name -g gkpstore -n ckptNum]] [-U] [-S]\n", argv[0]);
    fprintf(stderr, "  -p      Attempt to locate the last checkpoint in directory 7-CGW.\n");
    fprintf(stderr, "  -c      Look for checkpoints in 'name'\n");
    fprintf(stderr, "  -g      Path to gkpStore\n");
    fprintf(stderr, "  -n      Checkpoint number to load\n");
    fprintf(stderr, "  -U      Use real UIDs for miniscaffolds, otherwise, UIDs start at 1230000\n");
    fprintf(stderr, "  -S      Do NOT make mini scaffolds.\n");

  uids = UIDserverInitialize(256, uidStart);

  char *toprint = (char *)safe_malloc(sizeof(char) * (AS_READ_MAX_NORMAL_LEN + 51 + AS_READ_MAX_NORMAL_LEN + 2));

  LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix, ckptNum, FALSE);

  int ifrag;
  for (ifrag=0; ifrag < GetNumVA_CIFragT(ScaffoldGraph->CIFrags); ifrag++) {
    CIFragT *frag = GetCIFragT(ScaffoldGraph->CIFrags, ifrag);
    CIFragT *mate = NULL;

    if (frag->flags.bits.isDeleted)

    assert(frag->cid != NULLINDEX);
    assert((frag->flags.bits.hasMate == 0) || (frag->mate_iid != 0));

    //  Fix for missing mates -- OBT used to not delete mate links, leaving
    //  dangling mates.  Somebody else seems to be doing this too.
    if (frag->flags.bits.hasMate) {
      mate = GetCIFragT(ScaffoldGraph->CIFrags, frag->mate_iid);
      if (mate == NULL)
        frag->flags.bits.hasMate = 0;

    //  If this fragment is not chaff, we have nothing to do here.
    if (GetGraphNode(ScaffoldGraph->CIGraph,frag->cid)->flags.bits.isChaff == 0)

    //  Print a singleton if there is no mate, the mate isn't chaff,
    //  or we were told to not make miniscaffolds.
    if ((mate == NULL) ||
        (mate->flags.bits.isChaff == 0) ||
        (makeMiniScaffolds == 0)) {
      AS_UID  fUID = getFragmentClear(frag->read_iid, 0, toprint);

                        toprint, strlen(toprint), 0,
                        ">%s /type=singleton\n", AS_UID_toString(fUID));

    } else if ((mate != NULL) &&
               (mate->flags.bits.isChaff == 1) &&
               (makeMiniScaffolds == 1) &&
               (frag->read_iid < mate->read_iid)) {

      //  make sure the following chain of Ns is divisible by three;
      //  the exact length is arbitrary but Doug Rusch points out that
      //  by making it divisible by 3, we can get lucky and maintain
      //  the phase of a protein ...  which helps in the
      //  auto-annotation of environmental samples

      AS_UID  fUID = getFragmentClear(frag->read_iid, 0, toprint);


      AS_UID  mUID = getFragmentClear(mate->read_iid, 1, toprint + strlen(toprint));

                        toprint, strlen(toprint), 0,
                        ">"F_U64" /type=mini_scaffold /frgs=(%s,%s)\n",

  delete GlobalData;

Пример #20
main(int argc, char **argv) {
  char              *gkpPath = 0L;
  char              *merCountsFile = 0L;

  merylStreamReader *MF  = 0L;

  uint32             maxCount = 0;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-m") == 0) {
      merCountsFile = argv[++arg];

    } else {
      fprintf(stderr, "unknown option '%s'\n", argv[arg]);
  if ((merCountsFile == 0L) || (err)) {
    fprintf(stderr, "usage: %s -m mercounts\n", argv[0]);
    fprintf(stderr, "  -m mercounts    file of mercounts\n");

  MF = new merylStreamReader(merCountsFile);

  //  Examine the counts, pick a reasonable upper limit.

  uint64  totalUsefulDistinct = MF->numberOfDistinctMers() - MF->numberOfUniqueMers();
  uint64  totalUsefulAll      = MF->numberOfTotalMers()    - MF->numberOfUniqueMers();
  uint64  distinct            = 0;
  uint64  total               = 0;
  uint32  Xcoverage           = 8;

  fprintf(stderr, "distinct: "F_U64"\n", MF->numberOfDistinctMers());
  fprintf(stderr, "unique:   "F_U64"\n", MF->numberOfUniqueMers());
  fprintf(stderr, "total:    "F_U64"\n", MF->numberOfTotalMers());

  //  Pass 0: try to deduce the X coverage we have.  The
  //  pattern we should see in mer counts is an initial spike
  //  for unique mers (these contain errors), then a drop into
  //  a valley, and a bump at the X coverage.
  //  .
  //  .      ...
  //  ..  ..........
  //  .................
  //  If this pattern is not found, we fallback to the default
  //  guess of 8x coverage.

  uint32  i  = 0;
  uint32  iX = 0;

  fprintf(stderr, "distinct: "F_U64"\n", MF->numberOfDistinctMers());
  fprintf(stderr, "unique:   "F_U64"\n", MF->numberOfUniqueMers());
  fprintf(stderr, "total:    "F_U64"\n", MF->numberOfTotalMers());

  fprintf(stderr, "Xcoverage zero 1 0 "F_U64"\n", MF->histogram(1));

  for (i=2; (i < MF->histogramLength()) && (MF->histogram(i-1) > MF->histogram(i)); i++)
    fprintf(stderr, "Xcoverage drop "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(i-1), MF->histogram(i));

  iX = i - 1;

  for (; i < MF->histogramLength(); i++) {
    if (MF->histogram(iX) < MF->histogram(i)) {
      fprintf(stderr, "Xcoverage incr "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(iX), MF->histogram(i));
      iX = i;
    } else {
      //fprintf(stderr, "Xcoverage drop "F_U32" "F_U64" "F_U64"\n", i, MF->histogram(iX), MF->histogram(i));

  fprintf(stderr, "Guessed X coverage is "F_U32"\n", iX);

  Xcoverage = iX;

  //  Pass 1: look for a reasonable limit, using %distinct and %total.
  for (i=2; (i < MF->histogramLength()) && (maxCount == 0); i++) {
    distinct += MF->histogram(i);
    total    += MF->histogram(i) * i;

    //  If we cover 99% of all the distinct mers, that's reasonable.
    if ((distinct / (double)totalUsefulDistinct) > 0.99)
      maxCount = i;

    //  If we're a somewhat high count, and we're covering 2/3
    //  of the total mers, assume that there are lots of
    //  errors (or polymorphism) that are preventing us from
    //  covering many distinct mers.
    if ((i > 25 * Xcoverage) && ((total / (double)totalUsefulAll) > (2.0 / 3.0)))
      maxCount = i;

  fprintf(stderr, "Set maxCount to "F_U32", which will cover %.2f%% of distinct mers and %.2f%% of all mers.\n",
          i, 100.0 * distinct / totalUsefulDistinct, 100.0 * total / totalUsefulAll);

  //  Pass 2: if the limit is relatively small compared to our
  //  guessed Xcoverage, and %total is high, keep going to
  //  close 75% of the gap in total coverage.  So if the TC is
  //  90%, we'd keep going until TC is 97.5%.
  //  If we're WAY low compared to X coverage, close the gap
  //  too, but not as much.  This only happens if we're
  //  covering 99% of the distinct, so we're already in good
  //  shape.  The genome doesn't appear to be very repetitive.
  if (((maxCount <  5 * Xcoverage)) ||
      ((maxCount < 50 * Xcoverage) && (total / (double)totalUsefulAll > 0.90))) {
    double  closeAmount = 0.75;

    if (total / (double)totalUsefulAll <= 0.90)
      closeAmount = 0.5;

    //  No, really.  This is just 0.75 * (1-TC) + TC
    double  desiredTC = closeAmount + (1 - closeAmount) * total / (double)totalUsefulAll;

    for (; (i < MF->histogramLength()) && (total / (double)totalUsefulAll < desiredTC); i++) {
      distinct += MF->histogram(i);
      total    += MF->histogram(i) * i;

    maxCount = i;

    fprintf(stderr, "Reset maxCount to "F_U32", which will cover %.2f%% of distinct mers and %.2f%% of all mers.\n",
            maxCount, 100.0 * distinct / totalUsefulDistinct, 100.0 * total / totalUsefulAll);

  fprintf(stdout, F_U32"\n", maxCount);

Пример #21
main(int argc, char **argv) {
  int  illegal;

  argc = AS_configure(argc, argv);


  int err=0;
  int arg=1;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      G.Doing_Partial_Overlaps = TRUE;

    } else if (strcmp(argv[arg], "-h") == 0) {
      AS_UTL_decodeRange(argv[++arg], G.bgnHashID, G.endHashID);

    } else if (strcmp(argv[arg], "-H") == 0) {
      AS_UTL_decodeRange(argv[++arg], G.minLibToHash, G.maxLibToHash);

    } else if (strcmp(argv[arg], "-r") == 0) {
      AS_UTL_decodeRange(argv[++arg], G.bgnRefID, G.endRefID);

    } else if (strcmp(argv[arg], "-R") == 0) {
      AS_UTL_decodeRange(argv[++arg], G.minLibToRef, G.maxLibToRef);

    } else if (strcmp(argv[arg], "-k") == 0) {
      if ((isdigit(argv[arg][0]) && (argv[arg][1] == 0)) ||
          (isdigit(argv[arg][0]) && isdigit(argv[arg][1]) && (argv[arg][2] == 0))) {
        G.Kmer_Len = strtoull(argv[arg], NULL, 10);
      } else {
        errno = 0;
        G.Kmer_Skip_File = fopen(argv[arg], "r");
        if (errno)
          fprintf(stderr, "ERROR: Failed to open -k '%s': %s\n", argv[arg], strerror(errno)), exit(1);

    } else if (strcmp(argv[arg], "-l") == 0) {
      G.Frag_Olap_Limit = strtol(argv[++arg], NULL, 10);
      if  (G.Frag_Olap_Limit < 1)
        G.Frag_Olap_Limit = UINT64_MAX;

    } else if (strcmp(argv[arg], "-m") == 0) {
      G.Unique_Olap_Per_Pair = FALSE;
    } else if (strcmp(argv[arg], "-u") == 0) {
      G.Unique_Olap_Per_Pair = TRUE;

    } else if (strcmp(argv[arg], "--hashbits") == 0) {
      G.Hash_Mask_Bits = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashstrings") == 0) {
      G.Max_Hash_Strings = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashdatalen") == 0) {
      G.Max_Hash_Data_Len = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--hashload") == 0) {
      G.Max_Hash_Load = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "--maxreadlen") == 0) {
      //  Quite the gross way to do this, but simple.
      uint32 desired = strtoul(argv[++arg], NULL, 10);
      OFFSET_BITS = 1;
      while (((uint32)1 << OFFSET_BITS) < desired)

      STRING_NUM_BITS       = 30 - OFFSET_BITS;

      STRING_NUM_MASK       = (1 << STRING_NUM_BITS) - 1;
      OFFSET_MASK           = (1 << OFFSET_BITS) - 1;


    } else if (strcmp(argv[arg], "-o") == 0) {
      G.Outfile_Name = argv[++arg];

    } else if (strcmp(argv[arg], "-s") == 0) {
      G.Outstat_Name = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      G.Num_PThreads = strtoull(argv[++arg], NULL, 10);

    } else if (strcmp(argv[arg], "--minlength") == 0) {
      G.Min_Olap_Len = strtol (argv[++arg], NULL, 10);
    } else if (strcmp(argv[arg], "--maxerate") == 0) {
      G.maxErate = ceil(strtof(argv[++arg], NULL) * 100) / 100;

    } else if (strcmp(argv[arg], "-w") == 0) {
      G.Use_Window_Filter = TRUE;

    } else if (strcmp(argv[arg], "-z") == 0) {
      G.Use_Hopeless_Check = FALSE;

    } else {
      if (G.Frag_Store_Path == NULL) {
        G.Frag_Store_Path = argv[arg];
      } else {
        fprintf(stderr, "Unknown option '%s'\n", argv[arg]);

  //  Fix up some flags if we're allowing high error rates.
  if (G.maxErate > 0.06) {
    if (G.Use_Window_Filter)
      fprintf(stderr, "High error rates requested -- window-filter turned off despite -w flag!\n");
    G.Use_Window_Filter  = FALSE;
    G.Use_Hopeless_Check = FALSE;

  if (G.Max_Hash_Strings == 0)
    fprintf(stderr, "* No memory model supplied; -M needed!\n"), err++;

  if (G.Kmer_Len == 0)
    fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++;

  if (G.Max_Hash_Strings > MAX_STRING_NUM)
    fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++;

  if (G.Outfile_Name == NULL)
    fprintf (stderr, "ERROR:  No output file name specified\n"), err++;

  if ((err) || (G.Frag_Store_Path == NULL)) {
    fprintf(stderr, "USAGE:  %s [options] <gkpStorePath>\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "-b <fn>     in contig mode, specify the output file\n");
    fprintf(stderr, "-c          contig mode.  Use 2 frag stores.  First is\n");
    fprintf(stderr, "            for reads; second is for contigs\n");
    fprintf(stderr, "-G          do partial overlaps\n");
    fprintf(stderr, "-h <range>  to specify fragments to put in hash table\n");
    fprintf(stderr, "            Implies LSF mode (no changes to frag store)\n");
    fprintf(stderr, "-I          designate a file of frag iids to limit olaps to\n");
    fprintf(stderr, "            (Contig mode only)\n");
    fprintf(stderr, "-k          if one or two digits, the length of a kmer, otherwise\n");
    fprintf(stderr, "            the filename containing a list of kmers to ignore in\n");
    fprintf(stderr, "            the hash table\n");
    fprintf(stderr, "-l          specify the maximum number of overlaps per\n");
    fprintf(stderr, "            fragment-end per batch of fragments.\n");
    fprintf(stderr, "-m          allow multiple overlaps per oriented fragment pair\n");
    fprintf(stderr, "-M          specify memory size.  Valid values are '8GB', '4GB',\n");
    fprintf(stderr, "            '2GB', '1GB', '256MB'.  (Not for Contig mode)\n");
    fprintf(stderr, "-o          specify output file name\n");
    fprintf(stderr, "-P          write protoIO output (if not -G)\n");
    fprintf(stderr, "-r <range>  specify old fragments to overlap\n");
    fprintf(stderr, "-t <n>      use <n> parallel threads\n");
    fprintf(stderr, "-u          allow only 1 overlap per oriented fragment pair\n");
    fprintf(stderr, "-w          filter out overlaps with too many errors in a window\n");
    fprintf(stderr, "-z          skip the hopeless check\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--maxerate <n>     only output overlaps with fraction <n> or less error (e.g., 0.06 == 6%%)\n");
    fprintf(stderr, "--minlength <n>    only output overlaps of <n> or more bases\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--hashbits n       Use n bits for the hash mask.\n");
    fprintf(stderr, "--hashstrings n    Load at most n strings into the hash table at one time.\n");
    fprintf(stderr, "--hashdatalen n    Load at most n bytes into the hash table at one time.\n");
    fprintf(stderr, "--hashload f       Load to at most 0.0 < f < 1.0 capacity (default 0.7).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--maxreadlen n     For batches with all short reads, pack bits differently to\n");
    fprintf(stderr, "                   process more reads per batch.\n");
    fprintf(stderr, "                     all reads must be shorter than n\n");
    fprintf(stderr, "                     --hashstrings limited to 2^(30-m)\n");
    fprintf(stderr, "                   Common values:\n");
    fprintf(stderr, "                     maxreadlen 2048->hashstrings  524288 (default)\n");
    fprintf(stderr, "                     maxreadlen  512->hashstrings 2097152\n");
    fprintf(stderr, "                     maxreadlen  128->hashstrings 8388608\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "--readsperbatch n  Force batch size to n.\n");
    fprintf(stderr, "--readsperthread n Force each thread to process n reads.\n");
    fprintf(stderr, "\n");

  Out_BOF = new ovFile(G.Outfile_Name, ovFileFullWrite);

  //  We know enough now to set the hash function variables, and some other random variables.

  HSF1 = G.Kmer_Len - (G.Hash_Mask_Bits / 2);
  HSF2 = 2 * G.Kmer_Len - G.Hash_Mask_Bits;
  SV1  = HSF1 + 2;
  SV2  = (HSF1 + HSF2) / 2;
  SV3  = HSF2 - 2;

  //  Log parameters.

  fprintf(stderr, "\n");
  fprintf(stderr, "STRING_NUM_BITS       "F_U32"\n", STRING_NUM_BITS);
  fprintf(stderr, "OFFSET_BITS           "F_U32"\n", OFFSET_BITS);
  fprintf(stderr, "STRING_NUM_MASK       "F_U64"\n", STRING_NUM_MASK);
  fprintf(stderr, "OFFSET_MASK           "F_U64"\n", OFFSET_MASK);
  fprintf(stderr, "MAX_STRING_NUM        "F_U64"\n", MAX_STRING_NUM);
  fprintf(stderr, "\n");
  fprintf(stderr, "Hash_Mask_Bits        "F_U32"\n", G.Hash_Mask_Bits);
  fprintf(stderr, "Max_Hash_Strings      "F_U32"\n", G.Max_Hash_Strings);
  fprintf(stderr, "Max_Hash_Data_Len     "F_U64"\n", G.Max_Hash_Data_Len);
  fprintf(stderr, "Max_Hash_Load         %f\n", G.Max_Hash_Load);
  fprintf(stderr, "Kmer Length           "F_U64"\n", G.Kmer_Len);
  fprintf(stderr, "Min Overlap Length    %d\n", G.Min_Olap_Len);
  fprintf(stderr, "Max Error Rate        %f\n", G.maxErate);
  fprintf(stderr, "\n");
  fprintf(stderr, "Num_PThreads          "F_U32"\n", G.Num_PThreads);

  assert (8 * sizeof (uint64) > 2 * G.Kmer_Len);

  Bit_Equivalent['a'] = Bit_Equivalent['A'] = 0;
  Bit_Equivalent['c'] = Bit_Equivalent['C'] = 1;
  Bit_Equivalent['g'] = Bit_Equivalent['G'] = 2;
  Bit_Equivalent['t'] = Bit_Equivalent['T'] = 3;

  for  (int i = 0;  i < 256;  i ++) {
    char  ch = tolower ((char) i);

    if  (ch == 'a' || ch == 'c' || ch == 'g' || ch == 't')
      Char_Is_Bad[i] = 0;
      Char_Is_Bad[i] = 1;

  fprintf(stderr, "\n");
  fprintf(stderr, "HASH_TABLE_SIZE         "F_U32"\n",     HASH_TABLE_SIZE);
  fprintf(stderr, "sizeof(Hash_Bucket_t)   "F_SIZE_T"\n",  sizeof(Hash_Bucket_t));
  fprintf(stderr, "hash table size:        "F_SIZE_T" MB\n",  (HASH_TABLE_SIZE * sizeof(Hash_Bucket_t)) >> 20);
  fprintf(stderr, "\n");

  Hash_Table       = new Hash_Bucket_t [HASH_TABLE_SIZE];

  fprintf(stderr, "check  "F_SIZE_T" MB\n", (HASH_TABLE_SIZE    * sizeof (Check_Vector_t) >> 20));
  fprintf(stderr, "info   "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (Hash_Frag_Info_t) >> 20));
  fprintf(stderr, "start  "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (int64) >> 20));
  fprintf(stderr, "\n");

  Hash_Check_Array = new Check_Vector_t [HASH_TABLE_SIZE];
  String_Info      = new Hash_Frag_Info_t [G.Max_Hash_Strings];
  String_Start     = new int64 [G.Max_Hash_Strings];

  String_Start_Size = G.Max_Hash_Strings;

  memset(Hash_Check_Array, 0, sizeof(Check_Vector_t)   * HASH_TABLE_SIZE);
  memset(String_Info,      0, sizeof(Hash_Frag_Info_t) * G.Max_Hash_Strings);
  memset(String_Start,     0, sizeof(int64)            * G.Max_Hash_Strings);


  delete [] basesData;
  delete [] qualsData;
  delete [] nextRef;

  delete [] String_Start;
  delete [] String_Info;
  delete [] Hash_Check_Array;
  delete [] Hash_Table;

  delete Out_BOF;

  FILE *stats = stderr;

  if (G.Outstat_Name != NULL) {
    errno = 0;
    stats = fopen(G.Outstat_Name, "w");
    if (errno) {
      fprintf(stderr, "WARNING: failed to open '%s' for writing: %s\n", G.Outstat_Name, strerror(errno));
      stats = stderr;

  fprintf(stats, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct);
  fprintf(stats, "    Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct);
  fprintf(stats, "  Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct);
  fprintf(stats, " Total overlaps produced = "F_S64"\n", Total_Overlaps);
  fprintf(stats, "      Contained overlaps = "F_S64"\n", Contained_Overlap_Ct);
  fprintf(stats, "       Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct);
  fprintf(stats, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct);
  fprintf(stats, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct);

  if (stats != stderr)

Пример #22
main (int argc, char **argv) {
  char   tmpName[FILENAME_MAX] = {0};

  char  *gkpName = NULL;

  char  *tigName = NULL;
  int32  tigVers = -1;
  int32  tigPart = -1;

  int64  ctgBgn = -1;
  int64  ctgEnd = -1;

  char  *ctgName = NULL;
  char  *outName = NULL;
  char  *inName  = NULL;

  bool   forceCompute = false;

  int32  numFailures = 0;
  int32  numSkipped  = 0;

  bool   useUnitig  = false;
  bool   showResult = false;

                          CNS_OPTIONS_DO_PHASING_DEFAULT };

  //  Comminucate to MultiAlignment_CNS.c that we are doing consensus and not cgw.
  thisIsConsensus = 1;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-g") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      tigName = argv[++arg];
      tigVers = atoi(argv[++arg]);
      tigPart = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-c") == 0) {
      AS_UTL_decodeRange(argv[++arg], ctgBgn, ctgEnd);

    } else if (strcmp(argv[arg], "-T") == 0) {
      ctgName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      outName = argv[++arg];
    } else if (strcmp(argv[arg], "-I") == 0) {
      inName = argv[++arg];

    } else if (strcmp(argv[arg], "-f") == 0) {
      forceCompute = true;

    } else if (strcmp(argv[arg], "-U") == 0) {
      useUnitig = true;

    } else if (strcmp(argv[arg], "-v") == 0) {
      showResult = true;

    } else if (strcmp(argv[arg], "-V") == 0) {

    } else if (strcmp(argv[arg], "-w") == 0) {
      options.smooth_win = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-P") == 0) {
      options.do_phasing = atoi(argv[++arg]);

    } else {
      fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);

  if ((err) || (gkpName == NULL) || (tigName == NULL)) {
    fprintf(stderr, "usage: %s -g gkpStore -t tigStore version partition [opts]\n", argv[0]);
    fprintf(stderr, "    -c b         Compute only contig ID 'b' (must be in the correct partition!)\n");
    fprintf(stderr, "    -c b-e       Compute only contigs from ID 'b' to ID 'e'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -T file      Test the computation of the contig layout in 'file'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -f           Recompute contigs that already have a multialignment\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -U           Reuse the unitig consensus for contigs with only a single\n");
    fprintf(stderr, "                 unitig (EXPERIMENTAL!)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -O file      Don't update tigStore, dump a binary file instead.\n");
    fprintf(stderr, "    -I file      Import binary file into tigStore\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -v           Show multialigns.\n");
    fprintf(stderr, "    -V           Enable debugging option 'verbosemultialign'.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "    -w ws        Smoothing window size\n");
    fprintf(stderr, "\n");

  //  Open both stores for read only.
  gkpStore = new gkStore(gkpName, false, false);
  tigStore = new MultiAlignStore(tigName, tigVers, 0, tigPart, false, false, false);


  //  Decide on what to compute.  Either all contigs, or a single contig, or a special case test.
  uint32 b = 0;
  uint32 e = tigStore->numContigs();

  if (ctgBgn != -1) {
    b = ctgBgn;
    e = ctgEnd + 1;


  if (ctgName != NULL) {
    errno = 0;
    FILE         *F = fopen(ctgName, "r");
    if (errno)
      fprintf(stderr, "Failed to open input contig file '%s': %s\n", ctgName, strerror(errno)), exit(1);

    MultiAlignT  *ma       = CreateEmptyMultiAlignT();
    bool          isUnitig = false;

    while (LoadMultiAlignFromHuman(ma, isUnitig, F) == true) {
      if (ma->maID < 0)
        ma->maID = (isUnitig) ? tigStore->numUnitigs() : tigStore->numContigs();

      if (MultiAlignContig(ma, gkpStore, &options)) {
        if (showResult)
          PrintMultiAlignT(stdout, ma, gkpStore, false, false, AS_READ_CLEAR_LATEST);
      } else {
        fprintf(stderr, "MultiAlignContig()-- contig %d failed.\n", ma->maID);


    b = e = 0;

  //  Reopen for writing, if we have work to do.
  if (((inName) || (b < e)) && (outName == NULL)) {
    delete tigStore;
    tigStore = new MultiAlignStore(tigName, tigVers, 0, tigPart, true, false, true);

  if (inName) {
    importFromFile(inName, tigPart);

    b = e = 0;

  //  Now the usual case.  Iterate over all contigs, compute and update.
  for (uint32 i=b; i<e; i++) {
    MultiAlignT  *cma = tigStore->loadMultiAlign(i, false);

    if (cma == NULL) {
      //  Not in our partition, or deleted.

    bool  exists = (cma->consensus != NULL) && (GetNumchars(cma->consensus) > 1);

    if ((forceCompute == false) && (exists == true)) {
      //  Already finished contig consensus.
      fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments) - already computed, skipped\n",
              cma->maID, cma->data.num_unitigs, cma->data.num_frags);


      tigStore->unloadMultiAlign(cma->maID, false);


    int32         uID = GetIntUnitigPos(cma->u_list, 0)->ident;

    //  If this is a surrogate, we CANNOT reuse the unitig.  We need to process the contig so that
    //  the unplaced reads are stripped out.  A surrogate should have different contig and unitig
    //  IDs; we could also check the contig status.

    if ((cma->data.num_unitigs == 1) &&
        (cma->maID == uID) &&
        (useUnitig == true)) {
      fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments) - reusing unitig %d consensus\n",
              cma->maID, cma->data.num_unitigs, cma->data.num_frags, uID);

      MultiAlignT  *uma = tigStore->loadMultiAlign(uID, true);

      uma->data = cma->data;

      tigStore->unloadMultiAlign(cma->maID, false);

      if (outName)
        writeToOutFile(outName, tigPart, uma);
        tigStore->insertMultiAlign(uma, false, false);

      tigStore->unloadMultiAlign(uma->maID, true);


    fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments)%s\n",
            cma->maID, cma->data.num_unitigs, cma->data.num_frags,
            (exists) ? " - already computed, recomputing" : "");

    if (MultiAlignContig(cma, gkpStore, &options)) {
      if (outName)
        writeToOutFile(outName, tigPart, cma);
        tigStore->insertMultiAlign(cma, false, true);

      if (showResult)
        PrintMultiAlignT(stdout, cma, gkpStore, false, false, AS_READ_CLEAR_LATEST);

      tigStore->unloadMultiAlign(cma->maID, false);
    } else {
      fprintf(stderr, "MultiAlignContig()-- contig %d failed.\n", cma->maID);

  delete tigStore;

  fprintf(stderr, "\n");
  fprintf(stderr, "NumColumnsInUnitigs             = %d\n", NumColumnsInUnitigs);
  fprintf(stderr, "NumGapsInUnitigs                = %d\n", NumGapsInUnitigs);
  fprintf(stderr, "NumRunsOfGapsInUnitigReads      = %d\n", NumRunsOfGapsInUnitigReads);
  fprintf(stderr, "NumColumnsInContigs             = %d\n", NumColumnsInContigs);
  fprintf(stderr, "NumGapsInContigs                = %d\n", NumGapsInContigs);
  fprintf(stderr, "NumRunsOfGapsInContigReads      = %d\n", NumRunsOfGapsInContigReads);
  fprintf(stderr, "NumAAMismatches                 = %d\n", NumAAMismatches);
  fprintf(stderr, "NumVARRecords                   = %d\n", NumVARRecords);
  fprintf(stderr, "NumVARStringsWithFlankingGaps   = %d\n", NumVARStringsWithFlankingGaps);
  fprintf(stderr, "NumUnitigRetrySuccess           = %d\n", NumUnitigRetrySuccess);
  fprintf(stderr, "\n");

  if (numFailures) {
    fprintf(stderr, "WARNING:  Total number of contig failures = %d\n", numFailures);
    fprintf(stderr, "\n");
    fprintf(stderr, "Consensus did NOT finish successfully.\n");

  fprintf(stderr, "Consensus finished successfully.  Bye.\n");
Пример #23
main (int argc, char **argv) {
  char            *gkpName   = NULL;
  char            *tigName   = NULL;
  int32            tigVers   = -1;
  vector<char *>   tigInputs;
  tgStoreType      tigType   = tgStoreModify;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-T") == 0) {
      tigName = argv[++arg];
      tigVers = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-L") == 0) {
      AS_UTL_loadFileList(argv[++arg], tigInputs);

    } else if (strcmp(argv[arg], "-n") == 0) {
      tigType = tgStoreReadOnly;

    } else if (AS_UTL_fileExists(argv[arg])) {

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);

  if ((err) || (gkpName == NULL) || (tigName == NULL) || (tigInputs.size() == 0)) {
    fprintf(stderr, "usage: %s -G <gkpStore> -T <tigStore> <v> [input.cns]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G <gkpStore>         Path to the gatekeeper store\n");
    fprintf(stderr, "  -T <tigStore> <v>     Path to the tigStore and version to add tigs to\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -L <file-of-files>    Load the tig(s) from files listed in 'file-of-files'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -n                    Don't replace, just report what would have happened\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  The primary operation is to replace tigs in the store with ones in a set of input files.\n");
    fprintf(stderr, "  The input files can be either supplied directly on the command line or listed in\n");
    fprintf(stderr, "  a text file (-L).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  A new store is created if one doesn't exist, otherwise, whatever tigs are there are\n");
    fprintf(stderr, "  replaced with those in the -R file.  If version 'v' doesn't exist, it is created.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  Even if -n is supplied, a new store is created if one doesn't exist.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  To add a new tig, give it a tig id of -1.  New tigs must be added to the latest version.\n");
    fprintf(stderr, "  To delete a tig, remove all children, and set the number of them to zero.\n");
    fprintf(stderr, "\n");

    if (gkpName == NULL)
      fprintf(stderr, "ERROR:  no gatekeeper store (-G) supplied.\n");
    if (tigName == NULL)
      fprintf(stderr, "ERROR:  no tig store (-T) supplied.\n");
    if (tigInputs.size() == 0)
      fprintf(stderr, "ERROR:  no input tigs (-R) supplied.\n");


  //  If the store doesn't exist, create one, and make a bunch of versions
  if (AS_UTL_fileExists(tigName, true, false) == false) {
    fprintf(stderr, "Creating tig store '%s' version %d\n", tigName, tigVers);

    tgStore *tigStore = new tgStore(tigName);

    for (int32 vv=1; vv<tigVers; vv++)

    delete tigStore;

  gkStore *gkpStore = gkStore::gkStore_open(gkpName);
  tgStore *tigStore = new tgStore(tigName, tigVers, tigType);
  tgTig   *tig      = new tgTig;

  for (uint32 ff=0; ff<tigInputs.size(); ff++) {
    errno = 0;
    FILE *TI = fopen(tigInputs[ff], "r");
    if (errno)
      fprintf(stderr, "Failed to open '%s': %s\n", tigInputs[ff], strerror(errno)), exit(1);

    fprintf(stderr, "Reading layouts from '%s'.\n", tigInputs[ff]);

    while (tig->loadFromStreamOrLayout(TI) == true) {

      //  Handle insertion.

      if (tig->numberOfChildren() > 0) {
        //fprintf(stderr, "INSERTING tig %d\n", tig->tigID());
        tigStore->insertTig(tig, false);

      //  Deleted already?

      if (tigStore->isDeleted(tig->tigID()) == true) {
        //fprintf(stderr, "DELETING tig %d -- ALREADY DELETED\n", tig->tigID());

      //  Really delete it then.

      //fprintf(stderr, "DELETING tig %d\n", tig->tigID());


    fprintf(stderr, "Reading layouts from '%s' completed.\n", tigInputs[ff]);

  delete tig;
  delete tigStore;


Пример #24
main(int argc, char **argv) {
  char            *gkpStoreName      = NULL;
  char            *outPrefix         = NULL;

  char            *clrName           = NULL;

  uint32           libToDump         = 0;

  uint32           bgnID             = 1;
  uint32           endID             = UINT32_MAX;

  bool             dumpAllReads      = false;
  bool             dumpAllBases      = false;
  bool             dumpOnlyDeleted   = false;

  bool             dumpFASTQ         = true;
  bool             dumpFASTA         = false;

  bool             withLibName       = true;

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpStoreName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-c") == 0) {
      clrName = argv[++arg];

    } else if (strcmp(argv[arg], "-l") == 0) {
      libToDump = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-b") == 0) {
      bgnID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-e") == 0) {
      endID  = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-r") == 0) {
      bgnID  = atoi(argv[++arg]);
      endID  = bgnID;

    } else if (strcmp(argv[arg], "-allreads") == 0) {
      dumpAllReads    = true;

    } else if (strcmp(argv[arg], "-allbases") == 0) {
      dumpAllBases    = true;

    } else if (strcmp(argv[arg], "-onlydeleted") == 0) {
      dumpOnlyDeleted = true;
      dumpAllReads    = true;  //  Otherwise we won't report the deleted reads!

    } else if (strcmp(argv[arg], "-fastq") == 0) {
      dumpFASTQ       = true;
      dumpFASTA       = false;

    } else if (strcmp(argv[arg], "-fasta") == 0) {
      dumpFASTQ       = false;
      dumpFASTA       = true;

    } else if (strcmp(argv[arg], "-nolibname") == 0) {
      withLibName     = false;

    } else {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);

  if (gkpStoreName == NULL)
  if (outPrefix == NULL)
  if (err) {
    fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]);
    fprintf(stderr, "  -G gkpStore\n");
    fprintf(stderr, "  -o fastq-prefix     write files fastq-prefix.(libname).fastq, ...\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -l libToDump        output only read in library number libToDump (NOT IMPLEMENTED)\n");
    fprintf(stderr, "  -b id               output starting at read 'id'\n");
    fprintf(stderr, "  -e id               output stopping after read 'id'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -c clearFile        clear range file from OBT modules\n");
    fprintf(stderr, "  -allreads           if a clear range file, lower case mask the deleted reads\n");
    fprintf(stderr, "  -allbases           if a clear range file, lower case mask the non-clear bases\n");
    fprintf(stderr, "  -onlydeleted        if a clear range file, only output deleted reads (the entire read)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -r id               output only the single read 'id'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -fastq              output is FASTQ format (with extension .fastq, default)\n");
    fprintf(stderr, "  -fasta              output is FASTA format (with extension .fasta)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -nolibname          don't include the library name in the output file name\n");
    fprintf(stderr, "\n");

    if (gkpStoreName == NULL)
      fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n");
    if (outPrefix == NULL)
      fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n");


  gkStore        *gkpStore  = new gkStore(gkpStoreName);
  uint32          numReads  = gkpStore->gkStore_getNumReads();
  uint32          numLibs   = gkpStore->gkStore_getNumLibraries();

  clearRangeFile *clrRange  = (clrName == NULL) ? NULL : new clearRangeFile(clrName, gkpStore);

  if (bgnID < 1)
    bgnID = 1;

  if (numReads < endID)
    endID = numReads;

  if (endID < bgnID)
    fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID);

  fprintf(stderr, "Dumping reads from %u to %u (inclusive).\n", bgnID, endID);

  libOutput   **out = new libOutput * [numLibs + 1];

  //  Allocate outputs.  If withLibName == false, all reads will artificially be in lib zero, the
  //  other files won't ever be created.  Otherwise, the zeroth file won't ever be created.

  out[0] = new libOutput(outPrefix, NULL);

  for (uint32 i=1; i<=numLibs; i++)
    out[i] = new libOutput(outPrefix, gkpStore->gkStore_getLibrary(i)->gkLibrary_libraryName());

  //  Grab a new readData, and iterate through reads to dump.

  gkReadData   *readData = new gkReadData;

  for (uint32 rid=bgnID; rid<=endID; rid++) {
    gkRead      *read   = gkpStore->gkStore_getRead(rid);

    uint32       libID  = (withLibName == false) ? 0 : read->gkRead_libraryID();

    uint32       lclr   = 0;
    uint32       rclr   = read->gkRead_sequenceLength();
    bool         ignore = false;

    //fprintf(stderr, "READ %u claims id %u length %u in lib %u\n", rid, read->gkRead_readID(), read->gkRead_sequenceLength(), libID);

    //  If a clear range file is supplied, grab the clear range.  If it hasn't been set, the default
    //  is the entire read.

    if (clrRange) {
      lclr   = clrRange->bgn(rid);
      rclr   = clrRange->end(rid);
      ignore = clrRange->isDeleted(rid);

    //  Abort if we're not dumping anything from this read
    //   - not in a library we care about
    //   - deleted, and not dumping all reads
    //   - not deleted, but only reporting deleted reads

    if (((libToDump != 0) && (libID == libToDump)) ||
        ((dumpAllReads == false) && (ignore == true)) ||
        ((dumpOnlyDeleted == true) && (ignore == false)))

    //  And if we're told to ignore the read, and here, then the read was deleted and we're printing
    //  all reads.  Reset the clear range to the whole read, the clear range is invalid.

    if (ignore) {
      rclr = read->gkRead_sequenceLength();
      lclr = 0;

    //  Grab the sequence and quality.

    gkpStore->gkStore_loadReadData(read, readData);

    char   *seq = readData->gkReadData_getSequence();
    char   *qlt = readData->gkReadData_getQualities();
    uint32  len = rclr - lclr;

    //  Soft mask not-clear bases

    if (dumpAllBases == true) {
      for (uint32 i=0; i<lclr; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      for (uint32 i=lclr; i<rclr; i++)
        seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a';

      for (uint32 i=rclr; seq[i]; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      rclr = read->gkRead_sequenceLength();
      lclr = 0;

    //  Chop off the ends we're not printing.

    seq += lclr;

    seq[len] = 0;
    qlt[len] = 0;

    //  And print the read.
    if (dumpFASTQ)
      AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, len, qlt, len,
                        "@"F_U32" clr="F_U32","F_U32"\n",
                        rid, lclr, rclr);

    if (dumpFASTA)
      AS_UTL_writeFastA(out[libID]->getFASTA(), seq, len, 0,
                        ">"F_U32" clr="F_U32","F_U32"\n",
                        rid, lclr, rclr);

  delete   readData;

  for (uint32 i=1; i<=numLibs; i++)
    delete out[i];

  delete [] out;

  delete    gkpStore;

Пример #25
main(int argc, char **argv) {
  argc = AS_configure(argc, argv);

  merylArgs   *args = new merylArgs(argc, argv);


  switch (args->personality) {
    case 'P':

    case 'B':

    case 'd':
    case 't':
    case 'p':
    case 'c':
    case 'h':




      fprintf(stderr, "%s: unknown personality.  Specify -P, -B, -S or -M!\n", args->execName);

  delete args;

Пример #26
main(int argc, char **argv) {
  coParameters  *G = new coParameters();

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      G->gkpStorePath = argv[++arg];

    } else if (strcmp(argv[arg], "-R") == 0) {
      G->bgnID = atoi(argv[++arg]);
      G->endID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-O") == 0) {  //  -F?  -S Olap_Path
      G->ovlStorePath = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      G->errorRate = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-l") == 0) {
      G->minOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-c") == 0) {  //  For 'corrections' file input
      G->correctionsName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {  //  For 'erates' output
      G->eratesName = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {  //  But we're not threaded!
      G->numThreads = atoi(argv[++arg]);

    } else {


  if (G->gkpStorePath == NULL)
    fprintf(stderr, "ERROR: no input gatekeeper store (-G) supplied.\n"), err++;
  if (G->ovlStorePath == NULL)
    fprintf(stderr, "ERROR: no input overlap store (-O) supplied.\n"), err++;
  if (G->correctionsName == NULL)
    fprintf(stderr, "ERROR: no input read corrections file (-c) supplied.\n"), err++;
  if (G->eratesName == NULL)
    fprintf(stderr, "ERROR: no output erates file (-o) supplied.\n"), err++;

  if (err) {
    fprintf(stderr, "USAGE:  %s [-d <dna-file>] [-o <ovl_file>] [-q <quality>]\n", argv[0]);
    fprintf(stderr, "            [-x <del_file>] [-F OlapFile] [-S OlapStore]\n");
    fprintf(stderr, "            [-c <cgb_file>] [-e <erate_file>\n");
    fprintf(stderr, "           <gkpStore> <CorrectFile> <lo> <hi>\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Recalculates overlaps for frags  <lo> .. <hi>  in\n");
    fprintf(stderr, " <gkpStore>  using corrections in  <CorrectFile> \n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Options:\n");
    fprintf(stderr, "-e <erate-file>  specifies binary file to dump corrected erates to\n");
    fprintf(stderr, "                 for later updating of olap store by  update-erates \n");
    fprintf(stderr, "-F             specify file of sorted overlaps to use (in the format\n");
    fprintf(stderr, "               produced by  get-olaps\n");
    fprintf(stderr, "-o <ovl_file>  specifies name of file to which OVL messages go\n");
    fprintf(stderr, "-q <quality>   overlaps less than this error rate are\n");
    fprintf(stderr, "               automatically output\n");
    fprintf(stderr, "-S             specify the binary overlap store containing overlaps to use\n");

  //fprintf (stderr, "Quality Threshold = %.2f%%\n", 100.0 * Quality_Threshold);

  //  Initialize Globals

  fprintf(stderr, "Initializing.\n");

  double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN);

  Initialize_Match_Limit(G->Edit_Match_Limit, G->errorRate, MAX_ERRORS);

  for (int32 i=0;  i <= AS_MAX_READLEN;  i++)
    G->Error_Bound[i] = (int)ceil(i * G->errorRate);


  fprintf(stderr, "Opening gkpStore '%s'.\n", G->gkpStorePath);

  gkStore *gkpStore = gkStore::gkStore_open(G->gkpStorePath);

  if (G->bgnID < 1)
    G->bgnID = 1;

  if (gkpStore->gkStore_getNumReads() < G->endID)
    G->endID = gkpStore->gkStore_getNumReads();

  //  Load the reads for the overlaps we are going to be correcting, and apply corrections to them

  fprintf(stderr, "Correcting reads "F_U32" to "F_U32".\n", G->bgnID, G->endID);

  Correct_Frags(G, gkpStore);

  //  Load overlaps we're going to correct

  fprintf(stderr, "Loading overlaps.\n");

  Read_Olaps(G, gkpStore);

  //  Now sort them on the B iid.

  fprintf(stderr, "Sorting overlaps.\n");

  __gnu_sequential::sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_bID());
  sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_bID());

  //  Recompute overlaps

  fprintf(stderr, "Recomputing overlaps.\n");

  Redo_Olaps(G, gkpStore);

  gkpStore = NULL;

  //  Sort the overlaps back into the original order

  fprintf(stderr, "Sorting overlaps.\n");

  __gnu_sequential::sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_Order());
  sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_Order());

  //  Dump the new erates

  fprintf (stderr, "Saving corrected error rates to file %s\n", G->eratesName);

    errno = 0;
    FILE *fp = fopen(G->eratesName, "w");
    if (errno)
      fprintf(stderr, "Failed to open '%s': %s\n", G->eratesName, strerror(errno)), exit(1);

    AS_UTL_safeWrite(fp, &G->bgnID,    "loid", sizeof(int32),  1);
    AS_UTL_safeWrite(fp, &G->endID,    "hiid", sizeof(int32),  1);
    AS_UTL_safeWrite(fp, &G->olapsLen, "num",  sizeof(uint64), 1);

    fprintf(stderr, "--Allocate "F_U64" MB for output error rates.\n",
            (sizeof(uint16) * G->olapsLen) >> 20);

    uint16 *evalue = new uint16 [G->olapsLen];

    for (int32 i=0; i<G->olapsLen; i++)
      evalue[i] = G->olaps[i].evalue;

    AS_UTL_safeWrite(fp, evalue, "evalue", sizeof(uint16), G->olapsLen);

    delete [] evalue;


  //  Finished.

  //fprintf (stderr, "%d/%d failed/total alignments (%.1f%%)\n",
  //         Failed_Alignments_Ct, Total_Alignments_Ct,
  //         Total_Alignments_Ct == 0 ? 0.0 : (100.0 * Failed_Alignments_Ct) / Total_Alignments_Ct);

  delete G;

  fprintf(stderr, "DONE.\n");

Пример #27
main(int argc, char **argv) {
  char            *gkpStoreName      = NULL;
  char            *outPrefix         = NULL;

  uint32           minReadLength     = 0;

  uint32           firstFileArg      = 0;

  char             errorLogName[FILENAME_MAX];
  char             htmlLogName[FILENAME_MAX];
  char             nameMapName[FILENAME_MAX];

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-o") == 0) {
      gkpStoreName = argv[++arg];

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minReadLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "--") == 0) {
      firstFileArg = arg++;

    } else if (argv[arg][0] == '-') {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);

    } else {
      firstFileArg = arg;

  if (gkpStoreName == NULL)
  if (firstFileArg == 0)

  if (err) {
    fprintf(stderr, "usage: %s [...] -o gkpStore\n", argv[0]);
    fprintf(stderr, "  -o gkpStore         create this gkpStore\n");
    fprintf(stderr, "  \n");
    fprintf(stderr, "  -minlength L        discard reads shorter than L\n");
    fprintf(stderr, "  \n");
    fprintf(stderr, "  \n");

    if (gkpStoreName == NULL)
      fprintf(stderr, "ERROR: no gkpStore (-g) supplied.\n");
    if (firstFileArg == 0)
      fprintf(stderr, "ERROR: no input files supplied.\n");


  gkStore     *gkpStore     = gkStore::gkStore_open(gkpStoreName, gkStore_extend);
  gkRead      *gkpRead      = NULL;
  gkLibrary   *gkpLibrary   = NULL;
  uint32       gkpFileID    = 0;      //  Used for HTML output, an ID for each file loaded.

  uint32       inLineLen    = 1024;
  char         inLine[1024] = { 0 };

  validSeq['a'] = validSeq['c'] = validSeq['g'] = validSeq['t'] = validSeq['n'] = 1;
  validSeq['A'] = validSeq['C'] = validSeq['G'] = validSeq['T'] = validSeq['N'] = 1;

  errno = 0;

  sprintf(errorLogName, "%s/errorLog",    gkpStoreName);
  FILE    *errorLog = fopen(errorLogName, "w");
  if (errno)
    fprintf(stderr, "ERROR:  cannot open error file '%s': %s\n", errorLogName, strerror(errno)), exit(1);

  sprintf(htmlLogName,   "%s/load.dat", gkpStoreName);
  FILE    *htmlLog   = fopen(htmlLogName,   "w");
  if (errno)
    fprintf(stderr, "ERROR:  cannot open uid map file '%s': %s\n", htmlLogName, strerror(errno)), exit(1);

  sprintf(nameMapName,   "%s/readNames.txt", gkpStoreName);
  FILE    *nameMap   = fopen(nameMapName,   "w");
  if (errno)
    fprintf(stderr, "ERROR:  cannot open uid map file '%s': %s\n", nameMapName, strerror(errno)), exit(1);

  uint32  nERROR   = 0;  //  There aren't any errors, we just exit fatally if encountered.
  uint32  nWARNS   = 0;

  uint32  nLOADED  = 0;  //  Reads loaded
  uint64  bLOADED  = 0;  //  Bases loaded

  uint32  nSKIPPED = 0;
  uint64  bSKIPPED = 0;  //  Bases not loaded, too short

#if 0
  fprintf(htmlLog, "<!DOCTYPE html>\n");
  fprintf(htmlLog, "<html>\n");
  fprintf(htmlLog, "<head>\n");
  fprintf(htmlLog, "<title>gatekeeper load statistics</title>\n");
  fprintf(htmlLog, "<style type='text/css'>\n");
  fprintf(htmlLog, "body       { font-family: Helvetica, Verdana, sans-serif; }\n");
  fprintf(htmlLog, "h1, h2     { color: #ee3e80; }\n");
  fprintf(htmlLog, "p          { color: #665544; }\n");
  fprintf(htmlLog, "th, td     { border: 1px solid #111111; padding: 2px 2px 2px 2px; }\n");
  fprintf(htmlLog, "td:hover   { background-color: #e4e4e4; }\n");
  fprintf(htmlLog, "th:hover   { background-color: #d4d4d4; }\n");
  fprintf(htmlLog, "tr.details { visibility: collapse; }\n");
  fprintf(htmlLog, "</style>\n");
  fprintf(htmlLog, "</head>\n");
  fprintf(htmlLog, "<body>\n");
  fprintf(htmlLog, "<h2>Input Files</h2>\n");
  fprintf(htmlLog, "<table>\n");

  for (; firstFileArg < argc; firstFileArg++) {
    fprintf(stderr, "\n");
    fprintf(stderr, "Starting file '%s'.\n", argv[firstFileArg]);

    compressedFileReader *inFile = new compressedFileReader(argv[firstFileArg]);
    char                 *line   = new char [10240];
    KeyAndValue           keyval;

    while (fgets(line, 10240, inFile->file()) != NULL) {

      if (keyval.key() == NULL) {
        //  No key, so must be a comment or blank line

      if (strcasecmp(keyval.key(), "name") == 0) {
        gkpLibrary = gkpStore->gkStore_addEmptyLibrary(keyval.value());

      //  We'd better have a gkpLibrary defined, if not, the .gkp input file is incorrect.
      if (gkpLibrary == NULL) {
        fprintf(stderr, "WARNING: no 'name' tag in gkp input; creating library with name 'DEFAULT'.\n");
        gkpLibrary = gkpStore->gkStore_addEmptyLibrary(keyval.value());

      if        (strcasecmp(keyval.key(), "preset") == 0) {

      } else if (strcasecmp(keyval.key(), "qv") == 0) {

      } else if (strcasecmp(keyval.key(), "isNonRandom") == 0) {

      } else if (strcasecmp(keyval.key(), "trustHomopolymerRuns") == 0) {

      } else if (strcasecmp(keyval.key(), "removeDuplicateReads") == 0) {

      } else if (strcasecmp(keyval.key(), "finalTrim") == 0) {

      } else if (strcasecmp(keyval.key(), "removeSpurReads") == 0) {

      } else if (strcasecmp(keyval.key(), "removeChimericReads") == 0) {

      } else if (strcasecmp(keyval.key(), "checkForSubReads") == 0) {

      } else if (AS_UTL_fileExists(keyval.key(), false, false)) {
                  nWARNS, nLOADED, bLOADED, nSKIPPED, bSKIPPED);

      } else {
        fprintf(stderr, "ERROR:  option '%s' not recognized, and not a file of reads.\n", line);

    delete    inFile;
    delete [] line;

#if 0
  fprintf(htmlLog, "</table>\n");



  fprintf(stderr, "\n");
  fprintf(stderr, "Finished with:\n");
  fprintf(stderr, "  "F_U32" warnings (bad base or qv)\n", nWARNS);
  fprintf(stderr, "\n");
  fprintf(stderr, "Read from inputs:\n");
  fprintf(stderr, "  "F_U64" bp.\n",    bLOADED);
  fprintf(stderr, "  "F_U32" reads.\n", nLOADED);
  fprintf(stderr, "\n");
  fprintf(stderr, "Loaded into store:\n");
  fprintf(stderr, "  "F_U64" bp.\n",    bLOADED);
  fprintf(stderr, "  "F_U32" reads.\n", nLOADED);
  fprintf(stderr, "\n");
  fprintf(stderr, "Skipped (too short):\n");
  fprintf(stderr, "  "F_U64" bp (%.4f%%).\n",    bSKIPPED, 100.0 * bSKIPPED / (bSKIPPED + bLOADED));
  fprintf(stderr, "  "F_U32" reads (%.4f%%).\n", nSKIPPED, 100.0 * nSKIPPED / (nSKIPPED + nLOADED));
  fprintf(stderr, "\n");
  fprintf(stderr, "\n");

#if 0
  fprintf(htmlLog, "\n");
  fprintf(htmlLog, "<h2>Final Store</h2>\n");
  fprintf(htmlLog, "<table>\n");
  fprintf(htmlLog, "<tr><td colspan='2'>%s</td></tr>\n", gkpStoreName);
  fprintf(htmlLog, "<tr><td>readsLoaded</td><td>"F_U32" reads ("F_U64" bp)</td></tr>\n", nLOADED, bLOADED);
  fprintf(htmlLog, "<tr><td>readsSkipped</td><td>"F_U32" reads ("F_U64" bp) (read was too short)</td></tr>\n", nSKIPPED, bSKIPPED);
  fprintf(htmlLog, "<tr><td>warnings</td><td>"F_U32" warnings (invalid base or quality value)</td></tr>\n", nWARNS);
  fprintf(htmlLog, "</table>\n");
  fprintf(htmlLog, "\n");

  fprintf(htmlLog, "<script type='text/javascript'>\n");
  fprintf(htmlLog, "var toggleOne = function() {\n");
  fprintf(htmlLog, "  var table = this.closest('table');\n");
  fprintf(htmlLog, "  var elts  = table.querySelectorAll('.details');\n");
  fprintf(htmlLog, "\n");
  fprintf(htmlLog, "  for (var i=0; i<elts.length; i++) {\n");
  fprintf(htmlLog, "    if (!elts[i].enabled) {\n");
  fprintf(htmlLog, "      elts[i].enabled = true;\n");
  fprintf(htmlLog, "      elts[i].style.visibility = 'visible';\n");
  fprintf(htmlLog, "    } else {\n");
  fprintf(htmlLog, "      elts[i].enabled = false;\n");
  fprintf(htmlLog, "      elts[i].style.visibility = 'collapse';\n");
  fprintf(htmlLog, "    }\n");
  fprintf(htmlLog, "  }\n");
  fprintf(htmlLog, "}\n");
  fprintf(htmlLog, "\n");
  for (uint32 ii=0; ii<gkpFileID; ii++) {
    fprintf(htmlLog, "document.getElementById('gkpload%u').onclick = toggleOne;\n", ii);
    fprintf(htmlLog, "document.getElementById('gkpload%u').style   = 'cursor: pointer;';\n", ii);
  fprintf(htmlLog, "</script>\n");
  fprintf(htmlLog, "\n");
  fprintf(htmlLog, "</body>\n");
  fprintf(htmlLog, "</html>\n");
  fprintf(htmlLog, "sum "F_U32" "F_U64" "F_U32" "F_U64" "F_U32"\n", nLOADED, bLOADED, nSKIPPED, bSKIPPED, nWARNS);


  if (nERROR > 0)
    fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many errors.\n");

  if (bSKIPPED > 0.25 * (bSKIPPED + bLOADED))
    fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many bases skipped.  Check your reads.\n");

  if (nWARNS > 0.25 * (nLOADED))
    fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many warnings.  Check your reads.\n");

  if (nSKIPPED > 0.50 * (nLOADED))
    fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many short reads.  Check your reads!\n");

  if ((nERROR > 0) ||
      (bSKIPPED > 0.25 * (bSKIPPED + bLOADED)) ||
      (nWARNS   > 0.25 * (nSKIPPED + nLOADED)) ||
      (nSKIPPED > 0.50 * (nSKIPPED + nLOADED)))

  fprintf(stderr, "gatekeeperCreate finished successfully.\n");

Пример #28
int main (int argc, char *argv[]) {
   char    *asmFileName    = NULL;
   char    *tigStoreName   = NULL;
   uint32   tigStoreVers   = 2;

   int      minLength      = DEFAULT_UNITIG_LENGTH;
   int      numInstances   = DEFAULT_NUM_INSTANCES;
   int      distanceToEnds = DEFAULT_DISTANCE_TO_ENDS;

   uint32   numToggled     = 0;

   argc = AS_configure(argc, argv);
   int arg=1;
   int err=0;
   while (arg < argc) {
      if        (strcmp(argv[arg], "-a") == 0) {
         asmFileName = argv[++arg];

      } else if (strcmp(argv[arg], "-t") == 0) {
        tigStoreName = argv[++arg];
        tigStoreVers = atoi(argv[++arg]);

      } else if (strcmp(argv[arg], "-l") == 0) {
         minLength = atoi(argv[++arg]);

      } else if (strcmp(argv[arg], "-n") == 0) {
         numInstances = atoi(argv[++arg]);

      } else if (strcmp(argv[arg], "-d") == 0) {
         distanceToEnds = atoi(argv[++arg]);

      } else {
         fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);


   if (minLength <= 0) err++;
   if (numInstances < 0) err++;
   if (distanceToEnds <= 0) err++;

   if ((asmFileName == NULL) || (tigStoreName == NULL) || (err > 0)) {
      fprintf(stderr, "usage: %s -a asmFile -t tigStore version [-l minLength] [-n numInstances] [-d distanceToEnd]\n", argv[0]);
      fprintf(stderr, "\n");
      fprintf(stderr, "  -a asmFile            path to the assembly .asm file\n");
      fprintf(stderr, "  -t tigStore version   path to the tigStore and version to modify\n");

      fprintf(stderr, "  -l minLength          minimum size of a unitig to be toggled, default=%d)\n", DEFAULT_UNITIG_LENGTH);
      fprintf(stderr, "  -n numInstances       number of instances of a surrogate that is toggled, default = %d\n", DEFAULT_NUM_INSTANCES);
      fprintf(stderr, "  -d distanceToEnd      max number of bases the surrogate can be from the end of a scaffold for toggling, default = %d\n", DEFAULT_DISTANCE_TO_ENDS);
      fprintf(stderr, "\n");
      fprintf(stderr, "  Labels surrogate unitigs as non-repeat if they match any of the following conditions:\n");
      fprintf(stderr, "    1. the unitig meets all the -l, -n and -d conditions\n");
      fprintf(stderr, "    2. When -n = 0, all surrogate unitigs with more than one read\n");
      fprintf(stderr, "    3. the unitig appears exactly twice, within '-d' bases from the end of a scaffold\n");
   HashTable_AS      *UIDtoIID         = CreateScalarHashTable_AS();
   HashTable_AS      *CTGtoFirstUTG    = CreateScalarHashTable_AS();
   HashTable_AS      *CTGtoLastUTG     = CreateScalarHashTable_AS();
   VA_TYPE(int32)    *unitigLength	   = CreateVA_int32(8192);
   VA_TYPE(uint32)   *surrogateCount   = CreateVA_uint32(8192);
   VA_TYPE(uint32)   *surrogateAtScaffoldEnds   = CreateVA_uint32(8192);
   GenericMesg    *pmesg;
   FILE           *infp = fopen(asmFileName, "r");   

   while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) {
      SnapUnitigMesg    *utg     = NULL;
      SnapConConMesg    *ctg     = NULL;
      SnapScaffoldMesg  *scf     = NULL;
      uint32             count   = 0;
      uint32             forward = TRUE;
      uint32             lastCtg = 0;

      switch(pmesg->t) {
         case MESG_UTG:
            utg = (SnapUnitigMesg*)(pmesg->m);
            Setint32(unitigLength, utg->iaccession, &utg->length);

            if (utg->length >= minLength && (utg->status == AS_NOTREZ || utg->status == AS_SEP)) {               
               // store the mapping for this unitig's UID to IID and initialize it's instance counter at 0
               count = 0;
               InsertInHashTable_AS(UIDtoIID, AS_UID_toInteger(utg->eaccession), 0, (uint64)utg->iaccession, 0);               
               Setuint32(surrogateCount, utg->iaccession, &count);

         case MESG_CCO:
            ctg = (SnapConConMesg *)(pmesg->m);
            for (int32 i = 0; i < ctg->num_unitigs; i++) {
               // increment the surrogate unitigs instance counter
               if (ExistsInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)) {
                  uint32 *ret = Getuint32(surrogateCount, (uint32) LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0));
                  assert(ret != NULL);

                  // store first surrogate in a contig
                  if (!ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0) && 
                        MIN(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end) < distanceToEnds) {
                     InsertInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); 

                  // also store the last
                  if ((ctg->length - MAX(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end)) < distanceToEnds) {
                     ReplaceInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0);

         case MESG_SCF:
            scf = (SnapScaffoldMesg *)(pmesg->m);
            count = scf->iaccession;
            if (scf->contig_pairs[0].orient.isAnti() || scf->contig_pairs[0].orient.isOuttie()) {
               forward = FALSE;
            lastCtg = MAX(scf->num_contig_pairs - 1, 0);
            // All four cases below follow the same pattern
            // The first time a surrogate is found at the end of a scaffold, we record the scaffold ID
            // When the surrogate is seen at the end of a second scaffold, we record that it has been found at the ends of two scaffolds (UINT32_MAX)
            // If the surrogate is seen more than once in a single scaffold, it is eliminated (it can't connect two scaffolds)
            // If the surrogate is only seen once at the end of a scaffold (and again in the middle), it is eliminated
            // 1. Contig is first in scaffold and is forward, take the surrogate from the beginning of contig, if it exists                        
            if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && forward) {
               uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0));
               if (myval != NULL && (*myval) == scf->iaccession) {
                  count = 0;
               } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) {
                  count = UINT32_MAX;
               Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count);
               count = scf->iaccession;
            // 2. Contig is last in scaffold and is reversed, take the surrogate from the beginning of the contig, if it exists
            if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && !forward) {
               uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0));
               if (myval != NULL && (*myval) == scf->iaccession) {
                  count = 0;
               } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) {
                  count = UINT32_MAX;
               Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count);
               count = scf->iaccession;
            // 3. Contig is first in scaffold and is reversed, take the surrogate from the end of the contig, if it exists            
            if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && !forward) {
               uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0));
               if (myval != NULL && (*myval) == scf->iaccession) {
                  count = 0;
               } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) {
                  count = UINT32_MAX;
               Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count);
               count = scf->iaccession;
            // 4. Contig is last in scaffold and is forward, take the surrogate from the end of the contig, if it exists
            if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && forward) {
               uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0));
               if (myval != NULL && (*myval) == scf->iaccession) {
                  count = 0;
               } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) {
                  count = UINT32_MAX;
               Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count);
               count = scf->iaccession;

   uint32 *ret       = NULL;
   uint32 *atScfEnd  = NULL;

   // open the tig store for in-place writing (we don't increment the version since CGW always reads a fixed version initially)
   // this also removes any partitioning

   MultiAlignStore *tigStore = new MultiAlignStore(tigStoreName, tigStoreVers, 0, 0, TRUE, TRUE);

   for (uint32 i = 0; i < tigStore->numUnitigs(); i++) {
      uint32 *ret      = Getuint32(surrogateCount, i);
      uint32 *atScfEnd = Getuint32(surrogateAtScaffoldEnds, i);
      uint32 *length   = Getuint32(unitigLength, i);

      bool toggled = false;
      if (ret != NULL && (*ret) == (uint32)numInstances && numInstances != 0) {
         toggled = TRUE;

      // if we find a surrogate that has two instances and it is at scaffold ends mark toggle it as well
      else if (ret != NULL && (*ret) == NUM_INSTANCES_AT_SCAFFOLD_ENDS && atScfEnd != NULL && (*atScfEnd) == UINT32_MAX) {
         toggled = TRUE;

      // special case, mark non-singleton unitigs as unique if we are given no instances
      else if (numInstances == 0 && (length != NULL && (*length) >= minLength) && tigStore->getNumFrags(i, TRUE) > 1) {
         toggled = TRUE;
      if (toggled) {
         tigStore->setUnitigFUR(i, AS_FORCED_UNIQUE);

   delete tigStore;
   fprintf(stderr, "Toggled %d\n", numToggled);
   return 0;