Example #1
0
int
main(int argc, char **argv) {
  char            *gkpStoreName      = NULL;
  char            *outPrefix         = NULL;

  char            *clrName           = NULL;

  uint32           libToDump         = 0;

  uint32           bgnID             = 1;
  uint32           endID             = UINT32_MAX;

  bool             dumpAllReads      = false;
  bool             dumpAllBases      = false;
  bool             dumpOnlyDeleted   = false;

  bool             dumpFASTQ         = true;
  bool             dumpFASTA         = false;

  bool             withLibName       = true;

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpStoreName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outPrefix = argv[++arg];


    } else if (strcmp(argv[arg], "-c") == 0) {
      clrName = argv[++arg];


    } else if (strcmp(argv[arg], "-l") == 0) {
      libToDump = atoi(argv[++arg]);


    } else if (strcmp(argv[arg], "-b") == 0) {
      bgnID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-e") == 0) {
      endID  = atoi(argv[++arg]);


    } else if (strcmp(argv[arg], "-r") == 0) {
      bgnID  = atoi(argv[++arg]);
      endID  = bgnID;


    } else if (strcmp(argv[arg], "-allreads") == 0) {
      dumpAllReads    = true;

    } else if (strcmp(argv[arg], "-allbases") == 0) {
      dumpAllBases    = true;

    } else if (strcmp(argv[arg], "-onlydeleted") == 0) {
      dumpOnlyDeleted = true;
      dumpAllReads    = true;  //  Otherwise we won't report the deleted reads!


    } else if (strcmp(argv[arg], "-fastq") == 0) {
      dumpFASTQ       = true;
      dumpFASTA       = false;

    } else if (strcmp(argv[arg], "-fasta") == 0) {
      dumpFASTQ       = false;
      dumpFASTA       = true;

    } else if (strcmp(argv[arg], "-nolibname") == 0) {
      withLibName     = false;


    } else {
      err++;
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
    }
    arg++;
  }

  if (gkpStoreName == NULL)
    err++;
  if (outPrefix == NULL)
    err++;
  if (err) {
    fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]);
    fprintf(stderr, "  -G gkpStore\n");
    fprintf(stderr, "  -o fastq-prefix     write files fastq-prefix.(libname).fastq, ...\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -l libToDump        output only read in library number libToDump (NOT IMPLEMENTED)\n");
    fprintf(stderr, "  -b id               output starting at read 'id'\n");
    fprintf(stderr, "  -e id               output stopping after read 'id'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -c clearFile        clear range file from OBT modules\n");
    fprintf(stderr, "  -allreads           if a clear range file, lower case mask the deleted reads\n");
    fprintf(stderr, "  -allbases           if a clear range file, lower case mask the non-clear bases\n");
    fprintf(stderr, "  -onlydeleted        if a clear range file, only output deleted reads (the entire read)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -r id               output only the single read 'id'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -fastq              output is FASTQ format (with extension .fastq, default)\n");
    fprintf(stderr, "  -fasta              output is FASTA format (with extension .fasta)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -nolibname          don't include the library name in the output file name\n");
    fprintf(stderr, "\n");

    if (gkpStoreName == NULL)
      fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n");
    if (outPrefix == NULL)
      fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n");

    exit(1);
  }

  gkStore        *gkpStore  = new gkStore(gkpStoreName);
  uint32          numReads  = gkpStore->gkStore_getNumReads();
  uint32          numLibs   = gkpStore->gkStore_getNumLibraries();

  clearRangeFile *clrRange  = (clrName == NULL) ? NULL : new clearRangeFile(clrName, gkpStore);

  if (bgnID < 1)
    bgnID = 1;

  if (numReads < endID)
    endID = numReads;

  if (endID < bgnID)
    fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID);




  fprintf(stderr, "Dumping reads from %u to %u (inclusive).\n", bgnID, endID);

  libOutput   **out = new libOutput * [numLibs + 1];

  //  Allocate outputs.  If withLibName == false, all reads will artificially be in lib zero, the
  //  other files won't ever be created.  Otherwise, the zeroth file won't ever be created.

  out[0] = new libOutput(outPrefix, NULL);

  for (uint32 i=1; i<=numLibs; i++)
    out[i] = new libOutput(outPrefix, gkpStore->gkStore_getLibrary(i)->gkLibrary_libraryName());

  //  Grab a new readData, and iterate through reads to dump.

  gkReadData   *readData = new gkReadData;

  for (uint32 rid=bgnID; rid<=endID; rid++) {
    gkRead      *read   = gkpStore->gkStore_getRead(rid);

    uint32       libID  = (withLibName == false) ? 0 : read->gkRead_libraryID();

    uint32       lclr   = 0;
    uint32       rclr   = read->gkRead_sequenceLength();
    bool         ignore = false;

    //fprintf(stderr, "READ %u claims id %u length %u in lib %u\n", rid, read->gkRead_readID(), read->gkRead_sequenceLength(), libID);

    //  If a clear range file is supplied, grab the clear range.  If it hasn't been set, the default
    //  is the entire read.

    if (clrRange) {
      lclr   = clrRange->bgn(rid);
      rclr   = clrRange->end(rid);
      ignore = clrRange->isDeleted(rid);
    }

    //  Abort if we're not dumping anything from this read
    //   - not in a library we care about
    //   - deleted, and not dumping all reads
    //   - not deleted, but only reporting deleted reads

    if (((libToDump != 0) && (libID == libToDump)) ||
        ((dumpAllReads == false) && (ignore == true)) ||
        ((dumpOnlyDeleted == true) && (ignore == false)))
      continue;

    //  And if we're told to ignore the read, and here, then the read was deleted and we're printing
    //  all reads.  Reset the clear range to the whole read, the clear range is invalid.

    if (ignore) {
      rclr = read->gkRead_sequenceLength();
      lclr = 0;
    }

    //  Grab the sequence and quality.

    gkpStore->gkStore_loadReadData(read, readData);

    char   *seq = readData->gkReadData_getSequence();
    char   *qlt = readData->gkReadData_getQualities();
    uint32  len = rclr - lclr;

    //  Soft mask not-clear bases

    if (dumpAllBases == true) {
      for (uint32 i=0; i<lclr; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      for (uint32 i=lclr; i<rclr; i++)
        seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a';

      for (uint32 i=rclr; seq[i]; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      rclr = read->gkRead_sequenceLength();
      lclr = 0;
    }

    //  Chop off the ends we're not printing.

    seq += lclr;

    seq[len] = 0;
    qlt[len] = 0;

    //  And print the read.
    if (dumpFASTQ)
      AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, len, qlt, len,
                        "@"F_U32" clr="F_U32","F_U32"\n",
                        rid, lclr, rclr);

    if (dumpFASTA)
      AS_UTL_writeFastA(out[libID]->getFASTA(), seq, len, 0,
                        ">"F_U32" clr="F_U32","F_U32"\n",
                        rid, lclr, rclr);
  }

  delete   readData;

  for (uint32 i=1; i<=numLibs; i++)
    delete out[i];

  delete [] out;

  delete    gkpStore;

  exit(0);
}
Example #2
0
int
main( int argc, char **argv) {
  int          ckptNum           = NULLINDEX;
  int          makeMiniScaffolds = 1;
  uint64       uidStart          = 1230000;
  UIDserver   *uids              = NULL;

  GlobalData = new Globals_CGW();

  argc = AS_configure(argc, argv);

  int err=0;
  int arg=1;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-p") == 0) {
      ckptNum = GlobalData->setPrefix(argv[++arg]);
    } else if (strcmp(argv[arg], "-c") == 0) {
      strcpy(GlobalData->outputPrefix, argv[++arg]);
    } else if (strcmp(argv[arg], "-g") == 0) {
      strcpy(GlobalData->gkpStoreName, argv[++arg]);
    } else if (strcmp(argv[arg], "-t") == 0) {
      strcpy(GlobalData->tigStoreName, argv[++arg]);
    } else if (strcmp(argv[arg], "-n") == 0) {
      ckptNum = atoi(argv[++arg]);
    } else if (strcmp(argv[arg], "-U") == 0) {
      uidStart = 0;
    } else if (strcmp(argv[arg], "-S") == 0) {
      makeMiniScaffolds = 0;
    } else {
      fprintf(stderr, "unknown option '%s'\n", argv[arg]);
      err = 1;
    }
    arg++;
  }

  if ((GlobalData->outputPrefix[0] == 0) ||
      (GlobalData->gkpStoreName[0] == 0)) {
    fprintf(stderr, "usage: %s [[-p prefix] | [-c name -g gkpstore -n ckptNum]] [-U] [-S]\n", argv[0]);
    fprintf(stderr, "  -p      Attempt to locate the last checkpoint in directory 7-CGW.\n");
    fprintf(stderr, "  -c      Look for checkpoints in 'name'\n");
    fprintf(stderr, "  -g      Path to gkpStore\n");
    fprintf(stderr, "  -n      Checkpoint number to load\n");
    fprintf(stderr, "  -U      Use real UIDs for miniscaffolds, otherwise, UIDs start at 1230000\n");
    fprintf(stderr, "  -S      Do NOT make mini scaffolds.\n");
    exit(1);
  }

  uids = UIDserverInitialize(256, uidStart);

  char *toprint = (char *)safe_malloc(sizeof(char) * (AS_READ_MAX_NORMAL_LEN + 51 + AS_READ_MAX_NORMAL_LEN + 2));

  LoadScaffoldGraphFromCheckpoint(GlobalData->outputPrefix, ckptNum, FALSE);

  int ifrag;
  for (ifrag=0; ifrag < GetNumVA_CIFragT(ScaffoldGraph->CIFrags); ifrag++) {
    CIFragT *frag = GetCIFragT(ScaffoldGraph->CIFrags, ifrag);
    CIFragT *mate = NULL;

    if (frag->flags.bits.isDeleted)
      continue;

    assert(frag->cid != NULLINDEX);
    assert((frag->flags.bits.hasMate == 0) || (frag->mate_iid != 0));

    //  Fix for missing mates -- OBT used to not delete mate links, leaving
    //  dangling mates.  Somebody else seems to be doing this too.
    //
    if (frag->flags.bits.hasMate) {
      mate = GetCIFragT(ScaffoldGraph->CIFrags, frag->mate_iid);
      if (mate == NULL)
        frag->flags.bits.hasMate = 0;
    }

    //  If this fragment is not chaff, we have nothing to do here.
    //
    if (GetGraphNode(ScaffoldGraph->CIGraph,frag->cid)->flags.bits.isChaff == 0)
      continue;

    //  Print a singleton if there is no mate, the mate isn't chaff,
    //  or we were told to not make miniscaffolds.
    //
    if ((mate == NULL) ||
        (mate->flags.bits.isChaff == 0) ||
        (makeMiniScaffolds == 0)) {
      AS_UID  fUID = getFragmentClear(frag->read_iid, 0, toprint);

      AS_UTL_writeFastA(stdout,
                        toprint, strlen(toprint), 0,
                        ">%s /type=singleton\n", AS_UID_toString(fUID));

    } else if ((mate != NULL) &&
               (mate->flags.bits.isChaff == 1) &&
               (makeMiniScaffolds == 1) &&
               (frag->read_iid < mate->read_iid)) {

      //  make sure the following chain of Ns is divisible by three;
      //  the exact length is arbitrary but Doug Rusch points out that
      //  by making it divisible by 3, we can get lucky and maintain
      //  the phase of a protein ...  which helps in the
      //  auto-annotation of environmental samples

      AS_UID  fUID = getFragmentClear(frag->read_iid, 0, toprint);

      strcat(toprint, "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN");

      AS_UID  mUID = getFragmentClear(mate->read_iid, 1, toprint + strlen(toprint));

      AS_UTL_writeFastA(stdout,
                        toprint, strlen(toprint), 0,
                        ">"F_U64" /type=mini_scaffold /frgs=(%s,%s)\n",
                        getUID(uids),
                        AS_UID_toString(fUID),
                        AS_UID_toString(mUID));
    }
  }

  delete GlobalData;

  exit(0);
}
Example #3
0
int32
main(int32 argc, char **argv) {
   int32 arg = 1;
   int32 err = 0;
   int32 hlp = 0;

   char * gkpStoreName  = NULL;
   int32  gkpStorePart  = 0;
   char * msgFile       = NULL;
   char * outputFileName= NULL;
   char * seqAn         = NULL;
   char * wrkDir        = NULL;
   char * seqStoreName  = NULL;
   int32  seqStoreVer   = 0;
   int32  seqStorePart  = 0;     

   argc = AS_configure(argc, argv);

   while (arg < argc) {
      if (strcmp(argv[arg], "-c") == 0) {
         msgFile = argv[++arg];
      } else if (strcmp(argv[arg], "-G") == 0) {
         gkpStoreName = argv[++arg];
      } else if (strcmp(argv[arg], "-S") == 0) {
         gkpStorePart = atoi(argv[++arg]);
      } else if (strcmp(argv[arg], "-o") == 0) {
         outputFileName = argv[++arg];
      } else if (strcmp(argv[arg], "-s") == 0) {
         seqAn = argv[++arg];
      } else if (strcmp(argv[arg], "-w") == 0) {
         wrkDir = argv[++arg];
      } else if (strcmp(argv[arg], "-u") == 0) {
         seqStoreName = argv[++arg];
      } else if (strcmp(argv[arg], "-V") == 0) {
         seqStoreVer = atoi(argv[++arg]);
      } else if (strcmp(argv[arg], "-p") == 0) {
         seqStorePart = atoi(argv[++arg]);
      } else {
         err++;
      }
      arg++;
   }

   if ((err) || (gkpStoreName == NULL) || (msgFile == NULL) || (outputFileName == NULL) || seqAn == NULL) {
      fprintf(stderr, "USAGE: SeqAn_CNS -G <gkpStore> -c <input.cgb> -o <output.cgi> -s <seqan_executable> [-u seqstore, required for contig consensus] [-w working directory]\n");      
      exit(1);
   }

   gkStore        *gkpStore = new gkStore(gkpStoreName, FALSE, FALSE);

   gkpStore->gkStore_loadPartition(gkpStorePart);
   
   gkFragment      fr;
   GenericMesg    *pmesg;
   tSequenceDB    *sequenceDB = NULL;   

   FILE *infp = fopen(msgFile,"r");
   FILE *tempReads;
   FILE *outfp = fopen(outputFileName, "w");
   char fileName[AS_SEQAN_MAX_BUFFER_LENGTH];
   char *prefix = outputFileName;
   getFileName(prefix, wrkDir, AS_SEQAN_INPUT_NAME, fileName);

   int32 i = 0;
   
   while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) {
      int32 freeMem = 0;
     
      if (pmesg->t == MESG_IUM) {
         IntUnitigMesg *ium_mesg = (IntUnitigMesg *)pmesg->m;         
         
         if (strlen(ium_mesg->consensus) == 0) {
            tempReads = fopen(fileName,"w");

            for (i =0; i < ium_mesg->num_frags; i++) {
               // get the fragment sequence
               gkpStore->gkStore_getFragment(ium_mesg->f_list[i].ident, &fr, GKFRAGMENT_QLT);
               uint32   clrBeg = fr.gkFragment_getClearRegionBegin();
               uint32   clrEnd = fr.gkFragment_getClearRegionEnd  ();
               char    *seqStart = fr.gkFragment_getSequence();
               char     *seq      = seqStart+clrBeg;

               seq[clrEnd] = 0;
               AS_UTL_writeFastA(tempReads,
                  seq, clrEnd-clrBeg,
                   ">"F_IID","F_IID"\n", ium_mesg->f_list[i].position.bgn, ium_mesg->f_list[i].position.end);
            }
            fclose(tempReads);
            updateRecord(ium_mesg, fileName, seqAn, prefix, wrkDir);
            freeMem = 1;
         }
         WriteProtoMesg_AS(outfp, pmesg);

         if (freeMem) {
            safe_free(ium_mesg->consensus);
            safe_free(ium_mesg->quality);
         }
      }
      else if (pmesg->t == MESG_ICM) {         
         IntConConMesg *icm_mesg = (IntConConMesg *)pmesg->m;

         if (seqStoreName == NULL) {
            fprintf(stderr, "USAGE: The -u option is required for contig consensus\n");
            exit(1);
         }
         if (sequenceDB == NULL) {
            sequenceDB = openSequenceDB(seqStoreName, FALSE, seqStoreVer);
            openSequenceDBPartition(sequenceDB, seqStorePart);
         }
         
         if (strlen(icm_mesg->consensus) == 0) {
            tempReads = fopen(fileName,"w");

            for (i =0; i < icm_mesg->num_pieces; i++) {
               // get the fragment sequence
               gkpStore->gkStore_getFragment(icm_mesg->pieces[i].ident, &fr, GKFRAGMENT_QLT);
               uint32   clrBeg   = fr.gkFragment_getClearRegionBegin();
               uint32   clrEnd   = fr.gkFragment_getClearRegionEnd  ();
               char    *seqStart = fr.gkFragment_getSequence();
               char    *seq      = seqStart+clrBeg;

               seq[clrEnd] = 0;
               AS_UTL_writeFastA(tempReads,
                  seq, clrEnd-clrBeg,
                   ">"F_IID","F_IID"\n", icm_mesg->pieces[i].position.bgn, icm_mesg->pieces[i].position.end);
            }
            
            // now handle the unitig messages
            for (i =0; i < icm_mesg->num_unitigs; i++) {
               VA_TYPE(char) *ungappedSequence = CreateVA_char(0);
               VA_TYPE(char) *ungappedQuality  = CreateVA_char(0);
               MultiAlignT *uma = loadMultiAlignTFromSequenceDB(sequenceDB, icm_mesg->unitigs[i].ident, 1);
               assert(uma != NULL);
               
               GetMultiAlignUngappedConsensus(uma, ungappedSequence, ungappedQuality);
               char * seq = Getchar(ungappedSequence,0);

               AS_UTL_writeFastA(tempReads,
                  seq, strlen(seq),
                   ">"F_IID","F_IID"\n", icm_mesg->unitigs[i].position.bgn, icm_mesg->unitigs[i].position.end);
            }
            fclose(tempReads);

            updateICMRecord(icm_mesg, fileName, seqAn, prefix, wrkDir);
            freeMem = 1;
         }
         WriteProtoMesg_AS(outfp, pmesg);

         if (freeMem) {
            safe_free(icm_mesg->consensus);
            safe_free(icm_mesg->quality);
         }
      }
   }
      
   fclose(infp);
   fclose(outfp);

   return 0;
}