Ejemplo n.º 1
0
int
main(int argc, char **argv) {
  char            *gkpStoreName      = NULL;
  char            *outPrefix         = NULL;

  char            *clrName           = NULL;

  uint32           libToDump         = 0;

  uint32           bgnID             = 1;
  uint32           endID             = UINT32_MAX;

  bool             dumpAllReads      = false;
  bool             dumpAllBases      = false;
  bool             dumpOnlyDeleted   = false;

  bool             dumpFASTQ         = true;
  bool             dumpFASTA         = false;

  bool             withLibName       = true;

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpStoreName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outPrefix = argv[++arg];


    } else if (strcmp(argv[arg], "-c") == 0) {
      clrName = argv[++arg];


    } else if (strcmp(argv[arg], "-l") == 0) {
      libToDump = atoi(argv[++arg]);


    } else if (strcmp(argv[arg], "-b") == 0) {
      bgnID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-e") == 0) {
      endID  = atoi(argv[++arg]);


    } else if (strcmp(argv[arg], "-r") == 0) {
      bgnID  = atoi(argv[++arg]);
      endID  = bgnID;


    } else if (strcmp(argv[arg], "-allreads") == 0) {
      dumpAllReads    = true;

    } else if (strcmp(argv[arg], "-allbases") == 0) {
      dumpAllBases    = true;

    } else if (strcmp(argv[arg], "-onlydeleted") == 0) {
      dumpOnlyDeleted = true;
      dumpAllReads    = true;  //  Otherwise we won't report the deleted reads!


    } else if (strcmp(argv[arg], "-fastq") == 0) {
      dumpFASTQ       = true;
      dumpFASTA       = false;

    } else if (strcmp(argv[arg], "-fasta") == 0) {
      dumpFASTQ       = false;
      dumpFASTA       = true;

    } else if (strcmp(argv[arg], "-nolibname") == 0) {
      withLibName     = false;


    } else {
      err++;
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
    }
    arg++;
  }

  if (gkpStoreName == NULL)
    err++;
  if (outPrefix == NULL)
    err++;
  if (err) {
    fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]);
    fprintf(stderr, "  -G gkpStore\n");
    fprintf(stderr, "  -o fastq-prefix     write files fastq-prefix.(libname).fastq, ...\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -l libToDump        output only read in library number libToDump (NOT IMPLEMENTED)\n");
    fprintf(stderr, "  -b id               output starting at read 'id'\n");
    fprintf(stderr, "  -e id               output stopping after read 'id'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -c clearFile        clear range file from OBT modules\n");
    fprintf(stderr, "  -allreads           if a clear range file, lower case mask the deleted reads\n");
    fprintf(stderr, "  -allbases           if a clear range file, lower case mask the non-clear bases\n");
    fprintf(stderr, "  -onlydeleted        if a clear range file, only output deleted reads (the entire read)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -r id               output only the single read 'id'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -fastq              output is FASTQ format (with extension .fastq, default)\n");
    fprintf(stderr, "  -fasta              output is FASTA format (with extension .fasta)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -nolibname          don't include the library name in the output file name\n");
    fprintf(stderr, "\n");

    if (gkpStoreName == NULL)
      fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n");
    if (outPrefix == NULL)
      fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n");

    exit(1);
  }

  gkStore        *gkpStore  = new gkStore(gkpStoreName);
  uint32          numReads  = gkpStore->gkStore_getNumReads();
  uint32          numLibs   = gkpStore->gkStore_getNumLibraries();

  clearRangeFile *clrRange  = (clrName == NULL) ? NULL : new clearRangeFile(clrName, gkpStore);

  if (bgnID < 1)
    bgnID = 1;

  if (numReads < endID)
    endID = numReads;

  if (endID < bgnID)
    fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID);




  fprintf(stderr, "Dumping reads from %u to %u (inclusive).\n", bgnID, endID);

  libOutput   **out = new libOutput * [numLibs + 1];

  //  Allocate outputs.  If withLibName == false, all reads will artificially be in lib zero, the
  //  other files won't ever be created.  Otherwise, the zeroth file won't ever be created.

  out[0] = new libOutput(outPrefix, NULL);

  for (uint32 i=1; i<=numLibs; i++)
    out[i] = new libOutput(outPrefix, gkpStore->gkStore_getLibrary(i)->gkLibrary_libraryName());

  //  Grab a new readData, and iterate through reads to dump.

  gkReadData   *readData = new gkReadData;

  for (uint32 rid=bgnID; rid<=endID; rid++) {
    gkRead      *read   = gkpStore->gkStore_getRead(rid);

    uint32       libID  = (withLibName == false) ? 0 : read->gkRead_libraryID();

    uint32       lclr   = 0;
    uint32       rclr   = read->gkRead_sequenceLength();
    bool         ignore = false;

    //fprintf(stderr, "READ %u claims id %u length %u in lib %u\n", rid, read->gkRead_readID(), read->gkRead_sequenceLength(), libID);

    //  If a clear range file is supplied, grab the clear range.  If it hasn't been set, the default
    //  is the entire read.

    if (clrRange) {
      lclr   = clrRange->bgn(rid);
      rclr   = clrRange->end(rid);
      ignore = clrRange->isDeleted(rid);
    }

    //  Abort if we're not dumping anything from this read
    //   - not in a library we care about
    //   - deleted, and not dumping all reads
    //   - not deleted, but only reporting deleted reads

    if (((libToDump != 0) && (libID == libToDump)) ||
        ((dumpAllReads == false) && (ignore == true)) ||
        ((dumpOnlyDeleted == true) && (ignore == false)))
      continue;

    //  And if we're told to ignore the read, and here, then the read was deleted and we're printing
    //  all reads.  Reset the clear range to the whole read, the clear range is invalid.

    if (ignore) {
      rclr = read->gkRead_sequenceLength();
      lclr = 0;
    }

    //  Grab the sequence and quality.

    gkpStore->gkStore_loadReadData(read, readData);

    char   *seq = readData->gkReadData_getSequence();
    char   *qlt = readData->gkReadData_getQualities();
    uint32  len = rclr - lclr;

    //  Soft mask not-clear bases

    if (dumpAllBases == true) {
      for (uint32 i=0; i<lclr; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      for (uint32 i=lclr; i<rclr; i++)
        seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a';

      for (uint32 i=rclr; seq[i]; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      rclr = read->gkRead_sequenceLength();
      lclr = 0;
    }

    //  Chop off the ends we're not printing.

    seq += lclr;

    seq[len] = 0;
    qlt[len] = 0;

    //  And print the read.
    if (dumpFASTQ)
      AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, len, qlt, len,
                        "@"F_U32" clr="F_U32","F_U32"\n",
                        rid, lclr, rclr);

    if (dumpFASTA)
      AS_UTL_writeFastA(out[libID]->getFASTA(), seq, len, 0,
                        ">"F_U32" clr="F_U32","F_U32"\n",
                        rid, lclr, rclr);
  }

  delete   readData;

  for (uint32 i=1; i<=numLibs; i++)
    delete out[i];

  delete [] out;

  delete    gkpStore;

  exit(0);
}
Ejemplo n.º 2
0
int
main(int argc, char **argv) {
  char            *gkpStoreName      = NULL;
  char            *outPrefix         = NULL;

  AS_IID           libToDump         = 0;
  uint32           clrToDump         = AS_READ_CLEAR_LATEST;

  AS_IID           bgnIID            = 1;
  AS_IID           endIID            = AS_IID_MAX;

  bool             dumpAllBases      = true;
  bool             dumpAllReads      = false;

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-l") == 0) {
      libToDump = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-b") == 0) {
      bgnIID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-e") == 0) {
      endIID  = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-c") == 0) {
      clrToDump = gkStore_decodeClearRegionLabel(argv[++arg]);

    } else if (strcmp(argv[arg], "-g") == 0) {
      gkpStoreName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outPrefix = argv[++arg];

    } else {
      err++;
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
    }
    arg++;
  }

  if (gkpStoreName == NULL)
    err++;
  if (outPrefix == NULL)
    err++;
  if (clrToDump == AS_READ_CLEAR_ERROR)
    err++;
  if (err) {
    fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]);
    fprintf(stderr, "  -g gkpStore\n");
    fprintf(stderr, "  -o fastq-prefix     write files fastq-prefix.1.fastq, fastq-prefix.2.fastq, fastq-prefix.paired.fastq, fastq-prefix.unmated.fastq\n");
    fprintf(stderr, "  \n");
    fprintf(stderr, "  -l libToDump        output only fragments in library number libToDump (NOT IMPLEMENTED)\n");
    fprintf(stderr, "  -b iid              output starting at fragment iid\n");
    fprintf(stderr, "  -e iid              output stopping after fragment iid\n");
    fprintf(stderr, "  -c clrName          output clear range 'clrName'\n");
    fprintf(stderr, "  \n");

    if (gkpStoreName == NULL)
      fprintf(stderr, "ERROR: no gkpStore (-g) supplied.\n");
    if (outPrefix == NULL)
      fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n");
    if (clrToDump == AS_READ_CLEAR_ERROR)
      fprintf(stderr, "ERROR: clear range (-c) is not a valid clear range.\n");
    exit(1);
  }

  gkStore    *gkp       = new gkStore(gkpStoreName, FALSE, FALSE);

  AS_IID    numFrags    = gkp->gkStore_getNumFragments();
  AS_IID    numLibs     = gkp->gkStore_getNumLibraries();

  libInfo **lib         = new libInfo * [numLibs];

  lib[0] = new libInfo(outPrefix, "legacy");

  for (uint32 i=1; i<numLibs; i++)
    lib[i] = new libInfo(outPrefix, gkp->gkStore_getLibrary(i)->libraryName);

  if (bgnIID < 1)
    bgnIID = 1;
  if (numFrags < endIID)
    endIID = numFrags;

  //AS_IID    streamBgn = AS_IID_MIN;
  //AS_IID    streamEnd = AS_IID_MAX;

  gkStream   *fs        = new gkStream(gkp, bgnIID, endIID, GKFRAGMENT_QLT);
  gkFragment  fr;



  while (fs->next(&fr)) {
    int32   lclr   = fr.gkFragment_getClearRegionBegin(clrToDump);
    int32   rclr   = fr.gkFragment_getClearRegionEnd  (clrToDump);

    AS_IID  id1    = fr.gkFragment_getReadIID();
    AS_IID  id2    = fr.gkFragment_getMateIID();

    AS_IID  libIID = fr.gkFragment_getLibraryIID();

    if ((dumpAllReads == false) && (fr.gkFragment_getIsDeleted() == true))
      //  Fragment is deleted, don't dump.
      continue;

    if ((libToDump != 0) && (fr.gkFragment_getLibraryIID() == libToDump))
      //  Fragment isn't marked for dumping, don't dump.
      continue;

    if ((dumpAllBases == false) && (lclr >= rclr))
      //  Fragment has null or invalid clear range, don't dump.
      continue;

    if ((id2 != 0) && (id2 < id1))
      //  Mated, and the mate is the first frag.  We've already reported this one.
      continue;

    char *seq = fr.gkFragment_getSequence() + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0);
    char *qlt = fr.gkFragment_getQuality()  + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0);

    int32 len = (dumpAllBases == false) ? fr.gkFragment_getClearRegionLength(clrToDump) : fr.gkFragment_getSequenceLength();

    seq[len] = 0;
    qlt[len] = 0;

    if (dumpAllBases == true) {
      for (uint32 i=0; i<lclr; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;

      for (uint32 i=lclr; i<rclr; i++)
        seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a';

      for (uint32 i=rclr; seq[i]; i++)
        seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0;
    }

    if (id2 == 0) {
      //  Unmated read, dump to the unmated reads file.
      AS_UTL_writeFastQ(lib[libIID]->u, seq, len, qlt, len,
                        "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                        AS_UID_toString(fr.gkFragment_getReadUID()),
                        fr.gkFragment_getClearRegionBegin(clrToDump),
                        fr.gkFragment_getClearRegionEnd  (clrToDump),
                        fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC),
                        fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                        fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX),
                        fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                        fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT),
                        fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                        fr.gkFragment_getIsNonRandom() ? 'f' : 't');
      continue;
    }

    //  Write the first fragment (twice).
    AS_UTL_writeFastQ(lib[libIID]->a, seq, len, qlt, len,
                      "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                      AS_UID_toString(fr.gkFragment_getReadUID()),
                      fr.gkFragment_getClearRegionBegin(clrToDump),
                      fr.gkFragment_getClearRegionEnd  (clrToDump),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                      fr.gkFragment_getIsNonRandom() ? 'f' : 't');

    AS_UTL_writeFastQ(lib[libIID]->p, seq, len, qlt, len,
                      "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                      AS_UID_toString(fr.gkFragment_getReadUID()),
                      fr.gkFragment_getClearRegionBegin(clrToDump),
                      fr.gkFragment_getClearRegionEnd  (clrToDump),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                      fr.gkFragment_getIsNonRandom() ? 'f' : 't');

    //  Grab the second fragment.

    gkp->gkStore_getFragment(id2, &fr, GKFRAGMENT_QLT);

    lclr = fr.gkFragment_getClearRegionBegin(clrToDump) + 1;
    rclr = fr.gkFragment_getClearRegionEnd  (clrToDump);

    seq = fr.gkFragment_getSequence() + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0);
    qlt = fr.gkFragment_getQuality()  + ((dumpAllBases == false) ? fr.gkFragment_getClearRegionBegin(clrToDump) : 0);
    len = (dumpAllBases == false) ? fr.gkFragment_getClearRegionLength(clrToDump) : fr.gkFragment_getSequenceLength();

    seq[len] = 0;
    qlt[len] = 0;

    //  Write the second fragment (twice).
    AS_UTL_writeFastQ(lib[libIID]->b, seq, len, qlt, len,
                      "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                      AS_UID_toString(fr.gkFragment_getReadUID()),
                      fr.gkFragment_getClearRegionBegin(clrToDump),
                      fr.gkFragment_getClearRegionEnd  (clrToDump),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                      fr.gkFragment_getIsNonRandom() ? 'f' : 't');

    AS_UTL_writeFastQ(lib[libIID]->p, seq, len, qlt, len,
                      "@%s clr="F_U32","F_U32" clv="F_U32","F_U32" max="F_U32","F_U32" tnt="F_U32","F_U32" rnd=%c\n",
                      AS_UID_toString(fr.gkFragment_getReadUID()),
                      fr.gkFragment_getClearRegionBegin(clrToDump),
                      fr.gkFragment_getClearRegionEnd  (clrToDump),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_VEC),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_MAX),
                      fr.gkFragment_getClearRegionBegin(AS_READ_CLEAR_TNT),
                      fr.gkFragment_getClearRegionEnd  (AS_READ_CLEAR_TNT),
                      fr.gkFragment_getIsNonRandom() ? 'f' : 't');
  }

  delete fs;
  delete gkp;

  exit(0);
}