Exemplo n.º 1
0
void
gkStore::gkStore_deletePartitions(void) {
  char path[FILENAME_MAX];

  sprintf(path, "%s/partitions/map", gkStore_path());

  if (AS_UTL_fileExists(path, false, false) == false)
    return;

  //  How many partitions?

  FILE *F = fopen(path, "r");
  if (errno)
    fprintf(stderr, "ERROR: failed to open partition meta data '%s': %s\n", path, strerror(errno)), exit(1);

  fread(&_numberOfPartitions, sizeof(uint32), 1, F);

  fclose(F);

  //  Yay!  Delete!

  AS_UTL_unlink(path);

  for (uint32 ii=0; ii<_numberOfPartitions; ii++) {
    sprintf(path, "%s/partitions/reads.%04u", gkStore_path(), ii+1);  AS_UTL_unlink(path);
    sprintf(path, "%s/partitions/blobs.%04u", gkStore_path(), ii+1);  AS_UTL_unlink(path);
  }
}
Exemplo n.º 2
0
void
MateLocation::dumpHappiness(const char *prefix, const char *name) {
  char  dirname[FILENAME_MAX] = {0};
  char  outname[FILENAME_MAX] = {0};

  sprintf(dirname, "%s.%03u.%s.mateHappiness",
          prefix, logFileOrder, name);
  sprintf(outname, "%s.%03u.%s.mateHappiness/utg%09u.mateHappiness",
          prefix, logFileOrder, name, _tig->id());

  if (AS_UTL_fileExists(dirname, TRUE, TRUE) == 0)
    AS_UTL_mkdir(dirname);

  FILE *F = fopen(outname, "w");

  for (int32 i=0; i<_tigLen; i++)
    fprintf(F, "%u\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
            i,
            good[i],
            badFwd[i],
            badRev[i],
            badExternalFwd[i],
            badExternalRev[i],
            badCompressed[i],
            badStretched[i],
            badNormal[i],
            badAnti[i],
            badOuttie[i]);

  fclose(F);
}
Exemplo n.º 3
0
void
ovStore::ovStore_write(void) {
  AS_UTL_mkdir(_storePath);

  char name[FILENAME_MAX];

  sprintf(name, "%s/info", _storePath);

  //  If the ovs file exists, AND has a valid magic number, then the store is complete and we should
  //  abort before the valid store is destroyed.

  if (AS_UTL_fileExists(name, false, false)) {
    errno = 0;
    FILE *ovsinfo = fopen(name, "r");
    if (errno) {
      fprintf(stderr, "ERROR: failed to read store metadata from '%s': %s\n", name, strerror(errno));
      exit(1);
    }

    AS_UTL_safeRead(ovsinfo, &_info, "ovStore::ovStore::testinfo", sizeof(ovStoreInfo), 1);

    fclose(ovsinfo);

    if (_info._ovsMagic == ovStoreMagic)
      fprintf(stderr, "ERROR:  overlapStore '%s' is a valid overlap store, will not overwrite.\n",
              _storePath), exit(1);
  }

  //  Create a new incomplete info file.

  errno = 0;
  FILE *ovsinfo = fopen(name, "w");

  if (errno)
    fprintf(stderr, "failed to create overlap store '%s': %s\n", _storePath, strerror(errno)), exit(1);

  AS_UTL_safeWrite(ovsinfo, &_info, "ovStore::ovStore::saveinfo", sizeof(ovStoreInfo), 1);

  fclose(ovsinfo);

  sprintf(name, "%s/index", _storePath);

  errno = 0;
  _offtFile = fopen(name, "w");
  if (errno)
    fprintf(stderr, "AS_OVS_createOverlapStore()-- failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1);

  _overlapsThisFile = 0;
  _currentFileIndex = 0;
  _bof              = NULL;
}
Exemplo n.º 4
0
void
fastqFile::loadIndex(char *indexname) {
  struct stat  fastqstat;

  if (AS_UTL_fileExists(indexname) == false)
    return;

  errno = 0;
  if (stat(_filename, &fastqstat)) {
    fprintf(stderr, "fastqFile::constructIndex()-- stat of file '%s' failed: %s\n",
            _filename, strerror(errno));
    return;
  }

  FILE *I = fopen(indexname, "r");
  if (errno) {
    fprintf(stderr, "fastqFile::constructIndex()-- open of file '%s' failed: %s\n",
            indexname, strerror(errno));
    return;
  }

  fread(&_header, sizeof(fastqFileHeader), 1, I);

  if ((_header._magic[0] != FASTQ_MAGICNUMBER1) &&
      (_header._magic[1] != FASTQ_MAGICNUMBER2)) {
    fprintf(stderr, "fastqFile::constructIndex()-- magic mismatch.\n");
    fclose(I);
    return;
  }

  if ((_header._fastqFileSize         != (uint64)fastqstat.st_size) ||
      (_header._fastqModificationTime != (uint64)fastqstat.st_mtime) ||
      (_header._fastqCreationTime     != (uint64)fastqstat.st_ctime)) {
    fprintf(stderr, "fastqFile::constructIndex()-- stat mismatch.\n");
    fclose(I);
    return;
  }

  _index = new fastqFileIndex [_header._numberOfSequences];
  _names = new char           [_header._namesLength];

  fread(_index, sizeof(fastqFileIndex), _header._numberOfSequences, I);
  fread(_names, sizeof(char),           _header._namesLength,       I);

#ifdef DEBUG
  fprintf(stderr, "fastqFile::constructIndex()-- '%s' LOADED\n", _filename);
#endif

  fclose(I);
  return;
}
Exemplo n.º 5
0
//  Remove a file, or do nothing if the file doesn't exist.  Returns true if the file
//  was deleted, false if the file never existsed.
int
AS_UTL_unlink(const char *filename) {

  if (AS_UTL_fileExists(filename, FALSE, FALSE) == 0)
    return(0);

  errno = 0;
  unlink(filename);
  if (errno) {
    fprintf(stderr, "AS_UTL_unlink()--  Failed to remove file '%s': %s\n", filename, strerror(errno));
    exit(1);
  }

  return(1);
}
Exemplo n.º 6
0
compressedFileReader::compressedFileReader(const char *filename) {
  char    cmd[FILENAME_MAX * 2];
  int32   len = 0;

  _file = NULL;
  _pipe = false;
  _stdi = false;

  if (filename != NULL)
    len = strlen(filename);

  if ((len > 0) && (strcmp(filename, "-") != 0) && (AS_UTL_fileExists(filename, FALSE, FALSE) == FALSE))
    fprintf(stderr, "ERROR:  Failed to open input file '%s': %s\n", filename, strerror(errno)), exit(1);

  errno = 0;

  if        ((len > 3) && (strcasecmp(filename + len - 3, ".gz") == 0)) {
    sprintf(cmd, "gzip -dc %s", filename);
    _file = popen(cmd, "r");
    _pipe = true;

  } else if ((len > 4) && (strcasecmp(filename + len - 4, ".bz2") == 0)) {
    sprintf(cmd, "bzip2 -dc %s", filename);
    _file = popen(cmd, "r");
    _pipe = true;

  } else if ((len > 3) && (strcasecmp(filename + len - 3, ".xz") == 0)) {
    sprintf(cmd, "xz -dc %s", filename);
    _file = popen(cmd, "r");
    _pipe = true;

    if (_file == NULL)    //  popen() returns NULL on error.  It does not reliably set errno.
      fprintf(stderr, "ERROR:  Failed to open input file '%s': popen() returned NULL\n", filename), exit(1);

    errno = 0;

  } else if ((len == 0) || (strcmp(filename, "-") == 0)) {
    _file = stdin;
    _stdi = 1;

  } else {
    _file = fopen(filename, "r");
    _pipe = false;
  }

  if (errno)
    fprintf(stderr, "ERROR:  Failed to open input file '%s': %s\n", filename, strerror(errno)), exit(1);
}
Exemplo n.º 7
0
void
operationBuild(char   *buildName,
               char   *tigName,
               uint32  tigVers) {

  errno = 0;
  FILE *F = fopen(buildName, "r");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for reading: %s\n", buildName, strerror(errno)), exit(1);

  if (AS_UTL_fileExists(tigName, TRUE, TRUE)) {
    fprintf(stderr, "ERROR: '%s' exists, and I will not clobber an existing store.\n", tigName);
    exit(1);
  }

  tgStore *tigStore = new tgStore(tigName);
  tgTig    *tig      = new tgTig();

  for (int32 v=1; v<tigVers; v++)
    tigStore->nextVersion();

  while (tig->loadLayout(F) == true) {
    if (tig->numberOfChildren() == 0)
      continue;

    //  The log isn't correct.  For new tigs (all of these are) we don't know the
    //  id until after it is added.  Further, if these come with id's already set,
    //  they can't be added to a new store -- they don't exist.

#if 0
    fprintf(stderr, "INSERTING tig %d (%d children) (originally ID %d)\n",
            tig->tigID(), tig->numberOfChildren(), oID);
#endif

    tigStore->insertTig(tig, false);
  }

  fclose(F);

  delete tig;
  delete tigStore;
}
Exemplo n.º 8
0
int
main (int argc, char **argv) {
  char            *gkpName   = NULL;
  char            *tigName   = NULL;
  int32            tigVers   = -1;
  vector<char *>   tigInputs;
  tgStoreType      tigType   = tgStoreModify;

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-T") == 0) {
      tigName = argv[++arg];
      tigVers = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-L") == 0) {
      AS_UTL_loadFileList(argv[++arg], tigInputs);

    } else if (strcmp(argv[arg], "-n") == 0) {
      tigType = tgStoreReadOnly;

    } else if (AS_UTL_fileExists(argv[arg])) {
      tigInputs.push_back(argv[arg]);

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);
      err++;
    }

    arg++;
  }
  if ((err) || (gkpName == NULL) || (tigName == NULL) || (tigInputs.size() == 0)) {
    fprintf(stderr, "usage: %s -G <gkpStore> -T <tigStore> <v> [input.cns]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G <gkpStore>         Path to the gatekeeper store\n");
    fprintf(stderr, "  -T <tigStore> <v>     Path to the tigStore and version to add tigs to\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -L <file-of-files>    Load the tig(s) from files listed in 'file-of-files'\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -n                    Don't replace, just report what would have happened\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  The primary operation is to replace tigs in the store with ones in a set of input files.\n");
    fprintf(stderr, "  The input files can be either supplied directly on the command line or listed in\n");
    fprintf(stderr, "  a text file (-L).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  A new store is created if one doesn't exist, otherwise, whatever tigs are there are\n");
    fprintf(stderr, "  replaced with those in the -R file.  If version 'v' doesn't exist, it is created.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  Even if -n is supplied, a new store is created if one doesn't exist.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  To add a new tig, give it a tig id of -1.  New tigs must be added to the latest version.\n");
    fprintf(stderr, "  To delete a tig, remove all children, and set the number of them to zero.\n");
    fprintf(stderr, "\n");

    if (gkpName == NULL)
      fprintf(stderr, "ERROR:  no gatekeeper store (-G) supplied.\n");
    if (tigName == NULL)
      fprintf(stderr, "ERROR:  no tig store (-T) supplied.\n");
    if (tigInputs.size() == 0)
      fprintf(stderr, "ERROR:  no input tigs (-R) supplied.\n");

    exit(1);
  }

  //  If the store doesn't exist, create one, and make a bunch of versions
  if (AS_UTL_fileExists(tigName, true, false) == false) {
    fprintf(stderr, "Creating tig store '%s' version %d\n", tigName, tigVers);

    tgStore *tigStore = new tgStore(tigName);

    for (int32 vv=1; vv<tigVers; vv++)
      tigStore->nextVersion();

    delete tigStore;
  }

  gkStore *gkpStore = gkStore::gkStore_open(gkpName);
  tgStore *tigStore = new tgStore(tigName, tigVers, tigType);
  tgTig   *tig      = new tgTig;

  for (uint32 ff=0; ff<tigInputs.size(); ff++) {
    errno = 0;
    FILE *TI = fopen(tigInputs[ff], "r");
    if (errno)
      fprintf(stderr, "Failed to open '%s': %s\n", tigInputs[ff], strerror(errno)), exit(1);

    fprintf(stderr, "Reading layouts from '%s'.\n", tigInputs[ff]);

    while (tig->loadFromStreamOrLayout(TI) == true) {

      //  Handle insertion.

      if (tig->numberOfChildren() > 0) {
        //fprintf(stderr, "INSERTING tig %d\n", tig->tigID());
        tigStore->insertTig(tig, false);
        continue;
      }

      //  Deleted already?

      if (tigStore->isDeleted(tig->tigID()) == true) {
        //fprintf(stderr, "DELETING tig %d -- ALREADY DELETED\n", tig->tigID());
        continue;
      }

      //  Really delete it then.

      //fprintf(stderr, "DELETING tig %d\n", tig->tigID());
      tigStore->deleteTig(tig->tigID());
    }

    fclose(TI);

    fprintf(stderr, "Reading layouts from '%s' completed.\n", tigInputs[ff]);
  }

  delete tig;
  delete tigStore;

  gkpStore->gkStore_close();

  exit(0);
}
Exemplo n.º 9
0
int
main(int argc, char **argv) {
    char           *ovlName      = NULL;
    uint32          maxJob       = 0;

    bool            deleteIntermediates = true;

    bool            doExplicitTest = false;
    bool            doFixes        = false;

    char            name[FILENAME_MAX];

    argc = AS_configure(argc, argv);

    int err=0;
    int arg=1;
    while (arg < argc) {
        if        (strcmp(argv[arg], "-O") == 0) {
            ovlName = argv[++arg];

        } else if (strcmp(argv[arg], "-F") == 0) {
            maxJob = atoi(argv[++arg]);

        } else if (strcmp(argv[arg], "-f") == 0) {
            doFixes = true;

        } else if (strcmp(argv[arg], "-t") == 0) {
            doExplicitTest = true;
            ovlName = argv[++arg];

        } else if (strcmp(argv[arg], "-nodelete") == 0) {
            deleteIntermediates = false;

        } else {
            fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
        }

        arg++;
    }
    if (ovlName == NULL)
        err++;
    if ((maxJob == 0) && (doExplicitTest == false))
        err++;

    if (err) {
        fprintf(stderr, "usage: %s ...\n", argv[0]);
        fprintf(stderr, "  -O x.ovlStore    path to overlap store to build the final index for\n");
        fprintf(stderr, "  -F s             number of slices used in bucketizing/sorting\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "  -t x.ovlStore    explicitly test a previously constructed index\n");
        fprintf(stderr, "  -f               when testing, also create a new 'idx.fixed' which might\n");
        fprintf(stderr, "                   resolve rare problems\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "  -nodelete        do not remove intermediate files when the index is\n");
        fprintf(stderr, "                   successfully created\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "    DANGER    DO NOT USE     DO NOT USE     DO NOT USE    DANGER\n");
        fprintf(stderr, "    DANGER                                                DANGER\n");
        fprintf(stderr, "    DANGER   This command is difficult to run by hand.    DANGER\n");
        fprintf(stderr, "    DANGER          Use ovStoreCreate instead.            DANGER\n");
        fprintf(stderr, "    DANGER                                                DANGER\n");
        fprintf(stderr, "    DANGER    DO NOT USE     DO NOT USE     DO NOT USE    DANGER\n");
        fprintf(stderr, "\n");

        if (ovlName == NULL)
            fprintf(stderr, "ERROR: No overlap store (-O) supplied.\n");
        if ((maxJob == 0) && (doExplicitTest == false))
            fprintf(stderr, "ERROR: One of -F (number of slices) or -t (test a store) must be supplied.\n");

        exit(1);
    }

    //  Do the test, and maybe fix things up.

    if (doExplicitTest == true) {
        bool passed = testIndex(ovlName, doFixes);

        exit((passed == true) ? 0 : 1);
    }

    //  Check that all segments are present.  Every segment should have an info file.

    uint32  cntJob = 0;

    for (uint32 i=1; i<=maxJob; i++) {
        uint32  complete = 0;

        sprintf(name, "%s/%04d", ovlName, i);
        if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
            complete++;
        else
            fprintf(stderr, "ERROR: Segment "F_U32" data not present  (%s)\n", i, name);

        sprintf(name, "%s/%04d.info", ovlName, i);
        if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
            complete++;
        else
            fprintf(stderr, "ERROR: Segment "F_U32" info not present (%s)\n", i, name);

        sprintf(name, "%s/%04d.index", ovlName, i);
        if (AS_UTL_fileExists(name, FALSE, FALSE) == true)
            complete++;
        else
            fprintf(stderr, "ERROR: Segment "F_U32" index not present (%s)\n", i, name);

        if (complete == 3)
            cntJob++;
    }

    if (cntJob != maxJob) {
        fprintf(stderr, "ERROR: Expected "F_U32" segments, only found "F_U32".\n", maxJob, cntJob);
        exit(1);
    }

    //  Merge the stuff.

    mergeInfoFiles(ovlName, maxJob);

    //  Diagnostics.

    if (testIndex(ovlName, false) == false) {
        fprintf(stderr, "ERROR: index failed tests.\n");
        exit(1);
    }

    //  Remove intermediates.  For the buckets, we keep going until there are 10 in a row not present.
    //  During testing, on a microbe using 2850 buckets, some buckets were empty.

    if (deleteIntermediates == false) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Not removing intermediate files.  Finished.\n");
        exit(0);
    }

    fprintf(stderr, "\n");
    fprintf(stderr, "Removing intermediate files.\n");

    //  Removing indices is easy, beacuse we know how many there are.

    for (uint32 i=1; i<=maxJob; i++) {
        sprintf(name, "%s/%04u.index", ovlName, i);
        AS_UTL_unlink(name);
        sprintf(name, "%s/%04u.info",  ovlName, i);
        AS_UTL_unlink(name);
    }

    //  We don't know how many buckets there are, so we remove until we fail to find ten
    //  buckets in a row.

    for (uint32 missing=0, i=1; missing<10; i++) {
        sprintf(name, "%s/bucket%04d", ovlName, i);

        if (AS_UTL_fileExists(name, TRUE, FALSE) == FALSE) {
            missing++;
            continue;
        }

        missing = 0;

        sprintf(name, "%s/bucket%04d/sliceSizes", ovlName, i);
        AS_UTL_unlink(name);

        sprintf(name, "%s/bucket%04d", ovlName, i);
        rmdir(name);
    }

    fprintf(stderr, "Finished.\n");

    exit(0);
}
Exemplo n.º 10
0
void
ovStore::addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesLen) {

  char  name[FILENAME_MAX];
  sprintf(name, "%s/evalues", _storePath);

  //  If we have an opened memory mapped file, and it isn't open for writing, close it.

  if ((_evaluesMap) && (_evaluesMap->type() == memoryMappedFile_readOnly)) {
    fprintf(stderr, "WARNING: closing read-only evalues file.\n");
    delete _evaluesMap;

    _evaluesMap = NULL;
    _evalues    = NULL;
  }

  //  Remove a bogus evalues file if one exists.

  if ((AS_UTL_fileExists(name) == true) &&
      (AS_UTL_sizeOfFile(name) != (sizeof(uint16) * _info._numOverlapsTotal))) {
    fprintf(stderr, "WARNING: existing evalues file is incorrect size: should be "F_U64" bytes, is "F_U64" bytes.  Removing.\n",
            (sizeof(uint16) * _info._numOverlapsTotal), AS_UTL_sizeOfFile(name));
    AS_UTL_unlink(name);
  }

  //  Make a new evalues file if one doesn't exist.

  if (AS_UTL_fileExists(name) == false) {
    fprintf(stderr, "Creating evalues file for "F_U64" overlaps.\r", _info._numOverlapsTotal);

    errno = 0;
    FILE *F = fopen(name, "w");
    if (errno)
      fprintf(stderr, "Failed to make evalues file '%s': %s\n", name, strerror(errno)), exit(1);

    uint16  *Z  = new uint16 [1048576];
    uint64   Zn = 0;

    memset(Z, 0, sizeof(uint16) * 1048576);

    while (Zn < _info._numOverlapsTotal) {
      uint64  S = (Zn + 1048576 < _info._numOverlapsTotal) ? 1048576 : _info._numOverlapsTotal - Zn;

      AS_UTL_safeWrite(F, Z, "zero evalues", sizeof(uint16), S);

      Zn += S;

      fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\r",
              _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal);
    }

    fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\n",
            _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal);

    fclose(F);
  }

  //  Open the evalues file if it isn't already opened

  if (_evalues == NULL) {
    _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readWrite);
    _evalues    = (uint16 *)_evaluesMap->get(0);
  }

  //  Figure out the overlap ID for the first overlap associated with bgnID

  setRange(bgnID, endID);

  //  Load the evalues from 'evalues'

  for (uint64 ii=0; ii<evaluesLen; ii++)
    _evalues[_offt._overlapID + ii] = evalues[ii];

  //  That's it.  Deleting the ovStore object will close the memoryMappedFile.  It's left open
  //  for more updates.
}
Exemplo n.º 11
0
void
ovStore::ovStore_read(void) {
  char  name[FILENAME_MAX];

  sprintf(name, "%s/info", _storePath);
  errno = 0;
  FILE *ovsinfo = fopen(name, "r");
  if (errno)
    fprintf(stderr, "ERROR: directory '%s' is not an ovelrapStore; failed to open info file '%s': %s\n",
            _storePath, name, strerror(errno)), exit(1);

  AS_UTL_safeRead(ovsinfo, &_info, "ovStore::ovStore::info", sizeof(ovStoreInfo), 1);

  fclose(ovsinfo);

  if ((_info._ovsMagic != ovStoreMagic) && (_info._ovsMagic != ovStoreMagicIncomplete))
    fprintf(stderr, "ERROR:  directory '%s' is not an overlapStore; magic number 0x%016"F_X64P" incorrect.\n",
            _storePath, _info._ovsMagic), exit(1);

  if ((_info._ovsMagic != ovStoreMagic) && (_info._ovsMagic != ovStoreMagicIncomplete))
    fprintf(stderr, "ERROR:  overlapStore '%s' is incomplate; creation crashed?\n",
            _storePath), exit(1);

  if (_info._ovsVersion != ovStoreVersion)
    fprintf(stderr, "ERROR:  overlapStore '%s' is version "F_U64"; this code supports only version "F_U64".\n",
            _storePath, _info._ovsVersion, ovStoreVersion), exit(1);

  if (_info._maxReadLenInBits != AS_MAX_READLEN_BITS)
    fprintf(stderr, "ERROR:  overlapStore '%s' is for AS_MAX_READLEN_BITS="F_U64"; this code supports only %d bits.\n",
            _storePath, _info._maxReadLenInBits, AS_MAX_READLEN_BITS), exit(1);

  //  Load stats

#if 0
  sprintf(name, "%s/statistics", _storePath);
  errno = 0;
  FILE *ost = fopen(name, "r");
  if (errno)
    fprintf(stderr, "failed to open the stats file '%s': %s\n", name, strerror(errno)), exit(1);
  AS_UTL_safeRead(ost, &_stats, "ovStore::ovStore::stats", sizeof(OverlapStoreStats), 1);
  fclose(ost);
#endif

  //  Open the index

  sprintf(name, "%s/index", _storePath);

  errno = 0;
  _offtFile = fopen(name, "r");
  if (errno)
    fprintf(stderr, "ERROR:  failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1);

  //  Open erates

  sprintf(name, "%s/evalues", _storePath);

  if (AS_UTL_fileExists(name)) {
    _evaluesMap  = new memoryMappedFile(name, memoryMappedFile_readOnly);
    _evalues     = (uint16 *)_evaluesMap->get(0);
  }

  //_offtMMap   = new memoryMappedFile(name, memoryMappedFile_readOnly);
  //_offts      = (ovStoreOfft *)_offtMMap->get(0);
  //_offtLength = _offtMap->length() / sizeof(ovStoreOfft);
}
Exemplo n.º 12
0
void
gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
  char              name[FILENAME_MAX];

  //  Store cannot be partitioned already, and it must be readOnly (for safety) as we don't need to
  //  be changing any of the normal store data.

  assert(_numberOfPartitions == 0);
  assert(_mode               == gkStore_readOnly);

  //  Figure out what the last partition is

  uint32  maxPartition = 0;
  uint32  unPartitioned = 0;

  assert(partitionMap[0] == UINT32_MAX);

  for (uint32 fi=1; fi<=gkStore_getNumReads(); fi++) {
    if (partitionMap[fi] == UINT32_MAX)
      unPartitioned++;

    else if (maxPartition < partitionMap[fi])
      maxPartition = partitionMap[fi];
  }

  fprintf(stderr, "Found "F_U32" unpartitioned reads and maximum partition of "F_U32"\n",
          unPartitioned, maxPartition);

  //  Create the partitions by opening N copies of the data stores,
  //  and writing data to each.

  FILE         **blobfiles    = new FILE * [maxPartition + 1];
  uint64        *blobfileslen = new uint64 [maxPartition + 1];            //  Offset, in bytes, into the blobs file
  FILE         **readfiles    = new FILE * [maxPartition + 1];
  uint32        *readfileslen = new uint32 [maxPartition + 1];            //  aka _readsPerPartition
  uint32        *readIDmap    = new uint32 [gkStore_getNumReads() + 1];   //  aka _readIDtoPartitionIdx

  //  Be nice and put all the partitions in a subdirectory.

  sprintf(name,"%s/partitions", _storePath);

  if (AS_UTL_fileExists(name, true, true) == false)
    AS_UTL_mkdir(name);

  //  Open all the output files -- fail early if we can't open that many files.

  blobfiles[0]    = NULL;
  blobfileslen[0] = UINT64_MAX;
  readfiles[0]    = NULL;
  readfileslen[0] = UINT32_MAX;

  for (uint32 i=1; i<=maxPartition; i++) {
    sprintf(name,"%s/partitions/blobs.%04d", _storePath, i);

    errno = 0;
    blobfiles[i]    = fopen(name, "w");
    blobfileslen[i] = 0;

    if (errno)
      fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n",
              i, name, strerror(errno)), exit(1);

    sprintf(name,"%s/partitions/reads.%04d", _storePath, i);

    errno = 0;
    readfiles[i]    = fopen(name, "w");
    readfileslen[i] = 0;

    if (errno)
      fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n",
              i, name, strerror(errno)), exit(1);
  }

  //  Open the output partition map file -- we might as well fail early if we can't make it also.

  sprintf(name,"%s/partitions/map", _storePath);

  errno = 0;
  FILE *rIDmF = fopen(name, "w");
  if (errno)
    fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition map file '%s': %s\n",
            name, strerror(errno)), exit(1);

  //  Copy the blob from the master file to the partitioned file, update pointers.

  readIDmap[0] = UINT32_MAX;    //  There isn't a zeroth read, make it bogus.

  for (uint32 fi=1; fi<=gkStore_getNumReads(); fi++) {
    uint32  pi = partitionMap[fi];

    assert(pi != 0);  //  No zeroth partition, right?

    if (pi == UINT32_MAX)
      //  Deleted reads are not assigned a partition; skip them
      continue;

    //  Make a copy of the read, then modify it for the partition, then write it to the partition.
    //  Without the copy, we'd need to update the master record too.

    gkRead  partRead = _reads[fi];  //*gkStore_getRead(fi);

    partRead.gkRead_copyDataToPartition(_blobs, blobfiles, blobfileslen, pi);

#if 1
    fprintf(stderr, "read "F_U32"="F_U32" len "F_U32" -- blob master "F_U64" -- to part "F_U32" new read id "F_U32" blob "F_U64"/"F_U64" -- at readIdx "F_U32"\n",
            fi, _reads[fi].gkRead_readID(), _reads[fi].gkRead_sequenceLength(),
            _reads[fi]._mPtr,
            pi,
            partRead.gkRead_readID(), partRead._pID, partRead._mPtr,
            readfileslen[pi]);
#endif

    AS_UTL_safeWrite(readfiles[pi], &partRead, "gkStore::gkStore_buildPartitions::read", sizeof(gkRead), 1);

    readIDmap[fi] = readfileslen[pi]++;
  }

  //  There isn't a zeroth read.

  AS_UTL_safeWrite(rIDmF, &maxPartition,  "gkStore::gkStore_buildPartitions::maxPartition", sizeof(uint32), 1);
  AS_UTL_safeWrite(rIDmF,  readfileslen,  "gkStore::gkStore_buildPartitions::readfileslen", sizeof(uint32), maxPartition + 1);
  AS_UTL_safeWrite(rIDmF,  partitionMap,  "gkStore::gkStore_buildPartitions::partitionMap", sizeof(uint32), gkStore_getNumReads() + 1);
  AS_UTL_safeWrite(rIDmF,  readIDmap,     "gkStore::gkStore_buildPartitions::readIDmap",    sizeof(uint32), gkStore_getNumReads() + 1);

  //  cleanup -- close all the files, delete storage

  fclose(rIDmF);

  for (uint32 i=1; i<=maxPartition; i++) {
    fprintf(stderr, "partition "F_U32" has "F_U32" reads\n", i, readfileslen[i]);

    errno = 0;

    fclose(blobfiles[i]);
    fclose(readfiles[i]);

    if (errno)
      fprintf(stderr, "  warning: %s\n", strerror(errno));
  }

  delete [] readIDmap;
  delete [] readfileslen;
  delete [] readfiles;
  delete [] blobfileslen;
  delete [] blobfiles;
}
Exemplo n.º 13
0
//  The N valid modes for a 'new gkpStore' call:
//
//  1)  Add new reads/libraries, modify old ones.  gkStore(path, true, true)
//  2)  No addition, but can modify old ones.      gkStore(path, true)
//  3)  No addition, no modification.              gkStore(path);
//
gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) {
  char    name[FILENAME_MAX];

  memset(_storePath, 0, sizeof(char) * FILENAME_MAX);
  memset(_storeName, 0, sizeof(char) * FILENAME_MAX);

  strcpy(_storePath, path);
  strcpy(_storeName, path);  //  Broken.

  sprintf(name, "%s/info", _storePath);

  //  If the info file exists, load it.

  if (AS_UTL_fileExists(name, false, false) == true) {
    errno = 0;
    FILE *I = fopen(name, "r");
    AS_UTL_safeRead(I, &_info, "gkStore::_info", sizeof(gkStoreInfo), 1);
    fclose(I);
  }

  //  Check sizes are correct.

  uint32  failed = 0;

  if (_info.gkLibrarySize      != sizeof(gkLibrary))
    failed += fprintf(stderr, "ERROR:  gkLibrary size in store = %u, differs from executable = %u\n",
                      _info.gkLibrarySize, sizeof(gkLibrary));

  if (_info.gkReadSize         != sizeof(gkRead))
    failed += fprintf(stderr, "ERROR:  gkRead size in store = %u, differs from executable = %u\n",
                      _info.gkReadSize, sizeof(gkRead));

  if (_info.gkMaxLibrariesBits != AS_MAX_LIBRARIES_BITS)
    failed += fprintf(stderr, "ERROR:  AS_MAX_LIBRARIES_BITS in store = %u, differs from executable = %u\n",
                      _info.gkMaxLibrariesBits, AS_MAX_LIBRARIES_BITS);

  if (_info.gkLibraryNameSize  != LIBRARY_NAME_SIZE)
    failed += fprintf(stderr, "ERROR:  LIBRARY_NAME_SIZE in store = %u, differs from executable = %u\n",
                      _info.gkLibraryNameSize, LIBRARY_NAME_SIZE);

  if (_info.gkMaxReadBits      != AS_MAX_READS_BITS)
    failed += fprintf(stderr, "ERROR:  AS_MAX_READS_BITS in store = %u, differs from executable = %u\n",
                      _info.gkMaxReadBits, AS_MAX_READS_BITS);

  if (_info.gkMaxReadLenBits   != AS_MAX_READLEN_BITS)
    failed += fprintf(stderr, "ERROR:  AS_MAX_READLEN_BITS in store = %u, differs from executable = %u\n",
                      _info.gkMaxReadLenBits, AS_MAX_READLEN_BITS);

  if (failed)
    fprintf(stderr, "ERROR:\nERROR:  Can't open store '%s': parameters in src/AS_global.H are incompatible with the store.\n", _storePath), exit(1);

  assert(_info.gkLibrarySize      == sizeof(gkLibrary));
  assert(_info.gkReadSize         == sizeof(gkRead));

  assert(_info.gkMaxLibrariesBits == AS_MAX_LIBRARIES_BITS);
  assert(_info.gkLibraryNameSize  == LIBRARY_NAME_SIZE);
  assert(_info.gkMaxReadBits      == AS_MAX_READS_BITS);
  assert(_info.gkMaxReadLenBits   == AS_MAX_READLEN_BITS);

  //  Clear ourself, to make valgrind happier.

  _librariesMMap          = NULL;
  _librariesAlloc         = 0;
  _libraries              = NULL;

  _readsMMap              = NULL;
  _readsAlloc             = 0;
  _reads                  = NULL;

  _blobsMMap              = NULL;
  _blobs                  = NULL;
  _blobsFile              = NULL;

  _mode                   = mode;

  _numberOfPartitions     = 0;
  _partitionID            = 0;
  _readIDtoPartitionIdx   = NULL;
  _readIDtoPartitionID    = NULL;
  _readsPerPartition      = NULL;
  //_readsInThisPartition   = NULL;

  //
  //  READ ONLY
  //

  if ((mode == gkStore_readOnly) &&
      (partID == UINT32_MAX)) {
    //fprintf(stderr, "gkStore()--  opening '%s' for read-only access.\n", _storePath);

    if (AS_UTL_fileExists(_storePath, true, false) == false) {
      fprintf(stderr, "gkStore()--  failed to open '%s' for read-only access: store doesn't exist.\n", _storePath);
      exit(1);
    }

    sprintf(name, "%s/libraries", _storePath);
    _librariesMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
    _libraries     = (gkLibrary *)_librariesMMap->get(0);

    sprintf(name, "%s/reads", _storePath);
    _readsMMap     = new memoryMappedFile (name, memoryMappedFile_readOnly);
    _reads         = (gkRead *)_readsMMap->get(0);

    sprintf(name, "%s/blobs", _storePath);
    _blobsMMap     = new memoryMappedFile (name, memoryMappedFile_readOnly);
    _blobs         = (void *)_blobsMMap->get(0);
  }

  //
  //  MODIFY, NO APPEND (also for building a partitioned store)
  //

  else if ((mode == gkStore_modify) &&
           (partID == UINT32_MAX)) {
    //fprintf(stderr, "gkStore()--  opening '%s' for read-write access.\n", _storePath);

    if (AS_UTL_fileExists(_storePath, true, false) == false) {
      fprintf(stderr, "gkStore()--  failed to open '%s' for read-write access: store doesn't exist.\n", _storePath);
      exit(1);
    }

    sprintf(name, "%s/libraries", _storePath);
    _librariesMMap = new memoryMappedFile (name, memoryMappedFile_readWrite);
    _libraries     = (gkLibrary *)_librariesMMap->get(0);

    sprintf(name, "%s/reads", _storePath);
    _readsMMap     = new memoryMappedFile (name, memoryMappedFile_readWrite);
    _reads         = (gkRead *)_readsMMap->get(0);

    sprintf(name, "%s/blobs", _storePath);
    _blobsMMap     = new memoryMappedFile (name, memoryMappedFile_readWrite);
    _blobs         = (void *)_blobsMMap->get(0);
  }

  //
  //  MODIFY, APPEND, open mmap'd files, but copy them entirely to local memory
  //

  else if ((mode == gkStore_extend) &&
           (partID == UINT32_MAX)) {
    //fprintf(stderr, "gkStore()--  opening '%s' for read-write and append access.\n", _storePath);

    if (AS_UTL_fileExists(_storePath, true, true) == false)
      AS_UTL_mkdir(_storePath);

    _librariesAlloc = MAX(64, 2 * _info.numLibraries);
    _libraries      = new gkLibrary [_librariesAlloc];

    sprintf(name, "%s/libraries", _storePath);
    if (AS_UTL_fileExists(name, false, false) == true) {
      _librariesMMap  = new memoryMappedFile (name, memoryMappedFile_readOnly);

      memcpy(_libraries, _librariesMMap->get(0), sizeof(gkLibrary) * (_info.numLibraries + 1));

      delete _librariesMMap;
      _librariesMMap = NULL;;
    }

    _readsAlloc     = MAX(128, 2 * _info.numReads);
    _reads          = new gkRead [_readsAlloc];

    sprintf(name, "%s/reads", _storePath);
    if (AS_UTL_fileExists(name, false, false) == true) {
      _readsMMap      = new memoryMappedFile (name, memoryMappedFile_readOnly);

      memcpy(_reads, _readsMMap->get(0), sizeof(gkRead) * (_info.numReads + 1));

      delete _readsMMap;
      _readsMMap = NULL;
    }

    sprintf(name, "%s/blobs", _storePath);

    _blobsMMap     = NULL;
    _blobs         = NULL;

    errno = 0;
    _blobsFile     = fopen(name, "a+");
    if (errno)
      fprintf(stderr, "gkStore()--  Failed to open blobs file '%s' for appending: %s\n",
              name, strerror(errno)), exit(1);
  }

  //
  //  PARTITIONED, no modifications, no appends
  //
  //  BIG QUESTION: do we want to partition the read metadata too, or is it small enough
  //  to load in every job?  For now, we load all the metadata.

  else if ((mode == gkStore_readOnly) &&
           (partID != UINT32_MAX)) {
    //fprintf(stderr, "gkStore()--  opening '%s' partition '%u' for read-only access.\n", _storePath, partID);

    //  For partitioned reads, we need to have a uint32 map of readID to partitionReadID so we can
    //  lookup the metadata in the partitoned _reads data.  This is 4 bytes per read, compared to 24
    //  bytes for the full meta data.  Assuming 100x of 3kb read coverage on human, that's 100
    //  million reads, so 0.400 GB vs 2.4 GB.

    sprintf(name, "%s/partitions/map", _storePath);

    errno = 0;
    FILE *F = fopen(name, "r");
    if (errno)
      fprintf(stderr, "gkStore::gkStore()-- failed to open '%s' for reading: %s\n",
              name, strerror(errno)), exit(1);

    AS_UTL_safeRead(F, &_numberOfPartitions, "gkStore::_numberOfPartitions", sizeof(uint32), 1);

    _partitionID            = partID;
    _readsPerPartition      = new uint32 [_numberOfPartitions   + 1];  //  No zeroth element in any of these
    _readIDtoPartitionID    = new uint32 [gkStore_getNumReads() + 1];
    _readIDtoPartitionIdx   = new uint32 [gkStore_getNumReads() + 1];

    AS_UTL_safeRead(F, _readsPerPartition,    "gkStore::_readsPerPartition",    sizeof(uint32), _numberOfPartitions   + 1);
    AS_UTL_safeRead(F, _readIDtoPartitionID,  "gkStore::_readIDtoPartitionID",  sizeof(uint32), gkStore_getNumReads() + 1);
    AS_UTL_safeRead(F, _readIDtoPartitionIdx, "gkStore::_readIDtoPartitionIdx", sizeof(uint32), gkStore_getNumReads() + 1);

    fclose(F);

    sprintf(name, "%s/libraries", _storePath);
    _librariesMMap = new memoryMappedFile (name, memoryMappedFile_readOnly);
    _libraries     = (gkLibrary *)_librariesMMap->get(0);
    //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _libraries);

    sprintf(name, "%s/partitions/reads.%04"F_U32P"", _storePath, partID);
    _readsMMap     = new memoryMappedFile (name, memoryMappedFile_readOnly);
    _reads         = (gkRead *)_readsMMap->get(0);
    //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _reads);

    sprintf(name, "%s/partitions/blobs.%04"F_U32P"", _storePath, partID);
    _blobsMMap     = new memoryMappedFile (name, memoryMappedFile_readOnly);
    _blobs         = (void *)_blobsMMap->get(0);
    //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _blobs);
  }

  //  Info only, no access to reads or libraries.

  else if (mode == gkStore_infoOnly) {
    //fprintf(stderr, "gkStore()--  opening '%s' for info-only access.\n", _storePath);
  }

  else {
    fprintf(stderr, "gkStore::gkStore()-- invalid mode '%s' with partition ID %u.\n",
            toString(mode), partID);
    assert(0);
  }
}
Exemplo n.º 14
0
int
main(int argc, char **argv) {
  char            *gkpStoreName      = NULL;
  char            *outPrefix         = NULL;

  uint32           minReadLength     = 0;

  uint32           firstFileArg      = 0;

  char             errorLogName[FILENAME_MAX];
  char             htmlLogName[FILENAME_MAX];
  char             nameMapName[FILENAME_MAX];

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-o") == 0) {
      gkpStoreName = argv[++arg];

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minReadLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "--") == 0) {
      firstFileArg = arg++;
      break;

    } else if (argv[arg][0] == '-') {
      fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]);
      err++;

    } else {
      firstFileArg = arg;
      break;
    }
    arg++;
  }

  if (gkpStoreName == NULL)
    err++;
  if (firstFileArg == 0)
    err++;

  if (err) {
    fprintf(stderr, "usage: %s [...] -o gkpStore\n", argv[0]);
    fprintf(stderr, "  -o gkpStore         create this gkpStore\n");
    fprintf(stderr, "  \n");
    fprintf(stderr, "  -minlength L        discard reads shorter than L\n");
    fprintf(stderr, "  \n");
    fprintf(stderr, "  \n");

    if (gkpStoreName == NULL)
      fprintf(stderr, "ERROR: no gkpStore (-g) supplied.\n");
    if (firstFileArg == 0)
      fprintf(stderr, "ERROR: no input files supplied.\n");

    exit(1);
  }


  gkStore     *gkpStore     = gkStore::gkStore_open(gkpStoreName, gkStore_extend);
  gkRead      *gkpRead      = NULL;
  gkLibrary   *gkpLibrary   = NULL;
  uint32       gkpFileID    = 0;      //  Used for HTML output, an ID for each file loaded.

  uint32       inLineLen    = 1024;
  char         inLine[1024] = { 0 };

  validSeq['a'] = validSeq['c'] = validSeq['g'] = validSeq['t'] = validSeq['n'] = 1;
  validSeq['A'] = validSeq['C'] = validSeq['G'] = validSeq['T'] = validSeq['N'] = 1;

  errno = 0;

  sprintf(errorLogName, "%s/errorLog",    gkpStoreName);
  FILE    *errorLog = fopen(errorLogName, "w");
  if (errno)
    fprintf(stderr, "ERROR:  cannot open error file '%s': %s\n", errorLogName, strerror(errno)), exit(1);

  sprintf(htmlLogName,   "%s/load.dat", gkpStoreName);
  FILE    *htmlLog   = fopen(htmlLogName,   "w");
  if (errno)
    fprintf(stderr, "ERROR:  cannot open uid map file '%s': %s\n", htmlLogName, strerror(errno)), exit(1);

  sprintf(nameMapName,   "%s/readNames.txt", gkpStoreName);
  FILE    *nameMap   = fopen(nameMapName,   "w");
  if (errno)
    fprintf(stderr, "ERROR:  cannot open uid map file '%s': %s\n", nameMapName, strerror(errno)), exit(1);

  uint32  nERROR   = 0;  //  There aren't any errors, we just exit fatally if encountered.
  uint32  nWARNS   = 0;

  uint32  nLOADED  = 0;  //  Reads loaded
  uint64  bLOADED  = 0;  //  Bases loaded

  uint32  nSKIPPED = 0;
  uint64  bSKIPPED = 0;  //  Bases not loaded, too short

#if 0
  fprintf(htmlLog, "<!DOCTYPE html>\n");
  fprintf(htmlLog, "<html>\n");
  fprintf(htmlLog, "<head>\n");
  fprintf(htmlLog, "<title>gatekeeper load statistics</title>\n");
  fprintf(htmlLog, "<style type='text/css'>\n");
  fprintf(htmlLog, "body       { font-family: Helvetica, Verdana, sans-serif; }\n");
  fprintf(htmlLog, "h1, h2     { color: #ee3e80; }\n");
  fprintf(htmlLog, "p          { color: #665544; }\n");
  fprintf(htmlLog, "th, td     { border: 1px solid #111111; padding: 2px 2px 2px 2px; }\n");
  fprintf(htmlLog, "td:hover   { background-color: #e4e4e4; }\n");
  fprintf(htmlLog, "th:hover   { background-color: #d4d4d4; }\n");
  fprintf(htmlLog, "tr.details { visibility: collapse; }\n");
  fprintf(htmlLog, "</style>\n");
  fprintf(htmlLog, "</head>\n");
  fprintf(htmlLog, "<body>\n");
  fprintf(htmlLog, "<h2>Input Files</h2>\n");
  fprintf(htmlLog, "<table>\n");
#endif

  for (; firstFileArg < argc; firstFileArg++) {
    fprintf(stderr, "\n");
    fprintf(stderr, "Starting file '%s'.\n", argv[firstFileArg]);

    compressedFileReader *inFile = new compressedFileReader(argv[firstFileArg]);
    char                 *line   = new char [10240];
    KeyAndValue           keyval;

    while (fgets(line, 10240, inFile->file()) != NULL) {
      chomp(line);
      keyval.find(line);

      if (keyval.key() == NULL) {
        //  No key, so must be a comment or blank line
        continue;
      }

      if (strcasecmp(keyval.key(), "name") == 0) {
        gkpLibrary = gkpStore->gkStore_addEmptyLibrary(keyval.value());
        continue;
      }

      //  We'd better have a gkpLibrary defined, if not, the .gkp input file is incorrect.
      if (gkpLibrary == NULL) {
        fprintf(stderr, "WARNING: no 'name' tag in gkp input; creating library with name 'DEFAULT'.\n");
        gkpLibrary = gkpStore->gkStore_addEmptyLibrary(keyval.value());
        nWARNS++;
      }

      if        (strcasecmp(keyval.key(), "preset") == 0) {
        gkpLibrary->gkLibrary_parsePreset(keyval.value());

      } else if (strcasecmp(keyval.key(), "qv") == 0) {
        gkpLibrary->gkLibrary_setDefaultQV(keyval.value_double());

      } else if (strcasecmp(keyval.key(), "isNonRandom") == 0) {
        gkpLibrary->gkLibrary_setIsNonRandom(keyval.value_bool());

      } else if (strcasecmp(keyval.key(), "trustHomopolymerRuns") == 0) {
        gkpLibrary->gkLibrary_setTrustHomopolymerRuns(keyval.value_bool());

      } else if (strcasecmp(keyval.key(), "removeDuplicateReads") == 0) {
        gkpLibrary->gkLibrary_setRemoveDuplicateReads(keyval.value_bool());

      } else if (strcasecmp(keyval.key(), "finalTrim") == 0) {
        gkpLibrary->gkLibrary_setFinalTrim(keyval.value());

      } else if (strcasecmp(keyval.key(), "removeSpurReads") == 0) {
        gkpLibrary->gkLibrary_setRemoveSpurReads(keyval.value_bool());

      } else if (strcasecmp(keyval.key(), "removeChimericReads") == 0) {
        gkpLibrary->gkLibrary_setRemoveChimericReads(keyval.value_bool());

      } else if (strcasecmp(keyval.key(), "checkForSubReads") == 0) {
        gkpLibrary->gkLibrary_setCheckForSubReads(keyval.value_bool());

      } else if (AS_UTL_fileExists(keyval.key(), false, false)) {
        loadReads(gkpStore,
                  gkpLibrary,
                  gkpFileID++,
                  minReadLength,
                  nameMap,
                  htmlLog,
                  errorLog,
                  keyval.key(),
                  nWARNS, nLOADED, bLOADED, nSKIPPED, bSKIPPED);

      } else {
        fprintf(stderr, "ERROR:  option '%s' not recognized, and not a file of reads.\n", line);
        exit(1);
      }
    }

    delete    inFile;
    delete [] line;
  }

#if 0
  fprintf(htmlLog, "</table>\n");
#endif

  gkpStore->gkStore_close();

  fclose(nameMap);
  fclose(errorLog);

  fprintf(stderr, "\n");
  fprintf(stderr, "Finished with:\n");
  fprintf(stderr, "  "F_U32" warnings (bad base or qv)\n", nWARNS);
  fprintf(stderr, "\n");
  fprintf(stderr, "Read from inputs:\n");
  fprintf(stderr, "  "F_U64" bp.\n",    bLOADED);
  fprintf(stderr, "  "F_U32" reads.\n", nLOADED);
  fprintf(stderr, "\n");
  fprintf(stderr, "Loaded into store:\n");
  fprintf(stderr, "  "F_U64" bp.\n",    bLOADED);
  fprintf(stderr, "  "F_U32" reads.\n", nLOADED);
  fprintf(stderr, "\n");
  fprintf(stderr, "Skipped (too short):\n");
  fprintf(stderr, "  "F_U64" bp (%.4f%%).\n",    bSKIPPED, 100.0 * bSKIPPED / (bSKIPPED + bLOADED));
  fprintf(stderr, "  "F_U32" reads (%.4f%%).\n", nSKIPPED, 100.0 * nSKIPPED / (nSKIPPED + nLOADED));
  fprintf(stderr, "\n");
  fprintf(stderr, "\n");

#if 0
  fprintf(htmlLog, "\n");
  fprintf(htmlLog, "<h2>Final Store</h2>\n");
  fprintf(htmlLog, "<table>\n");
  fprintf(htmlLog, "<tr><td colspan='2'>%s</td></tr>\n", gkpStoreName);
  fprintf(htmlLog, "<tr><td>readsLoaded</td><td>"F_U32" reads ("F_U64" bp)</td></tr>\n", nLOADED, bLOADED);
  fprintf(htmlLog, "<tr><td>readsSkipped</td><td>"F_U32" reads ("F_U64" bp) (read was too short)</td></tr>\n", nSKIPPED, bSKIPPED);
  fprintf(htmlLog, "<tr><td>warnings</td><td>"F_U32" warnings (invalid base or quality value)</td></tr>\n", nWARNS);
  fprintf(htmlLog, "</table>\n");
  fprintf(htmlLog, "\n");

  fprintf(htmlLog, "<script type='text/javascript'>\n");
  fprintf(htmlLog, "var toggleOne = function() {\n");
  fprintf(htmlLog, "  var table = this.closest('table');\n");
  fprintf(htmlLog, "  var elts  = table.querySelectorAll('.details');\n");
  fprintf(htmlLog, "\n");
  fprintf(htmlLog, "  for (var i=0; i<elts.length; i++) {\n");
  fprintf(htmlLog, "    if (!elts[i].enabled) {\n");
  fprintf(htmlLog, "      elts[i].enabled = true;\n");
  fprintf(htmlLog, "      elts[i].style.visibility = 'visible';\n");
  fprintf(htmlLog, "    } else {\n");
  fprintf(htmlLog, "      elts[i].enabled = false;\n");
  fprintf(htmlLog, "      elts[i].style.visibility = 'collapse';\n");
  fprintf(htmlLog, "    }\n");
  fprintf(htmlLog, "  }\n");
  fprintf(htmlLog, "}\n");
  fprintf(htmlLog, "\n");
  for (uint32 ii=0; ii<gkpFileID; ii++) {
    fprintf(htmlLog, "document.getElementById('gkpload%u').onclick = toggleOne;\n", ii);
    fprintf(htmlLog, "document.getElementById('gkpload%u').style   = 'cursor: pointer;';\n", ii);
  }
  fprintf(htmlLog, "</script>\n");
  fprintf(htmlLog, "\n");
  fprintf(htmlLog, "</body>\n");
  fprintf(htmlLog, "</html>\n");
#else
  fprintf(htmlLog, "sum "F_U32" "F_U64" "F_U32" "F_U64" "F_U32"\n", nLOADED, bLOADED, nSKIPPED, bSKIPPED, nWARNS);
#endif

  fclose(htmlLog);



  if (nERROR > 0)
    fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many errors.\n");

  if (bSKIPPED > 0.25 * (bSKIPPED + bLOADED))
    fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many bases skipped.  Check your reads.\n");

  if (nWARNS > 0.25 * (nLOADED))
    fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many warnings.  Check your reads.\n");

  if (nSKIPPED > 0.50 * (nLOADED))
    fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many short reads.  Check your reads!\n");

  if ((nERROR > 0) ||
      (bSKIPPED > 0.25 * (bSKIPPED + bLOADED)) ||
      (nWARNS   > 0.25 * (nSKIPPED + nLOADED)) ||
      (nSKIPPED > 0.50 * (nSKIPPED + nLOADED)))
    exit(1);

  fprintf(stderr, "gatekeeperCreate finished successfully.\n");

  exit(0);
}
Exemplo n.º 15
0
void
mergeInfoFiles(char       *storePath,
               uint32      nPieces) {
  ovStoreInfo    infopiece;
  ovStoreInfo    info;

  info._ovsMagic              = ovStoreMagic;
	info._ovsVersion            = ovStoreVersion;
  info._smallestIID           = UINT64_MAX;
  info._largestIID            = 0;
  info._numOverlapsTotal      = 0;
  info._highestFileIndex      = nPieces;
	info._maxReadLenInBits      = AS_MAX_READLEN_BITS;

  ovStoreOfft offm;

  offm._a_iid     = 0;
  offm._fileno    = 1;
  offm._offset    = 0;
  offm._numOlaps  = 0;

  //  Open the new master index output file

  char            name[FILENAME_MAX];

  sprintf(name, "%s/index", storePath);

  errno = 0;
  FILE  *idx = fopen(name, "w");
  if (errno)
    fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);

  //  Special case, we need an empty index for the zeroth fragment.

  AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsetZero", sizeof(ovStoreOfft), 1);

  //  Process each

  for (uint32 i=1; i<=nPieces; i++) {
    sprintf(name, "%s/%04d.info", storePath, i);

    fprintf(stderr, "Processing '%s'\n", name);

    if (AS_UTL_fileExists(name, FALSE, FALSE) == false) {
      fprintf(stderr, "ERROR: file '%s' not found.\n", name);
      exit(1);
    }

    {
      errno = 0;
      FILE *F = fopen(name, "r");
      if (errno)
        fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
      AS_UTL_safeRead(F, &infopiece, "ovStore::mergeInfoFiles::infopiece", sizeof(ovStoreInfo), 1);
      fclose(F);
    }

    //  Add empty index elements for missing overlaps

    if (infopiece._numOverlapsTotal == 0) {
      fprintf(stderr, "  No overlaps found.\n");
      continue;
    }

    assert(infopiece._smallestIID <= infopiece._largestIID);

    if (info._largestIID + 1 < infopiece._smallestIID)
      fprintf(stderr, "  Adding empty records for fragments "F_U64" to "F_U64"\n",
              info._largestIID + 1, infopiece._smallestIID - 1);

    while (info._largestIID + 1 < infopiece._smallestIID) {
      offm._a_iid     = info._largestIID + 1;
      //offm._fileno    = set elsewhere
      //offm._offset    = set elsewhere
      //offm._numOlaps  = 0;

      AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsets", sizeof(ovStoreOfft), 1);

      info._largestIID++;
    }

    //  Copy index elements for existing overlaps.  While copying, update the supposed position
    //  of any fragments with no overlaps.  Without doing this, accessing the store beginning
    //  or ending at such a fragment will fail.

    {
      sprintf(name, "%s/%04d.index", storePath, i);

      errno = 0;
      FILE  *F = fopen(name, "r");
      if (errno)
        fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);

      uint32          recsLen = 0;
      uint32          recsMax = 1024 * 1024;
      ovStoreOfft    *recs    = new ovStoreOfft [recsMax];

      recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsLoad", sizeof(ovStoreOfft), recsMax);

      if (recsLen > 0) {
        if (info._largestIID + 1 != recs[0]._a_iid)
          fprintf(stderr, "ERROR: '%s' starts with iid "F_U32", but store only up to "F_U64"\n",
                  name, recs[0]._a_iid, info._largestIID);
        assert(info._largestIID + 1 == recs[0]._a_iid);
      }

      while (recsLen > 0) {
        offm._fileno = recs[recsLen-1]._fileno;  //  Update location of missing stuff.
        offm._offset = recs[recsLen-1]._offset;

				AS_UTL_safeWrite(idx, recs, "ovStore::mergeInfoFiles::offsetsWrite", sizeof(ovStoreOfft), recsLen);

        recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsReLoad", sizeof(ovStoreOfft), recsMax);
      }

      delete [] recs;

      fclose(F);
    }

    //  Update

    info._smallestIID = MIN(info._smallestIID, infopiece._smallestIID);
    info._largestIID  = MAX(info._largestIID,  infopiece._largestIID);

    info._numOverlapsTotal += infopiece._numOverlapsTotal;

    fprintf(stderr, "  Now finished with fragments "F_U64" to "F_U64" -- "F_U64" overlaps.\n",
            info._smallestIID, info._largestIID, info._numOverlapsTotal);
  }

  fclose(idx);


  //  Dump the new store info file

  {
    sprintf(name, "%s/info", storePath);

    errno = 0;
    FILE  *F = fopen(name, "w");
    if (errno)
      fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);

    AS_UTL_safeWrite(F, &info, "ovStore::mergeInfoFiles::finalInfo", sizeof(ovStoreInfo), 1);

    fclose(F);
  }

  fprintf(stderr, "\n");
  fprintf(stderr, "Index finalized for reads "F_U64" to "F_U64" with "F_U64" overlaps.\n",
          info._smallestIID,
          info._largestIID,
          info._numOverlapsTotal);
}
int
main(int argc, char **argv) {
  int32 minEvalue = 0;
  int32 maxEvalue = 0;
  int32 step      = 1;

  char D[FILENAME_MAX];
  char O[FILENAME_MAX];

  if        (argc == 2) {
    minEvalue  = atoi(argv[1]);
    maxEvalue  = minEvalue;

  } else if (argc == 3) {
    minEvalue  = atoi(argv[1]);
    maxEvalue  = atoi(argv[2]);

  } else if (argc == 4) {
    minEvalue  = atoi(argv[1]);
    maxEvalue  = atoi(argv[2]);
    step       = atoi(argv[3]);

  } else {
    fprintf(stderr, "usage: %s minEvalue [maxEvalue [step]]\n", argv[0]);
    fprintf(stderr, "  computes overlapper probabilities for minEvalue <= eValue <= maxEvalue'\n");
    fprintf(stderr, "    eValue 100 == 0.01 fraction error == 1%% error\n");
    exit(1);
  }

  fprintf(stderr, "Computing Edit_Match_Limit data for reads of length %ubp (bits = %u).\n", AS_MAX_READLEN, AS_MAX_READLEN_BITS);

  sprintf(D, "prefixEditDistance-matchLimitData-BITS=%01d", AS_MAX_READLEN_BITS);
  AS_UTL_mkdir(D);

#pragma omp parallel for schedule(dynamic, 1)
  for (int32 evalue=maxEvalue; evalue>=minEvalue; evalue -= step) {
    char    N[FILENAME_MAX];  //  Local to this thread!

    double  erate             = evalue / 10000.0;
    int32   start             = 1;

    int32   MAX_ERRORS        = (1 + (int) (erate * AS_MAX_READLEN));
    int32   ERRORS_FOR_FREE   = 1;

    int32  *starts            = new int32 [MAX_ERRORS + 1];

    memset(starts, 0, sizeof(int32) * (MAX_ERRORS + 1));

    sprintf(N, "%s/prefixEditDistance-matchLimit-%04d.bin", D, evalue);

    if (AS_UTL_fileExists(N)) {
      fprintf(stderr, "eValue %04d -- eRate %6.4f -- %7.4f%% error -- %8d values -- thread %2d - LOAD\n",
              evalue, erate, erate * 100.0, MAX_ERRORS, omp_get_thread_num());

      errno = 0;
      FILE *F = fopen(N, "r");
      if (errno)
        fprintf(stderr, "Failed to open '%s' for reading: %s\n", N, strerror(errno)), exit(1);

      int32  me = 0;
      double er = 0.0;

      fread(&me,     sizeof(int32),  1,          F);
      fread(&er,     sizeof(double), 1,          F);
      fread( starts, sizeof(int32),  MAX_ERRORS, F);

      assert(me == MAX_ERRORS);
      assert(er == erate);

      fclose(F);

    } else {
      fprintf(stderr, "eValue %04d -- eRate %6.4f -- %7.4f%% error -- %8d values -- thread %2d - COMPUTE\n",
              evalue, erate, erate * 100.0, MAX_ERRORS, omp_get_thread_num());

      for (int32 e=ERRORS_FOR_FREE + 1; e<MAX_ERRORS; e++) {
        start = Binomial_Bound(e - ERRORS_FOR_FREE, erate, start);
        starts[e] = start - 1;
      }
    }



    {
      sprintf(O, "%s/prefixEditDistance-matchLimit-%04d.bin", D, evalue);

      errno = 0;
      FILE *F = fopen(O, "w");
      if (errno)
        fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);

      fwrite(&MAX_ERRORS, sizeof(int32),  1,          F);
      fwrite(&erate,      sizeof(double), 1,          F);
      fwrite( starts,     sizeof(int32),  MAX_ERRORS, F);

      fclose(F);
    }



    {
      sprintf(O, "%s/prefixEditDistance-matchLimit-%04d.dat", D, evalue);

      errno = 0;
      FILE *F = fopen(O, "w");
      if (errno)
        fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);

      fprintf(F, "#length     limit   slope0toX slopeXtoMAX for erate=%0.4f MAX_ERRORS=%d\n", erate, MAX_ERRORS);

      for (uint32 mm=MAX_ERRORS-1, ii=1; ii<MAX_ERRORS; ii++)
        fprintf(F, "%-8d %8d %11.6f %11.6f\n",
                ii,
                starts[ii],
                (double)(starts[ii] - starts[1])  / (ii -  1 + 1),
                (double)(starts[mm] - starts[ii]) / (mm - ii + 1));

      fclose(F);
    }



    {
      sprintf(O, "%s/prefixEditDistance-matchLimit-%04d.C", D, evalue);

      errno = 0;
      FILE *F = fopen(O, "w");
      if (errno)
        fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);

      fprintf(F, "//\n");
      fprintf(F, "//  Automagically generated.  Do not edit.\n");
      fprintf(F, "//\n");
      fprintf(F, "\n");
      fprintf(F, "#include \"gkStore.H\"\n");
      fprintf(F, "\n");
      fprintf(F, "#if (AS_MAX_READLEN_BITS == %d)\n", AS_MAX_READLEN_BITS);
      fprintf(F, "\n");
      fprintf(F, "extern\n");
      fprintf(F, "const\n");
      fprintf(F, "int32\n");
      fprintf(F, "Edit_Match_Limit_%04d[%d] = {\n", evalue, MAX_ERRORS + 1);

      uint32  i=0;

      while (i < MAX_ERRORS) {
        uint32  j=0;

        fprintf(F, "  ");

        while ((j < 16) && (i < MAX_ERRORS)) {
          if (i < MAX_ERRORS-1)
            fprintf(F, "0x%08x,", starts[i]);
          else
            fprintf(F, "0x%08x", starts[i]);

          i++;
          j++;
        }

        fprintf(F, "\n");
      }

      fprintf(F, "};\n");
      fprintf(F, "\n");
      fprintf(F, "#endif\n");

      fclose(F);
    }
  }
}
int
main(int argc, char **argv) {
  int32 minEvalue = 0;
  int32 maxEvalue = 0;
  int32 step      = 0;

  if        (argc == 2) {
    minEvalue  = atoi(argv[1]);
    maxEvalue  = minEvalue;

  } else if (argc == 3) {
    minEvalue  = atoi(argv[1]);
    maxEvalue  = atoi(argv[2]);

  } else if (argc == 4) {
    minEvalue  = atoi(argv[1]);
    maxEvalue  = atoi(argv[2]);
    step       = atoi(argv[3]);

  } else {
    fprintf(stderr, "usage: %s minEvalue [maxEvalue [step]]\n", argv[0]);
    fprintf(stderr, "  computes overlapper probabilities for minEvalue <= eValue <= maxEvalue'\n");
    fprintf(stderr, "    eValue 100 == 0.01 fraction error == 1%% error\n");
    exit(1);
  }

#pragma omp parallel for schedule(dynamic, 1)
  for (uint32 evalue=minEvalue; evalue<=maxEvalue; evalue += step) {
    double  erate             = evalue / 10000.0;
    int32   start             = 1;

    int32   MAX_ERRORS        = (1 + (int) (erate * AS_MAX_READLEN));
    int32   ERRORS_FOR_FREE   = 1;

    int32  *starts            = new int32 [MAX_ERRORS + 1];

    memset(starts, 0, sizeof(int32) * (MAX_ERRORS + 1));

    char N[FILENAME_MAX];

    sprintf(N, "prefixEditDistance-matchLimitData/prefixEditDistance-matchLimit-%04d.dat", evalue);


    if (AS_UTL_fileExists(N)) {
      fprintf(stderr, "eValue %04d -- eRate %6.4f -- %7.4f%% error -- %8d values -- thread %2d - LOAD\n",
              evalue, erate, erate * 100.0, MAX_ERRORS, omp_get_thread_num());

      errno = 0;
      FILE *F = fopen(N, "r");
      if (errno)
        fprintf(stderr, "Failed to open '%s' for reading: %s\n", N, strerror(errno)), exit(1);

      int32  me = 0;
      double er = 0.0;

      fread(&me,     sizeof(int32),  1,          F);
      fread(&er,     sizeof(double), 1,          F);
      fread( starts, sizeof(int32),  MAX_ERRORS, F);

      assert(me == MAX_ERRORS);
      assert(er == erate);

      fclose(F);

    } else {
      fprintf(stderr, "eValue %04d -- eRate %6.4f -- %7.4f%% error -- %8d values -- thread %2d - COMPUTE\n",
              evalue, erate, erate * 100.0, MAX_ERRORS, omp_get_thread_num());

      for (int32 e=ERRORS_FOR_FREE + 1; e<MAX_ERRORS; e++) {
        start = Binomial_Bound(e - ERRORS_FOR_FREE, erate, start);
        starts[e] = start - 1;
      }
    }



    {
      sprintf(N, "prefixEditDistance-matchLimitData/prefixEditDistance-matchLimit-%04d.dat", evalue);

      errno = 0;
      FILE *F = fopen(N, "w");
      if (errno)
        fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);

      fwrite(&MAX_ERRORS, sizeof(int32),  1,          F);
      fwrite(&erate,      sizeof(double), 1,          F);
      fwrite( starts,     sizeof(int32),  MAX_ERRORS, F);

      fclose(F);
    }



    {
      sprintf(N, "prefixEditDistance-matchLimit-%04d.C", evalue);

      errno = 0;
      FILE *F = fopen(N, "w");
      if (errno)
        fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1);

      fprintf(F, "//\n");
      fprintf(F, "//  Automagically generated.  Do not edit.\n");
      fprintf(F, "//\n");
      fprintf(F, "\n");
      fprintf(F, "#include \"AS_global.H\"\n");
      fprintf(F, "\n");
      fprintf(F, "extern\n");
      fprintf(F, "const\n");
      fprintf(F, "int32\n");
      fprintf(F, "Edit_Match_Limit_%04d[%d] = {\n", evalue, MAX_ERRORS + 1);

      uint32  i=0;

      while (i < MAX_ERRORS) {
        uint32  j=0;

        fprintf(F, "  ");

        while ((j < 16) && (i < MAX_ERRORS)) {
          if (i < MAX_ERRORS-1)
            fprintf(F, "0x%08x,", starts[i]);
          else
            fprintf(F, "0x%08x", starts[i]);

          i++;
          j++;
        }

        fprintf(F, "\n");
      }

      fprintf(F, "};\n");

      fclose(F);
    }
  }
}