Exemple #1
0
void
gkRead::gkRead_copyDataToPartition(void *blobs, FILE **partfiles, uint64 *partfileslen, uint32 partID) {

  //  Stash away the location of the partitioned data

  assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID]));

  //  Figure out where the blob actually is, and make sure that it really is a blob

  uint8  *blob    = (uint8 *)blobs + _mPtr;
  uint32  blobLen = 8 + *((uint32 *)blob + 1);

  assert(blob[0] == 'B');
  assert(blob[1] == 'L');
  assert(blob[2] == 'O');
  assert(blob[3] == 'B');

  //  Write the blob to the partition, update the length of the partition

  AS_UTL_safeWrite(partfiles[partID], blob, "gkRead::gkRead_copyDataToPartition::blob", sizeof(char), blobLen);

  //  Update the read to the new location of the blob in the partitioned data.

  _mPtr = partfileslen[partID];
  _pID  = partID;

  //  And finalize by remembering the length.

  partfileslen[partID] += blobLen;

  assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID]));

}
Exemple #2
0
void
FragmentInfo::save(const char *prefix) {
  char  name[FILENAME_MAX];

  sprintf(name, "%s.fragmentInfo", prefix);

  errno = 0;
  FILE *file = fopen(name, "w");
  if (errno) {
    writeLog("FragmentInfo()-- Failed to open '%s' for writing: %s\n", name, strerror(errno));
    writeLog("FragmentInfo()-- Will not save fragment information to cache.\n");
    return;
  }

  writeLog("FragmentInfo()-- Saving fragment information to cache '%s'\n", name);

  AS_UTL_safeWrite(file, &fiMagicNumber,   "fragmentInformationMagicNumber",  sizeof(uint64), 1);
  AS_UTL_safeWrite(file, &fiVersionNumber, "fragmentInformationMagicNumber",  sizeof(uint64), 1);
  AS_UTL_safeWrite(file, &_numFragments,   "fragmentInformationNumFrgs",      sizeof(uint32), 1);
  AS_UTL_safeWrite(file, &_numLibraries,   "fragmentInformationNumLibs",      sizeof(uint32), 1);

  AS_UTL_safeWrite(file,  _fragLength,     "fragmentInformationFragLen",      sizeof(uint32), _numFragments + 1);
  AS_UTL_safeWrite(file,  _libIID,         "fragmentInformationLibIID",       sizeof(uint32), _numFragments + 1);

  AS_UTL_safeWrite(file,  _numFragsInLib,  "fragmentInformationNumFrgsInLib", sizeof(uint32), _numLibraries + 1);

  fclose(file);
}
Exemple #3
0
void
ovStore::ovStore_write(void) {
  AS_UTL_mkdir(_storePath);

  char name[FILENAME_MAX];

  sprintf(name, "%s/info", _storePath);

  //  If the ovs file exists, AND has a valid magic number, then the store is complete and we should
  //  abort before the valid store is destroyed.

  if (AS_UTL_fileExists(name, false, false)) {
    errno = 0;
    FILE *ovsinfo = fopen(name, "r");
    if (errno) {
      fprintf(stderr, "ERROR: failed to read store metadata from '%s': %s\n", name, strerror(errno));
      exit(1);
    }

    AS_UTL_safeRead(ovsinfo, &_info, "ovStore::ovStore::testinfo", sizeof(ovStoreInfo), 1);

    fclose(ovsinfo);

    if (_info._ovsMagic == ovStoreMagic)
      fprintf(stderr, "ERROR:  overlapStore '%s' is a valid overlap store, will not overwrite.\n",
              _storePath), exit(1);
  }

  //  Create a new incomplete info file.

  errno = 0;
  FILE *ovsinfo = fopen(name, "w");

  if (errno)
    fprintf(stderr, "failed to create overlap store '%s': %s\n", _storePath, strerror(errno)), exit(1);

  AS_UTL_safeWrite(ovsinfo, &_info, "ovStore::ovStore::saveinfo", sizeof(ovStoreInfo), 1);

  fclose(ovsinfo);

  sprintf(name, "%s/index", _storePath);

  errno = 0;
  _offtFile = fopen(name, "w");
  if (errno)
    fprintf(stderr, "AS_OVS_createOverlapStore()-- failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1);

  _overlapsThisFile = 0;
  _currentFileIndex = 0;
  _bof              = NULL;
}
Exemple #4
0
//  Dump a block of encoded data to disk, then update the gkRead to point to it.
//
void
gkStore::gkStore_stashReadData(gkRead *read, gkReadData *data) {

  assert(_blobsFile != NULL);

  read->_mPtr = AS_UTL_ftell(_blobsFile);
  read->_pID  = _partitionID;                //  0 if not partitioned

  //fprintf(stderr, "STASH read %u at position "F_SIZE_T"\n", read->gkRead_readID(), AS_UTL_ftell(_blobsFile));

  AS_UTL_safeWrite(_blobsFile,
                   data->_blob,
                   "gkStore_stashReadData::blob",
                   sizeof(char),
                   data->_blobLen);
}
void
BestOverlapGraph::save(const char *prefix, double AS_UTG_ERROR_RATE, double AS_UTG_ERROR_LIMIT) {
  char name[FILENAME_MAX];

  sprintf(name, "%s.bog", prefix);

  assert(_best5score == NULL);
  assert(_best3score == NULL);
  assert(_bestCscore == NULL);

  errno = 0;
  FILE *file = fopen(name, "w");
  if (errno) {
    fprintf(logFile, "BestOverlapGraph-- Failed to open '%s' for writing: %s\n", name, strerror(errno));
    fprintf(logFile, "BestOverlapGraph-- Will not save best overlap graph to cache: "F_STR"\n", strerror(errno));
    return;
  }

  fprintf(logFile, "BestOverlapGraph()-- Saving overlap graph to '%s'.\n",
          name);

  AS_UTL_safeWrite(file, &ogMagicNumber,      "magicnumber",   sizeof(uint64),              1);
  AS_UTL_safeWrite(file, &ogVersionNumber,    "versionnumber", sizeof(uint64),              1);

  AS_UTL_safeWrite(file, &AS_UTG_ERROR_RATE,  "errorRate",     sizeof(double),              1);
  AS_UTL_safeWrite(file, &AS_UTG_ERROR_LIMIT, "errorLimit",    sizeof(double),              1);

  AS_UTL_safeWrite(file, _best5, "best overlaps 5", sizeof(BestEdgeOverlap), FI->numFragments() + 1);
  AS_UTL_safeWrite(file, _best3, "best overlaps 3", sizeof(BestEdgeOverlap), FI->numFragments() + 1);
  AS_UTL_safeWrite(file, _bestC, "best contains C", sizeof(BestContainment), FI->numFragments() + 1);

  for (uint32 i=0; i<FI->numFragments() + 1; i++)
    if (_bestC[i].olaps != NULL)
      AS_UTL_safeWrite(file, _bestC[i].olaps, "best contains olaps", sizeof(uint32), _bestC[i].olapsLen);

  fclose(file);
}
Exemple #6
0
void
writeOverlaps(char       *storePath,
              ovOverlap *ovls,
              uint64      ovlsLen,
              uint32      fileID) {

	char                        name[FILENAME_MAX];

  uint32                      currentFileIndex = fileID;
  uint64                      overlapsThisFile = 0;

	ovStoreInfo    info;

	info._ovsMagic              = 1;
	info._ovsVersion            = ovStoreVersion;
  info._smallestIID           = UINT64_MAX;
  info._largestIID            = 0;
  info._numOverlapsTotal      = 0;
  info._highestFileIndex      = 0;
	info._maxReadLenInBits      = AS_MAX_READLEN_BITS;

	ovStoreOfft    offt;
  ovStoreOfft    offm;

  offt._a_iid     = offm._a_iid    = ovls[0].a_iid;
	offt._fileno    = offm._fileno   = fileID;
  offt._offset    = offm._offset   = 0;
  offt._numOlaps  = offm._numOlaps = 0;

  //  Create the output file

  sprintf(name, "%s/%04d", storePath, fileID);
  ovFile *bof = new ovFile(name, ovFileNormalWrite);

  //  Create the index file

	sprintf(name,"%s/%04d.index", storePath, fileID);

  errno = 0;
  FILE *offtFile=fopen(name,"w");
  if (errno)
    fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);

  //  Dump the overlaps

  fprintf(stderr, "Writing "F_U64" overlaps.\n", ovlsLen);

	for (uint64 i=0; i<ovlsLen; i++ ) {
    bof->writeOverlap(ovls + i);

    if (offt._a_iid > ovls[i].a_iid) {
			fprintf(stderr, "LAST:  a:"F_U32"\n", offt._a_iid);
			fprintf(stderr, "THIS:  a:"F_U32" b:"F_U32"\n", ovls[i].a_iid, ovls[i].b_iid);
		}
    assert(offt._a_iid <= ovls[i].a_iid);

    info._smallestIID = MIN(info._smallestIID, ovls[i].a_iid);
    info._largestIID  = MAX(info._largestIID,  ovls[i].a_iid);

		//  Put the index to disk, filling any gaps

		if ((offt._numOlaps != 0) && (offt._a_iid != ovls[i].a_iid)) {
			while (offm._a_iid < offt._a_iid) {
				offm._fileno   = offt._fileno;
				offm._offset   = offt._offset;
				offm._numOlaps = 0;

				AS_UTL_safeWrite(offtFile, &offm, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
				offm._a_iid++;
			}

			//  One more, since this iid is not offm -- we write it next!
			offm._a_iid++;

			AS_UTL_safeWrite(offtFile, &offt, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
			offt._numOlaps  = 0;
		}

		//  Update the index if this is the first overlap for this a_iid

		if (offt._numOlaps == 0) {
			offt._a_iid   = ovls[i].a_iid;
			offt._fileno  = currentFileIndex;
			offt._offset  = overlapsThisFile;
		}

		offt._numOlaps++;

		info._numOverlapsTotal++;

		overlapsThisFile++;
	}

  //  Close the output file.

  delete bof;

  //  Write the final index entries.

	while (offm._a_iid < offt._a_iid) {
		offm._fileno    = offt._fileno;
		offm._offset    = offt._offset;
		offm._numOlaps  = 0;

		AS_UTL_safeWrite(offtFile, &offm, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);
		offm._a_iid++;
	}

	AS_UTL_safeWrite(offtFile, &offt, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1);

	fclose(offtFile);

  //  In the nasty case that there were no overlaps in this slice, set meaningful smallest and
  //  largest.  Well, at least, set non-nonsense smallest and largest.

  if (overlapsThisFile == 0) {
    info._smallestIID = 0;
    info._largestIID  = 0;
  }

  //  Write the info, and some stats for the user.

  sprintf(name,"%s/%04d.info", storePath, fileID);

  errno = 0;
  FILE *F = fopen(name, "w");
  if (errno)
    fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);

  AS_UTL_safeWrite(F, &info, "Partition ovs file", sizeof(ovStoreInfo), 1);

  fclose(F);

  fprintf(stderr, "Wrote "F_U64" overlaps into '%s'\n", info._numOverlapsTotal, name);
  fprintf(stderr, "  Smallest "F_U64"\n", info._smallestIID);
  fprintf(stderr, "  Largest  "F_U64"\n", info._largestIID);
}
Exemple #7
0
void
ovStore::addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesLen) {

  char  name[FILENAME_MAX];
  sprintf(name, "%s/evalues", _storePath);

  //  If we have an opened memory mapped file, and it isn't open for writing, close it.

  if ((_evaluesMap) && (_evaluesMap->type() == memoryMappedFile_readOnly)) {
    fprintf(stderr, "WARNING: closing read-only evalues file.\n");
    delete _evaluesMap;

    _evaluesMap = NULL;
    _evalues    = NULL;
  }

  //  Remove a bogus evalues file if one exists.

  if ((AS_UTL_fileExists(name) == true) &&
      (AS_UTL_sizeOfFile(name) != (sizeof(uint16) * _info._numOverlapsTotal))) {
    fprintf(stderr, "WARNING: existing evalues file is incorrect size: should be "F_U64" bytes, is "F_U64" bytes.  Removing.\n",
            (sizeof(uint16) * _info._numOverlapsTotal), AS_UTL_sizeOfFile(name));
    AS_UTL_unlink(name);
  }

  //  Make a new evalues file if one doesn't exist.

  if (AS_UTL_fileExists(name) == false) {
    fprintf(stderr, "Creating evalues file for "F_U64" overlaps.\r", _info._numOverlapsTotal);

    errno = 0;
    FILE *F = fopen(name, "w");
    if (errno)
      fprintf(stderr, "Failed to make evalues file '%s': %s\n", name, strerror(errno)), exit(1);

    uint16  *Z  = new uint16 [1048576];
    uint64   Zn = 0;

    memset(Z, 0, sizeof(uint16) * 1048576);

    while (Zn < _info._numOverlapsTotal) {
      uint64  S = (Zn + 1048576 < _info._numOverlapsTotal) ? 1048576 : _info._numOverlapsTotal - Zn;

      AS_UTL_safeWrite(F, Z, "zero evalues", sizeof(uint16), S);

      Zn += S;

      fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\r",
              _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal);
    }

    fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\n",
            _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal);

    fclose(F);
  }

  //  Open the evalues file if it isn't already opened

  if (_evalues == NULL) {
    _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readWrite);
    _evalues    = (uint16 *)_evaluesMap->get(0);
  }

  //  Figure out the overlap ID for the first overlap associated with bgnID

  setRange(bgnID, endID);

  //  Load the evalues from 'evalues'

  for (uint64 ii=0; ii<evaluesLen; ii++)
    _evalues[_offt._overlapID + ii] = evalues[ii];

  //  That's it.  Deleting the ovStore object will close the memoryMappedFile.  It's left open
  //  for more updates.
}
Exemple #8
0
void
ovStore::writeOverlap(ovOverlap *overlap, uint32 maxOverlapsThisFile) {
	char            name[FILENAME_MAX];

	assert(_isOutput == TRUE);

    	_currentFileIndex++;
    	_overlapsThisFile = 0;

	for (uint64 i=0; i < maxOverlapsThisFile; i++ ) {
    		//  All overlaps will be sorted by a_iid
  		if (_offt._a_iid > overlap[i].a_iid) {
			fprintf(stderr, "LAST:  a:"F_U32"\n", _offt._a_iid);
			fprintf(stderr, "THIS:  a:"F_U32" b:"F_U32"\n", overlap[i].a_iid, overlap[i].b_iid);
		}

		assert(_offt._a_iid <= overlap[i].a_iid);

		if (_info._smallestIID > overlap[i].a_iid)
			_info._smallestIID = overlap[i].a_iid;
		if (_info._largestIID < overlap[i].a_iid)
     			_info._largestIID = overlap[i].a_iid;


		//  Put the index to disk, filling any gaps
		if ((_offt._numOlaps != 0) && (_offt._a_iid != overlap[i].a_iid)) {

			while (_offm._a_iid < _offt._a_iid) {
				_offm._fileno    = _offt._fileno;
				_offm._offset    = _offt._offset;
				_offm._numOlaps  = 0;
				AS_UTL_safeWrite(_offtFile,
						&_offm,
						"AS_OVS_writeOverlapToStore offset",
						sizeof(ovStoreOfft),
						1);
				_offm._a_iid++;
			}

			//  One more, since this iid is not missing -- we write it next!
			_offm._a_iid++;
			AS_UTL_safeWrite(_offtFile,
					&_offt,
					"AS_OVS_writeOverlapToStore offset",
					sizeof(ovStoreOfft),
					1);
			_offt._numOlaps  = 0;
		}
		//  Update the index if this is the first overlap for this a_iid
		if (_offt._numOlaps == 0) {
			_offt._a_iid     = overlap[i].a_iid;
			_offt._fileno    = _currentFileIndex;
			_offt._offset    = _overlapsThisFile;
      _offt._overlapID = _info._numOverlapsTotal;
		}

		_offt._numOlaps++;
		_info._numOverlapsTotal++;
		_overlapsThisFile++;

	}


  fprintf(stderr,"Done building index for dumpfile %d.\n",_currentFileIndex);
}
Exemple #9
0
void
ovStore::writeOverlap(ovOverlap *overlap) {
  char            name[FILENAME_MAX];

  assert(_isOutput == TRUE);

  if (_offt._a_iid > overlap->a_iid) {
    //  Woah!  The last overlap we saw is bigger than the one we have now?!
    fprintf(stderr, "LAST:  a:"F_U32"\n", _offt._a_iid);
    fprintf(stderr, "THIS:  a:"F_U32" b:"F_U32"\n", overlap->a_iid, overlap->b_iid);
  }
  assert(_offt._a_iid <= overlap->a_iid);

  if (_info._smallestIID > overlap->a_iid)
    _info._smallestIID = overlap->a_iid;
  if (_info._largestIID < overlap->a_iid)
     _info._largestIID = overlap->a_iid;


  //  If we don't have an output file yet, or the current file is
  //  too big, open a new file.
  //
  if ((_bof) && (_overlapsThisFile >= 1024 * 1024 * 1024 / _bof->recordSize())) {
    delete _bof;

    _bof              = NULL;
    _overlapsThisFile = 0;
  }

  if (_bof == NULL) {
    char  name[FILENAME_MAX];

    _currentFileIndex++;

    sprintf(name, "%s/%04d", _storePath, _currentFileIndex);
    _bof = new ovFile(name, ovFileNormalWrite);
  }


  //  Put the index to disk, filling any gaps
  //
  if ((_offt._numOlaps != 0) &&
      (_offt._a_iid != overlap->a_iid)) {

    while (_offm._a_iid < _offt._a_iid) {
      _offm._fileno    = _offt._fileno;
      _offm._offset    = _offt._offset;
      _offm._numOlaps  = 0;
      AS_UTL_safeWrite(_offtFile, &_offm, "ovStore::writeOverlap::offset", sizeof(ovStoreOfft), 1);
      _offm._a_iid++;
    }

    //  One more, since this iid is not missing -- we write it next!
    _offm._a_iid++;

    AS_UTL_safeWrite(_offtFile,
                     &_offt,
                     "AS_OVS_writeOverlapToStore offset",
                     sizeof(ovStoreOfft),
                     1);
    _offt._numOlaps  = 0;
  }


  //  Update the index if this is the first overlap for this a_iid
  //
  if (_offt._numOlaps == 0) {
    _offt._a_iid     = overlap->a_iid;
    _offt._fileno    = _currentFileIndex;
    _offt._offset    = _overlapsThisFile;
    _offt._overlapID = _info._numOverlapsTotal;
  }

  //AS_OVS_accumulateStats(ovs, overlap);
  _bof->writeOverlap(overlap);

  _offt._numOlaps++;
  _info._numOverlapsTotal++;
  _overlapsThisFile++;
}
Exemple #10
0
ovStore::~ovStore() {

  //  If output, write the last index element (don't forget to fill in gaps);
  //             update the info, using the final magic number

  if (_isOutput) {
    if (_offt._numOlaps > 0) {
      for (; _offm._a_iid < _offt._a_iid; _offm._a_iid++) {
        _offm._fileno   = _offt._fileno;
        _offm._offset   = _offt._offset;
        _offm._numOlaps = 0;

        AS_UTL_safeWrite(_offtFile, &_offm, "ovStore::~ovStore::offm", sizeof(ovStoreOfft), 1);
      }

      AS_UTL_safeWrite(_offtFile, &_offt, "ovStore::~ovStore::offt", sizeof(ovStoreOfft), 1);
    }

    _info._ovsMagic         = ovStoreMagic;
    _info._ovsVersion       = ovStoreVersion;
    _info._highestFileIndex = _currentFileIndex;

    char name[FILENAME_MAX];

    sprintf(name, "%s/info", _storePath);
    errno = 0;
    FILE *ovsinfo = fopen(name, "w");
    if (errno)
      fprintf(stderr, "failed to create overlap store '%s': %s\n", _storePath, strerror(errno)), exit(1);

    AS_UTL_safeWrite(ovsinfo, &_info, "ovStore::~ovStore::info", sizeof(ovStoreInfo), 1);

    fclose(ovsinfo);

    fprintf(stderr, "Closing the new store:\n");
    fprintf(stderr, "  info._ovsMagic           = 0x%016"F_X64P"\n", _info._ovsMagic);
    fprintf(stderr, "  info._ovsVersion         = "F_U64"\n", _info._ovsVersion);
    fprintf(stderr, "  info._smallestIID        = "F_U64"\n", _info._smallestIID);
    fprintf(stderr, "  info._largestIID         = "F_U64"\n", _info._largestIID);
    fprintf(stderr, "  info._numOverlapsTotal   = "F_U64"\n", _info._numOverlapsTotal);
    fprintf(stderr, "  info._highestFileIndex   = "F_U64"\n", _info._highestFileIndex);
    fprintf(stderr, "  info._maxReadLenInBits   = "F_U64"\n", _info._maxReadLenInBits);
  }

#if 0
  if (_statsUpdated) {
    fprintf(stderr, "Writing new stats.\n");

    char name [FILENAME_MAX];

    sprintf(name, "%s/ost", _storePath);
    errno = 0;
    FILE *ost = fopen(name, "w");
    if (errno)
      fprintf(stderr, "failed to write overlap stats '%s': %s\n", name, strerror(errno)), exit(1);

    AS_UTL_safeWrite(ost, &_stats, "AS_OVS_closeOverlapStore", sizeof(OverlapStoreStats), 1);

    fclose(ost);
  }
#endif

  delete _bof;

  fclose(_offtFile);
}
Exemple #11
0
void
mergeInfoFiles(char       *storePath,
               uint32      nPieces) {
  ovStoreInfo    infopiece;
  ovStoreInfo    info;

  info._ovsMagic              = ovStoreMagic;
	info._ovsVersion            = ovStoreVersion;
  info._smallestIID           = UINT64_MAX;
  info._largestIID            = 0;
  info._numOverlapsTotal      = 0;
  info._highestFileIndex      = nPieces;
	info._maxReadLenInBits      = AS_MAX_READLEN_BITS;

  ovStoreOfft offm;

  offm._a_iid     = 0;
  offm._fileno    = 1;
  offm._offset    = 0;
  offm._numOlaps  = 0;

  //  Open the new master index output file

  char            name[FILENAME_MAX];

  sprintf(name, "%s/index", storePath);

  errno = 0;
  FILE  *idx = fopen(name, "w");
  if (errno)
    fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);

  //  Special case, we need an empty index for the zeroth fragment.

  AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsetZero", sizeof(ovStoreOfft), 1);

  //  Process each

  for (uint32 i=1; i<=nPieces; i++) {
    sprintf(name, "%s/%04d.info", storePath, i);

    fprintf(stderr, "Processing '%s'\n", name);

    if (AS_UTL_fileExists(name, FALSE, FALSE) == false) {
      fprintf(stderr, "ERROR: file '%s' not found.\n", name);
      exit(1);
    }

    {
      errno = 0;
      FILE *F = fopen(name, "r");
      if (errno)
        fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
      AS_UTL_safeRead(F, &infopiece, "ovStore::mergeInfoFiles::infopiece", sizeof(ovStoreInfo), 1);
      fclose(F);
    }

    //  Add empty index elements for missing overlaps

    if (infopiece._numOverlapsTotal == 0) {
      fprintf(stderr, "  No overlaps found.\n");
      continue;
    }

    assert(infopiece._smallestIID <= infopiece._largestIID);

    if (info._largestIID + 1 < infopiece._smallestIID)
      fprintf(stderr, "  Adding empty records for fragments "F_U64" to "F_U64"\n",
              info._largestIID + 1, infopiece._smallestIID - 1);

    while (info._largestIID + 1 < infopiece._smallestIID) {
      offm._a_iid     = info._largestIID + 1;
      //offm._fileno    = set elsewhere
      //offm._offset    = set elsewhere
      //offm._numOlaps  = 0;

      AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsets", sizeof(ovStoreOfft), 1);

      info._largestIID++;
    }

    //  Copy index elements for existing overlaps.  While copying, update the supposed position
    //  of any fragments with no overlaps.  Without doing this, accessing the store beginning
    //  or ending at such a fragment will fail.

    {
      sprintf(name, "%s/%04d.index", storePath, i);

      errno = 0;
      FILE  *F = fopen(name, "r");
      if (errno)
        fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);

      uint32          recsLen = 0;
      uint32          recsMax = 1024 * 1024;
      ovStoreOfft    *recs    = new ovStoreOfft [recsMax];

      recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsLoad", sizeof(ovStoreOfft), recsMax);

      if (recsLen > 0) {
        if (info._largestIID + 1 != recs[0]._a_iid)
          fprintf(stderr, "ERROR: '%s' starts with iid "F_U32", but store only up to "F_U64"\n",
                  name, recs[0]._a_iid, info._largestIID);
        assert(info._largestIID + 1 == recs[0]._a_iid);
      }

      while (recsLen > 0) {
        offm._fileno = recs[recsLen-1]._fileno;  //  Update location of missing stuff.
        offm._offset = recs[recsLen-1]._offset;

				AS_UTL_safeWrite(idx, recs, "ovStore::mergeInfoFiles::offsetsWrite", sizeof(ovStoreOfft), recsLen);

        recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsReLoad", sizeof(ovStoreOfft), recsMax);
      }

      delete [] recs;

      fclose(F);
    }

    //  Update

    info._smallestIID = MIN(info._smallestIID, infopiece._smallestIID);
    info._largestIID  = MAX(info._largestIID,  infopiece._largestIID);

    info._numOverlapsTotal += infopiece._numOverlapsTotal;

    fprintf(stderr, "  Now finished with fragments "F_U64" to "F_U64" -- "F_U64" overlaps.\n",
            info._smallestIID, info._largestIID, info._numOverlapsTotal);
  }

  fclose(idx);


  //  Dump the new store info file

  {
    sprintf(name, "%s/info", storePath);

    errno = 0;
    FILE  *F = fopen(name, "w");
    if (errno)
      fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1);

    AS_UTL_safeWrite(F, &info, "ovStore::mergeInfoFiles::finalInfo", sizeof(ovStoreInfo), 1);

    fclose(F);
  }

  fprintf(stderr, "\n");
  fprintf(stderr, "Index finalized for reads "F_U64" to "F_U64" with "F_U64" overlaps.\n",
          info._smallestIID,
          info._largestIID,
          info._numOverlapsTotal);
}
Exemple #12
0
bool
testIndex(char *ovlName,
          bool  doFixes) {
  char name[FILENAME_MAX];
  FILE *I = NULL;
  FILE *F = NULL;

  sprintf(name, "%s/index", ovlName);

  errno = 0;
  I = fopen(name, "r");
  if (errno)
    fprintf(stderr, "ERROR: Failed to open '%s' for reading: %s\n", name, strerror(errno)), exit(1);

  //fprintf(stderr, "TESTING '%s'\n", name);

  if (doFixes) {
    sprintf(name, "%s/index.fixed", ovlName);

    errno = 0;
    F = fopen(name, "w");
    if (errno)
      fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1);

    //fprintf(stderr, "WITH FIXES TO '%s'\n", name);
  }

  ovStoreOfft  O;

  uint32  curIID = 0;
  uint32  minIID = UINT32_MAX;
  uint32  maxIID = 0;

  uint32  nErrs = 0;

  while (1 == AS_UTL_safeRead(I, &O, "offset", sizeof(ovStoreOfft), 1)) {
    bool  maxIncreases   = (maxIID < O._a_iid);
    bool  errorDecreased = ((O._a_iid < curIID));
    bool  errorGap       = ((O._a_iid > 0) && (curIID + 1 != O._a_iid));

    if (O._a_iid < minIID)
      minIID = O._a_iid;

    if (maxIncreases)
      maxIID = O._a_iid;

    if (errorDecreased)
      fprintf(stderr, "ERROR: index decreased from "F_U32" to "F_U32"\n", curIID, O._a_iid), nErrs++;
    else if (errorGap)
      fprintf(stderr, "ERROR: gap between "F_U32" and "F_U32"\n", curIID, O._a_iid), nErrs++;

    if ((maxIncreases == true) && (errorGap == false)) {
      if (doFixes)
        AS_UTL_safeWrite(F, &O, "offset", sizeof(ovStoreOfft), 1);

    } else if (O._numOlaps > 0) {
      fprintf(stderr, "ERROR: lost overlaps a_iid "F_U32" fileno "F_U32" offset "F_U32" numOlaps "F_U32"\n",
              O._a_iid, O._fileno, O._offset, O._numOlaps);
    }

    curIID = O._a_iid;
  }

  fclose(I);

  if (F)
    fclose(F);

  return(nErrs == 0);
}
Exemple #13
0
int
main(int argc, char **argv) {
  char     *gkpName = NULL;
  char     *ovsName = NULL;

  char     *finClrName = NULL;
  char     *outClrName = NULL;

  double    errorRate       = 0.06;
  //uint32    minAlignLength  = 40;
  uint32    minReadLength   = 64;

  uint32    idMin = 1;
  uint32    idMax = UINT32_MAX;

  char     *outputPrefix = NULL;
  char      outputName[FILENAME_MAX];

  FILE     *staFile      = NULL;
  FILE     *reportFile   = NULL;
  FILE     *subreadFile  = NULL;

  bool      doSubreadLogging        = true;
  bool      doSubreadLoggingVerbose = false;

  //  Statistics on the trimming - the second set are from the old logging, and don't really apply anymore.

  trimStat  readsIn;                  //  Read is eligible for trimming
  trimStat  deletedIn;                //  Read was deleted already
  trimStat  noTrimIn;                 //  Read not requesting trimming

  trimStat  noOverlaps;               //  no overlaps in store
  trimStat  noCoverage;               //  no coverage after adjusting for trimming done

  trimStat  readsProcChimera;         //  Read was processed for chimera signal
  trimStat  readsProcSpur;            //  Read was processed for spur signal
  trimStat  readsProcSubRead;         //  Read was processed for subread signal

#if 0
  trimStat  badSpur5;
  trimStat  badSpur3;
  trimStat  badChimera;
  trimStat  badSubread;
#endif

  trimStat  readsNoChange;

  trimStat  readsBadSpur5,   basesBadSpur5;
  trimStat  readsBadSpur3,   basesBadSpur3;
  trimStat  readsBadChimera, basesBadChimera;
  trimStat  readsBadSubread, basesBadSubread;

  trimStat  readsTrimmed5;
  trimStat  readsTrimmed3;

#if 0
  trimStat  fullCoverage;             //  fully covered by overlaps
  trimStat  noSignalNoGap;            //  no signal, no gaps
  trimStat  noSignalButGap;           //  no signal, with gaps

  trimStat  bothFixed;                //  both chimera and spur signal trimmed
  trimStat  chimeraFixed;             //  only chimera signal trimmed
  trimStat  spurFixed;                //  only spur signal trimmed

  trimStat  bothDeletedSmall;         //  deleted because of both cimera and spur signals
  trimStat  chimeraDeletedSmall;      //  deleted because of chimera signal
  trimStat  spurDeletedSmall;         //  deleted because of spur signal

  trimStat  spurDetectedNormal;       //  normal spur detected
  trimStat  spurDetectedLinker;       //  linker spur detected

  trimStat  chimeraDetectedInnie;     //  innpue-pair chimera detected
  trimStat  chimeraDetectedOverhang;  //  overhanging chimera detected
  trimStat  chimeraDetectedGap;       //  gap chimera detected
  trimStat  chimeraDetectedLinker;    //  linker chimera detected
#endif

  trimStat  deletedOut;               //  Read was deleted by trimming

  argc = AS_configure(argc, argv);

  int arg=1;
  int err=0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      gkpName = argv[++arg];

    } else if (strcmp(argv[arg], "-O") == 0) {
      ovsName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {
      outputPrefix = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {
      AS_UTL_decodeRange(argv[++arg], idMin, idMax);

    } else if (strcmp(argv[arg], "-Ci") == 0) {
      finClrName = argv[++arg];
    } else if (strcmp(argv[arg], "-Co") == 0) {
      outClrName = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      errorRate = atof(argv[++arg]);

    //} else if (strcmp(argv[arg], "-l") == 0) {
    //  minAlignLength = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-minlength") == 0) {
      minReadLength = atoi(argv[++arg]);

    } else {
      fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);
      err++;
    }
    arg++;
  }

  if (errorRate < 0.0)
    err++;

  if ((gkpName == 0L) || (ovsName == 0L) || (outputPrefix == NULL) || (err)) {
    fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Ci input.clearFile -Co output.clearFile -o outputPrefix]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "  -G gkpStore    path to read store\n");
    fprintf(stderr, "  -O ovlStore    path to overlap store\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -o name        output prefix, for logging\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -t bgn-end     limit processing to only reads from bgn to end (inclusive)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -Ci clearFile  path to input clear ranges (NOT SUPPORTED)\n");
    fprintf(stderr, "  -Co clearFile  path to ouput clear ranges\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -e erate       ignore overlaps with more than 'erate' percent error\n");
    //fprintf(stderr, "  -l length      ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -minlength l   reads trimmed below this many bases are deleted\n");
    fprintf(stderr, "\n");

    if (errorRate < 0.0)
      fprintf(stderr, "ERROR: Error rate (-e) value %f too small; must be 'fraction error' and above 0.0\n", errorRate);

    exit(1);
  }

  gkStore         *gkp = gkStore::gkStore_open(gkpName);
  ovStore         *ovs = new ovStore(ovsName, gkp);

  clearRangeFile  *finClr = new clearRangeFile(finClrName, gkp);
  clearRangeFile  *outClr = new clearRangeFile(outClrName, gkp);

  if (outClr)
    //  If the outClr file exists, those clear ranges are loaded.  We need to reset them
    //  back to 'untrimmed' for now.
    outClr->reset(gkp);

  if (finClr && outClr)
    //  A finClr file was supplied, so use those as the clear ranges.
    outClr->copy(finClr);


  sprintf(outputName, "%s.log",         outputPrefix);
  errno = 0;
  reportFile  = fopen(outputName, "w");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);

  sprintf(outputName, "%s.subread.log", outputPrefix);
  errno = 0;
  subreadFile = fopen(outputName, "w");
  if (errno)
    fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1);


  uint32      ovlLen = 0;
  uint32      ovlMax = 64 * 1024;
  ovOverlap  *ovl    = ovOverlap::allocateOverlaps(gkp, ovlMax);

  memset(ovl, 0, sizeof(ovOverlap) * ovlMax);

  workUnit *w = new workUnit;


  if (idMin < 1)
    idMin = 1;
  if (idMax > gkp->gkStore_getNumReads())
    idMax = gkp->gkStore_getNumReads();

  fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads, using errorRate = %.2f\n",
          idMin,
          idMax,
          gkp->gkStore_getNumReads(),
          errorRate);

  for (uint32 id=idMin; id<=idMax; id++) {
    gkRead     *read = gkp->gkStore_getRead(id);
    gkLibrary  *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID());

    if (finClr->isDeleted(id)) {
      //  Read already trashed.
      deletedIn += read->gkRead_sequenceLength();
      continue;
    }

    if ((libr->gkLibrary_removeSpurReads()     == false) &&
        (libr->gkLibrary_removeChimericReads() == false) &&
        (libr->gkLibrary_checkForSubReads()    == false)) {
      //  Nothing to do.
      noTrimIn += read->gkRead_sequenceLength();
      continue;
    }

    readsIn += read->gkRead_sequenceLength();


    uint32   nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax);

    //fprintf(stderr, "read %7u with %7u overlaps\r", id, nLoaded);

    if (nLoaded == 0) {
      //  No overlaps, nothing to check!
      noOverlaps += read->gkRead_sequenceLength();
      continue;
    }

    w->clear(id, finClr->bgn(id), finClr->end(id));
    w->addAndFilterOverlaps(gkp, finClr, errorRate, ovl, ovlLen);

    if (w->adjLen == 0) {
      //  All overlaps trimmed out!
      noCoverage += read->gkRead_sequenceLength();
      continue;
    }

    //  Find bad regions.

    //if (libr->gkLibrary_markBad() == true)
    //  //  From an external file, a list of known bad regions.  If no overlaps span
    //  //  the region with sufficient coverage, mark the region as bad.  This was
    //  //  motivated by the old 454 linker detection.
    //  markBad(gkp, w, subreadFile, doSubreadLoggingVerbose);

    //if (libr->gkLibrary_removeSpurReads() == true) {
    //  readsProcSpur += read->gkRead_sequenceLength();
    //  detectSpur(gkp, w, subreadFile, doSubreadLoggingVerbose);
    //  Get stats on spur region detected - save the length of each region to the trimStats object.
    //}

    //if (libr->gkLibrary_removeChimericReads() == true) {
    //  readsProcChimera += read->gkRead_sequenceLength();
    //  detectChimer(gkp, w, subreadFile, doSubreadLoggingVerbose);
    //  Get stats on chimera region detected - save the length of each region to the trimStats object.
    //}

    if (libr->gkLibrary_checkForSubReads() == true) {
      readsProcSubRead += read->gkRead_sequenceLength();
      detectSubReads(gkp, w, subreadFile, doSubreadLoggingVerbose);
    }

    //  Get stats on the bad regions found.  This kind of duplicates code in trimBadInterval(), but
    //  I don't want to pass all the stats objects into there.

    if (w->blist.size() == 0) {
      readsNoChange += read->gkRead_sequenceLength();
    }

    else {
      uint32  nSpur5   = 0, bSpur5   = 0;
      uint32  nSpur3   = 0, bSpur3   = 0;
      uint32  nChimera = 0, bChimera = 0;
      uint32  nSubread = 0, bSubread = 0;

      for (uint32 bb=0; bb<w->blist.size(); bb++) {
        switch (w->blist[bb].type) {
          case badType_5spur:
            nSpur5        += 1;
            basesBadSpur5 += w->blist[bb].end - w->blist[bb].bgn;
            break;
          case badType_3spur:
            nSpur3        += 1;
            basesBadSpur3 += w->blist[bb].end - w->blist[bb].bgn;
            break;
          case badType_chimera:
            nChimera        += 1;
            basesBadChimera += w->blist[bb].end - w->blist[bb].bgn;
            break;
          case badType_subread:
            nSubread        += 1;
            basesBadSubread += w->blist[bb].end - w->blist[bb].bgn;
            break;
          default:
            break;
        }
      }

      if (nSpur5   > 0)   readsBadSpur5   += nSpur5;
      if (nSpur3   > 0)   readsBadSpur3   += nSpur3;
      if (nChimera > 0)   readsBadChimera += nChimera;
      if (nSubread > 0)   readsBadSubread += nSubread;
    }

    //  Find solution.  This coalesces the list (in 'w') of all the bad regions found, picks out the
    //  largest good region, generates a log of the bad regions that support this decision, and sets
    //  the trim points.

    trimBadInterval(gkp, w, minReadLength, subreadFile, doSubreadLoggingVerbose);

    //  Log the solution.

    AS_UTL_safeWrite(reportFile, w->logMsg, "logMsg", sizeof(char), strlen(w->logMsg));

    //  Save the solution....

    outClr->setbgn(w->id) = w->clrBgn;
    outClr->setend(w->id) = w->clrEnd;

    //  And maybe delete the read.

    if (w->isOK == false) {
      deletedOut += read->gkRead_sequenceLength();

      outClr->setDeleted(w->id);
    }

    //  Update stats on what was trimmed.  The asserts say the clear range didn't expand, and the if
    //  tests if the clear range changed.

    assert(w->clrBgn >= w->iniBgn);
    assert(w->iniEnd >= w->clrEnd);

    if (w->clrBgn > w->iniBgn)
      readsTrimmed5 += w->clrBgn - w->iniBgn;

    if (w->iniEnd > w->clrEnd)
      readsTrimmed3 += w->iniEnd - w->clrEnd;
  }


  delete [] ovl;

  delete    w;

  gkp->gkStore_close();

  delete    finClr;
  delete    outClr;

  //  Close log files

  if (reportFile)
    fclose(reportFile);

  if (subreadFile)
    fclose(subreadFile);

  //  Write the summary

  if (outputPrefix) {
    sprintf(outputName, "%s.stats", outputPrefix);

    errno = 0;
    staFile = fopen(outputName, "w");
    if (errno)
      fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno));
  }

  if (staFile == NULL)
    staFile = stdout;

  //  Would like to know number of subreads per read

  fprintf(staFile, "PARAMETERS:\n");
  fprintf(staFile, "----------\n");
  fprintf(staFile, "%7u    (reads trimmed below this many bases are deleted)\n", minReadLength);
  fprintf(staFile, "%7.4f    (use overlaps at or below this fraction error)\n", errorRate);
  //fprintf(staFile, "%7u    (use only overlaps longer than this)\n", minAlignLength);  //  NOT SUPPORTED!
  fprintf(staFile, "INPUT READS:\n");
  fprintf(staFile, "-----------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads processed)\n", readsIn.nReads, readsIn.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "PROCESSED:\n");
  fprintf(staFile, "--------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no overlaps)\n", noOverlaps.nReads, noOverlaps.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no coverage after adjusting for trimming done already)\n", noCoverage.nReads, noCoverage.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for chimera)\n",  readsProcChimera.nReads, readsProcChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for spur)\n",     readsProcSpur.nReads,    readsProcSpur.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for subreads)\n", readsProcSubRead.nReads, readsProcSubRead.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "READS WITH SIGNALS:\n");
  fprintf(staFile, "------------------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 5' spur signal)\n", readsBadSpur5.nReads,   readsBadSpur5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 3' spur signal)\n", readsBadSpur3.nReads,   readsBadSpur3.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of chimera signal)\n", readsBadChimera.nReads, readsBadChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of subread signal)\n", readsBadSubread.nReads, readsBadSubread.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "SIGNALS:\n");
  fprintf(staFile, "-------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 5' spur signal)\n", basesBadSpur5.nReads,   basesBadSpur5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 3' spur signal)\n", basesBadSpur3.nReads,   basesBadSpur3.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of chimera signal)\n", basesBadChimera.nReads, basesBadChimera.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of subread signal)\n", basesBadSubread.nReads, basesBadSubread.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "TRIMMING:\n");
  fprintf(staFile, "--------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 5' end of the read)\n", readsTrimmed5.nReads, readsTrimmed5.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 3' end of the read)\n", readsTrimmed3.nReads, readsTrimmed3.nBases);

#if 0
  fprintf(staFile, "DELETED:\n");
  fprintf(staFile, "-------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of both cimera and spur signals)\n", bothDeletedSmall.nReads, bothDeletedSmall.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of chimera signal)\n", chimeraDeletedSmall.nReads, chimeraDeletedSmall.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of spur signal)\n", spurDeletedSmall.nReads, spurDeletedSmall.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "SPUR TYPES:\n");
  fprintf(staFile, "----------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (normal spur detected)\n", spurDetectedNormal.nReads, spurDetectedNormal.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker spur detected)\n", spurDetectedLinker.nReads, spurDetectedLinker.nBases);
  fprintf(staFile, "\n");
  fprintf(staFile, "CHIMERA TYPES:\n");
  fprintf(staFile, "-------------\n");
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (innie-pair chimera detected)\n", chimeraDetectedInnie.nReads, chimeraDetectedInnie.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (overhanging chimera detected)\n", chimeraDetectedOverhang.nReads, chimeraDetectedOverhang.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (gap chimera detected)\n", chimeraDetectedGap.nReads, chimeraDetectedGap.nBases);
  fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker chimera detected)\n", chimeraDetectedLinker.nReads, chimeraDetectedLinker.nBases);
#endif

  //  INPUT READS  = ACCEPTED + TRIMMED + DELETED
  //  SPUR TYPE    = TRIMMED and DELETED spur and both categories
  //  CHIMERA TYPE = TRIMMED and DELETED chimera and both categories

  if (staFile != stdout)
    fclose(staFile);

  exit(0);
}
Exemple #14
0
int
main(int argc, char **argv) {
  coParameters  *G = new coParameters();

  argc = AS_configure(argc, argv);

  int arg = 1;
  int err = 0;
  while (arg < argc) {
    if        (strcmp(argv[arg], "-G") == 0) {
      G->gkpStorePath = argv[++arg];

    } else if (strcmp(argv[arg], "-R") == 0) {
      G->bgnID = atoi(argv[++arg]);
      G->endID = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-O") == 0) {  //  -F?  -S Olap_Path
      G->ovlStorePath = argv[++arg];

    } else if (strcmp(argv[arg], "-e") == 0) {
      G->errorRate = atof(argv[++arg]);

    } else if (strcmp(argv[arg], "-l") == 0) {
      G->minOverlap = atoi(argv[++arg]);

    } else if (strcmp(argv[arg], "-c") == 0) {  //  For 'corrections' file input
      G->correctionsName = argv[++arg];

    } else if (strcmp(argv[arg], "-o") == 0) {  //  For 'erates' output
      G->eratesName = argv[++arg];

    } else if (strcmp(argv[arg], "-t") == 0) {  //  But we're not threaded!
      G->numThreads = atoi(argv[++arg]);

    } else {
      err++;
    }

    arg++;
  }

  if (G->gkpStorePath == NULL)
    fprintf(stderr, "ERROR: no input gatekeeper store (-G) supplied.\n"), err++;
  if (G->ovlStorePath == NULL)
    fprintf(stderr, "ERROR: no input overlap store (-O) supplied.\n"), err++;
  if (G->correctionsName == NULL)
    fprintf(stderr, "ERROR: no input read corrections file (-c) supplied.\n"), err++;
  if (G->eratesName == NULL)
    fprintf(stderr, "ERROR: no output erates file (-o) supplied.\n"), err++;


  if (err) {
    fprintf(stderr, "USAGE:  %s [-d <dna-file>] [-o <ovl_file>] [-q <quality>]\n", argv[0]);
    fprintf(stderr, "            [-x <del_file>] [-F OlapFile] [-S OlapStore]\n");
    fprintf(stderr, "            [-c <cgb_file>] [-e <erate_file>\n");
    fprintf(stderr, "           <gkpStore> <CorrectFile> <lo> <hi>\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Recalculates overlaps for frags  <lo> .. <hi>  in\n");
    fprintf(stderr, " <gkpStore>  using corrections in  <CorrectFile> \n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Options:\n");
    fprintf(stderr, "-e <erate-file>  specifies binary file to dump corrected erates to\n");
    fprintf(stderr, "                 for later updating of olap store by  update-erates \n");
    fprintf(stderr, "-F             specify file of sorted overlaps to use (in the format\n");
    fprintf(stderr, "               produced by  get-olaps\n");
    fprintf(stderr, "-o <ovl_file>  specifies name of file to which OVL messages go\n");
    fprintf(stderr, "-q <quality>   overlaps less than this error rate are\n");
    fprintf(stderr, "               automatically output\n");
    fprintf(stderr, "-S             specify the binary overlap store containing overlaps to use\n");
    exit(1);
  }

  //fprintf (stderr, "Quality Threshold = %.2f%%\n", 100.0 * Quality_Threshold);

  //
  //  Initialize Globals
  //

  fprintf(stderr, "Initializing.\n");

  double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN);

  Initialize_Match_Limit(G->Edit_Match_Limit, G->errorRate, MAX_ERRORS);

  for (int32 i=0;  i <= AS_MAX_READLEN;  i++)
    G->Error_Bound[i] = (int)ceil(i * G->errorRate);

  //
  //
  //

  fprintf(stderr, "Opening gkpStore '%s'.\n", G->gkpStorePath);

  gkStore *gkpStore = gkStore::gkStore_open(G->gkpStorePath);

  if (G->bgnID < 1)
    G->bgnID = 1;

  if (gkpStore->gkStore_getNumReads() < G->endID)
    G->endID = gkpStore->gkStore_getNumReads();

  //  Load the reads for the overlaps we are going to be correcting, and apply corrections to them

  fprintf(stderr, "Correcting reads "F_U32" to "F_U32".\n", G->bgnID, G->endID);

  Correct_Frags(G, gkpStore);

  //  Load overlaps we're going to correct

  fprintf(stderr, "Loading overlaps.\n");

  Read_Olaps(G, gkpStore);

  //  Now sort them on the B iid.

  fprintf(stderr, "Sorting overlaps.\n");

#ifdef _GLIBCXX_PARALLEL
  __gnu_sequential::sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_bID());
#else
  sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_bID());
#endif

  //  Recompute overlaps

  fprintf(stderr, "Recomputing overlaps.\n");

  Redo_Olaps(G, gkpStore);

  gkpStore->gkStore_close();
  gkpStore = NULL;

  //  Sort the overlaps back into the original order

  fprintf(stderr, "Sorting overlaps.\n");

#ifdef _GLIBCXX_PARALLEL
  __gnu_sequential::sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_Order());
#else
  sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_Order());
#endif

  //  Dump the new erates

  fprintf (stderr, "Saving corrected error rates to file %s\n", G->eratesName);

  {
    errno = 0;
    FILE *fp = fopen(G->eratesName, "w");
    if (errno)
      fprintf(stderr, "Failed to open '%s': %s\n", G->eratesName, strerror(errno)), exit(1);

    AS_UTL_safeWrite(fp, &G->bgnID,    "loid", sizeof(int32),  1);
    AS_UTL_safeWrite(fp, &G->endID,    "hiid", sizeof(int32),  1);
    AS_UTL_safeWrite(fp, &G->olapsLen, "num",  sizeof(uint64), 1);

    fprintf(stderr, "--Allocate "F_U64" MB for output error rates.\n",
            (sizeof(uint16) * G->olapsLen) >> 20);

    uint16 *evalue = new uint16 [G->olapsLen];

    for (int32 i=0; i<G->olapsLen; i++)
      evalue[i] = G->olaps[i].evalue;

    AS_UTL_safeWrite(fp, evalue, "evalue", sizeof(uint16), G->olapsLen);

    delete [] evalue;

    fclose(fp);
  }

  //  Finished.

  //fprintf (stderr, "%d/%d failed/total alignments (%.1f%%)\n",
  //         Failed_Alignments_Ct, Total_Alignments_Ct,
  //         Total_Alignments_Ct == 0 ? 0.0 : (100.0 * Failed_Alignments_Ct) / Total_Alignments_Ct);

  delete G;

  fprintf(stderr, "DONE.\n");

  exit(0);
}
Exemple #15
0
void
gkStore::gkStore_buildPartitions(uint32 *partitionMap) {
  char              name[FILENAME_MAX];

  //  Store cannot be partitioned already, and it must be readOnly (for safety) as we don't need to
  //  be changing any of the normal store data.

  assert(_numberOfPartitions == 0);
  assert(_mode               == gkStore_readOnly);

  //  Figure out what the last partition is

  uint32  maxPartition = 0;
  uint32  unPartitioned = 0;

  assert(partitionMap[0] == UINT32_MAX);

  for (uint32 fi=1; fi<=gkStore_getNumReads(); fi++) {
    if (partitionMap[fi] == UINT32_MAX)
      unPartitioned++;

    else if (maxPartition < partitionMap[fi])
      maxPartition = partitionMap[fi];
  }

  fprintf(stderr, "Found "F_U32" unpartitioned reads and maximum partition of "F_U32"\n",
          unPartitioned, maxPartition);

  //  Create the partitions by opening N copies of the data stores,
  //  and writing data to each.

  FILE         **blobfiles    = new FILE * [maxPartition + 1];
  uint64        *blobfileslen = new uint64 [maxPartition + 1];            //  Offset, in bytes, into the blobs file
  FILE         **readfiles    = new FILE * [maxPartition + 1];
  uint32        *readfileslen = new uint32 [maxPartition + 1];            //  aka _readsPerPartition
  uint32        *readIDmap    = new uint32 [gkStore_getNumReads() + 1];   //  aka _readIDtoPartitionIdx

  //  Be nice and put all the partitions in a subdirectory.

  sprintf(name,"%s/partitions", _storePath);

  if (AS_UTL_fileExists(name, true, true) == false)
    AS_UTL_mkdir(name);

  //  Open all the output files -- fail early if we can't open that many files.

  blobfiles[0]    = NULL;
  blobfileslen[0] = UINT64_MAX;
  readfiles[0]    = NULL;
  readfileslen[0] = UINT32_MAX;

  for (uint32 i=1; i<=maxPartition; i++) {
    sprintf(name,"%s/partitions/blobs.%04d", _storePath, i);

    errno = 0;
    blobfiles[i]    = fopen(name, "w");
    blobfileslen[i] = 0;

    if (errno)
      fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n",
              i, name, strerror(errno)), exit(1);

    sprintf(name,"%s/partitions/reads.%04d", _storePath, i);

    errno = 0;
    readfiles[i]    = fopen(name, "w");
    readfileslen[i] = 0;

    if (errno)
      fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n",
              i, name, strerror(errno)), exit(1);
  }

  //  Open the output partition map file -- we might as well fail early if we can't make it also.

  sprintf(name,"%s/partitions/map", _storePath);

  errno = 0;
  FILE *rIDmF = fopen(name, "w");
  if (errno)
    fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition map file '%s': %s\n",
            name, strerror(errno)), exit(1);

  //  Copy the blob from the master file to the partitioned file, update pointers.

  readIDmap[0] = UINT32_MAX;    //  There isn't a zeroth read, make it bogus.

  for (uint32 fi=1; fi<=gkStore_getNumReads(); fi++) {
    uint32  pi = partitionMap[fi];

    assert(pi != 0);  //  No zeroth partition, right?

    if (pi == UINT32_MAX)
      //  Deleted reads are not assigned a partition; skip them
      continue;

    //  Make a copy of the read, then modify it for the partition, then write it to the partition.
    //  Without the copy, we'd need to update the master record too.

    gkRead  partRead = _reads[fi];  //*gkStore_getRead(fi);

    partRead.gkRead_copyDataToPartition(_blobs, blobfiles, blobfileslen, pi);

#if 1
    fprintf(stderr, "read "F_U32"="F_U32" len "F_U32" -- blob master "F_U64" -- to part "F_U32" new read id "F_U32" blob "F_U64"/"F_U64" -- at readIdx "F_U32"\n",
            fi, _reads[fi].gkRead_readID(), _reads[fi].gkRead_sequenceLength(),
            _reads[fi]._mPtr,
            pi,
            partRead.gkRead_readID(), partRead._pID, partRead._mPtr,
            readfileslen[pi]);
#endif

    AS_UTL_safeWrite(readfiles[pi], &partRead, "gkStore::gkStore_buildPartitions::read", sizeof(gkRead), 1);

    readIDmap[fi] = readfileslen[pi]++;
  }

  //  There isn't a zeroth read.

  AS_UTL_safeWrite(rIDmF, &maxPartition,  "gkStore::gkStore_buildPartitions::maxPartition", sizeof(uint32), 1);
  AS_UTL_safeWrite(rIDmF,  readfileslen,  "gkStore::gkStore_buildPartitions::readfileslen", sizeof(uint32), maxPartition + 1);
  AS_UTL_safeWrite(rIDmF,  partitionMap,  "gkStore::gkStore_buildPartitions::partitionMap", sizeof(uint32), gkStore_getNumReads() + 1);
  AS_UTL_safeWrite(rIDmF,  readIDmap,     "gkStore::gkStore_buildPartitions::readIDmap",    sizeof(uint32), gkStore_getNumReads() + 1);

  //  cleanup -- close all the files, delete storage

  fclose(rIDmF);

  for (uint32 i=1; i<=maxPartition; i++) {
    fprintf(stderr, "partition "F_U32" has "F_U32" reads\n", i, readfileslen[i]);

    errno = 0;

    fclose(blobfiles[i]);
    fclose(readfiles[i]);

    if (errno)
      fprintf(stderr, "  warning: %s\n", strerror(errno));
  }

  delete [] readIDmap;
  delete [] readfileslen;
  delete [] readfiles;
  delete [] blobfileslen;
  delete [] blobfiles;
}
Exemple #16
0
gkStore::~gkStore() {
  char   N[FILENAME_MAX];
  FILE  *F;

  //  Should check that inf on disk is the same as inf in memory, and update if needed.

  bool   needsInfoUpdate = false;

  //  Write N+1 because we write, but don't count, the [0] element.

  if        (_librariesMMap) {
    delete _librariesMMap;

  } else if (_libraries) {
    sprintf(N, "%s/libraries", gkStore_path());
    errno = 0;
    F = fopen(N, "w");
    if (errno)
      fprintf(stderr, "gkStore::~gkStore()-- failed to open '%s' for writing: %s\n",
              N, strerror(errno)), exit(1);

    AS_UTL_safeWrite(F, _libraries, "libraries", sizeof(gkLibrary), gkStore_getNumLibraries() + 1);
    fclose(F);

    delete [] _libraries;

    needsInfoUpdate = true;
  }


  if        (_readsMMap) {
    delete _readsMMap;

  } else if (_reads) {
    sprintf(N, "%s/reads", gkStore_path());
    errno = 0;
    F = fopen(N, "w");
    if (errno)
      fprintf(stderr, "gkStore::~gkStore()-- failed to open '%s' for writing: %s\n",
              N, strerror(errno)), exit(1);

    AS_UTL_safeWrite(F, _reads, "reads", sizeof(gkRead), gkStore_getNumReads() + 1);
    fclose(F);

    delete [] _reads;

    needsInfoUpdate = true;
  }


  if (needsInfoUpdate) {
    sprintf(N, "%s/info", gkStore_path());
    errno = 0;
    F = fopen(N, "w");
    if (errno)
      fprintf(stderr, "gkStore::~gkStore()-- failed to open '%s' for writing: %s\n",
              N, strerror(errno)), exit(1);

    AS_UTL_safeWrite(F, &_info, "info", sizeof(gkStoreInfo), 1);
    fclose(F);


    sprintf(N, "%s/info.txt", gkStore_path());
    errno = 0;
    F = fopen(N, "w");
    if (errno)
      fprintf(stderr, "gkStore::~gkStore()-- failed to open '%s' for writing: %s\n",
              N, strerror(errno)), exit(1);

    _info.writeInfoAsText(F);

    fclose(F);
  }


  if (_blobsMMap)
    delete _blobsMMap;

  if (_blobsFile)
    fclose(_blobsFile);

  delete [] _readIDtoPartitionIdx;
  delete [] _readIDtoPartitionID;
  delete [] _readsPerPartition;
};