void gkRead::gkRead_copyDataToPartition(void *blobs, FILE **partfiles, uint64 *partfileslen, uint32 partID) { // Stash away the location of the partitioned data assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID])); // Figure out where the blob actually is, and make sure that it really is a blob uint8 *blob = (uint8 *)blobs + _mPtr; uint32 blobLen = 8 + *((uint32 *)blob + 1); assert(blob[0] == 'B'); assert(blob[1] == 'L'); assert(blob[2] == 'O'); assert(blob[3] == 'B'); // Write the blob to the partition, update the length of the partition AS_UTL_safeWrite(partfiles[partID], blob, "gkRead::gkRead_copyDataToPartition::blob", sizeof(char), blobLen); // Update the read to the new location of the blob in the partitioned data. _mPtr = partfileslen[partID]; _pID = partID; // And finalize by remembering the length. partfileslen[partID] += blobLen; assert(partfileslen[partID] == AS_UTL_ftell(partfiles[partID])); }
void FragmentInfo::save(const char *prefix) { char name[FILENAME_MAX]; sprintf(name, "%s.fragmentInfo", prefix); errno = 0; FILE *file = fopen(name, "w"); if (errno) { writeLog("FragmentInfo()-- Failed to open '%s' for writing: %s\n", name, strerror(errno)); writeLog("FragmentInfo()-- Will not save fragment information to cache.\n"); return; } writeLog("FragmentInfo()-- Saving fragment information to cache '%s'\n", name); AS_UTL_safeWrite(file, &fiMagicNumber, "fragmentInformationMagicNumber", sizeof(uint64), 1); AS_UTL_safeWrite(file, &fiVersionNumber, "fragmentInformationMagicNumber", sizeof(uint64), 1); AS_UTL_safeWrite(file, &_numFragments, "fragmentInformationNumFrgs", sizeof(uint32), 1); AS_UTL_safeWrite(file, &_numLibraries, "fragmentInformationNumLibs", sizeof(uint32), 1); AS_UTL_safeWrite(file, _fragLength, "fragmentInformationFragLen", sizeof(uint32), _numFragments + 1); AS_UTL_safeWrite(file, _libIID, "fragmentInformationLibIID", sizeof(uint32), _numFragments + 1); AS_UTL_safeWrite(file, _numFragsInLib, "fragmentInformationNumFrgsInLib", sizeof(uint32), _numLibraries + 1); fclose(file); }
void ovStore::ovStore_write(void) { AS_UTL_mkdir(_storePath); char name[FILENAME_MAX]; sprintf(name, "%s/info", _storePath); // If the ovs file exists, AND has a valid magic number, then the store is complete and we should // abort before the valid store is destroyed. if (AS_UTL_fileExists(name, false, false)) { errno = 0; FILE *ovsinfo = fopen(name, "r"); if (errno) { fprintf(stderr, "ERROR: failed to read store metadata from '%s': %s\n", name, strerror(errno)); exit(1); } AS_UTL_safeRead(ovsinfo, &_info, "ovStore::ovStore::testinfo", sizeof(ovStoreInfo), 1); fclose(ovsinfo); if (_info._ovsMagic == ovStoreMagic) fprintf(stderr, "ERROR: overlapStore '%s' is a valid overlap store, will not overwrite.\n", _storePath), exit(1); } // Create a new incomplete info file. errno = 0; FILE *ovsinfo = fopen(name, "w"); if (errno) fprintf(stderr, "failed to create overlap store '%s': %s\n", _storePath, strerror(errno)), exit(1); AS_UTL_safeWrite(ovsinfo, &_info, "ovStore::ovStore::saveinfo", sizeof(ovStoreInfo), 1); fclose(ovsinfo); sprintf(name, "%s/index", _storePath); errno = 0; _offtFile = fopen(name, "w"); if (errno) fprintf(stderr, "AS_OVS_createOverlapStore()-- failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1); _overlapsThisFile = 0; _currentFileIndex = 0; _bof = NULL; }
// Dump a block of encoded data to disk, then update the gkRead to point to it. // void gkStore::gkStore_stashReadData(gkRead *read, gkReadData *data) { assert(_blobsFile != NULL); read->_mPtr = AS_UTL_ftell(_blobsFile); read->_pID = _partitionID; // 0 if not partitioned //fprintf(stderr, "STASH read %u at position "F_SIZE_T"\n", read->gkRead_readID(), AS_UTL_ftell(_blobsFile)); AS_UTL_safeWrite(_blobsFile, data->_blob, "gkStore_stashReadData::blob", sizeof(char), data->_blobLen); }
void BestOverlapGraph::save(const char *prefix, double AS_UTG_ERROR_RATE, double AS_UTG_ERROR_LIMIT) { char name[FILENAME_MAX]; sprintf(name, "%s.bog", prefix); assert(_best5score == NULL); assert(_best3score == NULL); assert(_bestCscore == NULL); errno = 0; FILE *file = fopen(name, "w"); if (errno) { fprintf(logFile, "BestOverlapGraph-- Failed to open '%s' for writing: %s\n", name, strerror(errno)); fprintf(logFile, "BestOverlapGraph-- Will not save best overlap graph to cache: "F_STR"\n", strerror(errno)); return; } fprintf(logFile, "BestOverlapGraph()-- Saving overlap graph to '%s'.\n", name); AS_UTL_safeWrite(file, &ogMagicNumber, "magicnumber", sizeof(uint64), 1); AS_UTL_safeWrite(file, &ogVersionNumber, "versionnumber", sizeof(uint64), 1); AS_UTL_safeWrite(file, &AS_UTG_ERROR_RATE, "errorRate", sizeof(double), 1); AS_UTL_safeWrite(file, &AS_UTG_ERROR_LIMIT, "errorLimit", sizeof(double), 1); AS_UTL_safeWrite(file, _best5, "best overlaps 5", sizeof(BestEdgeOverlap), FI->numFragments() + 1); AS_UTL_safeWrite(file, _best3, "best overlaps 3", sizeof(BestEdgeOverlap), FI->numFragments() + 1); AS_UTL_safeWrite(file, _bestC, "best contains C", sizeof(BestContainment), FI->numFragments() + 1); for (uint32 i=0; i<FI->numFragments() + 1; i++) if (_bestC[i].olaps != NULL) AS_UTL_safeWrite(file, _bestC[i].olaps, "best contains olaps", sizeof(uint32), _bestC[i].olapsLen); fclose(file); }
void writeOverlaps(char *storePath, ovOverlap *ovls, uint64 ovlsLen, uint32 fileID) { char name[FILENAME_MAX]; uint32 currentFileIndex = fileID; uint64 overlapsThisFile = 0; ovStoreInfo info; info._ovsMagic = 1; info._ovsVersion = ovStoreVersion; info._smallestIID = UINT64_MAX; info._largestIID = 0; info._numOverlapsTotal = 0; info._highestFileIndex = 0; info._maxReadLenInBits = AS_MAX_READLEN_BITS; ovStoreOfft offt; ovStoreOfft offm; offt._a_iid = offm._a_iid = ovls[0].a_iid; offt._fileno = offm._fileno = fileID; offt._offset = offm._offset = 0; offt._numOlaps = offm._numOlaps = 0; // Create the output file sprintf(name, "%s/%04d", storePath, fileID); ovFile *bof = new ovFile(name, ovFileNormalWrite); // Create the index file sprintf(name,"%s/%04d.index", storePath, fileID); errno = 0; FILE *offtFile=fopen(name,"w"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1); // Dump the overlaps fprintf(stderr, "Writing "F_U64" overlaps.\n", ovlsLen); for (uint64 i=0; i<ovlsLen; i++ ) { bof->writeOverlap(ovls + i); if (offt._a_iid > ovls[i].a_iid) { fprintf(stderr, "LAST: a:"F_U32"\n", offt._a_iid); fprintf(stderr, "THIS: a:"F_U32" b:"F_U32"\n", ovls[i].a_iid, ovls[i].b_iid); } assert(offt._a_iid <= ovls[i].a_iid); info._smallestIID = MIN(info._smallestIID, ovls[i].a_iid); info._largestIID = MAX(info._largestIID, ovls[i].a_iid); // Put the index to disk, filling any gaps if ((offt._numOlaps != 0) && (offt._a_iid != ovls[i].a_iid)) { while (offm._a_iid < offt._a_iid) { offm._fileno = offt._fileno; offm._offset = offt._offset; offm._numOlaps = 0; AS_UTL_safeWrite(offtFile, &offm, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1); offm._a_iid++; } // One more, since this iid is not offm -- we write it next! offm._a_iid++; AS_UTL_safeWrite(offtFile, &offt, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1); offt._numOlaps = 0; } // Update the index if this is the first overlap for this a_iid if (offt._numOlaps == 0) { offt._a_iid = ovls[i].a_iid; offt._fileno = currentFileIndex; offt._offset = overlapsThisFile; } offt._numOlaps++; info._numOverlapsTotal++; overlapsThisFile++; } // Close the output file. delete bof; // Write the final index entries. while (offm._a_iid < offt._a_iid) { offm._fileno = offt._fileno; offm._offset = offt._offset; offm._numOlaps = 0; AS_UTL_safeWrite(offtFile, &offm, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1); offm._a_iid++; } AS_UTL_safeWrite(offtFile, &offt, "AS_OVS_writeOverlapToStore offt", sizeof(ovStoreOfft), 1); fclose(offtFile); // In the nasty case that there were no overlaps in this slice, set meaningful smallest and // largest. Well, at least, set non-nonsense smallest and largest. if (overlapsThisFile == 0) { info._smallestIID = 0; info._largestIID = 0; } // Write the info, and some stats for the user. sprintf(name,"%s/%04d.info", storePath, fileID); errno = 0; FILE *F = fopen(name, "w"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1); AS_UTL_safeWrite(F, &info, "Partition ovs file", sizeof(ovStoreInfo), 1); fclose(F); fprintf(stderr, "Wrote "F_U64" overlaps into '%s'\n", info._numOverlapsTotal, name); fprintf(stderr, " Smallest "F_U64"\n", info._smallestIID); fprintf(stderr, " Largest "F_U64"\n", info._largestIID); }
void ovStore::addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesLen) { char name[FILENAME_MAX]; sprintf(name, "%s/evalues", _storePath); // If we have an opened memory mapped file, and it isn't open for writing, close it. if ((_evaluesMap) && (_evaluesMap->type() == memoryMappedFile_readOnly)) { fprintf(stderr, "WARNING: closing read-only evalues file.\n"); delete _evaluesMap; _evaluesMap = NULL; _evalues = NULL; } // Remove a bogus evalues file if one exists. if ((AS_UTL_fileExists(name) == true) && (AS_UTL_sizeOfFile(name) != (sizeof(uint16) * _info._numOverlapsTotal))) { fprintf(stderr, "WARNING: existing evalues file is incorrect size: should be "F_U64" bytes, is "F_U64" bytes. Removing.\n", (sizeof(uint16) * _info._numOverlapsTotal), AS_UTL_sizeOfFile(name)); AS_UTL_unlink(name); } // Make a new evalues file if one doesn't exist. if (AS_UTL_fileExists(name) == false) { fprintf(stderr, "Creating evalues file for "F_U64" overlaps.\r", _info._numOverlapsTotal); errno = 0; FILE *F = fopen(name, "w"); if (errno) fprintf(stderr, "Failed to make evalues file '%s': %s\n", name, strerror(errno)), exit(1); uint16 *Z = new uint16 [1048576]; uint64 Zn = 0; memset(Z, 0, sizeof(uint16) * 1048576); while (Zn < _info._numOverlapsTotal) { uint64 S = (Zn + 1048576 < _info._numOverlapsTotal) ? 1048576 : _info._numOverlapsTotal - Zn; AS_UTL_safeWrite(F, Z, "zero evalues", sizeof(uint16), S); Zn += S; fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\r", _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal); } fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\n", _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal); fclose(F); } // Open the evalues file if it isn't already opened if (_evalues == NULL) { _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readWrite); _evalues = (uint16 *)_evaluesMap->get(0); } // Figure out the overlap ID for the first overlap associated with bgnID setRange(bgnID, endID); // Load the evalues from 'evalues' for (uint64 ii=0; ii<evaluesLen; ii++) _evalues[_offt._overlapID + ii] = evalues[ii]; // That's it. Deleting the ovStore object will close the memoryMappedFile. It's left open // for more updates. }
void ovStore::writeOverlap(ovOverlap *overlap, uint32 maxOverlapsThisFile) { char name[FILENAME_MAX]; assert(_isOutput == TRUE); _currentFileIndex++; _overlapsThisFile = 0; for (uint64 i=0; i < maxOverlapsThisFile; i++ ) { // All overlaps will be sorted by a_iid if (_offt._a_iid > overlap[i].a_iid) { fprintf(stderr, "LAST: a:"F_U32"\n", _offt._a_iid); fprintf(stderr, "THIS: a:"F_U32" b:"F_U32"\n", overlap[i].a_iid, overlap[i].b_iid); } assert(_offt._a_iid <= overlap[i].a_iid); if (_info._smallestIID > overlap[i].a_iid) _info._smallestIID = overlap[i].a_iid; if (_info._largestIID < overlap[i].a_iid) _info._largestIID = overlap[i].a_iid; // Put the index to disk, filling any gaps if ((_offt._numOlaps != 0) && (_offt._a_iid != overlap[i].a_iid)) { while (_offm._a_iid < _offt._a_iid) { _offm._fileno = _offt._fileno; _offm._offset = _offt._offset; _offm._numOlaps = 0; AS_UTL_safeWrite(_offtFile, &_offm, "AS_OVS_writeOverlapToStore offset", sizeof(ovStoreOfft), 1); _offm._a_iid++; } // One more, since this iid is not missing -- we write it next! _offm._a_iid++; AS_UTL_safeWrite(_offtFile, &_offt, "AS_OVS_writeOverlapToStore offset", sizeof(ovStoreOfft), 1); _offt._numOlaps = 0; } // Update the index if this is the first overlap for this a_iid if (_offt._numOlaps == 0) { _offt._a_iid = overlap[i].a_iid; _offt._fileno = _currentFileIndex; _offt._offset = _overlapsThisFile; _offt._overlapID = _info._numOverlapsTotal; } _offt._numOlaps++; _info._numOverlapsTotal++; _overlapsThisFile++; } fprintf(stderr,"Done building index for dumpfile %d.\n",_currentFileIndex); }
void ovStore::writeOverlap(ovOverlap *overlap) { char name[FILENAME_MAX]; assert(_isOutput == TRUE); if (_offt._a_iid > overlap->a_iid) { // Woah! The last overlap we saw is bigger than the one we have now?! fprintf(stderr, "LAST: a:"F_U32"\n", _offt._a_iid); fprintf(stderr, "THIS: a:"F_U32" b:"F_U32"\n", overlap->a_iid, overlap->b_iid); } assert(_offt._a_iid <= overlap->a_iid); if (_info._smallestIID > overlap->a_iid) _info._smallestIID = overlap->a_iid; if (_info._largestIID < overlap->a_iid) _info._largestIID = overlap->a_iid; // If we don't have an output file yet, or the current file is // too big, open a new file. // if ((_bof) && (_overlapsThisFile >= 1024 * 1024 * 1024 / _bof->recordSize())) { delete _bof; _bof = NULL; _overlapsThisFile = 0; } if (_bof == NULL) { char name[FILENAME_MAX]; _currentFileIndex++; sprintf(name, "%s/%04d", _storePath, _currentFileIndex); _bof = new ovFile(name, ovFileNormalWrite); } // Put the index to disk, filling any gaps // if ((_offt._numOlaps != 0) && (_offt._a_iid != overlap->a_iid)) { while (_offm._a_iid < _offt._a_iid) { _offm._fileno = _offt._fileno; _offm._offset = _offt._offset; _offm._numOlaps = 0; AS_UTL_safeWrite(_offtFile, &_offm, "ovStore::writeOverlap::offset", sizeof(ovStoreOfft), 1); _offm._a_iid++; } // One more, since this iid is not missing -- we write it next! _offm._a_iid++; AS_UTL_safeWrite(_offtFile, &_offt, "AS_OVS_writeOverlapToStore offset", sizeof(ovStoreOfft), 1); _offt._numOlaps = 0; } // Update the index if this is the first overlap for this a_iid // if (_offt._numOlaps == 0) { _offt._a_iid = overlap->a_iid; _offt._fileno = _currentFileIndex; _offt._offset = _overlapsThisFile; _offt._overlapID = _info._numOverlapsTotal; } //AS_OVS_accumulateStats(ovs, overlap); _bof->writeOverlap(overlap); _offt._numOlaps++; _info._numOverlapsTotal++; _overlapsThisFile++; }
ovStore::~ovStore() { // If output, write the last index element (don't forget to fill in gaps); // update the info, using the final magic number if (_isOutput) { if (_offt._numOlaps > 0) { for (; _offm._a_iid < _offt._a_iid; _offm._a_iid++) { _offm._fileno = _offt._fileno; _offm._offset = _offt._offset; _offm._numOlaps = 0; AS_UTL_safeWrite(_offtFile, &_offm, "ovStore::~ovStore::offm", sizeof(ovStoreOfft), 1); } AS_UTL_safeWrite(_offtFile, &_offt, "ovStore::~ovStore::offt", sizeof(ovStoreOfft), 1); } _info._ovsMagic = ovStoreMagic; _info._ovsVersion = ovStoreVersion; _info._highestFileIndex = _currentFileIndex; char name[FILENAME_MAX]; sprintf(name, "%s/info", _storePath); errno = 0; FILE *ovsinfo = fopen(name, "w"); if (errno) fprintf(stderr, "failed to create overlap store '%s': %s\n", _storePath, strerror(errno)), exit(1); AS_UTL_safeWrite(ovsinfo, &_info, "ovStore::~ovStore::info", sizeof(ovStoreInfo), 1); fclose(ovsinfo); fprintf(stderr, "Closing the new store:\n"); fprintf(stderr, " info._ovsMagic = 0x%016"F_X64P"\n", _info._ovsMagic); fprintf(stderr, " info._ovsVersion = "F_U64"\n", _info._ovsVersion); fprintf(stderr, " info._smallestIID = "F_U64"\n", _info._smallestIID); fprintf(stderr, " info._largestIID = "F_U64"\n", _info._largestIID); fprintf(stderr, " info._numOverlapsTotal = "F_U64"\n", _info._numOverlapsTotal); fprintf(stderr, " info._highestFileIndex = "F_U64"\n", _info._highestFileIndex); fprintf(stderr, " info._maxReadLenInBits = "F_U64"\n", _info._maxReadLenInBits); } #if 0 if (_statsUpdated) { fprintf(stderr, "Writing new stats.\n"); char name [FILENAME_MAX]; sprintf(name, "%s/ost", _storePath); errno = 0; FILE *ost = fopen(name, "w"); if (errno) fprintf(stderr, "failed to write overlap stats '%s': %s\n", name, strerror(errno)), exit(1); AS_UTL_safeWrite(ost, &_stats, "AS_OVS_closeOverlapStore", sizeof(OverlapStoreStats), 1); fclose(ost); } #endif delete _bof; fclose(_offtFile); }
void mergeInfoFiles(char *storePath, uint32 nPieces) { ovStoreInfo infopiece; ovStoreInfo info; info._ovsMagic = ovStoreMagic; info._ovsVersion = ovStoreVersion; info._smallestIID = UINT64_MAX; info._largestIID = 0; info._numOverlapsTotal = 0; info._highestFileIndex = nPieces; info._maxReadLenInBits = AS_MAX_READLEN_BITS; ovStoreOfft offm; offm._a_iid = 0; offm._fileno = 1; offm._offset = 0; offm._numOlaps = 0; // Open the new master index output file char name[FILENAME_MAX]; sprintf(name, "%s/index", storePath); errno = 0; FILE *idx = fopen(name, "w"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1); // Special case, we need an empty index for the zeroth fragment. AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsetZero", sizeof(ovStoreOfft), 1); // Process each for (uint32 i=1; i<=nPieces; i++) { sprintf(name, "%s/%04d.info", storePath, i); fprintf(stderr, "Processing '%s'\n", name); if (AS_UTL_fileExists(name, FALSE, FALSE) == false) { fprintf(stderr, "ERROR: file '%s' not found.\n", name); exit(1); } { errno = 0; FILE *F = fopen(name, "r"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1); AS_UTL_safeRead(F, &infopiece, "ovStore::mergeInfoFiles::infopiece", sizeof(ovStoreInfo), 1); fclose(F); } // Add empty index elements for missing overlaps if (infopiece._numOverlapsTotal == 0) { fprintf(stderr, " No overlaps found.\n"); continue; } assert(infopiece._smallestIID <= infopiece._largestIID); if (info._largestIID + 1 < infopiece._smallestIID) fprintf(stderr, " Adding empty records for fragments "F_U64" to "F_U64"\n", info._largestIID + 1, infopiece._smallestIID - 1); while (info._largestIID + 1 < infopiece._smallestIID) { offm._a_iid = info._largestIID + 1; //offm._fileno = set elsewhere //offm._offset = set elsewhere //offm._numOlaps = 0; AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsets", sizeof(ovStoreOfft), 1); info._largestIID++; } // Copy index elements for existing overlaps. While copying, update the supposed position // of any fragments with no overlaps. Without doing this, accessing the store beginning // or ending at such a fragment will fail. { sprintf(name, "%s/%04d.index", storePath, i); errno = 0; FILE *F = fopen(name, "r"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1); uint32 recsLen = 0; uint32 recsMax = 1024 * 1024; ovStoreOfft *recs = new ovStoreOfft [recsMax]; recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsLoad", sizeof(ovStoreOfft), recsMax); if (recsLen > 0) { if (info._largestIID + 1 != recs[0]._a_iid) fprintf(stderr, "ERROR: '%s' starts with iid "F_U32", but store only up to "F_U64"\n", name, recs[0]._a_iid, info._largestIID); assert(info._largestIID + 1 == recs[0]._a_iid); } while (recsLen > 0) { offm._fileno = recs[recsLen-1]._fileno; // Update location of missing stuff. offm._offset = recs[recsLen-1]._offset; AS_UTL_safeWrite(idx, recs, "ovStore::mergeInfoFiles::offsetsWrite", sizeof(ovStoreOfft), recsLen); recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsReLoad", sizeof(ovStoreOfft), recsMax); } delete [] recs; fclose(F); } // Update info._smallestIID = MIN(info._smallestIID, infopiece._smallestIID); info._largestIID = MAX(info._largestIID, infopiece._largestIID); info._numOverlapsTotal += infopiece._numOverlapsTotal; fprintf(stderr, " Now finished with fragments "F_U64" to "F_U64" -- "F_U64" overlaps.\n", info._smallestIID, info._largestIID, info._numOverlapsTotal); } fclose(idx); // Dump the new store info file { sprintf(name, "%s/info", storePath); errno = 0; FILE *F = fopen(name, "w"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1); AS_UTL_safeWrite(F, &info, "ovStore::mergeInfoFiles::finalInfo", sizeof(ovStoreInfo), 1); fclose(F); } fprintf(stderr, "\n"); fprintf(stderr, "Index finalized for reads "F_U64" to "F_U64" with "F_U64" overlaps.\n", info._smallestIID, info._largestIID, info._numOverlapsTotal); }
bool testIndex(char *ovlName, bool doFixes) { char name[FILENAME_MAX]; FILE *I = NULL; FILE *F = NULL; sprintf(name, "%s/index", ovlName); errno = 0; I = fopen(name, "r"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s' for reading: %s\n", name, strerror(errno)), exit(1); //fprintf(stderr, "TESTING '%s'\n", name); if (doFixes) { sprintf(name, "%s/index.fixed", ovlName); errno = 0; F = fopen(name, "w"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s' for writing: %s\n", name, strerror(errno)), exit(1); //fprintf(stderr, "WITH FIXES TO '%s'\n", name); } ovStoreOfft O; uint32 curIID = 0; uint32 minIID = UINT32_MAX; uint32 maxIID = 0; uint32 nErrs = 0; while (1 == AS_UTL_safeRead(I, &O, "offset", sizeof(ovStoreOfft), 1)) { bool maxIncreases = (maxIID < O._a_iid); bool errorDecreased = ((O._a_iid < curIID)); bool errorGap = ((O._a_iid > 0) && (curIID + 1 != O._a_iid)); if (O._a_iid < minIID) minIID = O._a_iid; if (maxIncreases) maxIID = O._a_iid; if (errorDecreased) fprintf(stderr, "ERROR: index decreased from "F_U32" to "F_U32"\n", curIID, O._a_iid), nErrs++; else if (errorGap) fprintf(stderr, "ERROR: gap between "F_U32" and "F_U32"\n", curIID, O._a_iid), nErrs++; if ((maxIncreases == true) && (errorGap == false)) { if (doFixes) AS_UTL_safeWrite(F, &O, "offset", sizeof(ovStoreOfft), 1); } else if (O._numOlaps > 0) { fprintf(stderr, "ERROR: lost overlaps a_iid "F_U32" fileno "F_U32" offset "F_U32" numOlaps "F_U32"\n", O._a_iid, O._fileno, O._offset, O._numOlaps); } curIID = O._a_iid; } fclose(I); if (F) fclose(F); return(nErrs == 0); }
int main(int argc, char **argv) { char *gkpName = NULL; char *ovsName = NULL; char *finClrName = NULL; char *outClrName = NULL; double errorRate = 0.06; //uint32 minAlignLength = 40; uint32 minReadLength = 64; uint32 idMin = 1; uint32 idMax = UINT32_MAX; char *outputPrefix = NULL; char outputName[FILENAME_MAX]; FILE *staFile = NULL; FILE *reportFile = NULL; FILE *subreadFile = NULL; bool doSubreadLogging = true; bool doSubreadLoggingVerbose = false; // Statistics on the trimming - the second set are from the old logging, and don't really apply anymore. trimStat readsIn; // Read is eligible for trimming trimStat deletedIn; // Read was deleted already trimStat noTrimIn; // Read not requesting trimming trimStat noOverlaps; // no overlaps in store trimStat noCoverage; // no coverage after adjusting for trimming done trimStat readsProcChimera; // Read was processed for chimera signal trimStat readsProcSpur; // Read was processed for spur signal trimStat readsProcSubRead; // Read was processed for subread signal #if 0 trimStat badSpur5; trimStat badSpur3; trimStat badChimera; trimStat badSubread; #endif trimStat readsNoChange; trimStat readsBadSpur5, basesBadSpur5; trimStat readsBadSpur3, basesBadSpur3; trimStat readsBadChimera, basesBadChimera; trimStat readsBadSubread, basesBadSubread; trimStat readsTrimmed5; trimStat readsTrimmed3; #if 0 trimStat fullCoverage; // fully covered by overlaps trimStat noSignalNoGap; // no signal, no gaps trimStat noSignalButGap; // no signal, with gaps trimStat bothFixed; // both chimera and spur signal trimmed trimStat chimeraFixed; // only chimera signal trimmed trimStat spurFixed; // only spur signal trimmed trimStat bothDeletedSmall; // deleted because of both cimera and spur signals trimStat chimeraDeletedSmall; // deleted because of chimera signal trimStat spurDeletedSmall; // deleted because of spur signal trimStat spurDetectedNormal; // normal spur detected trimStat spurDetectedLinker; // linker spur detected trimStat chimeraDetectedInnie; // innpue-pair chimera detected trimStat chimeraDetectedOverhang; // overhanging chimera detected trimStat chimeraDetectedGap; // gap chimera detected trimStat chimeraDetectedLinker; // linker chimera detected #endif trimStat deletedOut; // Read was deleted by trimming argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { ovsName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { outputPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { AS_UTL_decodeRange(argv[++arg], idMin, idMax); } else if (strcmp(argv[arg], "-Ci") == 0) { finClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Co") == 0) { outClrName = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { errorRate = atof(argv[++arg]); //} else if (strcmp(argv[arg], "-l") == 0) { // minAlignLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-minlength") == 0) { minReadLength = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if (errorRate < 0.0) err++; if ((gkpName == 0L) || (ovsName == 0L) || (outputPrefix == NULL) || (err)) { fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Ci input.clearFile -Co output.clearFile -o outputPrefix]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G gkpStore path to read store\n"); fprintf(stderr, " -O ovlStore path to overlap store\n"); fprintf(stderr, "\n"); fprintf(stderr, " -o name output prefix, for logging\n"); fprintf(stderr, "\n"); fprintf(stderr, " -t bgn-end limit processing to only reads from bgn to end (inclusive)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -Ci clearFile path to input clear ranges (NOT SUPPORTED)\n"); fprintf(stderr, " -Co clearFile path to ouput clear ranges\n"); fprintf(stderr, "\n"); fprintf(stderr, " -e erate ignore overlaps with more than 'erate' percent error\n"); //fprintf(stderr, " -l length ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -minlength l reads trimmed below this many bases are deleted\n"); fprintf(stderr, "\n"); if (errorRate < 0.0) fprintf(stderr, "ERROR: Error rate (-e) value %f too small; must be 'fraction error' and above 0.0\n", errorRate); exit(1); } gkStore *gkp = gkStore::gkStore_open(gkpName); ovStore *ovs = new ovStore(ovsName, gkp); clearRangeFile *finClr = new clearRangeFile(finClrName, gkp); clearRangeFile *outClr = new clearRangeFile(outClrName, gkp); if (outClr) // If the outClr file exists, those clear ranges are loaded. We need to reset them // back to 'untrimmed' for now. outClr->reset(gkp); if (finClr && outClr) // A finClr file was supplied, so use those as the clear ranges. outClr->copy(finClr); sprintf(outputName, "%s.log", outputPrefix); errno = 0; reportFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1); sprintf(outputName, "%s.subread.log", outputPrefix); errno = 0; subreadFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1); uint32 ovlLen = 0; uint32 ovlMax = 64 * 1024; ovOverlap *ovl = ovOverlap::allocateOverlaps(gkp, ovlMax); memset(ovl, 0, sizeof(ovOverlap) * ovlMax); workUnit *w = new workUnit; if (idMin < 1) idMin = 1; if (idMax > gkp->gkStore_getNumReads()) idMax = gkp->gkStore_getNumReads(); fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads, using errorRate = %.2f\n", idMin, idMax, gkp->gkStore_getNumReads(), errorRate); for (uint32 id=idMin; id<=idMax; id++) { gkRead *read = gkp->gkStore_getRead(id); gkLibrary *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID()); if (finClr->isDeleted(id)) { // Read already trashed. deletedIn += read->gkRead_sequenceLength(); continue; } if ((libr->gkLibrary_removeSpurReads() == false) && (libr->gkLibrary_removeChimericReads() == false) && (libr->gkLibrary_checkForSubReads() == false)) { // Nothing to do. noTrimIn += read->gkRead_sequenceLength(); continue; } readsIn += read->gkRead_sequenceLength(); uint32 nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax); //fprintf(stderr, "read %7u with %7u overlaps\r", id, nLoaded); if (nLoaded == 0) { // No overlaps, nothing to check! noOverlaps += read->gkRead_sequenceLength(); continue; } w->clear(id, finClr->bgn(id), finClr->end(id)); w->addAndFilterOverlaps(gkp, finClr, errorRate, ovl, ovlLen); if (w->adjLen == 0) { // All overlaps trimmed out! noCoverage += read->gkRead_sequenceLength(); continue; } // Find bad regions. //if (libr->gkLibrary_markBad() == true) // // From an external file, a list of known bad regions. If no overlaps span // // the region with sufficient coverage, mark the region as bad. This was // // motivated by the old 454 linker detection. // markBad(gkp, w, subreadFile, doSubreadLoggingVerbose); //if (libr->gkLibrary_removeSpurReads() == true) { // readsProcSpur += read->gkRead_sequenceLength(); // detectSpur(gkp, w, subreadFile, doSubreadLoggingVerbose); // Get stats on spur region detected - save the length of each region to the trimStats object. //} //if (libr->gkLibrary_removeChimericReads() == true) { // readsProcChimera += read->gkRead_sequenceLength(); // detectChimer(gkp, w, subreadFile, doSubreadLoggingVerbose); // Get stats on chimera region detected - save the length of each region to the trimStats object. //} if (libr->gkLibrary_checkForSubReads() == true) { readsProcSubRead += read->gkRead_sequenceLength(); detectSubReads(gkp, w, subreadFile, doSubreadLoggingVerbose); } // Get stats on the bad regions found. This kind of duplicates code in trimBadInterval(), but // I don't want to pass all the stats objects into there. if (w->blist.size() == 0) { readsNoChange += read->gkRead_sequenceLength(); } else { uint32 nSpur5 = 0, bSpur5 = 0; uint32 nSpur3 = 0, bSpur3 = 0; uint32 nChimera = 0, bChimera = 0; uint32 nSubread = 0, bSubread = 0; for (uint32 bb=0; bb<w->blist.size(); bb++) { switch (w->blist[bb].type) { case badType_5spur: nSpur5 += 1; basesBadSpur5 += w->blist[bb].end - w->blist[bb].bgn; break; case badType_3spur: nSpur3 += 1; basesBadSpur3 += w->blist[bb].end - w->blist[bb].bgn; break; case badType_chimera: nChimera += 1; basesBadChimera += w->blist[bb].end - w->blist[bb].bgn; break; case badType_subread: nSubread += 1; basesBadSubread += w->blist[bb].end - w->blist[bb].bgn; break; default: break; } } if (nSpur5 > 0) readsBadSpur5 += nSpur5; if (nSpur3 > 0) readsBadSpur3 += nSpur3; if (nChimera > 0) readsBadChimera += nChimera; if (nSubread > 0) readsBadSubread += nSubread; } // Find solution. This coalesces the list (in 'w') of all the bad regions found, picks out the // largest good region, generates a log of the bad regions that support this decision, and sets // the trim points. trimBadInterval(gkp, w, minReadLength, subreadFile, doSubreadLoggingVerbose); // Log the solution. AS_UTL_safeWrite(reportFile, w->logMsg, "logMsg", sizeof(char), strlen(w->logMsg)); // Save the solution.... outClr->setbgn(w->id) = w->clrBgn; outClr->setend(w->id) = w->clrEnd; // And maybe delete the read. if (w->isOK == false) { deletedOut += read->gkRead_sequenceLength(); outClr->setDeleted(w->id); } // Update stats on what was trimmed. The asserts say the clear range didn't expand, and the if // tests if the clear range changed. assert(w->clrBgn >= w->iniBgn); assert(w->iniEnd >= w->clrEnd); if (w->clrBgn > w->iniBgn) readsTrimmed5 += w->clrBgn - w->iniBgn; if (w->iniEnd > w->clrEnd) readsTrimmed3 += w->iniEnd - w->clrEnd; } delete [] ovl; delete w; gkp->gkStore_close(); delete finClr; delete outClr; // Close log files if (reportFile) fclose(reportFile); if (subreadFile) fclose(subreadFile); // Write the summary if (outputPrefix) { sprintf(outputName, "%s.stats", outputPrefix); errno = 0; staFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)); } if (staFile == NULL) staFile = stdout; // Would like to know number of subreads per read fprintf(staFile, "PARAMETERS:\n"); fprintf(staFile, "----------\n"); fprintf(staFile, "%7u (reads trimmed below this many bases are deleted)\n", minReadLength); fprintf(staFile, "%7.4f (use overlaps at or below this fraction error)\n", errorRate); //fprintf(staFile, "%7u (use only overlaps longer than this)\n", minAlignLength); // NOT SUPPORTED! fprintf(staFile, "INPUT READS:\n"); fprintf(staFile, "-----------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads processed)\n", readsIn.nReads, readsIn.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases); fprintf(staFile, "\n"); fprintf(staFile, "PROCESSED:\n"); fprintf(staFile, "--------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no overlaps)\n", noOverlaps.nReads, noOverlaps.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no coverage after adjusting for trimming done already)\n", noCoverage.nReads, noCoverage.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for chimera)\n", readsProcChimera.nReads, readsProcChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for spur)\n", readsProcSpur.nReads, readsProcSpur.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for subreads)\n", readsProcSubRead.nReads, readsProcSubRead.nBases); fprintf(staFile, "\n"); fprintf(staFile, "READS WITH SIGNALS:\n"); fprintf(staFile, "------------------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 5' spur signal)\n", readsBadSpur5.nReads, readsBadSpur5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 3' spur signal)\n", readsBadSpur3.nReads, readsBadSpur3.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of chimera signal)\n", readsBadChimera.nReads, readsBadChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of subread signal)\n", readsBadSubread.nReads, readsBadSubread.nBases); fprintf(staFile, "\n"); fprintf(staFile, "SIGNALS:\n"); fprintf(staFile, "-------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 5' spur signal)\n", basesBadSpur5.nReads, basesBadSpur5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 3' spur signal)\n", basesBadSpur3.nReads, basesBadSpur3.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of chimera signal)\n", basesBadChimera.nReads, basesBadChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of subread signal)\n", basesBadSubread.nReads, basesBadSubread.nBases); fprintf(staFile, "\n"); fprintf(staFile, "TRIMMING:\n"); fprintf(staFile, "--------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 5' end of the read)\n", readsTrimmed5.nReads, readsTrimmed5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 3' end of the read)\n", readsTrimmed3.nReads, readsTrimmed3.nBases); #if 0 fprintf(staFile, "DELETED:\n"); fprintf(staFile, "-------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of both cimera and spur signals)\n", bothDeletedSmall.nReads, bothDeletedSmall.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of chimera signal)\n", chimeraDeletedSmall.nReads, chimeraDeletedSmall.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of spur signal)\n", spurDeletedSmall.nReads, spurDeletedSmall.nBases); fprintf(staFile, "\n"); fprintf(staFile, "SPUR TYPES:\n"); fprintf(staFile, "----------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (normal spur detected)\n", spurDetectedNormal.nReads, spurDetectedNormal.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker spur detected)\n", spurDetectedLinker.nReads, spurDetectedLinker.nBases); fprintf(staFile, "\n"); fprintf(staFile, "CHIMERA TYPES:\n"); fprintf(staFile, "-------------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (innie-pair chimera detected)\n", chimeraDetectedInnie.nReads, chimeraDetectedInnie.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (overhanging chimera detected)\n", chimeraDetectedOverhang.nReads, chimeraDetectedOverhang.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (gap chimera detected)\n", chimeraDetectedGap.nReads, chimeraDetectedGap.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker chimera detected)\n", chimeraDetectedLinker.nReads, chimeraDetectedLinker.nBases); #endif // INPUT READS = ACCEPTED + TRIMMED + DELETED // SPUR TYPE = TRIMMED and DELETED spur and both categories // CHIMERA TYPE = TRIMMED and DELETED chimera and both categories if (staFile != stdout) fclose(staFile); exit(0); }
int main(int argc, char **argv) { coParameters *G = new coParameters(); argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { G->gkpStorePath = argv[++arg]; } else if (strcmp(argv[arg], "-R") == 0) { G->bgnID = atoi(argv[++arg]); G->endID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-O") == 0) { // -F? -S Olap_Path G->ovlStorePath = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { G->errorRate = atof(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { G->minOverlap = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { // For 'corrections' file input G->correctionsName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { // For 'erates' output G->eratesName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { // But we're not threaded! G->numThreads = atoi(argv[++arg]); } else { err++; } arg++; } if (G->gkpStorePath == NULL) fprintf(stderr, "ERROR: no input gatekeeper store (-G) supplied.\n"), err++; if (G->ovlStorePath == NULL) fprintf(stderr, "ERROR: no input overlap store (-O) supplied.\n"), err++; if (G->correctionsName == NULL) fprintf(stderr, "ERROR: no input read corrections file (-c) supplied.\n"), err++; if (G->eratesName == NULL) fprintf(stderr, "ERROR: no output erates file (-o) supplied.\n"), err++; if (err) { fprintf(stderr, "USAGE: %s [-d <dna-file>] [-o <ovl_file>] [-q <quality>]\n", argv[0]); fprintf(stderr, " [-x <del_file>] [-F OlapFile] [-S OlapStore]\n"); fprintf(stderr, " [-c <cgb_file>] [-e <erate_file>\n"); fprintf(stderr, " <gkpStore> <CorrectFile> <lo> <hi>\n"); fprintf(stderr, "\n"); fprintf(stderr, "Recalculates overlaps for frags <lo> .. <hi> in\n"); fprintf(stderr, " <gkpStore> using corrections in <CorrectFile> \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, "-e <erate-file> specifies binary file to dump corrected erates to\n"); fprintf(stderr, " for later updating of olap store by update-erates \n"); fprintf(stderr, "-F specify file of sorted overlaps to use (in the format\n"); fprintf(stderr, " produced by get-olaps\n"); fprintf(stderr, "-o <ovl_file> specifies name of file to which OVL messages go\n"); fprintf(stderr, "-q <quality> overlaps less than this error rate are\n"); fprintf(stderr, " automatically output\n"); fprintf(stderr, "-S specify the binary overlap store containing overlaps to use\n"); exit(1); } //fprintf (stderr, "Quality Threshold = %.2f%%\n", 100.0 * Quality_Threshold); // // Initialize Globals // fprintf(stderr, "Initializing.\n"); double MAX_ERRORS = 1 + (uint32)(G->errorRate * AS_MAX_READLEN); Initialize_Match_Limit(G->Edit_Match_Limit, G->errorRate, MAX_ERRORS); for (int32 i=0; i <= AS_MAX_READLEN; i++) G->Error_Bound[i] = (int)ceil(i * G->errorRate); // // // fprintf(stderr, "Opening gkpStore '%s'.\n", G->gkpStorePath); gkStore *gkpStore = gkStore::gkStore_open(G->gkpStorePath); if (G->bgnID < 1) G->bgnID = 1; if (gkpStore->gkStore_getNumReads() < G->endID) G->endID = gkpStore->gkStore_getNumReads(); // Load the reads for the overlaps we are going to be correcting, and apply corrections to them fprintf(stderr, "Correcting reads "F_U32" to "F_U32".\n", G->bgnID, G->endID); Correct_Frags(G, gkpStore); // Load overlaps we're going to correct fprintf(stderr, "Loading overlaps.\n"); Read_Olaps(G, gkpStore); // Now sort them on the B iid. fprintf(stderr, "Sorting overlaps.\n"); #ifdef _GLIBCXX_PARALLEL __gnu_sequential::sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_bID()); #else sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_bID()); #endif // Recompute overlaps fprintf(stderr, "Recomputing overlaps.\n"); Redo_Olaps(G, gkpStore); gkpStore->gkStore_close(); gkpStore = NULL; // Sort the overlaps back into the original order fprintf(stderr, "Sorting overlaps.\n"); #ifdef _GLIBCXX_PARALLEL __gnu_sequential::sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_Order()); #else sort(G->olaps, G->olaps + G->olapsLen, Olap_Info_t_by_Order()); #endif // Dump the new erates fprintf (stderr, "Saving corrected error rates to file %s\n", G->eratesName); { errno = 0; FILE *fp = fopen(G->eratesName, "w"); if (errno) fprintf(stderr, "Failed to open '%s': %s\n", G->eratesName, strerror(errno)), exit(1); AS_UTL_safeWrite(fp, &G->bgnID, "loid", sizeof(int32), 1); AS_UTL_safeWrite(fp, &G->endID, "hiid", sizeof(int32), 1); AS_UTL_safeWrite(fp, &G->olapsLen, "num", sizeof(uint64), 1); fprintf(stderr, "--Allocate "F_U64" MB for output error rates.\n", (sizeof(uint16) * G->olapsLen) >> 20); uint16 *evalue = new uint16 [G->olapsLen]; for (int32 i=0; i<G->olapsLen; i++) evalue[i] = G->olaps[i].evalue; AS_UTL_safeWrite(fp, evalue, "evalue", sizeof(uint16), G->olapsLen); delete [] evalue; fclose(fp); } // Finished. //fprintf (stderr, "%d/%d failed/total alignments (%.1f%%)\n", // Failed_Alignments_Ct, Total_Alignments_Ct, // Total_Alignments_Ct == 0 ? 0.0 : (100.0 * Failed_Alignments_Ct) / Total_Alignments_Ct); delete G; fprintf(stderr, "DONE.\n"); exit(0); }
void gkStore::gkStore_buildPartitions(uint32 *partitionMap) { char name[FILENAME_MAX]; // Store cannot be partitioned already, and it must be readOnly (for safety) as we don't need to // be changing any of the normal store data. assert(_numberOfPartitions == 0); assert(_mode == gkStore_readOnly); // Figure out what the last partition is uint32 maxPartition = 0; uint32 unPartitioned = 0; assert(partitionMap[0] == UINT32_MAX); for (uint32 fi=1; fi<=gkStore_getNumReads(); fi++) { if (partitionMap[fi] == UINT32_MAX) unPartitioned++; else if (maxPartition < partitionMap[fi]) maxPartition = partitionMap[fi]; } fprintf(stderr, "Found "F_U32" unpartitioned reads and maximum partition of "F_U32"\n", unPartitioned, maxPartition); // Create the partitions by opening N copies of the data stores, // and writing data to each. FILE **blobfiles = new FILE * [maxPartition + 1]; uint64 *blobfileslen = new uint64 [maxPartition + 1]; // Offset, in bytes, into the blobs file FILE **readfiles = new FILE * [maxPartition + 1]; uint32 *readfileslen = new uint32 [maxPartition + 1]; // aka _readsPerPartition uint32 *readIDmap = new uint32 [gkStore_getNumReads() + 1]; // aka _readIDtoPartitionIdx // Be nice and put all the partitions in a subdirectory. sprintf(name,"%s/partitions", _storePath); if (AS_UTL_fileExists(name, true, true) == false) AS_UTL_mkdir(name); // Open all the output files -- fail early if we can't open that many files. blobfiles[0] = NULL; blobfileslen[0] = UINT64_MAX; readfiles[0] = NULL; readfileslen[0] = UINT32_MAX; for (uint32 i=1; i<=maxPartition; i++) { sprintf(name,"%s/partitions/blobs.%04d", _storePath, i); errno = 0; blobfiles[i] = fopen(name, "w"); blobfileslen[i] = 0; if (errno) fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n", i, name, strerror(errno)), exit(1); sprintf(name,"%s/partitions/reads.%04d", _storePath, i); errno = 0; readfiles[i] = fopen(name, "w"); readfileslen[i] = 0; if (errno) fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n", i, name, strerror(errno)), exit(1); } // Open the output partition map file -- we might as well fail early if we can't make it also. sprintf(name,"%s/partitions/map", _storePath); errno = 0; FILE *rIDmF = fopen(name, "w"); if (errno) fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition map file '%s': %s\n", name, strerror(errno)), exit(1); // Copy the blob from the master file to the partitioned file, update pointers. readIDmap[0] = UINT32_MAX; // There isn't a zeroth read, make it bogus. for (uint32 fi=1; fi<=gkStore_getNumReads(); fi++) { uint32 pi = partitionMap[fi]; assert(pi != 0); // No zeroth partition, right? if (pi == UINT32_MAX) // Deleted reads are not assigned a partition; skip them continue; // Make a copy of the read, then modify it for the partition, then write it to the partition. // Without the copy, we'd need to update the master record too. gkRead partRead = _reads[fi]; //*gkStore_getRead(fi); partRead.gkRead_copyDataToPartition(_blobs, blobfiles, blobfileslen, pi); #if 1 fprintf(stderr, "read "F_U32"="F_U32" len "F_U32" -- blob master "F_U64" -- to part "F_U32" new read id "F_U32" blob "F_U64"/"F_U64" -- at readIdx "F_U32"\n", fi, _reads[fi].gkRead_readID(), _reads[fi].gkRead_sequenceLength(), _reads[fi]._mPtr, pi, partRead.gkRead_readID(), partRead._pID, partRead._mPtr, readfileslen[pi]); #endif AS_UTL_safeWrite(readfiles[pi], &partRead, "gkStore::gkStore_buildPartitions::read", sizeof(gkRead), 1); readIDmap[fi] = readfileslen[pi]++; } // There isn't a zeroth read. AS_UTL_safeWrite(rIDmF, &maxPartition, "gkStore::gkStore_buildPartitions::maxPartition", sizeof(uint32), 1); AS_UTL_safeWrite(rIDmF, readfileslen, "gkStore::gkStore_buildPartitions::readfileslen", sizeof(uint32), maxPartition + 1); AS_UTL_safeWrite(rIDmF, partitionMap, "gkStore::gkStore_buildPartitions::partitionMap", sizeof(uint32), gkStore_getNumReads() + 1); AS_UTL_safeWrite(rIDmF, readIDmap, "gkStore::gkStore_buildPartitions::readIDmap", sizeof(uint32), gkStore_getNumReads() + 1); // cleanup -- close all the files, delete storage fclose(rIDmF); for (uint32 i=1; i<=maxPartition; i++) { fprintf(stderr, "partition "F_U32" has "F_U32" reads\n", i, readfileslen[i]); errno = 0; fclose(blobfiles[i]); fclose(readfiles[i]); if (errno) fprintf(stderr, " warning: %s\n", strerror(errno)); } delete [] readIDmap; delete [] readfileslen; delete [] readfiles; delete [] blobfileslen; delete [] blobfiles; }
gkStore::~gkStore() { char N[FILENAME_MAX]; FILE *F; // Should check that inf on disk is the same as inf in memory, and update if needed. bool needsInfoUpdate = false; // Write N+1 because we write, but don't count, the [0] element. if (_librariesMMap) { delete _librariesMMap; } else if (_libraries) { sprintf(N, "%s/libraries", gkStore_path()); errno = 0; F = fopen(N, "w"); if (errno) fprintf(stderr, "gkStore::~gkStore()-- failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); AS_UTL_safeWrite(F, _libraries, "libraries", sizeof(gkLibrary), gkStore_getNumLibraries() + 1); fclose(F); delete [] _libraries; needsInfoUpdate = true; } if (_readsMMap) { delete _readsMMap; } else if (_reads) { sprintf(N, "%s/reads", gkStore_path()); errno = 0; F = fopen(N, "w"); if (errno) fprintf(stderr, "gkStore::~gkStore()-- failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); AS_UTL_safeWrite(F, _reads, "reads", sizeof(gkRead), gkStore_getNumReads() + 1); fclose(F); delete [] _reads; needsInfoUpdate = true; } if (needsInfoUpdate) { sprintf(N, "%s/info", gkStore_path()); errno = 0; F = fopen(N, "w"); if (errno) fprintf(stderr, "gkStore::~gkStore()-- failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); AS_UTL_safeWrite(F, &_info, "info", sizeof(gkStoreInfo), 1); fclose(F); sprintf(N, "%s/info.txt", gkStore_path()); errno = 0; F = fopen(N, "w"); if (errno) fprintf(stderr, "gkStore::~gkStore()-- failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); _info.writeInfoAsText(F); fclose(F); } if (_blobsMMap) delete _blobsMMap; if (_blobsFile) fclose(_blobsFile); delete [] _readIDtoPartitionIdx; delete [] _readIDtoPartitionID; delete [] _readsPerPartition; };