void gkStore::gkStore_deletePartitions(void) { char path[FILENAME_MAX]; sprintf(path, "%s/partitions/map", gkStore_path()); if (AS_UTL_fileExists(path, false, false) == false) return; // How many partitions? FILE *F = fopen(path, "r"); if (errno) fprintf(stderr, "ERROR: failed to open partition meta data '%s': %s\n", path, strerror(errno)), exit(1); fread(&_numberOfPartitions, sizeof(uint32), 1, F); fclose(F); // Yay! Delete! AS_UTL_unlink(path); for (uint32 ii=0; ii<_numberOfPartitions; ii++) { sprintf(path, "%s/partitions/reads.%04u", gkStore_path(), ii+1); AS_UTL_unlink(path); sprintf(path, "%s/partitions/blobs.%04u", gkStore_path(), ii+1); AS_UTL_unlink(path); } }
void MateLocation::dumpHappiness(const char *prefix, const char *name) { char dirname[FILENAME_MAX] = {0}; char outname[FILENAME_MAX] = {0}; sprintf(dirname, "%s.%03u.%s.mateHappiness", prefix, logFileOrder, name); sprintf(outname, "%s.%03u.%s.mateHappiness/utg%09u.mateHappiness", prefix, logFileOrder, name, _tig->id()); if (AS_UTL_fileExists(dirname, TRUE, TRUE) == 0) AS_UTL_mkdir(dirname); FILE *F = fopen(outname, "w"); for (int32 i=0; i<_tigLen; i++) fprintf(F, "%u\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", i, good[i], badFwd[i], badRev[i], badExternalFwd[i], badExternalRev[i], badCompressed[i], badStretched[i], badNormal[i], badAnti[i], badOuttie[i]); fclose(F); }
void ovStore::ovStore_write(void) { AS_UTL_mkdir(_storePath); char name[FILENAME_MAX]; sprintf(name, "%s/info", _storePath); // If the ovs file exists, AND has a valid magic number, then the store is complete and we should // abort before the valid store is destroyed. if (AS_UTL_fileExists(name, false, false)) { errno = 0; FILE *ovsinfo = fopen(name, "r"); if (errno) { fprintf(stderr, "ERROR: failed to read store metadata from '%s': %s\n", name, strerror(errno)); exit(1); } AS_UTL_safeRead(ovsinfo, &_info, "ovStore::ovStore::testinfo", sizeof(ovStoreInfo), 1); fclose(ovsinfo); if (_info._ovsMagic == ovStoreMagic) fprintf(stderr, "ERROR: overlapStore '%s' is a valid overlap store, will not overwrite.\n", _storePath), exit(1); } // Create a new incomplete info file. errno = 0; FILE *ovsinfo = fopen(name, "w"); if (errno) fprintf(stderr, "failed to create overlap store '%s': %s\n", _storePath, strerror(errno)), exit(1); AS_UTL_safeWrite(ovsinfo, &_info, "ovStore::ovStore::saveinfo", sizeof(ovStoreInfo), 1); fclose(ovsinfo); sprintf(name, "%s/index", _storePath); errno = 0; _offtFile = fopen(name, "w"); if (errno) fprintf(stderr, "AS_OVS_createOverlapStore()-- failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1); _overlapsThisFile = 0; _currentFileIndex = 0; _bof = NULL; }
void fastqFile::loadIndex(char *indexname) { struct stat fastqstat; if (AS_UTL_fileExists(indexname) == false) return; errno = 0; if (stat(_filename, &fastqstat)) { fprintf(stderr, "fastqFile::constructIndex()-- stat of file '%s' failed: %s\n", _filename, strerror(errno)); return; } FILE *I = fopen(indexname, "r"); if (errno) { fprintf(stderr, "fastqFile::constructIndex()-- open of file '%s' failed: %s\n", indexname, strerror(errno)); return; } fread(&_header, sizeof(fastqFileHeader), 1, I); if ((_header._magic[0] != FASTQ_MAGICNUMBER1) && (_header._magic[1] != FASTQ_MAGICNUMBER2)) { fprintf(stderr, "fastqFile::constructIndex()-- magic mismatch.\n"); fclose(I); return; } if ((_header._fastqFileSize != (uint64)fastqstat.st_size) || (_header._fastqModificationTime != (uint64)fastqstat.st_mtime) || (_header._fastqCreationTime != (uint64)fastqstat.st_ctime)) { fprintf(stderr, "fastqFile::constructIndex()-- stat mismatch.\n"); fclose(I); return; } _index = new fastqFileIndex [_header._numberOfSequences]; _names = new char [_header._namesLength]; fread(_index, sizeof(fastqFileIndex), _header._numberOfSequences, I); fread(_names, sizeof(char), _header._namesLength, I); #ifdef DEBUG fprintf(stderr, "fastqFile::constructIndex()-- '%s' LOADED\n", _filename); #endif fclose(I); return; }
// Remove a file, or do nothing if the file doesn't exist. Returns true if the file // was deleted, false if the file never existsed. int AS_UTL_unlink(const char *filename) { if (AS_UTL_fileExists(filename, FALSE, FALSE) == 0) return(0); errno = 0; unlink(filename); if (errno) { fprintf(stderr, "AS_UTL_unlink()-- Failed to remove file '%s': %s\n", filename, strerror(errno)); exit(1); } return(1); }
compressedFileReader::compressedFileReader(const char *filename) { char cmd[FILENAME_MAX * 2]; int32 len = 0; _file = NULL; _pipe = false; _stdi = false; if (filename != NULL) len = strlen(filename); if ((len > 0) && (strcmp(filename, "-") != 0) && (AS_UTL_fileExists(filename, FALSE, FALSE) == FALSE)) fprintf(stderr, "ERROR: Failed to open input file '%s': %s\n", filename, strerror(errno)), exit(1); errno = 0; if ((len > 3) && (strcasecmp(filename + len - 3, ".gz") == 0)) { sprintf(cmd, "gzip -dc %s", filename); _file = popen(cmd, "r"); _pipe = true; } else if ((len > 4) && (strcasecmp(filename + len - 4, ".bz2") == 0)) { sprintf(cmd, "bzip2 -dc %s", filename); _file = popen(cmd, "r"); _pipe = true; } else if ((len > 3) && (strcasecmp(filename + len - 3, ".xz") == 0)) { sprintf(cmd, "xz -dc %s", filename); _file = popen(cmd, "r"); _pipe = true; if (_file == NULL) // popen() returns NULL on error. It does not reliably set errno. fprintf(stderr, "ERROR: Failed to open input file '%s': popen() returned NULL\n", filename), exit(1); errno = 0; } else if ((len == 0) || (strcmp(filename, "-") == 0)) { _file = stdin; _stdi = 1; } else { _file = fopen(filename, "r"); _pipe = false; } if (errno) fprintf(stderr, "ERROR: Failed to open input file '%s': %s\n", filename, strerror(errno)), exit(1); }
void operationBuild(char *buildName, char *tigName, uint32 tigVers) { errno = 0; FILE *F = fopen(buildName, "r"); if (errno) fprintf(stderr, "Failed to open '%s' for reading: %s\n", buildName, strerror(errno)), exit(1); if (AS_UTL_fileExists(tigName, TRUE, TRUE)) { fprintf(stderr, "ERROR: '%s' exists, and I will not clobber an existing store.\n", tigName); exit(1); } tgStore *tigStore = new tgStore(tigName); tgTig *tig = new tgTig(); for (int32 v=1; v<tigVers; v++) tigStore->nextVersion(); while (tig->loadLayout(F) == true) { if (tig->numberOfChildren() == 0) continue; // The log isn't correct. For new tigs (all of these are) we don't know the // id until after it is added. Further, if these come with id's already set, // they can't be added to a new store -- they don't exist. #if 0 fprintf(stderr, "INSERTING tig %d (%d children) (originally ID %d)\n", tig->tigID(), tig->numberOfChildren(), oID); #endif tigStore->insertTig(tig, false); } fclose(F); delete tig; delete tigStore; }
int main (int argc, char **argv) { char *gkpName = NULL; char *tigName = NULL; int32 tigVers = -1; vector<char *> tigInputs; tgStoreType tigType = tgStoreModify; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-T") == 0) { tigName = argv[++arg]; tigVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-L") == 0) { AS_UTL_loadFileList(argv[++arg], tigInputs); } else if (strcmp(argv[arg], "-n") == 0) { tigType = tgStoreReadOnly; } else if (AS_UTL_fileExists(argv[arg])) { tigInputs.push_back(argv[arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((err) || (gkpName == NULL) || (tigName == NULL) || (tigInputs.size() == 0)) { fprintf(stderr, "usage: %s -G <gkpStore> -T <tigStore> <v> [input.cns]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G <gkpStore> Path to the gatekeeper store\n"); fprintf(stderr, " -T <tigStore> <v> Path to the tigStore and version to add tigs to\n"); fprintf(stderr, "\n"); fprintf(stderr, " -L <file-of-files> Load the tig(s) from files listed in 'file-of-files'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -n Don't replace, just report what would have happened\n"); fprintf(stderr, "\n"); fprintf(stderr, " The primary operation is to replace tigs in the store with ones in a set of input files.\n"); fprintf(stderr, " The input files can be either supplied directly on the command line or listed in\n"); fprintf(stderr, " a text file (-L).\n"); fprintf(stderr, "\n"); fprintf(stderr, " A new store is created if one doesn't exist, otherwise, whatever tigs are there are\n"); fprintf(stderr, " replaced with those in the -R file. If version 'v' doesn't exist, it is created.\n"); fprintf(stderr, "\n"); fprintf(stderr, " Even if -n is supplied, a new store is created if one doesn't exist.\n"); fprintf(stderr, "\n"); fprintf(stderr, " To add a new tig, give it a tig id of -1. New tigs must be added to the latest version.\n"); fprintf(stderr, " To delete a tig, remove all children, and set the number of them to zero.\n"); fprintf(stderr, "\n"); if (gkpName == NULL) fprintf(stderr, "ERROR: no gatekeeper store (-G) supplied.\n"); if (tigName == NULL) fprintf(stderr, "ERROR: no tig store (-T) supplied.\n"); if (tigInputs.size() == 0) fprintf(stderr, "ERROR: no input tigs (-R) supplied.\n"); exit(1); } // If the store doesn't exist, create one, and make a bunch of versions if (AS_UTL_fileExists(tigName, true, false) == false) { fprintf(stderr, "Creating tig store '%s' version %d\n", tigName, tigVers); tgStore *tigStore = new tgStore(tigName); for (int32 vv=1; vv<tigVers; vv++) tigStore->nextVersion(); delete tigStore; } gkStore *gkpStore = gkStore::gkStore_open(gkpName); tgStore *tigStore = new tgStore(tigName, tigVers, tigType); tgTig *tig = new tgTig; for (uint32 ff=0; ff<tigInputs.size(); ff++) { errno = 0; FILE *TI = fopen(tigInputs[ff], "r"); if (errno) fprintf(stderr, "Failed to open '%s': %s\n", tigInputs[ff], strerror(errno)), exit(1); fprintf(stderr, "Reading layouts from '%s'.\n", tigInputs[ff]); while (tig->loadFromStreamOrLayout(TI) == true) { // Handle insertion. if (tig->numberOfChildren() > 0) { //fprintf(stderr, "INSERTING tig %d\n", tig->tigID()); tigStore->insertTig(tig, false); continue; } // Deleted already? if (tigStore->isDeleted(tig->tigID()) == true) { //fprintf(stderr, "DELETING tig %d -- ALREADY DELETED\n", tig->tigID()); continue; } // Really delete it then. //fprintf(stderr, "DELETING tig %d\n", tig->tigID()); tigStore->deleteTig(tig->tigID()); } fclose(TI); fprintf(stderr, "Reading layouts from '%s' completed.\n", tigInputs[ff]); } delete tig; delete tigStore; gkpStore->gkStore_close(); exit(0); }
int main(int argc, char **argv) { char *ovlName = NULL; uint32 maxJob = 0; bool deleteIntermediates = true; bool doExplicitTest = false; bool doFixes = false; char name[FILENAME_MAX]; argc = AS_configure(argc, argv); int err=0; int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-O") == 0) { ovlName = argv[++arg]; } else if (strcmp(argv[arg], "-F") == 0) { maxJob = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-f") == 0) { doFixes = true; } else if (strcmp(argv[arg], "-t") == 0) { doExplicitTest = true; ovlName = argv[++arg]; } else if (strcmp(argv[arg], "-nodelete") == 0) { deleteIntermediates = false; } else { fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); } arg++; } if (ovlName == NULL) err++; if ((maxJob == 0) && (doExplicitTest == false)) err++; if (err) { fprintf(stderr, "usage: %s ...\n", argv[0]); fprintf(stderr, " -O x.ovlStore path to overlap store to build the final index for\n"); fprintf(stderr, " -F s number of slices used in bucketizing/sorting\n"); fprintf(stderr, "\n"); fprintf(stderr, " -t x.ovlStore explicitly test a previously constructed index\n"); fprintf(stderr, " -f when testing, also create a new 'idx.fixed' which might\n"); fprintf(stderr, " resolve rare problems\n"); fprintf(stderr, "\n"); fprintf(stderr, " -nodelete do not remove intermediate files when the index is\n"); fprintf(stderr, " successfully created\n"); fprintf(stderr, "\n"); fprintf(stderr, " DANGER DO NOT USE DO NOT USE DO NOT USE DANGER\n"); fprintf(stderr, " DANGER DANGER\n"); fprintf(stderr, " DANGER This command is difficult to run by hand. DANGER\n"); fprintf(stderr, " DANGER Use ovStoreCreate instead. DANGER\n"); fprintf(stderr, " DANGER DANGER\n"); fprintf(stderr, " DANGER DO NOT USE DO NOT USE DO NOT USE DANGER\n"); fprintf(stderr, "\n"); if (ovlName == NULL) fprintf(stderr, "ERROR: No overlap store (-O) supplied.\n"); if ((maxJob == 0) && (doExplicitTest == false)) fprintf(stderr, "ERROR: One of -F (number of slices) or -t (test a store) must be supplied.\n"); exit(1); } // Do the test, and maybe fix things up. if (doExplicitTest == true) { bool passed = testIndex(ovlName, doFixes); exit((passed == true) ? 0 : 1); } // Check that all segments are present. Every segment should have an info file. uint32 cntJob = 0; for (uint32 i=1; i<=maxJob; i++) { uint32 complete = 0; sprintf(name, "%s/%04d", ovlName, i); if (AS_UTL_fileExists(name, FALSE, FALSE) == true) complete++; else fprintf(stderr, "ERROR: Segment "F_U32" data not present (%s)\n", i, name); sprintf(name, "%s/%04d.info", ovlName, i); if (AS_UTL_fileExists(name, FALSE, FALSE) == true) complete++; else fprintf(stderr, "ERROR: Segment "F_U32" info not present (%s)\n", i, name); sprintf(name, "%s/%04d.index", ovlName, i); if (AS_UTL_fileExists(name, FALSE, FALSE) == true) complete++; else fprintf(stderr, "ERROR: Segment "F_U32" index not present (%s)\n", i, name); if (complete == 3) cntJob++; } if (cntJob != maxJob) { fprintf(stderr, "ERROR: Expected "F_U32" segments, only found "F_U32".\n", maxJob, cntJob); exit(1); } // Merge the stuff. mergeInfoFiles(ovlName, maxJob); // Diagnostics. if (testIndex(ovlName, false) == false) { fprintf(stderr, "ERROR: index failed tests.\n"); exit(1); } // Remove intermediates. For the buckets, we keep going until there are 10 in a row not present. // During testing, on a microbe using 2850 buckets, some buckets were empty. if (deleteIntermediates == false) { fprintf(stderr, "\n"); fprintf(stderr, "Not removing intermediate files. Finished.\n"); exit(0); } fprintf(stderr, "\n"); fprintf(stderr, "Removing intermediate files.\n"); // Removing indices is easy, beacuse we know how many there are. for (uint32 i=1; i<=maxJob; i++) { sprintf(name, "%s/%04u.index", ovlName, i); AS_UTL_unlink(name); sprintf(name, "%s/%04u.info", ovlName, i); AS_UTL_unlink(name); } // We don't know how many buckets there are, so we remove until we fail to find ten // buckets in a row. for (uint32 missing=0, i=1; missing<10; i++) { sprintf(name, "%s/bucket%04d", ovlName, i); if (AS_UTL_fileExists(name, TRUE, FALSE) == FALSE) { missing++; continue; } missing = 0; sprintf(name, "%s/bucket%04d/sliceSizes", ovlName, i); AS_UTL_unlink(name); sprintf(name, "%s/bucket%04d", ovlName, i); rmdir(name); } fprintf(stderr, "Finished.\n"); exit(0); }
void ovStore::addEvalues(uint32 bgnID, uint32 endID, uint16 *evalues, uint64 evaluesLen) { char name[FILENAME_MAX]; sprintf(name, "%s/evalues", _storePath); // If we have an opened memory mapped file, and it isn't open for writing, close it. if ((_evaluesMap) && (_evaluesMap->type() == memoryMappedFile_readOnly)) { fprintf(stderr, "WARNING: closing read-only evalues file.\n"); delete _evaluesMap; _evaluesMap = NULL; _evalues = NULL; } // Remove a bogus evalues file if one exists. if ((AS_UTL_fileExists(name) == true) && (AS_UTL_sizeOfFile(name) != (sizeof(uint16) * _info._numOverlapsTotal))) { fprintf(stderr, "WARNING: existing evalues file is incorrect size: should be "F_U64" bytes, is "F_U64" bytes. Removing.\n", (sizeof(uint16) * _info._numOverlapsTotal), AS_UTL_sizeOfFile(name)); AS_UTL_unlink(name); } // Make a new evalues file if one doesn't exist. if (AS_UTL_fileExists(name) == false) { fprintf(stderr, "Creating evalues file for "F_U64" overlaps.\r", _info._numOverlapsTotal); errno = 0; FILE *F = fopen(name, "w"); if (errno) fprintf(stderr, "Failed to make evalues file '%s': %s\n", name, strerror(errno)), exit(1); uint16 *Z = new uint16 [1048576]; uint64 Zn = 0; memset(Z, 0, sizeof(uint16) * 1048576); while (Zn < _info._numOverlapsTotal) { uint64 S = (Zn + 1048576 < _info._numOverlapsTotal) ? 1048576 : _info._numOverlapsTotal - Zn; AS_UTL_safeWrite(F, Z, "zero evalues", sizeof(uint16), S); Zn += S; fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\r", _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal); } fprintf(stderr, "Creating evalues file for "F_U64" overlaps....%07.3f%%\n", _info._numOverlapsTotal, 100.0 * Zn / _info._numOverlapsTotal); fclose(F); } // Open the evalues file if it isn't already opened if (_evalues == NULL) { _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readWrite); _evalues = (uint16 *)_evaluesMap->get(0); } // Figure out the overlap ID for the first overlap associated with bgnID setRange(bgnID, endID); // Load the evalues from 'evalues' for (uint64 ii=0; ii<evaluesLen; ii++) _evalues[_offt._overlapID + ii] = evalues[ii]; // That's it. Deleting the ovStore object will close the memoryMappedFile. It's left open // for more updates. }
void ovStore::ovStore_read(void) { char name[FILENAME_MAX]; sprintf(name, "%s/info", _storePath); errno = 0; FILE *ovsinfo = fopen(name, "r"); if (errno) fprintf(stderr, "ERROR: directory '%s' is not an ovelrapStore; failed to open info file '%s': %s\n", _storePath, name, strerror(errno)), exit(1); AS_UTL_safeRead(ovsinfo, &_info, "ovStore::ovStore::info", sizeof(ovStoreInfo), 1); fclose(ovsinfo); if ((_info._ovsMagic != ovStoreMagic) && (_info._ovsMagic != ovStoreMagicIncomplete)) fprintf(stderr, "ERROR: directory '%s' is not an overlapStore; magic number 0x%016"F_X64P" incorrect.\n", _storePath, _info._ovsMagic), exit(1); if ((_info._ovsMagic != ovStoreMagic) && (_info._ovsMagic != ovStoreMagicIncomplete)) fprintf(stderr, "ERROR: overlapStore '%s' is incomplate; creation crashed?\n", _storePath), exit(1); if (_info._ovsVersion != ovStoreVersion) fprintf(stderr, "ERROR: overlapStore '%s' is version "F_U64"; this code supports only version "F_U64".\n", _storePath, _info._ovsVersion, ovStoreVersion), exit(1); if (_info._maxReadLenInBits != AS_MAX_READLEN_BITS) fprintf(stderr, "ERROR: overlapStore '%s' is for AS_MAX_READLEN_BITS="F_U64"; this code supports only %d bits.\n", _storePath, _info._maxReadLenInBits, AS_MAX_READLEN_BITS), exit(1); // Load stats #if 0 sprintf(name, "%s/statistics", _storePath); errno = 0; FILE *ost = fopen(name, "r"); if (errno) fprintf(stderr, "failed to open the stats file '%s': %s\n", name, strerror(errno)), exit(1); AS_UTL_safeRead(ost, &_stats, "ovStore::ovStore::stats", sizeof(OverlapStoreStats), 1); fclose(ost); #endif // Open the index sprintf(name, "%s/index", _storePath); errno = 0; _offtFile = fopen(name, "r"); if (errno) fprintf(stderr, "ERROR: failed to open offset file '%s': %s\n", name, strerror(errno)), exit(1); // Open erates sprintf(name, "%s/evalues", _storePath); if (AS_UTL_fileExists(name)) { _evaluesMap = new memoryMappedFile(name, memoryMappedFile_readOnly); _evalues = (uint16 *)_evaluesMap->get(0); } //_offtMMap = new memoryMappedFile(name, memoryMappedFile_readOnly); //_offts = (ovStoreOfft *)_offtMMap->get(0); //_offtLength = _offtMap->length() / sizeof(ovStoreOfft); }
void gkStore::gkStore_buildPartitions(uint32 *partitionMap) { char name[FILENAME_MAX]; // Store cannot be partitioned already, and it must be readOnly (for safety) as we don't need to // be changing any of the normal store data. assert(_numberOfPartitions == 0); assert(_mode == gkStore_readOnly); // Figure out what the last partition is uint32 maxPartition = 0; uint32 unPartitioned = 0; assert(partitionMap[0] == UINT32_MAX); for (uint32 fi=1; fi<=gkStore_getNumReads(); fi++) { if (partitionMap[fi] == UINT32_MAX) unPartitioned++; else if (maxPartition < partitionMap[fi]) maxPartition = partitionMap[fi]; } fprintf(stderr, "Found "F_U32" unpartitioned reads and maximum partition of "F_U32"\n", unPartitioned, maxPartition); // Create the partitions by opening N copies of the data stores, // and writing data to each. FILE **blobfiles = new FILE * [maxPartition + 1]; uint64 *blobfileslen = new uint64 [maxPartition + 1]; // Offset, in bytes, into the blobs file FILE **readfiles = new FILE * [maxPartition + 1]; uint32 *readfileslen = new uint32 [maxPartition + 1]; // aka _readsPerPartition uint32 *readIDmap = new uint32 [gkStore_getNumReads() + 1]; // aka _readIDtoPartitionIdx // Be nice and put all the partitions in a subdirectory. sprintf(name,"%s/partitions", _storePath); if (AS_UTL_fileExists(name, true, true) == false) AS_UTL_mkdir(name); // Open all the output files -- fail early if we can't open that many files. blobfiles[0] = NULL; blobfileslen[0] = UINT64_MAX; readfiles[0] = NULL; readfileslen[0] = UINT32_MAX; for (uint32 i=1; i<=maxPartition; i++) { sprintf(name,"%s/partitions/blobs.%04d", _storePath, i); errno = 0; blobfiles[i] = fopen(name, "w"); blobfileslen[i] = 0; if (errno) fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n", i, name, strerror(errno)), exit(1); sprintf(name,"%s/partitions/reads.%04d", _storePath, i); errno = 0; readfiles[i] = fopen(name, "w"); readfileslen[i] = 0; if (errno) fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition %u file '%s' for write: %s\n", i, name, strerror(errno)), exit(1); } // Open the output partition map file -- we might as well fail early if we can't make it also. sprintf(name,"%s/partitions/map", _storePath); errno = 0; FILE *rIDmF = fopen(name, "w"); if (errno) fprintf(stderr, "gkStore::gkStore_buildPartitions()-- ERROR: failed to open partition map file '%s': %s\n", name, strerror(errno)), exit(1); // Copy the blob from the master file to the partitioned file, update pointers. readIDmap[0] = UINT32_MAX; // There isn't a zeroth read, make it bogus. for (uint32 fi=1; fi<=gkStore_getNumReads(); fi++) { uint32 pi = partitionMap[fi]; assert(pi != 0); // No zeroth partition, right? if (pi == UINT32_MAX) // Deleted reads are not assigned a partition; skip them continue; // Make a copy of the read, then modify it for the partition, then write it to the partition. // Without the copy, we'd need to update the master record too. gkRead partRead = _reads[fi]; //*gkStore_getRead(fi); partRead.gkRead_copyDataToPartition(_blobs, blobfiles, blobfileslen, pi); #if 1 fprintf(stderr, "read "F_U32"="F_U32" len "F_U32" -- blob master "F_U64" -- to part "F_U32" new read id "F_U32" blob "F_U64"/"F_U64" -- at readIdx "F_U32"\n", fi, _reads[fi].gkRead_readID(), _reads[fi].gkRead_sequenceLength(), _reads[fi]._mPtr, pi, partRead.gkRead_readID(), partRead._pID, partRead._mPtr, readfileslen[pi]); #endif AS_UTL_safeWrite(readfiles[pi], &partRead, "gkStore::gkStore_buildPartitions::read", sizeof(gkRead), 1); readIDmap[fi] = readfileslen[pi]++; } // There isn't a zeroth read. AS_UTL_safeWrite(rIDmF, &maxPartition, "gkStore::gkStore_buildPartitions::maxPartition", sizeof(uint32), 1); AS_UTL_safeWrite(rIDmF, readfileslen, "gkStore::gkStore_buildPartitions::readfileslen", sizeof(uint32), maxPartition + 1); AS_UTL_safeWrite(rIDmF, partitionMap, "gkStore::gkStore_buildPartitions::partitionMap", sizeof(uint32), gkStore_getNumReads() + 1); AS_UTL_safeWrite(rIDmF, readIDmap, "gkStore::gkStore_buildPartitions::readIDmap", sizeof(uint32), gkStore_getNumReads() + 1); // cleanup -- close all the files, delete storage fclose(rIDmF); for (uint32 i=1; i<=maxPartition; i++) { fprintf(stderr, "partition "F_U32" has "F_U32" reads\n", i, readfileslen[i]); errno = 0; fclose(blobfiles[i]); fclose(readfiles[i]); if (errno) fprintf(stderr, " warning: %s\n", strerror(errno)); } delete [] readIDmap; delete [] readfileslen; delete [] readfiles; delete [] blobfileslen; delete [] blobfiles; }
// The N valid modes for a 'new gkpStore' call: // // 1) Add new reads/libraries, modify old ones. gkStore(path, true, true) // 2) No addition, but can modify old ones. gkStore(path, true) // 3) No addition, no modification. gkStore(path); // gkStore::gkStore(char const *path, gkStore_mode mode, uint32 partID) { char name[FILENAME_MAX]; memset(_storePath, 0, sizeof(char) * FILENAME_MAX); memset(_storeName, 0, sizeof(char) * FILENAME_MAX); strcpy(_storePath, path); strcpy(_storeName, path); // Broken. sprintf(name, "%s/info", _storePath); // If the info file exists, load it. if (AS_UTL_fileExists(name, false, false) == true) { errno = 0; FILE *I = fopen(name, "r"); AS_UTL_safeRead(I, &_info, "gkStore::_info", sizeof(gkStoreInfo), 1); fclose(I); } // Check sizes are correct. uint32 failed = 0; if (_info.gkLibrarySize != sizeof(gkLibrary)) failed += fprintf(stderr, "ERROR: gkLibrary size in store = %u, differs from executable = %u\n", _info.gkLibrarySize, sizeof(gkLibrary)); if (_info.gkReadSize != sizeof(gkRead)) failed += fprintf(stderr, "ERROR: gkRead size in store = %u, differs from executable = %u\n", _info.gkReadSize, sizeof(gkRead)); if (_info.gkMaxLibrariesBits != AS_MAX_LIBRARIES_BITS) failed += fprintf(stderr, "ERROR: AS_MAX_LIBRARIES_BITS in store = %u, differs from executable = %u\n", _info.gkMaxLibrariesBits, AS_MAX_LIBRARIES_BITS); if (_info.gkLibraryNameSize != LIBRARY_NAME_SIZE) failed += fprintf(stderr, "ERROR: LIBRARY_NAME_SIZE in store = %u, differs from executable = %u\n", _info.gkLibraryNameSize, LIBRARY_NAME_SIZE); if (_info.gkMaxReadBits != AS_MAX_READS_BITS) failed += fprintf(stderr, "ERROR: AS_MAX_READS_BITS in store = %u, differs from executable = %u\n", _info.gkMaxReadBits, AS_MAX_READS_BITS); if (_info.gkMaxReadLenBits != AS_MAX_READLEN_BITS) failed += fprintf(stderr, "ERROR: AS_MAX_READLEN_BITS in store = %u, differs from executable = %u\n", _info.gkMaxReadLenBits, AS_MAX_READLEN_BITS); if (failed) fprintf(stderr, "ERROR:\nERROR: Can't open store '%s': parameters in src/AS_global.H are incompatible with the store.\n", _storePath), exit(1); assert(_info.gkLibrarySize == sizeof(gkLibrary)); assert(_info.gkReadSize == sizeof(gkRead)); assert(_info.gkMaxLibrariesBits == AS_MAX_LIBRARIES_BITS); assert(_info.gkLibraryNameSize == LIBRARY_NAME_SIZE); assert(_info.gkMaxReadBits == AS_MAX_READS_BITS); assert(_info.gkMaxReadLenBits == AS_MAX_READLEN_BITS); // Clear ourself, to make valgrind happier. _librariesMMap = NULL; _librariesAlloc = 0; _libraries = NULL; _readsMMap = NULL; _readsAlloc = 0; _reads = NULL; _blobsMMap = NULL; _blobs = NULL; _blobsFile = NULL; _mode = mode; _numberOfPartitions = 0; _partitionID = 0; _readIDtoPartitionIdx = NULL; _readIDtoPartitionID = NULL; _readsPerPartition = NULL; //_readsInThisPartition = NULL; // // READ ONLY // if ((mode == gkStore_readOnly) && (partID == UINT32_MAX)) { //fprintf(stderr, "gkStore()-- opening '%s' for read-only access.\n", _storePath); if (AS_UTL_fileExists(_storePath, true, false) == false) { fprintf(stderr, "gkStore()-- failed to open '%s' for read-only access: store doesn't exist.\n", _storePath); exit(1); } sprintf(name, "%s/libraries", _storePath); _librariesMMap = new memoryMappedFile (name, memoryMappedFile_readOnly); _libraries = (gkLibrary *)_librariesMMap->get(0); sprintf(name, "%s/reads", _storePath); _readsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly); _reads = (gkRead *)_readsMMap->get(0); sprintf(name, "%s/blobs", _storePath); _blobsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly); _blobs = (void *)_blobsMMap->get(0); } // // MODIFY, NO APPEND (also for building a partitioned store) // else if ((mode == gkStore_modify) && (partID == UINT32_MAX)) { //fprintf(stderr, "gkStore()-- opening '%s' for read-write access.\n", _storePath); if (AS_UTL_fileExists(_storePath, true, false) == false) { fprintf(stderr, "gkStore()-- failed to open '%s' for read-write access: store doesn't exist.\n", _storePath); exit(1); } sprintf(name, "%s/libraries", _storePath); _librariesMMap = new memoryMappedFile (name, memoryMappedFile_readWrite); _libraries = (gkLibrary *)_librariesMMap->get(0); sprintf(name, "%s/reads", _storePath); _readsMMap = new memoryMappedFile (name, memoryMappedFile_readWrite); _reads = (gkRead *)_readsMMap->get(0); sprintf(name, "%s/blobs", _storePath); _blobsMMap = new memoryMappedFile (name, memoryMappedFile_readWrite); _blobs = (void *)_blobsMMap->get(0); } // // MODIFY, APPEND, open mmap'd files, but copy them entirely to local memory // else if ((mode == gkStore_extend) && (partID == UINT32_MAX)) { //fprintf(stderr, "gkStore()-- opening '%s' for read-write and append access.\n", _storePath); if (AS_UTL_fileExists(_storePath, true, true) == false) AS_UTL_mkdir(_storePath); _librariesAlloc = MAX(64, 2 * _info.numLibraries); _libraries = new gkLibrary [_librariesAlloc]; sprintf(name, "%s/libraries", _storePath); if (AS_UTL_fileExists(name, false, false) == true) { _librariesMMap = new memoryMappedFile (name, memoryMappedFile_readOnly); memcpy(_libraries, _librariesMMap->get(0), sizeof(gkLibrary) * (_info.numLibraries + 1)); delete _librariesMMap; _librariesMMap = NULL;; } _readsAlloc = MAX(128, 2 * _info.numReads); _reads = new gkRead [_readsAlloc]; sprintf(name, "%s/reads", _storePath); if (AS_UTL_fileExists(name, false, false) == true) { _readsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly); memcpy(_reads, _readsMMap->get(0), sizeof(gkRead) * (_info.numReads + 1)); delete _readsMMap; _readsMMap = NULL; } sprintf(name, "%s/blobs", _storePath); _blobsMMap = NULL; _blobs = NULL; errno = 0; _blobsFile = fopen(name, "a+"); if (errno) fprintf(stderr, "gkStore()-- Failed to open blobs file '%s' for appending: %s\n", name, strerror(errno)), exit(1); } // // PARTITIONED, no modifications, no appends // // BIG QUESTION: do we want to partition the read metadata too, or is it small enough // to load in every job? For now, we load all the metadata. else if ((mode == gkStore_readOnly) && (partID != UINT32_MAX)) { //fprintf(stderr, "gkStore()-- opening '%s' partition '%u' for read-only access.\n", _storePath, partID); // For partitioned reads, we need to have a uint32 map of readID to partitionReadID so we can // lookup the metadata in the partitoned _reads data. This is 4 bytes per read, compared to 24 // bytes for the full meta data. Assuming 100x of 3kb read coverage on human, that's 100 // million reads, so 0.400 GB vs 2.4 GB. sprintf(name, "%s/partitions/map", _storePath); errno = 0; FILE *F = fopen(name, "r"); if (errno) fprintf(stderr, "gkStore::gkStore()-- failed to open '%s' for reading: %s\n", name, strerror(errno)), exit(1); AS_UTL_safeRead(F, &_numberOfPartitions, "gkStore::_numberOfPartitions", sizeof(uint32), 1); _partitionID = partID; _readsPerPartition = new uint32 [_numberOfPartitions + 1]; // No zeroth element in any of these _readIDtoPartitionID = new uint32 [gkStore_getNumReads() + 1]; _readIDtoPartitionIdx = new uint32 [gkStore_getNumReads() + 1]; AS_UTL_safeRead(F, _readsPerPartition, "gkStore::_readsPerPartition", sizeof(uint32), _numberOfPartitions + 1); AS_UTL_safeRead(F, _readIDtoPartitionID, "gkStore::_readIDtoPartitionID", sizeof(uint32), gkStore_getNumReads() + 1); AS_UTL_safeRead(F, _readIDtoPartitionIdx, "gkStore::_readIDtoPartitionIdx", sizeof(uint32), gkStore_getNumReads() + 1); fclose(F); sprintf(name, "%s/libraries", _storePath); _librariesMMap = new memoryMappedFile (name, memoryMappedFile_readOnly); _libraries = (gkLibrary *)_librariesMMap->get(0); //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _libraries); sprintf(name, "%s/partitions/reads.%04"F_U32P"", _storePath, partID); _readsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly); _reads = (gkRead *)_readsMMap->get(0); //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _reads); sprintf(name, "%s/partitions/blobs.%04"F_U32P"", _storePath, partID); _blobsMMap = new memoryMappedFile (name, memoryMappedFile_readOnly); _blobs = (void *)_blobsMMap->get(0); //fprintf(stderr, " -- openend '%s' at "F_X64"\n", name, _blobs); } // Info only, no access to reads or libraries. else if (mode == gkStore_infoOnly) { //fprintf(stderr, "gkStore()-- opening '%s' for info-only access.\n", _storePath); } else { fprintf(stderr, "gkStore::gkStore()-- invalid mode '%s' with partition ID %u.\n", toString(mode), partID); assert(0); } }
int main(int argc, char **argv) { char *gkpStoreName = NULL; char *outPrefix = NULL; uint32 minReadLength = 0; uint32 firstFileArg = 0; char errorLogName[FILENAME_MAX]; char htmlLogName[FILENAME_MAX]; char nameMapName[FILENAME_MAX]; argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-o") == 0) { gkpStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-minlength") == 0) { minReadLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "--") == 0) { firstFileArg = arg++; break; } else if (argv[arg][0] == '-') { fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); err++; } else { firstFileArg = arg; break; } arg++; } if (gkpStoreName == NULL) err++; if (firstFileArg == 0) err++; if (err) { fprintf(stderr, "usage: %s [...] -o gkpStore\n", argv[0]); fprintf(stderr, " -o gkpStore create this gkpStore\n"); fprintf(stderr, " \n"); fprintf(stderr, " -minlength L discard reads shorter than L\n"); fprintf(stderr, " \n"); fprintf(stderr, " \n"); if (gkpStoreName == NULL) fprintf(stderr, "ERROR: no gkpStore (-g) supplied.\n"); if (firstFileArg == 0) fprintf(stderr, "ERROR: no input files supplied.\n"); exit(1); } gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName, gkStore_extend); gkRead *gkpRead = NULL; gkLibrary *gkpLibrary = NULL; uint32 gkpFileID = 0; // Used for HTML output, an ID for each file loaded. uint32 inLineLen = 1024; char inLine[1024] = { 0 }; validSeq['a'] = validSeq['c'] = validSeq['g'] = validSeq['t'] = validSeq['n'] = 1; validSeq['A'] = validSeq['C'] = validSeq['G'] = validSeq['T'] = validSeq['N'] = 1; errno = 0; sprintf(errorLogName, "%s/errorLog", gkpStoreName); FILE *errorLog = fopen(errorLogName, "w"); if (errno) fprintf(stderr, "ERROR: cannot open error file '%s': %s\n", errorLogName, strerror(errno)), exit(1); sprintf(htmlLogName, "%s/load.dat", gkpStoreName); FILE *htmlLog = fopen(htmlLogName, "w"); if (errno) fprintf(stderr, "ERROR: cannot open uid map file '%s': %s\n", htmlLogName, strerror(errno)), exit(1); sprintf(nameMapName, "%s/readNames.txt", gkpStoreName); FILE *nameMap = fopen(nameMapName, "w"); if (errno) fprintf(stderr, "ERROR: cannot open uid map file '%s': %s\n", nameMapName, strerror(errno)), exit(1); uint32 nERROR = 0; // There aren't any errors, we just exit fatally if encountered. uint32 nWARNS = 0; uint32 nLOADED = 0; // Reads loaded uint64 bLOADED = 0; // Bases loaded uint32 nSKIPPED = 0; uint64 bSKIPPED = 0; // Bases not loaded, too short #if 0 fprintf(htmlLog, "<!DOCTYPE html>\n"); fprintf(htmlLog, "<html>\n"); fprintf(htmlLog, "<head>\n"); fprintf(htmlLog, "<title>gatekeeper load statistics</title>\n"); fprintf(htmlLog, "<style type='text/css'>\n"); fprintf(htmlLog, "body { font-family: Helvetica, Verdana, sans-serif; }\n"); fprintf(htmlLog, "h1, h2 { color: #ee3e80; }\n"); fprintf(htmlLog, "p { color: #665544; }\n"); fprintf(htmlLog, "th, td { border: 1px solid #111111; padding: 2px 2px 2px 2px; }\n"); fprintf(htmlLog, "td:hover { background-color: #e4e4e4; }\n"); fprintf(htmlLog, "th:hover { background-color: #d4d4d4; }\n"); fprintf(htmlLog, "tr.details { visibility: collapse; }\n"); fprintf(htmlLog, "</style>\n"); fprintf(htmlLog, "</head>\n"); fprintf(htmlLog, "<body>\n"); fprintf(htmlLog, "<h2>Input Files</h2>\n"); fprintf(htmlLog, "<table>\n"); #endif for (; firstFileArg < argc; firstFileArg++) { fprintf(stderr, "\n"); fprintf(stderr, "Starting file '%s'.\n", argv[firstFileArg]); compressedFileReader *inFile = new compressedFileReader(argv[firstFileArg]); char *line = new char [10240]; KeyAndValue keyval; while (fgets(line, 10240, inFile->file()) != NULL) { chomp(line); keyval.find(line); if (keyval.key() == NULL) { // No key, so must be a comment or blank line continue; } if (strcasecmp(keyval.key(), "name") == 0) { gkpLibrary = gkpStore->gkStore_addEmptyLibrary(keyval.value()); continue; } // We'd better have a gkpLibrary defined, if not, the .gkp input file is incorrect. if (gkpLibrary == NULL) { fprintf(stderr, "WARNING: no 'name' tag in gkp input; creating library with name 'DEFAULT'.\n"); gkpLibrary = gkpStore->gkStore_addEmptyLibrary(keyval.value()); nWARNS++; } if (strcasecmp(keyval.key(), "preset") == 0) { gkpLibrary->gkLibrary_parsePreset(keyval.value()); } else if (strcasecmp(keyval.key(), "qv") == 0) { gkpLibrary->gkLibrary_setDefaultQV(keyval.value_double()); } else if (strcasecmp(keyval.key(), "isNonRandom") == 0) { gkpLibrary->gkLibrary_setIsNonRandom(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "trustHomopolymerRuns") == 0) { gkpLibrary->gkLibrary_setTrustHomopolymerRuns(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "removeDuplicateReads") == 0) { gkpLibrary->gkLibrary_setRemoveDuplicateReads(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "finalTrim") == 0) { gkpLibrary->gkLibrary_setFinalTrim(keyval.value()); } else if (strcasecmp(keyval.key(), "removeSpurReads") == 0) { gkpLibrary->gkLibrary_setRemoveSpurReads(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "removeChimericReads") == 0) { gkpLibrary->gkLibrary_setRemoveChimericReads(keyval.value_bool()); } else if (strcasecmp(keyval.key(), "checkForSubReads") == 0) { gkpLibrary->gkLibrary_setCheckForSubReads(keyval.value_bool()); } else if (AS_UTL_fileExists(keyval.key(), false, false)) { loadReads(gkpStore, gkpLibrary, gkpFileID++, minReadLength, nameMap, htmlLog, errorLog, keyval.key(), nWARNS, nLOADED, bLOADED, nSKIPPED, bSKIPPED); } else { fprintf(stderr, "ERROR: option '%s' not recognized, and not a file of reads.\n", line); exit(1); } } delete inFile; delete [] line; } #if 0 fprintf(htmlLog, "</table>\n"); #endif gkpStore->gkStore_close(); fclose(nameMap); fclose(errorLog); fprintf(stderr, "\n"); fprintf(stderr, "Finished with:\n"); fprintf(stderr, " "F_U32" warnings (bad base or qv)\n", nWARNS); fprintf(stderr, "\n"); fprintf(stderr, "Read from inputs:\n"); fprintf(stderr, " "F_U64" bp.\n", bLOADED); fprintf(stderr, " "F_U32" reads.\n", nLOADED); fprintf(stderr, "\n"); fprintf(stderr, "Loaded into store:\n"); fprintf(stderr, " "F_U64" bp.\n", bLOADED); fprintf(stderr, " "F_U32" reads.\n", nLOADED); fprintf(stderr, "\n"); fprintf(stderr, "Skipped (too short):\n"); fprintf(stderr, " "F_U64" bp (%.4f%%).\n", bSKIPPED, 100.0 * bSKIPPED / (bSKIPPED + bLOADED)); fprintf(stderr, " "F_U32" reads (%.4f%%).\n", nSKIPPED, 100.0 * nSKIPPED / (nSKIPPED + nLOADED)); fprintf(stderr, "\n"); fprintf(stderr, "\n"); #if 0 fprintf(htmlLog, "\n"); fprintf(htmlLog, "<h2>Final Store</h2>\n"); fprintf(htmlLog, "<table>\n"); fprintf(htmlLog, "<tr><td colspan='2'>%s</td></tr>\n", gkpStoreName); fprintf(htmlLog, "<tr><td>readsLoaded</td><td>"F_U32" reads ("F_U64" bp)</td></tr>\n", nLOADED, bLOADED); fprintf(htmlLog, "<tr><td>readsSkipped</td><td>"F_U32" reads ("F_U64" bp) (read was too short)</td></tr>\n", nSKIPPED, bSKIPPED); fprintf(htmlLog, "<tr><td>warnings</td><td>"F_U32" warnings (invalid base or quality value)</td></tr>\n", nWARNS); fprintf(htmlLog, "</table>\n"); fprintf(htmlLog, "\n"); fprintf(htmlLog, "<script type='text/javascript'>\n"); fprintf(htmlLog, "var toggleOne = function() {\n"); fprintf(htmlLog, " var table = this.closest('table');\n"); fprintf(htmlLog, " var elts = table.querySelectorAll('.details');\n"); fprintf(htmlLog, "\n"); fprintf(htmlLog, " for (var i=0; i<elts.length; i++) {\n"); fprintf(htmlLog, " if (!elts[i].enabled) {\n"); fprintf(htmlLog, " elts[i].enabled = true;\n"); fprintf(htmlLog, " elts[i].style.visibility = 'visible';\n"); fprintf(htmlLog, " } else {\n"); fprintf(htmlLog, " elts[i].enabled = false;\n"); fprintf(htmlLog, " elts[i].style.visibility = 'collapse';\n"); fprintf(htmlLog, " }\n"); fprintf(htmlLog, " }\n"); fprintf(htmlLog, "}\n"); fprintf(htmlLog, "\n"); for (uint32 ii=0; ii<gkpFileID; ii++) { fprintf(htmlLog, "document.getElementById('gkpload%u').onclick = toggleOne;\n", ii); fprintf(htmlLog, "document.getElementById('gkpload%u').style = 'cursor: pointer;';\n", ii); } fprintf(htmlLog, "</script>\n"); fprintf(htmlLog, "\n"); fprintf(htmlLog, "</body>\n"); fprintf(htmlLog, "</html>\n"); #else fprintf(htmlLog, "sum "F_U32" "F_U64" "F_U32" "F_U64" "F_U32"\n", nLOADED, bLOADED, nSKIPPED, bSKIPPED, nWARNS); #endif fclose(htmlLog); if (nERROR > 0) fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many errors.\n"); if (bSKIPPED > 0.25 * (bSKIPPED + bLOADED)) fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many bases skipped. Check your reads.\n"); if (nWARNS > 0.25 * (nLOADED)) fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many warnings. Check your reads.\n"); if (nSKIPPED > 0.50 * (nLOADED)) fprintf(stderr, "gatekeeperCreate did NOT finish successfully; too many short reads. Check your reads!\n"); if ((nERROR > 0) || (bSKIPPED > 0.25 * (bSKIPPED + bLOADED)) || (nWARNS > 0.25 * (nSKIPPED + nLOADED)) || (nSKIPPED > 0.50 * (nSKIPPED + nLOADED))) exit(1); fprintf(stderr, "gatekeeperCreate finished successfully.\n"); exit(0); }
void mergeInfoFiles(char *storePath, uint32 nPieces) { ovStoreInfo infopiece; ovStoreInfo info; info._ovsMagic = ovStoreMagic; info._ovsVersion = ovStoreVersion; info._smallestIID = UINT64_MAX; info._largestIID = 0; info._numOverlapsTotal = 0; info._highestFileIndex = nPieces; info._maxReadLenInBits = AS_MAX_READLEN_BITS; ovStoreOfft offm; offm._a_iid = 0; offm._fileno = 1; offm._offset = 0; offm._numOlaps = 0; // Open the new master index output file char name[FILENAME_MAX]; sprintf(name, "%s/index", storePath); errno = 0; FILE *idx = fopen(name, "w"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1); // Special case, we need an empty index for the zeroth fragment. AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsetZero", sizeof(ovStoreOfft), 1); // Process each for (uint32 i=1; i<=nPieces; i++) { sprintf(name, "%s/%04d.info", storePath, i); fprintf(stderr, "Processing '%s'\n", name); if (AS_UTL_fileExists(name, FALSE, FALSE) == false) { fprintf(stderr, "ERROR: file '%s' not found.\n", name); exit(1); } { errno = 0; FILE *F = fopen(name, "r"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1); AS_UTL_safeRead(F, &infopiece, "ovStore::mergeInfoFiles::infopiece", sizeof(ovStoreInfo), 1); fclose(F); } // Add empty index elements for missing overlaps if (infopiece._numOverlapsTotal == 0) { fprintf(stderr, " No overlaps found.\n"); continue; } assert(infopiece._smallestIID <= infopiece._largestIID); if (info._largestIID + 1 < infopiece._smallestIID) fprintf(stderr, " Adding empty records for fragments "F_U64" to "F_U64"\n", info._largestIID + 1, infopiece._smallestIID - 1); while (info._largestIID + 1 < infopiece._smallestIID) { offm._a_iid = info._largestIID + 1; //offm._fileno = set elsewhere //offm._offset = set elsewhere //offm._numOlaps = 0; AS_UTL_safeWrite(idx, &offm, "ovStore::mergeInfoFiles::offsets", sizeof(ovStoreOfft), 1); info._largestIID++; } // Copy index elements for existing overlaps. While copying, update the supposed position // of any fragments with no overlaps. Without doing this, accessing the store beginning // or ending at such a fragment will fail. { sprintf(name, "%s/%04d.index", storePath, i); errno = 0; FILE *F = fopen(name, "r"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1); uint32 recsLen = 0; uint32 recsMax = 1024 * 1024; ovStoreOfft *recs = new ovStoreOfft [recsMax]; recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsLoad", sizeof(ovStoreOfft), recsMax); if (recsLen > 0) { if (info._largestIID + 1 != recs[0]._a_iid) fprintf(stderr, "ERROR: '%s' starts with iid "F_U32", but store only up to "F_U64"\n", name, recs[0]._a_iid, info._largestIID); assert(info._largestIID + 1 == recs[0]._a_iid); } while (recsLen > 0) { offm._fileno = recs[recsLen-1]._fileno; // Update location of missing stuff. offm._offset = recs[recsLen-1]._offset; AS_UTL_safeWrite(idx, recs, "ovStore::mergeInfoFiles::offsetsWrite", sizeof(ovStoreOfft), recsLen); recsLen = AS_UTL_safeRead(F, recs, "ovStore::mergeInfoFiles::offsetsReLoad", sizeof(ovStoreOfft), recsMax); } delete [] recs; fclose(F); } // Update info._smallestIID = MIN(info._smallestIID, infopiece._smallestIID); info._largestIID = MAX(info._largestIID, infopiece._largestIID); info._numOverlapsTotal += infopiece._numOverlapsTotal; fprintf(stderr, " Now finished with fragments "F_U64" to "F_U64" -- "F_U64" overlaps.\n", info._smallestIID, info._largestIID, info._numOverlapsTotal); } fclose(idx); // Dump the new store info file { sprintf(name, "%s/info", storePath); errno = 0; FILE *F = fopen(name, "w"); if (errno) fprintf(stderr, "ERROR: Failed to open '%s': %s\n", name, strerror(errno)), exit(1); AS_UTL_safeWrite(F, &info, "ovStore::mergeInfoFiles::finalInfo", sizeof(ovStoreInfo), 1); fclose(F); } fprintf(stderr, "\n"); fprintf(stderr, "Index finalized for reads "F_U64" to "F_U64" with "F_U64" overlaps.\n", info._smallestIID, info._largestIID, info._numOverlapsTotal); }
int main(int argc, char **argv) { int32 minEvalue = 0; int32 maxEvalue = 0; int32 step = 1; char D[FILENAME_MAX]; char O[FILENAME_MAX]; if (argc == 2) { minEvalue = atoi(argv[1]); maxEvalue = minEvalue; } else if (argc == 3) { minEvalue = atoi(argv[1]); maxEvalue = atoi(argv[2]); } else if (argc == 4) { minEvalue = atoi(argv[1]); maxEvalue = atoi(argv[2]); step = atoi(argv[3]); } else { fprintf(stderr, "usage: %s minEvalue [maxEvalue [step]]\n", argv[0]); fprintf(stderr, " computes overlapper probabilities for minEvalue <= eValue <= maxEvalue'\n"); fprintf(stderr, " eValue 100 == 0.01 fraction error == 1%% error\n"); exit(1); } fprintf(stderr, "Computing Edit_Match_Limit data for reads of length %ubp (bits = %u).\n", AS_MAX_READLEN, AS_MAX_READLEN_BITS); sprintf(D, "prefixEditDistance-matchLimitData-BITS=%01d", AS_MAX_READLEN_BITS); AS_UTL_mkdir(D); #pragma omp parallel for schedule(dynamic, 1) for (int32 evalue=maxEvalue; evalue>=minEvalue; evalue -= step) { char N[FILENAME_MAX]; // Local to this thread! double erate = evalue / 10000.0; int32 start = 1; int32 MAX_ERRORS = (1 + (int) (erate * AS_MAX_READLEN)); int32 ERRORS_FOR_FREE = 1; int32 *starts = new int32 [MAX_ERRORS + 1]; memset(starts, 0, sizeof(int32) * (MAX_ERRORS + 1)); sprintf(N, "%s/prefixEditDistance-matchLimit-%04d.bin", D, evalue); if (AS_UTL_fileExists(N)) { fprintf(stderr, "eValue %04d -- eRate %6.4f -- %7.4f%% error -- %8d values -- thread %2d - LOAD\n", evalue, erate, erate * 100.0, MAX_ERRORS, omp_get_thread_num()); errno = 0; FILE *F = fopen(N, "r"); if (errno) fprintf(stderr, "Failed to open '%s' for reading: %s\n", N, strerror(errno)), exit(1); int32 me = 0; double er = 0.0; fread(&me, sizeof(int32), 1, F); fread(&er, sizeof(double), 1, F); fread( starts, sizeof(int32), MAX_ERRORS, F); assert(me == MAX_ERRORS); assert(er == erate); fclose(F); } else { fprintf(stderr, "eValue %04d -- eRate %6.4f -- %7.4f%% error -- %8d values -- thread %2d - COMPUTE\n", evalue, erate, erate * 100.0, MAX_ERRORS, omp_get_thread_num()); for (int32 e=ERRORS_FOR_FREE + 1; e<MAX_ERRORS; e++) { start = Binomial_Bound(e - ERRORS_FOR_FREE, erate, start); starts[e] = start - 1; } } { sprintf(O, "%s/prefixEditDistance-matchLimit-%04d.bin", D, evalue); errno = 0; FILE *F = fopen(O, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); fwrite(&MAX_ERRORS, sizeof(int32), 1, F); fwrite(&erate, sizeof(double), 1, F); fwrite( starts, sizeof(int32), MAX_ERRORS, F); fclose(F); } { sprintf(O, "%s/prefixEditDistance-matchLimit-%04d.dat", D, evalue); errno = 0; FILE *F = fopen(O, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); fprintf(F, "#length limit slope0toX slopeXtoMAX for erate=%0.4f MAX_ERRORS=%d\n", erate, MAX_ERRORS); for (uint32 mm=MAX_ERRORS-1, ii=1; ii<MAX_ERRORS; ii++) fprintf(F, "%-8d %8d %11.6f %11.6f\n", ii, starts[ii], (double)(starts[ii] - starts[1]) / (ii - 1 + 1), (double)(starts[mm] - starts[ii]) / (mm - ii + 1)); fclose(F); } { sprintf(O, "%s/prefixEditDistance-matchLimit-%04d.C", D, evalue); errno = 0; FILE *F = fopen(O, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); fprintf(F, "//\n"); fprintf(F, "// Automagically generated. Do not edit.\n"); fprintf(F, "//\n"); fprintf(F, "\n"); fprintf(F, "#include \"gkStore.H\"\n"); fprintf(F, "\n"); fprintf(F, "#if (AS_MAX_READLEN_BITS == %d)\n", AS_MAX_READLEN_BITS); fprintf(F, "\n"); fprintf(F, "extern\n"); fprintf(F, "const\n"); fprintf(F, "int32\n"); fprintf(F, "Edit_Match_Limit_%04d[%d] = {\n", evalue, MAX_ERRORS + 1); uint32 i=0; while (i < MAX_ERRORS) { uint32 j=0; fprintf(F, " "); while ((j < 16) && (i < MAX_ERRORS)) { if (i < MAX_ERRORS-1) fprintf(F, "0x%08x,", starts[i]); else fprintf(F, "0x%08x", starts[i]); i++; j++; } fprintf(F, "\n"); } fprintf(F, "};\n"); fprintf(F, "\n"); fprintf(F, "#endif\n"); fclose(F); } } }
int main(int argc, char **argv) { int32 minEvalue = 0; int32 maxEvalue = 0; int32 step = 0; if (argc == 2) { minEvalue = atoi(argv[1]); maxEvalue = minEvalue; } else if (argc == 3) { minEvalue = atoi(argv[1]); maxEvalue = atoi(argv[2]); } else if (argc == 4) { minEvalue = atoi(argv[1]); maxEvalue = atoi(argv[2]); step = atoi(argv[3]); } else { fprintf(stderr, "usage: %s minEvalue [maxEvalue [step]]\n", argv[0]); fprintf(stderr, " computes overlapper probabilities for minEvalue <= eValue <= maxEvalue'\n"); fprintf(stderr, " eValue 100 == 0.01 fraction error == 1%% error\n"); exit(1); } #pragma omp parallel for schedule(dynamic, 1) for (uint32 evalue=minEvalue; evalue<=maxEvalue; evalue += step) { double erate = evalue / 10000.0; int32 start = 1; int32 MAX_ERRORS = (1 + (int) (erate * AS_MAX_READLEN)); int32 ERRORS_FOR_FREE = 1; int32 *starts = new int32 [MAX_ERRORS + 1]; memset(starts, 0, sizeof(int32) * (MAX_ERRORS + 1)); char N[FILENAME_MAX]; sprintf(N, "prefixEditDistance-matchLimitData/prefixEditDistance-matchLimit-%04d.dat", evalue); if (AS_UTL_fileExists(N)) { fprintf(stderr, "eValue %04d -- eRate %6.4f -- %7.4f%% error -- %8d values -- thread %2d - LOAD\n", evalue, erate, erate * 100.0, MAX_ERRORS, omp_get_thread_num()); errno = 0; FILE *F = fopen(N, "r"); if (errno) fprintf(stderr, "Failed to open '%s' for reading: %s\n", N, strerror(errno)), exit(1); int32 me = 0; double er = 0.0; fread(&me, sizeof(int32), 1, F); fread(&er, sizeof(double), 1, F); fread( starts, sizeof(int32), MAX_ERRORS, F); assert(me == MAX_ERRORS); assert(er == erate); fclose(F); } else { fprintf(stderr, "eValue %04d -- eRate %6.4f -- %7.4f%% error -- %8d values -- thread %2d - COMPUTE\n", evalue, erate, erate * 100.0, MAX_ERRORS, omp_get_thread_num()); for (int32 e=ERRORS_FOR_FREE + 1; e<MAX_ERRORS; e++) { start = Binomial_Bound(e - ERRORS_FOR_FREE, erate, start); starts[e] = start - 1; } } { sprintf(N, "prefixEditDistance-matchLimitData/prefixEditDistance-matchLimit-%04d.dat", evalue); errno = 0; FILE *F = fopen(N, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); fwrite(&MAX_ERRORS, sizeof(int32), 1, F); fwrite(&erate, sizeof(double), 1, F); fwrite( starts, sizeof(int32), MAX_ERRORS, F); fclose(F); } { sprintf(N, "prefixEditDistance-matchLimit-%04d.C", evalue); errno = 0; FILE *F = fopen(N, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); fprintf(F, "//\n"); fprintf(F, "// Automagically generated. Do not edit.\n"); fprintf(F, "//\n"); fprintf(F, "\n"); fprintf(F, "#include \"AS_global.H\"\n"); fprintf(F, "\n"); fprintf(F, "extern\n"); fprintf(F, "const\n"); fprintf(F, "int32\n"); fprintf(F, "Edit_Match_Limit_%04d[%d] = {\n", evalue, MAX_ERRORS + 1); uint32 i=0; while (i < MAX_ERRORS) { uint32 j=0; fprintf(F, " "); while ((j < 16) && (i < MAX_ERRORS)) { if (i < MAX_ERRORS-1) fprintf(F, "0x%08x,", starts[i]); else fprintf(F, "0x%08x", starts[i]); i++; j++; } fprintf(F, "\n"); } fprintf(F, "};\n"); fclose(F); } } }