int main(int argc, char **argv) { int illegal; argc = AS_configure(argc, argv); G.initialize(); int err=0; int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { G.Doing_Partial_Overlaps = TRUE; } else if (strcmp(argv[arg], "-h") == 0) { AS_UTL_decodeRange(argv[++arg], G.bgnHashID, G.endHashID); } else if (strcmp(argv[arg], "-H") == 0) { AS_UTL_decodeRange(argv[++arg], G.minLibToHash, G.maxLibToHash); } else if (strcmp(argv[arg], "-r") == 0) { AS_UTL_decodeRange(argv[++arg], G.bgnRefID, G.endRefID); } else if (strcmp(argv[arg], "-R") == 0) { AS_UTL_decodeRange(argv[++arg], G.minLibToRef, G.maxLibToRef); } else if (strcmp(argv[arg], "-k") == 0) { arg++; if ((isdigit(argv[arg][0]) && (argv[arg][1] == 0)) || (isdigit(argv[arg][0]) && isdigit(argv[arg][1]) && (argv[arg][2] == 0))) { G.Kmer_Len = strtoull(argv[arg], NULL, 10); } else { errno = 0; G.Kmer_Skip_File = fopen(argv[arg], "r"); if (errno) fprintf(stderr, "ERROR: Failed to open -k '%s': %s\n", argv[arg], strerror(errno)), exit(1); } } else if (strcmp(argv[arg], "-l") == 0) { G.Frag_Olap_Limit = strtol(argv[++arg], NULL, 10); if (G.Frag_Olap_Limit < 1) G.Frag_Olap_Limit = UINT64_MAX; } else if (strcmp(argv[arg], "-m") == 0) { G.Unique_Olap_Per_Pair = FALSE; } else if (strcmp(argv[arg], "-u") == 0) { G.Unique_Olap_Per_Pair = TRUE; } else if (strcmp(argv[arg], "--hashbits") == 0) { G.Hash_Mask_Bits = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashstrings") == 0) { G.Max_Hash_Strings = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashdatalen") == 0) { G.Max_Hash_Data_Len = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashload") == 0) { G.Max_Hash_Load = atof(argv[++arg]); } else if (strcmp(argv[arg], "--maxreadlen") == 0) { // Quite the gross way to do this, but simple. uint32 desired = strtoul(argv[++arg], NULL, 10); OFFSET_BITS = 1; while (((uint32)1 << OFFSET_BITS) < desired) OFFSET_BITS++; STRING_NUM_BITS = 30 - OFFSET_BITS; STRING_NUM_MASK = (1 << STRING_NUM_BITS) - 1; OFFSET_MASK = (1 << OFFSET_BITS) - 1; MAX_STRING_NUM = STRING_NUM_MASK; } else if (strcmp(argv[arg], "-o") == 0) { G.Outfile_Name = argv[++arg]; } else if (strcmp(argv[arg], "-s") == 0) { G.Outstat_Name = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { G.Num_PThreads = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--minlength") == 0) { G.Min_Olap_Len = strtol (argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--maxerate") == 0) { G.maxErate = ceil(strtof(argv[++arg], NULL) * 100) / 100; } else if (strcmp(argv[arg], "-w") == 0) { G.Use_Window_Filter = TRUE; } else if (strcmp(argv[arg], "-z") == 0) { G.Use_Hopeless_Check = FALSE; } else { if (G.Frag_Store_Path == NULL) { G.Frag_Store_Path = argv[arg]; } else { fprintf(stderr, "Unknown option '%s'\n", argv[arg]); err++; } } arg++; } // Fix up some flags if we're allowing high error rates. // if (G.maxErate > 0.06) { if (G.Use_Window_Filter) fprintf(stderr, "High error rates requested -- window-filter turned off despite -w flag!\n"); G.Use_Window_Filter = FALSE; G.Use_Hopeless_Check = FALSE; } if (G.Max_Hash_Strings == 0) fprintf(stderr, "* No memory model supplied; -M needed!\n"), err++; if (G.Kmer_Len == 0) fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++; if (G.Max_Hash_Strings > MAX_STRING_NUM) fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++; if (G.Outfile_Name == NULL) fprintf (stderr, "ERROR: No output file name specified\n"), err++; if ((err) || (G.Frag_Store_Path == NULL)) { fprintf(stderr, "USAGE: %s [options] <gkpStorePath>\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "-b <fn> in contig mode, specify the output file\n"); fprintf(stderr, "-c contig mode. Use 2 frag stores. First is\n"); fprintf(stderr, " for reads; second is for contigs\n"); fprintf(stderr, "-G do partial overlaps\n"); fprintf(stderr, "-h <range> to specify fragments to put in hash table\n"); fprintf(stderr, " Implies LSF mode (no changes to frag store)\n"); fprintf(stderr, "-I designate a file of frag iids to limit olaps to\n"); fprintf(stderr, " (Contig mode only)\n"); fprintf(stderr, "-k if one or two digits, the length of a kmer, otherwise\n"); fprintf(stderr, " the filename containing a list of kmers to ignore in\n"); fprintf(stderr, " the hash table\n"); fprintf(stderr, "-l specify the maximum number of overlaps per\n"); fprintf(stderr, " fragment-end per batch of fragments.\n"); fprintf(stderr, "-m allow multiple overlaps per oriented fragment pair\n"); fprintf(stderr, "-M specify memory size. Valid values are '8GB', '4GB',\n"); fprintf(stderr, " '2GB', '1GB', '256MB'. (Not for Contig mode)\n"); fprintf(stderr, "-o specify output file name\n"); fprintf(stderr, "-P write protoIO output (if not -G)\n"); fprintf(stderr, "-r <range> specify old fragments to overlap\n"); fprintf(stderr, "-t <n> use <n> parallel threads\n"); fprintf(stderr, "-u allow only 1 overlap per oriented fragment pair\n"); fprintf(stderr, "-w filter out overlaps with too many errors in a window\n"); fprintf(stderr, "-z skip the hopeless check\n"); fprintf(stderr, "\n"); fprintf(stderr, "--maxerate <n> only output overlaps with fraction <n> or less error (e.g., 0.06 == 6%%)\n"); fprintf(stderr, "--minlength <n> only output overlaps of <n> or more bases\n"); fprintf(stderr, "\n"); fprintf(stderr, "--hashbits n Use n bits for the hash mask.\n"); fprintf(stderr, "--hashstrings n Load at most n strings into the hash table at one time.\n"); fprintf(stderr, "--hashdatalen n Load at most n bytes into the hash table at one time.\n"); fprintf(stderr, "--hashload f Load to at most 0.0 < f < 1.0 capacity (default 0.7).\n"); fprintf(stderr, "\n"); fprintf(stderr, "--maxreadlen n For batches with all short reads, pack bits differently to\n"); fprintf(stderr, " process more reads per batch.\n"); fprintf(stderr, " all reads must be shorter than n\n"); fprintf(stderr, " --hashstrings limited to 2^(30-m)\n"); fprintf(stderr, " Common values:\n"); fprintf(stderr, " maxreadlen 2048->hashstrings 524288 (default)\n"); fprintf(stderr, " maxreadlen 512->hashstrings 2097152\n"); fprintf(stderr, " maxreadlen 128->hashstrings 8388608\n"); fprintf(stderr, "\n"); fprintf(stderr, "--readsperbatch n Force batch size to n.\n"); fprintf(stderr, "--readsperthread n Force each thread to process n reads.\n"); fprintf(stderr, "\n"); exit(1); } Out_BOF = new ovFile(G.Outfile_Name, ovFileFullWrite); // We know enough now to set the hash function variables, and some other random variables. HSF1 = G.Kmer_Len - (G.Hash_Mask_Bits / 2); HSF2 = 2 * G.Kmer_Len - G.Hash_Mask_Bits; SV1 = HSF1 + 2; SV2 = (HSF1 + HSF2) / 2; SV3 = HSF2 - 2; // Log parameters. fprintf(stderr, "\n"); fprintf(stderr, "STRING_NUM_BITS "F_U32"\n", STRING_NUM_BITS); fprintf(stderr, "OFFSET_BITS "F_U32"\n", OFFSET_BITS); fprintf(stderr, "STRING_NUM_MASK "F_U64"\n", STRING_NUM_MASK); fprintf(stderr, "OFFSET_MASK "F_U64"\n", OFFSET_MASK); fprintf(stderr, "MAX_STRING_NUM "F_U64"\n", MAX_STRING_NUM); fprintf(stderr, "\n"); fprintf(stderr, "Hash_Mask_Bits "F_U32"\n", G.Hash_Mask_Bits); fprintf(stderr, "Max_Hash_Strings "F_U32"\n", G.Max_Hash_Strings); fprintf(stderr, "Max_Hash_Data_Len "F_U64"\n", G.Max_Hash_Data_Len); fprintf(stderr, "Max_Hash_Load %f\n", G.Max_Hash_Load); fprintf(stderr, "Kmer Length "F_U64"\n", G.Kmer_Len); fprintf(stderr, "Min Overlap Length %d\n", G.Min_Olap_Len); fprintf(stderr, "Max Error Rate %f\n", G.maxErate); fprintf(stderr, "\n"); fprintf(stderr, "Num_PThreads "F_U32"\n", G.Num_PThreads); assert (8 * sizeof (uint64) > 2 * G.Kmer_Len); Bit_Equivalent['a'] = Bit_Equivalent['A'] = 0; Bit_Equivalent['c'] = Bit_Equivalent['C'] = 1; Bit_Equivalent['g'] = Bit_Equivalent['G'] = 2; Bit_Equivalent['t'] = Bit_Equivalent['T'] = 3; for (int i = 0; i < 256; i ++) { char ch = tolower ((char) i); if (ch == 'a' || ch == 'c' || ch == 'g' || ch == 't') Char_Is_Bad[i] = 0; else Char_Is_Bad[i] = 1; } fprintf(stderr, "\n"); fprintf(stderr, "HASH_TABLE_SIZE "F_U32"\n", HASH_TABLE_SIZE); fprintf(stderr, "sizeof(Hash_Bucket_t) "F_SIZE_T"\n", sizeof(Hash_Bucket_t)); fprintf(stderr, "hash table size: "F_SIZE_T" MB\n", (HASH_TABLE_SIZE * sizeof(Hash_Bucket_t)) >> 20); fprintf(stderr, "\n"); Hash_Table = new Hash_Bucket_t [HASH_TABLE_SIZE]; fprintf(stderr, "check "F_SIZE_T" MB\n", (HASH_TABLE_SIZE * sizeof (Check_Vector_t) >> 20)); fprintf(stderr, "info "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (Hash_Frag_Info_t) >> 20)); fprintf(stderr, "start "F_SIZE_T" MB\n", (G.Max_Hash_Strings * sizeof (int64) >> 20)); fprintf(stderr, "\n"); Hash_Check_Array = new Check_Vector_t [HASH_TABLE_SIZE]; String_Info = new Hash_Frag_Info_t [G.Max_Hash_Strings]; String_Start = new int64 [G.Max_Hash_Strings]; String_Start_Size = G.Max_Hash_Strings; memset(Hash_Check_Array, 0, sizeof(Check_Vector_t) * HASH_TABLE_SIZE); memset(String_Info, 0, sizeof(Hash_Frag_Info_t) * G.Max_Hash_Strings); memset(String_Start, 0, sizeof(int64) * G.Max_Hash_Strings); OverlapDriver(); delete [] basesData; delete [] qualsData; delete [] nextRef; delete [] String_Start; delete [] String_Info; delete [] Hash_Check_Array; delete [] Hash_Table; delete Out_BOF; FILE *stats = stderr; if (G.Outstat_Name != NULL) { errno = 0; stats = fopen(G.Outstat_Name, "w"); if (errno) { fprintf(stderr, "WARNING: failed to open '%s' for writing: %s\n", G.Outstat_Name, strerror(errno)); stats = stderr; } } fprintf(stats, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct); fprintf(stats, " Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct); fprintf(stats, " Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct); fprintf(stats, " Total overlaps produced = "F_S64"\n", Total_Overlaps); fprintf(stats, " Contained overlaps = "F_S64"\n", Contained_Overlap_Ct); fprintf(stats, " Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct); fprintf(stats, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct); fprintf(stats, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct); if (stats != stderr) fclose(stats); return(0); }
int main(int argc, char **argv) { char *gkpName = 0L; char *ovsName = 0L; char *iniClrName = NULL; char *maxClrName = NULL; char *outClrName = NULL; uint32 errorValue = AS_OVS_encodeEvalue(0.015); uint32 minAlignLength = 40; uint32 minReadLength = 64; char *outputPrefix = NULL; char logName[FILENAME_MAX] = {0}; char sumName[FILENAME_MAX] = {0}; FILE *logFile = 0L; FILE *sumFile = 0L; uint32 idMin = 1; uint32 idMax = UINT32_MAX; uint32 minEvidenceOverlap = 40; uint32 minEvidenceCoverage = 1; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { ovsName = argv[++arg]; } else if (strcmp(argv[arg], "-Ci") == 0) { iniClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Cm") == 0) { maxClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Co") == 0) { outClrName = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { double erate = atof(argv[++arg]); errorValue = AS_OVS_encodeEvalue(erate); } else if (strcmp(argv[arg], "-l") == 0) { minAlignLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-minlength") == 0) { minReadLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-ol") == 0) { minEvidenceOverlap = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-oc") == 0) { minEvidenceCoverage = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-o") == 0) { outputPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { AS_UTL_decodeRange(argv[++arg], idMin, idMax); } else { fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); err++; } arg++; } if ((gkpName == NULL) || (ovsName == NULL) || (outputPrefix == NULL) || (err)) { fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Co output.clearFile -o outputPrefix\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G gkpStore path to read store\n"); fprintf(stderr, " -O ovlStore path to overlap store\n"); fprintf(stderr, "\n"); fprintf(stderr, " -o name output prefix, for logging\n"); fprintf(stderr, "\n"); fprintf(stderr, " -t bgn-end limit processing to only reads from bgn to end (inclusive)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -Ci clearFile path to input clear ranges (NOT SUPPORTED)\n"); //fprintf(stderr, " -Cm clearFile path to maximal clear ranges\n"); fprintf(stderr, " -Co clearFile path to ouput clear ranges\n"); fprintf(stderr, "\n"); fprintf(stderr, " -e erate ignore overlaps with more than 'erate' percent error\n"); //fprintf(stderr, " -l length ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -ol l the minimum evidence overlap length\n"); fprintf(stderr, " -oc c the minimum evidence overlap coverage\n"); fprintf(stderr, " evidence overlaps must overlap by 'l' bases to be joined, and\n"); fprintf(stderr, " must be at least 'c' deep to be retained\n"); fprintf(stderr, "\n"); fprintf(stderr, " -minlength l reads trimmed below this many bases are deleted\n"); fprintf(stderr, "\n"); exit(1); } gkStore *gkp = gkStore::gkStore_open(gkpName); ovStore *ovs = new ovStore(ovsName, gkp); clearRangeFile *iniClr = (iniClrName == NULL) ? NULL : new clearRangeFile(iniClrName, gkp); clearRangeFile *maxClr = (maxClrName == NULL) ? NULL : new clearRangeFile(maxClrName, gkp); clearRangeFile *outClr = (outClrName == NULL) ? NULL : new clearRangeFile(outClrName, gkp); if (outClr) // If the outClr file exists, those clear ranges are loaded. We need to reset them // back to 'untrimmed' for now. outClr->reset(gkp); if (iniClr && outClr) // An iniClr file was supplied, so use those as the initial clear ranges. outClr->copy(iniClr); if (outputPrefix) { sprintf(logName, "%s.log", outputPrefix); sprintf(sumName, "%s.summary", outputPrefix); errno = 0; logFile = fopen(logName, "w"); if (errno) fprintf(stderr, "Failed to open log file '%s' for writing: %s\n", logName, strerror(errno)), exit(1); sumFile = fopen(sumName, "w"); if (errno) fprintf(stderr, "Failed to open summary file '%s' for writing: %s\n", sumName, strerror(errno)), exit(1); fprintf(logFile, "id\tinitL\tinitR\tfinalL\tfinalR\tmessage (DEL=deleted NOC=no change MOD=modified)\n"); fprintf(sumFile, "Overlap error rate <= %.4f fraction error\n", AS_OVS_decodeEvalue(errorValue)); fprintf(sumFile, "Overlap min overlap >= %u base%s (for 'largest covered')\n", minEvidenceOverlap, (minEvidenceOverlap == 1) ? "" : "s"); fprintf(sumFile, "Overlap min coverage >= %u read%s (for 'largest covered')\n", minEvidenceCoverage, (minEvidenceCoverage == 1) ? "" : "s"); } uint32 ovlLen = 0; uint32 ovlMax = 64 * 1024; ovOverlap *ovl = ovOverlap::allocateOverlaps(gkp, ovlMax); memset(ovl, 0, sizeof(ovOverlap) * ovlMax); char logMsg[1024] = {0}; if (idMin < 1) idMin = 1; if (idMax > gkp->gkStore_getNumReads()) idMax = gkp->gkStore_getNumReads(); fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads.\n", idMin, idMax, gkp->gkStore_getNumReads()); for (uint32 id=idMin; id<=idMax; id++) { gkRead *read = gkp->gkStore_getRead(id); gkLibrary *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID()); logMsg[0] = 0; // If the fragment is deleted, do nothing. If the fragment was deleted AFTER overlaps were // generated, then the overlaps will be out of sync -- we'll get overlaps for these fragments // we skip. // if ((iniClr) && (iniClr->isDeleted(id) == true)) continue; // If it did not request trimming, do nothing. Similar to the above, we'll get overlaps to // fragments we skip. // if ((libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) && (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE)) continue; // Decide on the initial trimming. We copied any iniClr into outClr above, and if there wasn't // an iniClr, then outClr is the full read. uint32 ibgn = outClr->bgn(id); uint32 iend = outClr->end(id); // Set the, ahem, initial final trimming. bool isGood = false; uint32 fbgn = ibgn; uint32 fend = iend; // Load overlaps. uint32 nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax); // Trim! if (nLoaded == 0) { // No overlaps, so mark it as junk. isGood = false; } else if (libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) { // Use the largest region covered by overlaps as the trim assert(ovlLen > 0); assert(id == ovl[0].a_iid); isGood = largestCovered(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, errorValue, minEvidenceOverlap, minEvidenceCoverage, minReadLength); assert(fbgn <= fend); } else if (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE) { // Use the largest region covered by overlaps as the trim assert(ovlLen > 0); assert(id == ovl[0].a_iid); isGood = bestEdge(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, errorValue, minEvidenceOverlap, minEvidenceCoverage, minReadLength); assert(fbgn <= fend); } else { // Do nothing. Really shouldn't get here. assert(0); continue; } // Enforce the maximum clear range if ((isGood) && (maxClr)) { isGood = enforceMaximumClearRange(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, maxClr); assert(fbgn <= fend); } // // Trimmed. Make sense of the result, write some logs, and update the output. // // If bad trimming or too small, write the log and keep going. // if ((isGood == false) || (fend - fbgn < minReadLength)) { outClr->setbgn(id) = fbgn; outClr->setend(id) = fend; outClr->setDeleted(id); // Gah, just obliterates the clear range. fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tDEL%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); } // If we didn't change anything, also write a log. // else if ((ibgn == fbgn) && (iend == fend)) { fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tNOC%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); continue; } // Otherwise, we actually did something. else { outClr->setbgn(id) = fbgn; outClr->setend(id) = fend; fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tMOD%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); } } gkp->gkStore_close(); delete ovs; delete iniClr; delete maxClr; delete outClr; fclose(logFile); fclose(sumFile); exit(0); }
int main(int argc, char **argv) { char *gkpName = NULL; char *ovsName = NULL; char *finClrName = NULL; char *outClrName = NULL; double errorRate = 0.06; //uint32 minAlignLength = 40; uint32 minReadLength = 64; uint32 idMin = 1; uint32 idMax = UINT32_MAX; char *outputPrefix = NULL; char outputName[FILENAME_MAX]; FILE *staFile = NULL; FILE *reportFile = NULL; FILE *subreadFile = NULL; bool doSubreadLogging = true; bool doSubreadLoggingVerbose = false; // Statistics on the trimming - the second set are from the old logging, and don't really apply anymore. trimStat readsIn; // Read is eligible for trimming trimStat deletedIn; // Read was deleted already trimStat noTrimIn; // Read not requesting trimming trimStat noOverlaps; // no overlaps in store trimStat noCoverage; // no coverage after adjusting for trimming done trimStat readsProcChimera; // Read was processed for chimera signal trimStat readsProcSpur; // Read was processed for spur signal trimStat readsProcSubRead; // Read was processed for subread signal #if 0 trimStat badSpur5; trimStat badSpur3; trimStat badChimera; trimStat badSubread; #endif trimStat readsNoChange; trimStat readsBadSpur5, basesBadSpur5; trimStat readsBadSpur3, basesBadSpur3; trimStat readsBadChimera, basesBadChimera; trimStat readsBadSubread, basesBadSubread; trimStat readsTrimmed5; trimStat readsTrimmed3; #if 0 trimStat fullCoverage; // fully covered by overlaps trimStat noSignalNoGap; // no signal, no gaps trimStat noSignalButGap; // no signal, with gaps trimStat bothFixed; // both chimera and spur signal trimmed trimStat chimeraFixed; // only chimera signal trimmed trimStat spurFixed; // only spur signal trimmed trimStat bothDeletedSmall; // deleted because of both cimera and spur signals trimStat chimeraDeletedSmall; // deleted because of chimera signal trimStat spurDeletedSmall; // deleted because of spur signal trimStat spurDetectedNormal; // normal spur detected trimStat spurDetectedLinker; // linker spur detected trimStat chimeraDetectedInnie; // innpue-pair chimera detected trimStat chimeraDetectedOverhang; // overhanging chimera detected trimStat chimeraDetectedGap; // gap chimera detected trimStat chimeraDetectedLinker; // linker chimera detected #endif trimStat deletedOut; // Read was deleted by trimming argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { ovsName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { outputPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { AS_UTL_decodeRange(argv[++arg], idMin, idMax); } else if (strcmp(argv[arg], "-Ci") == 0) { finClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Co") == 0) { outClrName = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { errorRate = atof(argv[++arg]); //} else if (strcmp(argv[arg], "-l") == 0) { // minAlignLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-minlength") == 0) { minReadLength = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if (errorRate < 0.0) err++; if ((gkpName == 0L) || (ovsName == 0L) || (outputPrefix == NULL) || (err)) { fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Ci input.clearFile -Co output.clearFile -o outputPrefix]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G gkpStore path to read store\n"); fprintf(stderr, " -O ovlStore path to overlap store\n"); fprintf(stderr, "\n"); fprintf(stderr, " -o name output prefix, for logging\n"); fprintf(stderr, "\n"); fprintf(stderr, " -t bgn-end limit processing to only reads from bgn to end (inclusive)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -Ci clearFile path to input clear ranges (NOT SUPPORTED)\n"); fprintf(stderr, " -Co clearFile path to ouput clear ranges\n"); fprintf(stderr, "\n"); fprintf(stderr, " -e erate ignore overlaps with more than 'erate' percent error\n"); //fprintf(stderr, " -l length ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -minlength l reads trimmed below this many bases are deleted\n"); fprintf(stderr, "\n"); if (errorRate < 0.0) fprintf(stderr, "ERROR: Error rate (-e) value %f too small; must be 'fraction error' and above 0.0\n", errorRate); exit(1); } gkStore *gkp = gkStore::gkStore_open(gkpName); ovStore *ovs = new ovStore(ovsName, gkp); clearRangeFile *finClr = new clearRangeFile(finClrName, gkp); clearRangeFile *outClr = new clearRangeFile(outClrName, gkp); if (outClr) // If the outClr file exists, those clear ranges are loaded. We need to reset them // back to 'untrimmed' for now. outClr->reset(gkp); if (finClr && outClr) // A finClr file was supplied, so use those as the clear ranges. outClr->copy(finClr); sprintf(outputName, "%s.log", outputPrefix); errno = 0; reportFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1); sprintf(outputName, "%s.subread.log", outputPrefix); errno = 0; subreadFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)), exit(1); uint32 ovlLen = 0; uint32 ovlMax = 64 * 1024; ovOverlap *ovl = ovOverlap::allocateOverlaps(gkp, ovlMax); memset(ovl, 0, sizeof(ovOverlap) * ovlMax); workUnit *w = new workUnit; if (idMin < 1) idMin = 1; if (idMax > gkp->gkStore_getNumReads()) idMax = gkp->gkStore_getNumReads(); fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads, using errorRate = %.2f\n", idMin, idMax, gkp->gkStore_getNumReads(), errorRate); for (uint32 id=idMin; id<=idMax; id++) { gkRead *read = gkp->gkStore_getRead(id); gkLibrary *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID()); if (finClr->isDeleted(id)) { // Read already trashed. deletedIn += read->gkRead_sequenceLength(); continue; } if ((libr->gkLibrary_removeSpurReads() == false) && (libr->gkLibrary_removeChimericReads() == false) && (libr->gkLibrary_checkForSubReads() == false)) { // Nothing to do. noTrimIn += read->gkRead_sequenceLength(); continue; } readsIn += read->gkRead_sequenceLength(); uint32 nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax); //fprintf(stderr, "read %7u with %7u overlaps\r", id, nLoaded); if (nLoaded == 0) { // No overlaps, nothing to check! noOverlaps += read->gkRead_sequenceLength(); continue; } w->clear(id, finClr->bgn(id), finClr->end(id)); w->addAndFilterOverlaps(gkp, finClr, errorRate, ovl, ovlLen); if (w->adjLen == 0) { // All overlaps trimmed out! noCoverage += read->gkRead_sequenceLength(); continue; } // Find bad regions. //if (libr->gkLibrary_markBad() == true) // // From an external file, a list of known bad regions. If no overlaps span // // the region with sufficient coverage, mark the region as bad. This was // // motivated by the old 454 linker detection. // markBad(gkp, w, subreadFile, doSubreadLoggingVerbose); //if (libr->gkLibrary_removeSpurReads() == true) { // readsProcSpur += read->gkRead_sequenceLength(); // detectSpur(gkp, w, subreadFile, doSubreadLoggingVerbose); // Get stats on spur region detected - save the length of each region to the trimStats object. //} //if (libr->gkLibrary_removeChimericReads() == true) { // readsProcChimera += read->gkRead_sequenceLength(); // detectChimer(gkp, w, subreadFile, doSubreadLoggingVerbose); // Get stats on chimera region detected - save the length of each region to the trimStats object. //} if (libr->gkLibrary_checkForSubReads() == true) { readsProcSubRead += read->gkRead_sequenceLength(); detectSubReads(gkp, w, subreadFile, doSubreadLoggingVerbose); } // Get stats on the bad regions found. This kind of duplicates code in trimBadInterval(), but // I don't want to pass all the stats objects into there. if (w->blist.size() == 0) { readsNoChange += read->gkRead_sequenceLength(); } else { uint32 nSpur5 = 0, bSpur5 = 0; uint32 nSpur3 = 0, bSpur3 = 0; uint32 nChimera = 0, bChimera = 0; uint32 nSubread = 0, bSubread = 0; for (uint32 bb=0; bb<w->blist.size(); bb++) { switch (w->blist[bb].type) { case badType_5spur: nSpur5 += 1; basesBadSpur5 += w->blist[bb].end - w->blist[bb].bgn; break; case badType_3spur: nSpur3 += 1; basesBadSpur3 += w->blist[bb].end - w->blist[bb].bgn; break; case badType_chimera: nChimera += 1; basesBadChimera += w->blist[bb].end - w->blist[bb].bgn; break; case badType_subread: nSubread += 1; basesBadSubread += w->blist[bb].end - w->blist[bb].bgn; break; default: break; } } if (nSpur5 > 0) readsBadSpur5 += nSpur5; if (nSpur3 > 0) readsBadSpur3 += nSpur3; if (nChimera > 0) readsBadChimera += nChimera; if (nSubread > 0) readsBadSubread += nSubread; } // Find solution. This coalesces the list (in 'w') of all the bad regions found, picks out the // largest good region, generates a log of the bad regions that support this decision, and sets // the trim points. trimBadInterval(gkp, w, minReadLength, subreadFile, doSubreadLoggingVerbose); // Log the solution. AS_UTL_safeWrite(reportFile, w->logMsg, "logMsg", sizeof(char), strlen(w->logMsg)); // Save the solution.... outClr->setbgn(w->id) = w->clrBgn; outClr->setend(w->id) = w->clrEnd; // And maybe delete the read. if (w->isOK == false) { deletedOut += read->gkRead_sequenceLength(); outClr->setDeleted(w->id); } // Update stats on what was trimmed. The asserts say the clear range didn't expand, and the if // tests if the clear range changed. assert(w->clrBgn >= w->iniBgn); assert(w->iniEnd >= w->clrEnd); if (w->clrBgn > w->iniBgn) readsTrimmed5 += w->clrBgn - w->iniBgn; if (w->iniEnd > w->clrEnd) readsTrimmed3 += w->iniEnd - w->clrEnd; } delete [] ovl; delete w; gkp->gkStore_close(); delete finClr; delete outClr; // Close log files if (reportFile) fclose(reportFile); if (subreadFile) fclose(subreadFile); // Write the summary if (outputPrefix) { sprintf(outputName, "%s.stats", outputPrefix); errno = 0; staFile = fopen(outputName, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", outputName, strerror(errno)); } if (staFile == NULL) staFile = stdout; // Would like to know number of subreads per read fprintf(staFile, "PARAMETERS:\n"); fprintf(staFile, "----------\n"); fprintf(staFile, "%7u (reads trimmed below this many bases are deleted)\n", minReadLength); fprintf(staFile, "%7.4f (use overlaps at or below this fraction error)\n", errorRate); //fprintf(staFile, "%7u (use only overlaps longer than this)\n", minAlignLength); // NOT SUPPORTED! fprintf(staFile, "INPUT READS:\n"); fprintf(staFile, "-----------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads processed)\n", readsIn.nReads, readsIn.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, previously deleted)\n", deletedIn.nReads, deletedIn.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (reads not processed, in a library where trimming isn't allowed)\n", noTrimIn.nReads, noTrimIn.nBases); fprintf(staFile, "\n"); fprintf(staFile, "PROCESSED:\n"); fprintf(staFile, "--------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no overlaps)\n", noOverlaps.nReads, noOverlaps.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (no coverage after adjusting for trimming done already)\n", noCoverage.nReads, noCoverage.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for chimera)\n", readsProcChimera.nReads, readsProcChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for spur)\n", readsProcSpur.nReads, readsProcSpur.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (processed for subreads)\n", readsProcSubRead.nReads, readsProcSubRead.nBases); fprintf(staFile, "\n"); fprintf(staFile, "READS WITH SIGNALS:\n"); fprintf(staFile, "------------------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 5' spur signal)\n", readsBadSpur5.nReads, readsBadSpur5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of 3' spur signal)\n", readsBadSpur3.nReads, readsBadSpur3.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of chimera signal)\n", readsBadChimera.nReads, readsBadChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" signals (number of subread signal)\n", readsBadSubread.nReads, readsBadSubread.nBases); fprintf(staFile, "\n"); fprintf(staFile, "SIGNALS:\n"); fprintf(staFile, "-------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 5' spur signal)\n", basesBadSpur5.nReads, basesBadSpur5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of 3' spur signal)\n", basesBadSpur3.nReads, basesBadSpur3.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of chimera signal)\n", basesBadChimera.nReads, basesBadChimera.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (size of subread signal)\n", basesBadSubread.nReads, basesBadSubread.nBases); fprintf(staFile, "\n"); fprintf(staFile, "TRIMMING:\n"); fprintf(staFile, "--------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 5' end of the read)\n", readsTrimmed5.nReads, readsTrimmed5.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (trimmed from the 3' end of the read)\n", readsTrimmed3.nReads, readsTrimmed3.nBases); #if 0 fprintf(staFile, "DELETED:\n"); fprintf(staFile, "-------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of both cimera and spur signals)\n", bothDeletedSmall.nReads, bothDeletedSmall.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of chimera signal)\n", chimeraDeletedSmall.nReads, chimeraDeletedSmall.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (deleted because of spur signal)\n", spurDeletedSmall.nReads, spurDeletedSmall.nBases); fprintf(staFile, "\n"); fprintf(staFile, "SPUR TYPES:\n"); fprintf(staFile, "----------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (normal spur detected)\n", spurDetectedNormal.nReads, spurDetectedNormal.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker spur detected)\n", spurDetectedLinker.nReads, spurDetectedLinker.nBases); fprintf(staFile, "\n"); fprintf(staFile, "CHIMERA TYPES:\n"); fprintf(staFile, "-------------\n"); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (innie-pair chimera detected)\n", chimeraDetectedInnie.nReads, chimeraDetectedInnie.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (overhanging chimera detected)\n", chimeraDetectedOverhang.nReads, chimeraDetectedOverhang.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (gap chimera detected)\n", chimeraDetectedGap.nReads, chimeraDetectedGap.nBases); fprintf(staFile, "%6"F_U32P" reads %12"F_U64P" bases (linker chimera detected)\n", chimeraDetectedLinker.nReads, chimeraDetectedLinker.nBases); #endif // INPUT READS = ACCEPTED + TRIMMED + DELETED // SPUR TYPE = TRIMMED and DELETED spur and both categories // CHIMERA TYPE = TRIMMED and DELETED chimera and both categories if (staFile != stdout) fclose(staFile); exit(0); }
int main (int argc, char **argv) { char tmpName[FILENAME_MAX] = {0}; char *gkpName = NULL; char *tigName = NULL; int32 tigVers = -1; int32 tigPart = -1; int64 ctgBgn = -1; int64 ctgEnd = -1; char *ctgName = NULL; char *outName = NULL; char *inName = NULL; bool forceCompute = false; int32 numFailures = 0; int32 numSkipped = 0; bool useUnitig = false; bool showResult = false; CNS_Options options = { CNS_OPTIONS_SPLIT_ALLELES_DEFAULT, CNS_OPTIONS_MIN_ANCHOR_DEFAULT, CNS_OPTIONS_DO_PHASING_DEFAULT }; // Comminucate to MultiAlignment_CNS.c that we are doing consensus and not cgw. thisIsConsensus = 1; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-g") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { tigName = argv[++arg]; tigVers = atoi(argv[++arg]); tigPart = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) { AS_UTL_decodeRange(argv[++arg], ctgBgn, ctgEnd); } else if (strcmp(argv[arg], "-T") == 0) { ctgName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { outName = argv[++arg]; } else if (strcmp(argv[arg], "-I") == 0) { inName = argv[++arg]; } else if (strcmp(argv[arg], "-f") == 0) { forceCompute = true; } else if (strcmp(argv[arg], "-U") == 0) { useUnitig = true; } else if (strcmp(argv[arg], "-v") == 0) { showResult = true; } else if (strcmp(argv[arg], "-V") == 0) { VERBOSE_MULTIALIGN_OUTPUT++; } else if (strcmp(argv[arg], "-w") == 0) { options.smooth_win = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-P") == 0) { options.do_phasing = atoi(argv[++arg]); } else { fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((err) || (gkpName == NULL) || (tigName == NULL)) { fprintf(stderr, "usage: %s -g gkpStore -t tigStore version partition [opts]\n", argv[0]); fprintf(stderr, " -c b Compute only contig ID 'b' (must be in the correct partition!)\n"); fprintf(stderr, " -c b-e Compute only contigs from ID 'b' to ID 'e'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -T file Test the computation of the contig layout in 'file'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -f Recompute contigs that already have a multialignment\n"); fprintf(stderr, "\n"); fprintf(stderr, " -U Reuse the unitig consensus for contigs with only a single\n"); fprintf(stderr, " unitig (EXPERIMENTAL!)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -O file Don't update tigStore, dump a binary file instead.\n"); fprintf(stderr, " -I file Import binary file into tigStore\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v Show multialigns.\n"); fprintf(stderr, " -V Enable debugging option 'verbosemultialign'.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -w ws Smoothing window size\n"); fprintf(stderr, "\n"); exit(1); } // Open both stores for read only. gkpStore = new gkStore(gkpName, false, false); tigStore = new MultiAlignStore(tigName, tigVers, 0, tigPart, false, false, false); gkpStore->gkStore_loadPartition(tigPart); // Decide on what to compute. Either all contigs, or a single contig, or a special case test. uint32 b = 0; uint32 e = tigStore->numContigs(); if (ctgBgn != -1) { b = ctgBgn; e = ctgEnd + 1; } FORCE_UNITIG_ABUT = 1; if (ctgName != NULL) { errno = 0; FILE *F = fopen(ctgName, "r"); if (errno) fprintf(stderr, "Failed to open input contig file '%s': %s\n", ctgName, strerror(errno)), exit(1); MultiAlignT *ma = CreateEmptyMultiAlignT(); bool isUnitig = false; while (LoadMultiAlignFromHuman(ma, isUnitig, F) == true) { if (ma->maID < 0) ma->maID = (isUnitig) ? tigStore->numUnitigs() : tigStore->numContigs(); if (MultiAlignContig(ma, gkpStore, &options)) { if (showResult) PrintMultiAlignT(stdout, ma, gkpStore, false, false, AS_READ_CLEAR_LATEST); } else { fprintf(stderr, "MultiAlignContig()-- contig %d failed.\n", ma->maID); numFailures++; } } DeleteMultiAlignT(ma); b = e = 0; } // Reopen for writing, if we have work to do. if (((inName) || (b < e)) && (outName == NULL)) { delete tigStore; tigStore = new MultiAlignStore(tigName, tigVers, 0, tigPart, true, false, true); } if (inName) { importFromFile(inName, tigPart); b = e = 0; } // Now the usual case. Iterate over all contigs, compute and update. for (uint32 i=b; i<e; i++) { MultiAlignT *cma = tigStore->loadMultiAlign(i, false); if (cma == NULL) { // Not in our partition, or deleted. continue; } bool exists = (cma->consensus != NULL) && (GetNumchars(cma->consensus) > 1); if ((forceCompute == false) && (exists == true)) { // Already finished contig consensus. fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments) - already computed, skipped\n", cma->maID, cma->data.num_unitigs, cma->data.num_frags); numSkipped++; tigStore->unloadMultiAlign(cma->maID, false); continue; } int32 uID = GetIntUnitigPos(cma->u_list, 0)->ident; // If this is a surrogate, we CANNOT reuse the unitig. We need to process the contig so that // the unplaced reads are stripped out. A surrogate should have different contig and unitig // IDs; we could also check the contig status. if ((cma->data.num_unitigs == 1) && (cma->maID == uID) && (useUnitig == true)) { fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments) - reusing unitig %d consensus\n", cma->maID, cma->data.num_unitigs, cma->data.num_frags, uID); MultiAlignT *uma = tigStore->loadMultiAlign(uID, true); uma->data = cma->data; tigStore->unloadMultiAlign(cma->maID, false); if (outName) writeToOutFile(outName, tigPart, uma); else tigStore->insertMultiAlign(uma, false, false); tigStore->unloadMultiAlign(uma->maID, true); continue; } fprintf(stderr, "Working on contig %d (%d unitigs and %d fragments)%s\n", cma->maID, cma->data.num_unitigs, cma->data.num_frags, (exists) ? " - already computed, recomputing" : ""); if (MultiAlignContig(cma, gkpStore, &options)) { if (outName) writeToOutFile(outName, tigPart, cma); else tigStore->insertMultiAlign(cma, false, true); if (showResult) PrintMultiAlignT(stdout, cma, gkpStore, false, false, AS_READ_CLEAR_LATEST); tigStore->unloadMultiAlign(cma->maID, false); } else { fprintf(stderr, "MultiAlignContig()-- contig %d failed.\n", cma->maID); numFailures++; } } delete tigStore; fprintf(stderr, "\n"); fprintf(stderr, "NumColumnsInUnitigs = %d\n", NumColumnsInUnitigs); fprintf(stderr, "NumGapsInUnitigs = %d\n", NumGapsInUnitigs); fprintf(stderr, "NumRunsOfGapsInUnitigReads = %d\n", NumRunsOfGapsInUnitigReads); fprintf(stderr, "NumColumnsInContigs = %d\n", NumColumnsInContigs); fprintf(stderr, "NumGapsInContigs = %d\n", NumGapsInContigs); fprintf(stderr, "NumRunsOfGapsInContigReads = %d\n", NumRunsOfGapsInContigReads); fprintf(stderr, "NumAAMismatches = %d\n", NumAAMismatches); fprintf(stderr, "NumVARRecords = %d\n", NumVARRecords); fprintf(stderr, "NumVARStringsWithFlankingGaps = %d\n", NumVARStringsWithFlankingGaps); fprintf(stderr, "NumUnitigRetrySuccess = %d\n", NumUnitigRetrySuccess); fprintf(stderr, "\n"); if (numFailures) { fprintf(stderr, "WARNING: Total number of contig failures = %d\n", numFailures); fprintf(stderr, "\n"); fprintf(stderr, "Consensus did NOT finish successfully.\n"); return(1); } fprintf(stderr, "Consensus finished successfully. Bye.\n"); return(0); }
int main(int argc, char **argv) { char *gkpStoreName = NULL; char *outPrefix = NULL; char *outSuffix = NULL; char *clrName = NULL; uint32 libToDump = 0; uint32 bgnID = 1; uint32 endID = UINT32_MAX; bool dumpAllReads = false; bool dumpAllBases = false; bool dumpOnlyDeleted = false; bool dumpFASTQ = true; bool dumpFASTA = false; bool withLibName = true; argc = AS_configure(argc, argv); int arg = 1; int err = 0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpStoreName = argv[++arg]; } else if (strcmp(argv[arg], "-o") == 0) { outPrefix = argv[++arg]; outSuffix = scanPrefix(outPrefix); } else if (strcmp(argv[arg], "-c") == 0) { clrName = argv[++arg]; } else if (strcmp(argv[arg], "-l") == 0) { libToDump = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-b") == 0) { // DEPRECATED! bgnID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-e") == 0) { // DEPRECATED! endID = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-r") == 0) { AS_UTL_decodeRange(argv[++arg], bgnID, endID); } else if (strcmp(argv[arg], "-allreads") == 0) { dumpAllReads = true; } else if (strcmp(argv[arg], "-allbases") == 0) { dumpAllBases = true; } else if (strcmp(argv[arg], "-onlydeleted") == 0) { dumpOnlyDeleted = true; dumpAllReads = true; // Otherwise we won't report the deleted reads! } else if (strcmp(argv[arg], "-fastq") == 0) { dumpFASTQ = true; dumpFASTA = false; } else if (strcmp(argv[arg], "-fasta") == 0) { dumpFASTQ = false; dumpFASTA = true; } else if (strcmp(argv[arg], "-nolibname") == 0) { withLibName = false; } else { err++; fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); } arg++; } if (gkpStoreName == NULL) err++; if (outPrefix == NULL) err++; if (err) { fprintf(stderr, "usage: %s [...] -o fastq-prefix -g gkpStore\n", argv[0]); fprintf(stderr, " -G gkpStore\n"); fprintf(stderr, " -o fastq-prefix write files fastq-prefix.(libname).fastq, ...\n"); fprintf(stderr, " if fastq-prefix is '-', all sequences output to stdout\n"); fprintf(stderr, " if fastq-prefix ends in .gz, .bz2 or .xz, output is compressed\n"); fprintf(stderr, "\n"); fprintf(stderr, " -l libToDump output only read in library number libToDump (NOT IMPLEMENTED)\n"); fprintf(stderr, " -r id[-id] output only the single read 'id', or the specified range of ids\n"); fprintf(stderr, "\n"); fprintf(stderr, " -c clearFile clear range file from OBT modules\n"); fprintf(stderr, " -allreads if a clear range file, lower case mask the deleted reads\n"); fprintf(stderr, " -allbases if a clear range file, lower case mask the non-clear bases\n"); fprintf(stderr, " -onlydeleted if a clear range file, only output deleted reads (the entire read)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -fastq output is FASTQ format (with extension .fastq, default)\n"); fprintf(stderr, " -fasta output is FASTA format (with extension .fasta)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -nolibname don't include the library name in the output file name\n"); fprintf(stderr, "\n"); if (gkpStoreName == NULL) fprintf(stderr, "ERROR: no gkpStore (-G) supplied.\n"); if (outPrefix == NULL) fprintf(stderr, "ERROR: no output prefix (-o) supplied.\n"); exit(1); } gkStore *gkpStore = gkStore::gkStore_open(gkpStoreName); uint32 numReads = gkpStore->gkStore_getNumReads(); uint32 numLibs = gkpStore->gkStore_getNumLibraries(); clearRangeFile *clrRange = (clrName == NULL) ? NULL : new clearRangeFile(clrName, gkpStore); if (bgnID < 1) bgnID = 1; if (numReads < endID) endID = numReads; if (endID < bgnID) fprintf(stderr, "No reads to dump; reversed ranges make no sense: bgn="F_U32" end="F_U32"??\n", bgnID, endID); fprintf(stderr, "Dumping reads from %u to %u (inclusive).\n", bgnID, endID); libOutput **out = new libOutput * [numLibs + 1]; // Allocate outputs. If withLibName == false, all reads will artificially be in lib zero, the // other files won't ever be created. Otherwise, the zeroth file won't ever be created. out[0] = new libOutput(outPrefix, outSuffix, NULL); for (uint32 i=1; i<=numLibs; i++) out[i] = new libOutput(outPrefix, outSuffix, gkpStore->gkStore_getLibrary(i)->gkLibrary_libraryName()); // Grab a new readData, and iterate through reads to dump. gkReadData *readData = new gkReadData; for (uint32 rid=bgnID; rid<=endID; rid++) { gkRead *read = gkpStore->gkStore_getRead(rid); uint32 libID = (withLibName == false) ? 0 : read->gkRead_libraryID(); uint32 flen = read->gkRead_sequenceLength(); uint32 lclr = 0; uint32 rclr = flen; bool ignore = false; //fprintf(stderr, "READ %u claims id %u length %u in lib %u\n", rid, read->gkRead_readID(), read->gkRead_sequenceLength(), libID); // If a clear range file is supplied, grab the clear range. If it hasn't been set, the default // is the entire read. if (clrRange) { lclr = clrRange->bgn(rid); rclr = clrRange->end(rid); ignore = clrRange->isDeleted(rid); } // Abort if we're not dumping anything from this read // - not in a library we care about // - deleted, and not dumping all reads // - not deleted, but only reporting deleted reads if (((libToDump != 0) && (libID == libToDump)) || ((dumpAllReads == false) && (ignore == true)) || ((dumpOnlyDeleted == true) && (ignore == false))) continue; // And if we're told to ignore the read, and here, then the read was deleted and we're printing // all reads. Reset the clear range to the whole read, the clear range is invalid. if (ignore) { lclr = 0; rclr = read->gkRead_sequenceLength(); } // Grab the sequence and quality. gkpStore->gkStore_loadReadData(read, readData); char *seq = readData->gkReadData_getSequence(); char *qlt = readData->gkReadData_getQualities(); uint32 clen = rclr - lclr; // Soft mask not-clear bases if (dumpAllBases == true) { for (uint32 i=0; i<lclr; i++) seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0; for (uint32 i=lclr; i<rclr; i++) seq[i] += (seq[i] >= 'A') ? 0 : 'A' - 'a'; for (uint32 i=rclr; flen; i++) seq[i] += (seq[i] >= 'A') ? 'a' - 'A' : 0; lclr = 0; rclr = flen; } // Chop off the ends we're not printing. seq += lclr; qlt += lclr; seq[clen] = 0; qlt[clen] = 0; // Print the read. if (dumpFASTA) AS_UTL_writeFastA(out[libID]->getFASTA(), seq, clen, 100, ">"F_U32" clr="F_U32","F_U32"\n", rid, lclr, rclr); if (dumpFASTQ) AS_UTL_writeFastQ(out[libID]->getFASTQ(), seq, clen, qlt, clen, "@"F_U32" clr="F_U32","F_U32"\n", rid, lclr, rclr); } delete clrRange; delete readData; for (uint32 i=0; i<=numLibs; i++) delete out[i]; delete [] out; gkpStore->gkStore_close(); exit(0); }
int main (int argc, char **argv) { char *gkpName = NULL; char *tigName = NULL; uint32 tigVers = UINT32_MAX; uint32 tigPart = UINT32_MAX; char *tigFileName = NULL; uint32 utgBgn = UINT32_MAX; uint32 utgEnd = UINT32_MAX; char *outResultsName = NULL; char *outLayoutsName = NULL; char *outSeqNameA = NULL; char *outSeqNameQ = NULL; char *outPackageName = NULL; FILE *outResultsFile = NULL; FILE *outLayoutsFile = NULL; FILE *outSeqFileA = NULL; FILE *outSeqFileQ = NULL; FILE *outPackageFile = NULL; char *inPackageName = NULL; char algorithm = 'P'; uint32 numThreads = 0; bool forceCompute = false; double errorRate = 0.12; double errorRateMax = 0.40; uint32 minOverlap = 40; int32 numFailures = 0; bool showResult = false; double maxCov = 0.0; uint32 maxLen = UINT32_MAX; uint32 verbosity = 0; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-T") == 0) { tigName = argv[++arg]; tigVers = atoi(argv[++arg]); tigPart = atoi(argv[++arg]); if (argv[arg][0] == '.') tigPart = UINT32_MAX; if (tigVers == 0) fprintf(stderr, "invalid tigStore version (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1); if (tigPart == 0) fprintf(stderr, "invalid tigStore partition (-T store version partition) '-t %s %s %s'.\n", argv[arg-2], argv[arg-1], argv[arg]), exit(1); } else if (strcmp(argv[arg], "-u") == 0) { AS_UTL_decodeRange(argv[++arg], utgBgn, utgEnd); } else if (strcmp(argv[arg], "-t") == 0) { tigFileName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { outResultsName = argv[++arg]; } else if (strcmp(argv[arg], "-L") == 0) { outLayoutsName = argv[++arg]; } else if (strcmp(argv[arg], "-A") == 0) { outSeqNameA = argv[++arg]; } else if (strcmp(argv[arg], "-Q") == 0) { outSeqNameQ = argv[++arg]; } else if (strcmp(argv[arg], "-quick") == 0) { algorithm = 'Q'; } else if (strcmp(argv[arg], "-pbdagcon") == 0) { algorithm = 'P'; } else if (strcmp(argv[arg], "-utgcns") == 0) { algorithm = 'U'; } else if (strcmp(argv[arg], "-threads") == 0) { numThreads = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-p") == 0) { inPackageName = argv[++arg]; } else if (strcmp(argv[arg], "-P") == 0) { outPackageName = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { errorRate = atof(argv[++arg]); } else if (strcmp(argv[arg], "-em") == 0) { errorRateMax = atof(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { minOverlap = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-f") == 0) { forceCompute = true; } else if (strcmp(argv[arg], "-v") == 0) { showResult = true; } else if (strcmp(argv[arg], "-V") == 0) { verbosity++; } else if (strcmp(argv[arg], "-maxcoverage") == 0) { maxCov = atof(argv[++arg]); } else if (strcmp(argv[arg], "-maxlength") == 0) { maxLen = atof(argv[++arg]); } else { fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if ((gkpName == NULL) && (inPackageName == NULL)) err++; if ((tigFileName == NULL) && (tigName == NULL) && (inPackageName == NULL)) err++; if (err) { fprintf(stderr, "usage: %s [opts]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " INPUT\n"); fprintf(stderr, " -G g Load reads from gkStore 'g'\n"); fprintf(stderr, " -T t v p Load unitigs from tgStore 't', version 'v', partition 'p'.\n"); fprintf(stderr, " Expects reads will be in gkStore partition 'p' as well\n"); fprintf(stderr, " Use p='.' to specify no partition\n"); fprintf(stderr, " -t file Test the computation of the unitig layout in 'file'\n"); fprintf(stderr, " 'file' can be from:\n"); fprintf(stderr, " 'tgStoreDump -d layout' (human readable layout format)\n"); fprintf(stderr, " 'utgcns -L' (human readable layout format)\n"); fprintf(stderr, " 'utgcns -O' (binary multialignment format)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -p package Load unitig and read from 'package' created with -P. This\n"); fprintf(stderr, " is usually used by developers.\n"); fprintf(stderr, "\n"); fprintf(stderr, "\n"); fprintf(stderr, " ALGORITHM\n"); fprintf(stderr, " -quick No alignments, just paste read sequence into the unitig positions.\n"); fprintf(stderr, " This is very fast, but the consensus sequence is formed from a mosaic\n"); fprintf(stderr, " of read sequences, and there can be large indel. This is useful for\n"); fprintf(stderr, " checking intermediate assembly structure by mapping to reference, or\n"); fprintf(stderr, " possibly for use as input to a polishing step.\n"); fprintf(stderr, " -pbdagcon Use pbdagcon (https://github.com/PacificBiosciences/pbdagcon).\n"); fprintf(stderr, " This is fast and robust. It is the default algorithm. It does not\n"); fprintf(stderr, " generate a final multialignment output (the -v option will not show\n"); fprintf(stderr, " anything useful).\n"); fprintf(stderr, " -utgcns Use utgcns (the original Celera Assembler consensus algorithm)\n"); fprintf(stderr, " This isn't as fast, isn't as robust, but does generate a final multialign\n"); fprintf(stderr, " output.\n"); fprintf(stderr, "\n"); fprintf(stderr, "\n"); fprintf(stderr, " OUTPUT\n"); fprintf(stderr, " -O results Write computed tigs to binary output file 'results'\n"); fprintf(stderr, " -L layouts Write computed tigs to layout output file 'layouts'\n"); fprintf(stderr, " -A fasta Write computed tigs to fasta output file 'fasta'\n"); fprintf(stderr, " -Q fastq Write computed tigs to fastq output file 'fastq'\n"); fprintf(stderr, "\n"); fprintf(stderr, " -P package Create a copy of the inputs needed to compute the unitigs. This\n"); fprintf(stderr, " file can then be sent to the developers for debugging. The unitig(s)\n"); fprintf(stderr, " are not processed and no other outputs are created. Ideally,\n"); fprintf(stderr, " only one unitig is selected (-u, below).\n"); fprintf(stderr, "\n"); fprintf(stderr, " TIG SELECTION (if -T input is used)\n"); fprintf(stderr, " -u b Compute only unitig ID 'b' (must be in the correct partition!)\n"); fprintf(stderr, " -u b-e Compute only unitigs from ID 'b' to ID 'e'\n"); fprintf(stderr, " -f Recompute unitigs that already have a multialignment\n"); fprintf(stderr, " -maxlength l Do not compute consensus for unitigs longer than l bases.\n"); fprintf(stderr, "\n"); fprintf(stderr, " PARAMETERS\n"); fprintf(stderr, " -e e Expect alignments at up to fraction e error\n"); fprintf(stderr, " -em m Don't ever allow alignments more than fraction m error\n"); fprintf(stderr, " -l l Expect alignments of at least l bases\n"); fprintf(stderr, " -maxcoverage c Use non-contained reads and the longest contained reads, up to\n"); fprintf(stderr, " C coverage, for consensus generation. The default is 0, and will\n"); fprintf(stderr, " use all reads.\n"); fprintf(stderr, "\n"); fprintf(stderr, " LOGGING\n"); fprintf(stderr, " -v Show multialigns.\n"); fprintf(stderr, " -V Enable debugging option 'verbosemultialign'.\n"); fprintf(stderr, "\n"); if ((gkpName == NULL) && (inPackageName == NULL)) fprintf(stderr, "ERROR: No gkpStore (-G) and no package (-p) supplied.\n"); if ((tigFileName == NULL) && (tigName == NULL) && (inPackageName == NULL)) fprintf(stderr, "ERROR: No tigStore (-T) OR no test unitig (-t) OR no package (-p) supplied.\n"); exit(1); } errno = 0; // Open output files. If we're creating a package, the usual output files are not opened. if (outPackageName) outPackageFile = fopen(outPackageName, "w"); if (errno) fprintf(stderr, "Failed to open output package file '%s': %s\n", outPackageName, strerror(errno)), exit(1); if ((outResultsName) && (outPackageName == NULL)) outResultsFile = fopen(outResultsName, "w"); if (errno) fprintf(stderr, "Failed to open output results file '%s': %s\n", outResultsName, strerror(errno)), exit(1); if ((outLayoutsName) && (outPackageName == NULL)) outLayoutsFile = fopen(outLayoutsName, "w"); if (errno) fprintf(stderr, "Failed to open output layout file '%s': %s\n", outLayoutsName, strerror(errno)), exit(1); if ((outSeqNameA) && (outPackageName == NULL)) outSeqFileA = fopen(outSeqNameA, "w"); if (errno) fprintf(stderr, "Failed to open output FASTA file '%s': %s\n", outSeqNameA, strerror(errno)), exit(1); if ((outSeqNameQ) && (outPackageName == NULL)) outSeqFileQ = fopen(outSeqNameQ, "w"); if (errno) fprintf(stderr, "Failed to open output FASTQ file '%s': %s\n", outSeqNameQ, strerror(errno)), exit(1); if (numThreads > 0) { omp_set_num_threads(numThreads); fprintf(stderr, "number of threads = %d (command line)\n", numThreads); fprintf(stderr, "\n"); } else { fprintf(stderr, "number of threads = %d (OpenMP default)\n", omp_get_max_threads()); fprintf(stderr, "\n"); } // Open gatekeeper for read only, and load the partitioned data if tigPart > 0. gkStore *gkpStore = NULL; tgStore *tigStore = NULL; FILE *tigFile = NULL; FILE *inPackageFile = NULL; map<uint32, gkRead *> *inPackageRead = NULL; map<uint32, gkReadData *> *inPackageReadData = NULL; if (gkpName) { fprintf(stderr, "-- Opening gkpStore '%s' partition %u.\n", gkpName, tigPart); gkpStore = gkStore::gkStore_open(gkpName, gkStore_readOnly, tigPart); } if (tigName) { fprintf(stderr, "-- Opening tigStore '%s' version %u.\n", tigName, tigVers); tigStore = new tgStore(tigName, tigVers); } if (tigFileName) { fprintf(stderr, "-- Opening tigFile '%s'.\n", tigFileName); errno = 0; tigFile = fopen(tigFileName, "r"); if (errno) fprintf(stderr, "Failed to open input tig file '%s': %s\n", tigFileName, strerror(errno)), exit(1); } if (inPackageName) { fprintf(stderr, "-- Opening package file '%s'.\n", inPackageName); errno = 0; inPackageFile = fopen(inPackageName, "r"); if (errno) fprintf(stderr, "Failed to open input package file '%s': %s\n", inPackageName, strerror(errno)), exit(1); } // Report some sizes. fprintf(stderr, "sizeof(abBead) "F_SIZE_T"\n", sizeof(abBead)); fprintf(stderr, "sizeof(abColumn) "F_SIZE_T"\n", sizeof(abColumn)); fprintf(stderr, "sizeof(abAbacus) "F_SIZE_T"\n", sizeof(abAbacus)); fprintf(stderr, "sizeof(abSequence) "F_SIZE_T"\n", sizeof(abSequence)); // Decide on what to compute. Either all unitigs, or a single unitig, or a special case test. uint32 b = 0; uint32 e = UINT32_MAX; if (tigStore) { if (utgEnd > tigStore->numTigs() - 1) utgEnd = tigStore->numTigs() - 1; if (utgBgn != UINT32_MAX) { b = utgBgn; e = utgEnd; } else { b = 0; e = utgEnd; } fprintf(stderr, "-- Computing unitig consensus for b="F_U32" to e="F_U32" with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n", b, e, errorRate, errorRateMax, minOverlap); } else { fprintf(stderr, "-- Computing unitig consensus with errorRate %0.4f (max %0.4f) and minimum overlap "F_U32"\n", errorRate, errorRateMax, minOverlap); } fprintf(stderr, "\n"); // I don't like this loop control. for (uint32 ti=b; (e == UINT32_MAX) || (ti <= e); ti++) { tgTig *tig = NULL; // If a tigStore, load the tig. The tig is the owner; it cannot be deleted by us. if (tigStore) tig = tigStore->loadTig(ti); // If a tigFile or a package, create a new tig and fill it. Obviously, we own it. if (tigFile || inPackageFile) { tig = new tgTig(); if (tig->loadFromStreamOrLayout((tigFile != NULL) ? tigFile : inPackageFile) == false) { delete tig; break; } } // No tig loaded, keep going. if (tig == NULL) continue; // If a package, populate the read and readData maps with data from the package. if (inPackageFile) { inPackageRead = new map<uint32, gkRead *>; inPackageReadData = new map<uint32, gkReadData *>; for (int32 ii=0; ii<tig->numberOfChildren(); ii++) { uint32 readID = tig->getChild(ii)->ident(); gkRead *read = (*inPackageRead)[readID] = new gkRead; gkReadData *data = (*inPackageReadData)[readID] = new gkReadData; gkStore::gkStore_loadReadFromStream(inPackageFile, read, data); if (read->gkRead_readID() != readID) fprintf(stderr, "ERROR: package not in sync with tig. package readID = %u tig readID = %u\n", read->gkRead_readID(), readID); assert(read->gkRead_readID() == readID); } } // More 'not liking' - set the verbosity level for logging. tig->_utgcns_verboseLevel = verbosity; // Are we parittioned? Is this tig in our partition? if (tigPart != UINT32_MAX) { uint32 missingReads = 0; for (uint32 ii=0; ii<tig->numberOfChildren(); ii++) if (gkpStore->gkStore_getReadInPartition(tig->getChild(ii)->ident()) == NULL) missingReads++; if (missingReads) { //fprintf(stderr, "SKIP unitig %u with %u reads found only %u reads in partition, skipped\n", // tig->tigID(), tig->numberOfChildren(), tig->numberOfChildren() - missingReads); continue; } } if (tig->length(true) > maxLen) { fprintf(stderr, "SKIP unitig %d of length %d (%d children) - too long, skipped\n", tig->tigID(), tig->length(true), tig->numberOfChildren()); continue; } if (tig->numberOfChildren() == 0) { fprintf(stderr, "SKIP unitig %d of length %d (%d children) - no children, skipped\n", tig->tigID(), tig->length(true), tig->numberOfChildren()); continue; } bool exists = tig->consensusExists(); if (tig->numberOfChildren() > 1) fprintf(stderr, "Working on unitig %d of length %d (%d children)%s%s\n", tig->tigID(), tig->length(true), tig->numberOfChildren(), ((exists == true) && (forceCompute == false)) ? " - already computed" : "", ((exists == true) && (forceCompute == true)) ? " - already computed, recomputing" : ""); // Process the tig. Remove deep coverage, create a consensus object, process it, and report the results. // before we add it to the store. unitigConsensus *utgcns = new unitigConsensus(gkpStore, errorRate, errorRateMax, minOverlap); savedChildren *origChildren = NULL; bool success = exists; // Save the tig in the package? // // The original idea was to dump the tig and all the reads, then load the tig and process as normal. // Sadly, stashContains() rearranges the order of the reads even if it doesn't remove any. The rearranged // tig couldn't be saved (otherwise it would be rearranged again). So, we were in the position of // needing to save the original tig and the rearranged reads. Impossible. // // Instead, we save the origianl tig and original reads -- including any that get stashed -- then // load them all back into a map for use in consensus proper. It's a bit of a pain, and could // have way more reads saved than necessary. if (outPackageFile) { utgcns->savePackage(outPackageFile, tig); fprintf(stderr, " Packaged unitig %u into '%s'\n", tig->tigID(), outPackageName); } // Compute consensus if it doesn't exist, or if we're forcing a recompute. But only if we // didn't just package it. if ((outPackageFile == NULL) && ((exists == false) || (forceCompute == true))) { origChildren = stashContains(tig, maxCov, true); switch (algorithm) { case 'Q': success = utgcns->generateQuick(tig, inPackageRead, inPackageReadData); break; case 'P': default: success = utgcns->generatePBDAG(tig, inPackageRead, inPackageReadData); break; case 'U': success = utgcns->generate(tig, inPackageRead, inPackageReadData); break; } } // If it was successful (or existed already), output. Success is always false if the unitig // was packaged, regardless of if it existed already. if (success == true) { if ((showResult) && (gkpStore)) // No gkpStore if we're from a package. Dang. tig->display(stdout, gkpStore, 200, 3); unstashContains(tig, origChildren); if (outResultsFile) tig->saveToStream(outResultsFile); if (outLayoutsFile) tig->dumpLayout(outLayoutsFile); if (outSeqFileA) tig->dumpFASTA(outSeqFileA, true); if (outSeqFileQ) tig->dumpFASTQ(outSeqFileQ, true); } // Report failures. if ((success == false) && (outPackageFile == NULL)) { fprintf(stderr, "unitigConsensus()-- unitig %d failed.\n", tig->tigID()); numFailures++; } // Clean up, unloading or deleting the tig. delete utgcns; // No real reason to keep this until here. delete origChildren; // Need to keep it until after we display() above. if (tigStore) tigStore->unloadTig(tig->tigID(), true); // Tell the store we're done with it if (tigFile) delete tig; } finish: delete tigStore; gkpStore->gkStore_close(); if (tigFile) fclose(tigFile); if (outResultsFile) fclose(outResultsFile); if (outLayoutsFile) fclose(outLayoutsFile); if (outPackageFile) fclose(outPackageFile); if (inPackageFile) fclose(inPackageFile); if (numFailures) { fprintf(stderr, "WARNING: Total number of unitig failures = %d\n", numFailures); fprintf(stderr, "\n"); fprintf(stderr, "Consensus did NOT finish successfully.\n"); } else { fprintf(stderr, "Consensus finished successfully. Bye.\n"); } return(numFailures != 0); }
int main(int argc, char **argv) { char bolfile_name[FILENAME_MAX] = {0}; char Outfile_Name[FILENAME_MAX] = {0}; int illegal; char * p; argc = AS_configure(argc, argv); Min_Olap_Len = AS_OVERLAP_MIN_LEN; // set after configure int err=0; int arg=1; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { Doing_Partial_Overlaps = TRUE; } else if (strcmp(argv[arg], "-h") == 0) { AS_UTL_decodeRange(argv[++arg], Lo_Hash_Frag, Hi_Hash_Frag); } else if (strcmp(argv[arg], "-H") == 0) { AS_UTL_decodeRange(argv[++arg], minLibToHash, maxLibToHash); } else if (strcmp(argv[arg], "-R") == 0) { AS_UTL_decodeRange(argv[++arg], minLibToRef, maxLibToRef); } else if (strcmp(argv[arg], "-k") == 0) { arg++; if ((isdigit(argv[arg][0]) && (argv[arg][1] == 0)) || (isdigit(argv[arg][0]) && isdigit(argv[arg][1]) && (argv[arg][2] == 0))) { Kmer_Len = strtoull(argv[arg], NULL, 10); } else { errno = 0; Kmer_Skip_File = fopen(argv[arg], "r"); if (errno) fprintf(stderr, "ERROR: Failed to open -k '%s': %s\n", argv[arg], strerror(errno)), exit(1); } } else if (strcmp(argv[arg], "-l") == 0) { Frag_Olap_Limit = strtol(argv[++arg], NULL, 10); if (Frag_Olap_Limit < 1) Frag_Olap_Limit = INT_MAX; } else if (strcmp(argv[arg], "-m") == 0) { Unique_Olap_Per_Pair = FALSE; } else if (strcmp(argv[arg], "--hashbits") == 0) { Hash_Mask_Bits = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashstrings") == 0) { Max_Hash_Strings = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashdatalen") == 0) { Max_Hash_Data_Len = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--hashload") == 0) { Max_Hash_Load = atof(argv[++arg]); } else if (strcmp(argv[arg], "--maxreadlen") == 0) { // Quite the gross way to do this, but simple. uint32 desired = strtoul(argv[++arg], NULL, 10); OFFSET_BITS = 1; while (((uint32)1 << OFFSET_BITS) < desired) OFFSET_BITS++; STRING_NUM_BITS = 30 - OFFSET_BITS; STRING_NUM_MASK = (1 << STRING_NUM_BITS) - 1; OFFSET_MASK = (1 << OFFSET_BITS) - 1; MAX_STRING_NUM = STRING_NUM_MASK; } else if (strcmp(argv[arg], "--readsperbatch") == 0) { Max_Reads_Per_Batch = strtoul(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "--readsperthread") == 0) { Max_Reads_Per_Thread = strtoul(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "-o") == 0) { strcpy(Outfile_Name, argv[++arg]); } else if (strcmp(argv[arg], "-r") == 0) { AS_UTL_decodeRange(argv[++arg], Lo_Old_Frag, Hi_Old_Frag); } else if (strcmp(argv[arg], "-t") == 0) { Num_PThreads = strtoull(argv[++arg], NULL, 10); } else if (strcmp(argv[arg], "-u") == 0) { Unique_Olap_Per_Pair = TRUE; } else if (strcmp(argv[arg], "-v") == 0) { Min_Olap_Len = (int) strtol (argv[++arg], & p, 10); } else if (strcmp(argv[arg], "-w") == 0) { Use_Window_Filter = TRUE; } else if (strcmp(argv[arg], "-x") == 0) { Ignore_Clear_Range = TRUE; } else if (strcmp(argv[arg], "-z") == 0) { Use_Hopeless_Check = FALSE; } else { if (Frag_Store_Path == NULL) { Frag_Store_Path = argv[arg]; } else { fprintf(stderr, "Unknown option '%s'\n", argv[arg]); err++; } } arg++; } // Fix up some flags if we're allowing high error rates. // if (AS_OVL_ERROR_RATE > 0.06) { if (Use_Window_Filter) fprintf(stderr, "High error rates requested -- window-filter turned off despite -w flag!\n"); Use_Window_Filter = FALSE; Use_Hopeless_Check = FALSE; } if (Max_Hash_Strings == 0) fprintf(stderr, "* No memory model supplied; -M needed!\n"), err++; if (Kmer_Len == 0) fprintf(stderr, "* No kmer length supplied; -k needed!\n"), err++; if (Max_Hash_Strings > MAX_STRING_NUM) fprintf(stderr, "Too many strings (--hashstrings), must be less than "F_U64"\n", MAX_STRING_NUM), err++; if (Outfile_Name[0] == 0) fprintf (stderr, "ERROR: No output file name specified\n"), err++; if ((err) || (Frag_Store_Path == NULL)) { fprintf(stderr, "USAGE: %s [options] <gkpStorePath>\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "-b <fn> in contig mode, specify the output file\n"); fprintf(stderr, "-c contig mode. Use 2 frag stores. First is\n"); fprintf(stderr, " for reads; second is for contigs\n"); fprintf(stderr, "-G do partial overlaps\n"); fprintf(stderr, "-h <range> to specify fragments to put in hash table\n"); fprintf(stderr, " Implies LSF mode (no changes to frag store)\n"); fprintf(stderr, "-I designate a file of frag iids to limit olaps to\n"); fprintf(stderr, " (Contig mode only)\n"); fprintf(stderr, "-k if one or two digits, the length of a kmer, otherwise\n"); fprintf(stderr, " the filename containing a list of kmers to ignore in\n"); fprintf(stderr, " the hash table\n"); fprintf(stderr, "-l specify the maximum number of overlaps per\n"); fprintf(stderr, " fragment-end per batch of fragments.\n"); fprintf(stderr, "-m allow multiple overlaps per oriented fragment pair\n"); fprintf(stderr, "-M specify memory size. Valid values are '8GB', '4GB',\n"); fprintf(stderr, " '2GB', '1GB', '256MB'. (Not for Contig mode)\n"); fprintf(stderr, "-o specify output file name\n"); fprintf(stderr, "-P write protoIO output (if not -G)\n"); fprintf(stderr, "-r <range> specify old fragments to overlap\n"); fprintf(stderr, "-s ignore screen information with fragments\n"); fprintf(stderr, "-t <n> use <n> parallel threads\n"); fprintf(stderr, "-u allow only 1 overlap per oriented fragment pair\n"); fprintf(stderr, "-v <n> only output overlaps of <n> or more bases\n"); fprintf(stderr, "-w filter out overlaps with too many errors in a window\n"); fprintf(stderr, "-x ignore the clear ranges on reads and use the \n"); fprintf(stderr, " full sequence\n"); fprintf(stderr, "-z skip the hopeless check\n"); fprintf(stderr, "\n"); fprintf(stderr, "--hashbits n Use n bits for the hash mask.\n"); fprintf(stderr, "--hashstrings n Load at most n strings into the hash table at one time.\n"); fprintf(stderr, "--hashdatalen n Load at most n bytes into the hash table at one time.\n"); fprintf(stderr, "--hashload f Load to at most 0.0 < f < 1.0 capacity (default 0.7).\n"); fprintf(stderr, "\n"); fprintf(stderr, "--maxreadlen n For batches with all short reads, pack bits differently to\n"); fprintf(stderr, " process more reads per batch.\n"); fprintf(stderr, " all reads must be shorter than n\n"); fprintf(stderr, " --hashstrings limited to 2^(30-m)\n"); fprintf(stderr, " Common values:\n"); fprintf(stderr, " maxreadlen 2048 -> hashstrings 524288 (default)\n"); fprintf(stderr, " maxreadlen 512 -> hashstrings 2097152\n"); fprintf(stderr, " maxreadlen 128 -> hashstrings 8388608\n"); fprintf(stderr, "\n"); fprintf(stderr, "--readsperbatch n Force batch size to n.\n"); fprintf(stderr, "--readsperthread n Force each thread to process n reads.\n"); fprintf(stderr, "\n"); exit(1); } assert(NULL == Out_BOF); Out_BOF = AS_OVS_createBinaryOverlapFile(Outfile_Name, FALSE); // Adjust the number of reads to load into memory at once (for processing, not the hash table), if (Max_Reads_Per_Batch == 0) Max_Reads_Per_Batch = (Max_Hash_Strings < 100000) ? Max_Hash_Strings : 100000; //if (Max_Hash_Strings < Max_Reads_Per_Batch) // Max_Reads_Per_Batch = Max_Hash_Strings; // Adjust the number of reads processed per thread. Default to having four blocks per thread, // but make sure that (a) all threads have work to do, and (b) batches are not minuscule. if (Max_Reads_Per_Thread == 0) Max_Reads_Per_Thread = Max_Reads_Per_Batch / (4 * Num_PThreads); if (Max_Reads_Per_Thread * Num_PThreads > Max_Reads_Per_Batch) Max_Reads_Per_Thread = Max_Reads_Per_Batch / Num_PThreads + 1; if (Max_Reads_Per_Thread < 10) Max_Reads_Per_Thread = 10; // We know enough now to set the hash function variables, and some other random variables. HSF1 = Kmer_Len - (Hash_Mask_Bits / 2); HSF2 = 2 * Kmer_Len - Hash_Mask_Bits; SV1 = HSF1 + 2; SV2 = (HSF1 + HSF2) / 2; SV3 = HSF2 - 2; Branch_Match_Value = (Doing_Partial_Overlaps) ? PARTIAL_BRANCH_MATCH_VAL : DEFAULT_BRANCH_MATCH_VAL; Branch_Error_Value = Branch_Match_Value - 1.0; fprintf(stderr, "\n"); fprintf(stderr, "STRING_NUM_BITS "F_U32"\n", STRING_NUM_BITS); fprintf(stderr, "OFFSET_BITS "F_U32"\n", OFFSET_BITS); fprintf(stderr, "STRING_NUM_MASK "F_U64"\n", STRING_NUM_MASK); fprintf(stderr, "OFFSET_MASK "F_U64"\n", OFFSET_MASK); fprintf(stderr, "MAX_STRING_NUM "F_U64"\n", MAX_STRING_NUM); fprintf(stderr, "\n"); fprintf(stderr, "Hash_Mask_Bits "F_U32"\n", Hash_Mask_Bits); fprintf(stderr, "Max_Hash_Strings "F_U32"\n", Max_Hash_Strings); fprintf(stderr, "Max_Hash_Data_Len "F_U64"\n", Max_Hash_Data_Len); fprintf(stderr, "Max_Hash_Load %f\n", Max_Hash_Load); fprintf(stderr, "Kmer Length %d\n", (int)Kmer_Len); fprintf(stderr, "Min Overlap Length %d\n", Min_Olap_Len); fprintf(stderr, "MAX_ERRORS %d\n", MAX_ERRORS); fprintf(stderr, "ERRORS_FOR_FREE %d\n", ERRORS_FOR_FREE); fprintf(stderr, "\n"); fprintf(stderr, "Num_PThreads "F_U32"\n", Num_PThreads); fprintf(stderr, "Max_Reads_Per_Batch "F_U32"\n", Max_Reads_Per_Batch); fprintf(stderr, "Max_Reads_Per_Thread "F_U32"\n", Max_Reads_Per_Thread); assert (8 * sizeof (uint64) > 2 * Kmer_Len); Initialize_Globals (); OldFragStore = new gkStore(Frag_Store_Path, FALSE, FALSE); /****************************************/ OverlapDriver(); /****************************************/ fprintf (stderr, " Kmer hits without olaps = "F_S64"\n", Kmer_Hits_Without_Olap_Ct); fprintf (stderr, " Kmer hits with olaps = "F_S64"\n", Kmer_Hits_With_Olap_Ct); fprintf (stderr, " Multiple overlaps/pair = "F_S64"\n", Multi_Overlap_Ct); fprintf (stderr, " Total overlaps produced = "F_S64"\n", Total_Overlaps); fprintf (stderr, " Contained overlaps = "F_S64"\n", Contained_Overlap_Ct); fprintf (stderr, " Dovetail overlaps = "F_S64"\n", Dovetail_Overlap_Ct); fprintf (stderr, "Rejected by short window = "F_S64"\n", Bad_Short_Window_Ct); fprintf (stderr, " Rejected by long window = "F_S64"\n", Bad_Long_Window_Ct); delete OldFragStore; AS_OVS_closeBinaryOverlapFile(Out_BOF); return(0); }