int main(int argc, char **argv) { char *gkpName = 0L; char *ovsName = 0L; char *iniClrName = NULL; char *maxClrName = NULL; char *outClrName = NULL; uint32 errorValue = AS_OVS_encodeEvalue(0.015); uint32 minAlignLength = 40; uint32 minReadLength = 64; char *outputPrefix = NULL; char logName[FILENAME_MAX] = {0}; char sumName[FILENAME_MAX] = {0}; FILE *logFile = 0L; FILE *sumFile = 0L; uint32 idMin = 1; uint32 idMax = UINT32_MAX; uint32 minEvidenceOverlap = 40; uint32 minEvidenceCoverage = 1; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) { gkpName = argv[++arg]; } else if (strcmp(argv[arg], "-O") == 0) { ovsName = argv[++arg]; } else if (strcmp(argv[arg], "-Ci") == 0) { iniClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Cm") == 0) { maxClrName = argv[++arg]; } else if (strcmp(argv[arg], "-Co") == 0) { outClrName = argv[++arg]; } else if (strcmp(argv[arg], "-e") == 0) { double erate = atof(argv[++arg]); errorValue = AS_OVS_encodeEvalue(erate); } else if (strcmp(argv[arg], "-l") == 0) { minAlignLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-minlength") == 0) { minReadLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-ol") == 0) { minEvidenceOverlap = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-oc") == 0) { minEvidenceCoverage = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-o") == 0) { outputPrefix = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { AS_UTL_decodeRange(argv[++arg], idMin, idMax); } else { fprintf(stderr, "ERROR: unknown option '%s'\n", argv[arg]); err++; } arg++; } if ((gkpName == NULL) || (ovsName == NULL) || (outputPrefix == NULL) || (err)) { fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -Co output.clearFile -o outputPrefix\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -G gkpStore path to read store\n"); fprintf(stderr, " -O ovlStore path to overlap store\n"); fprintf(stderr, "\n"); fprintf(stderr, " -o name output prefix, for logging\n"); fprintf(stderr, "\n"); fprintf(stderr, " -t bgn-end limit processing to only reads from bgn to end (inclusive)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -Ci clearFile path to input clear ranges (NOT SUPPORTED)\n"); //fprintf(stderr, " -Cm clearFile path to maximal clear ranges\n"); fprintf(stderr, " -Co clearFile path to ouput clear ranges\n"); fprintf(stderr, "\n"); fprintf(stderr, " -e erate ignore overlaps with more than 'erate' percent error\n"); //fprintf(stderr, " -l length ignore overlaps shorter than 'l' aligned bases (NOT SUPPORTED)\n"); fprintf(stderr, "\n"); fprintf(stderr, " -ol l the minimum evidence overlap length\n"); fprintf(stderr, " -oc c the minimum evidence overlap coverage\n"); fprintf(stderr, " evidence overlaps must overlap by 'l' bases to be joined, and\n"); fprintf(stderr, " must be at least 'c' deep to be retained\n"); fprintf(stderr, "\n"); fprintf(stderr, " -minlength l reads trimmed below this many bases are deleted\n"); fprintf(stderr, "\n"); exit(1); } gkStore *gkp = gkStore::gkStore_open(gkpName); ovStore *ovs = new ovStore(ovsName, gkp); clearRangeFile *iniClr = (iniClrName == NULL) ? NULL : new clearRangeFile(iniClrName, gkp); clearRangeFile *maxClr = (maxClrName == NULL) ? NULL : new clearRangeFile(maxClrName, gkp); clearRangeFile *outClr = (outClrName == NULL) ? NULL : new clearRangeFile(outClrName, gkp); if (outClr) // If the outClr file exists, those clear ranges are loaded. We need to reset them // back to 'untrimmed' for now. outClr->reset(gkp); if (iniClr && outClr) // An iniClr file was supplied, so use those as the initial clear ranges. outClr->copy(iniClr); if (outputPrefix) { sprintf(logName, "%s.log", outputPrefix); sprintf(sumName, "%s.summary", outputPrefix); errno = 0; logFile = fopen(logName, "w"); if (errno) fprintf(stderr, "Failed to open log file '%s' for writing: %s\n", logName, strerror(errno)), exit(1); sumFile = fopen(sumName, "w"); if (errno) fprintf(stderr, "Failed to open summary file '%s' for writing: %s\n", sumName, strerror(errno)), exit(1); fprintf(logFile, "id\tinitL\tinitR\tfinalL\tfinalR\tmessage (DEL=deleted NOC=no change MOD=modified)\n"); fprintf(sumFile, "Overlap error rate <= %.4f fraction error\n", AS_OVS_decodeEvalue(errorValue)); fprintf(sumFile, "Overlap min overlap >= %u base%s (for 'largest covered')\n", minEvidenceOverlap, (minEvidenceOverlap == 1) ? "" : "s"); fprintf(sumFile, "Overlap min coverage >= %u read%s (for 'largest covered')\n", minEvidenceCoverage, (minEvidenceCoverage == 1) ? "" : "s"); } uint32 ovlLen = 0; uint32 ovlMax = 64 * 1024; ovOverlap *ovl = ovOverlap::allocateOverlaps(gkp, ovlMax); memset(ovl, 0, sizeof(ovOverlap) * ovlMax); char logMsg[1024] = {0}; if (idMin < 1) idMin = 1; if (idMax > gkp->gkStore_getNumReads()) idMax = gkp->gkStore_getNumReads(); fprintf(stderr, "Processing from ID "F_U32" to "F_U32" out of "F_U32" reads.\n", idMin, idMax, gkp->gkStore_getNumReads()); for (uint32 id=idMin; id<=idMax; id++) { gkRead *read = gkp->gkStore_getRead(id); gkLibrary *libr = gkp->gkStore_getLibrary(read->gkRead_libraryID()); logMsg[0] = 0; // If the fragment is deleted, do nothing. If the fragment was deleted AFTER overlaps were // generated, then the overlaps will be out of sync -- we'll get overlaps for these fragments // we skip. // if ((iniClr) && (iniClr->isDeleted(id) == true)) continue; // If it did not request trimming, do nothing. Similar to the above, we'll get overlaps to // fragments we skip. // if ((libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) && (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE)) continue; // Decide on the initial trimming. We copied any iniClr into outClr above, and if there wasn't // an iniClr, then outClr is the full read. uint32 ibgn = outClr->bgn(id); uint32 iend = outClr->end(id); // Set the, ahem, initial final trimming. bool isGood = false; uint32 fbgn = ibgn; uint32 fend = iend; // Load overlaps. uint32 nLoaded = ovs->readOverlaps(id, ovl, ovlLen, ovlMax); // Trim! if (nLoaded == 0) { // No overlaps, so mark it as junk. isGood = false; } else if (libr->gkLibrary_finalTrim() == FINALTRIM_LARGEST_COVERED) { // Use the largest region covered by overlaps as the trim assert(ovlLen > 0); assert(id == ovl[0].a_iid); isGood = largestCovered(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, errorValue, minEvidenceOverlap, minEvidenceCoverage, minReadLength); assert(fbgn <= fend); } else if (libr->gkLibrary_finalTrim() == FINALTRIM_BEST_EDGE) { // Use the largest region covered by overlaps as the trim assert(ovlLen > 0); assert(id == ovl[0].a_iid); isGood = bestEdge(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, errorValue, minEvidenceOverlap, minEvidenceCoverage, minReadLength); assert(fbgn <= fend); } else { // Do nothing. Really shouldn't get here. assert(0); continue; } // Enforce the maximum clear range if ((isGood) && (maxClr)) { isGood = enforceMaximumClearRange(ovl, ovlLen, read, ibgn, iend, fbgn, fend, logMsg, maxClr); assert(fbgn <= fend); } // // Trimmed. Make sense of the result, write some logs, and update the output. // // If bad trimming or too small, write the log and keep going. // if ((isGood == false) || (fend - fbgn < minReadLength)) { outClr->setbgn(id) = fbgn; outClr->setend(id) = fend; outClr->setDeleted(id); // Gah, just obliterates the clear range. fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tDEL%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); } // If we didn't change anything, also write a log. // else if ((ibgn == fbgn) && (iend == fend)) { fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tNOC%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); continue; } // Otherwise, we actually did something. else { outClr->setbgn(id) = fbgn; outClr->setend(id) = fend; fprintf(logFile, F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\t"F_U32"\tMOD%s\n", id, ibgn, iend, fbgn, fend, (logMsg[0] == 0) ? "" : logMsg); } } gkp->gkStore_close(); delete ovs; delete iniClr; delete maxClr; delete outClr; fclose(logFile); fclose(sumFile); exit(0); }
// Read old fragments in gkpStore and choose the ones that // have overlaps with fragments in Frag. Recompute the // overlaps, using fragment corrections and output the revised error. void Redo_Olaps(coParameters *G, gkStore *gkpStore) { // Figure out the range of B reads we care about. We probably could just loop over every read in // the store with minimal penalty. uint64 thisOvl = 0; uint64 lastOvl = G->olapsLen - 1; uint32 loBid = G->olaps[thisOvl].b_iid; uint32 hiBid = G->olaps[lastOvl].b_iid; // Open all the corrections. memoryMappedFile *Cfile = new memoryMappedFile(G->correctionsName); Correction_Output_t *C = (Correction_Output_t *)Cfile->get(); uint64 Cpos = 0; uint64 Clen = Cfile->length() / sizeof(Correction_Output_t); // Allocate some temporary work space for the forward and reverse corrected B reads. fprintf(stderr, "--Allocate "F_U64" MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20); char *fseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1]; uint32 fseqLen = 0; char *rseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1]; uint32 rseqLen = 0; fprintf(stderr, "--Allocate "F_U64" MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20); Adjust_t *fadj = new Adjust_t [AS_MAX_READLEN + 1]; Adjust_t *radj = new Adjust_t [AS_MAX_READLEN + 1]; uint32 fadjLen = 0; // radj is the same length fprintf(stderr, "--Allocate "F_U64" MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20); gkReadData *readData = new gkReadData; pedWorkArea_t *ped = new pedWorkArea_t; uint64 Total_Alignments_Ct = 0; uint64 Failed_Alignments_Ct = 0; uint64 Failed_Alignments_Both_Ct = 0; uint64 Failed_Alignments_End_Ct = 0; uint64 Failed_Alignments_Length_Ct = 0; uint32 rhaFail = 0; uint32 rhaPass = 0; uint64 olapsFwd = 0; uint64 olapsRev = 0; ped->initialize(G, G->errorRate); // Process overlaps. Loop over the B reads, and recompute each overlap. for (uint32 curID=loBid; curID<=hiBid; curID++) { if (((curID - loBid) % 1024) == 0) fprintf(stderr, "Recomputing overlaps - %9u - %9u - %9u\r", loBid, curID, hiBid); if (curID < G->olaps[thisOvl].b_iid) continue; gkRead *read = gkpStore->gkStore_getRead(curID); gkpStore->gkStore_loadReadData(read, readData); // Apply corrections to the B read (also converts to lower case, reverses it, etc) //fprintf(stderr, "Correcting B read %u at Cpos=%u\n", curID, Cpos); fseqLen = 0; rseqLen = 0; fadjLen = 0; correctRead(curID, fseq, fseqLen, fadj, fadjLen, readData->gkReadData_getSequence(), read->gkRead_sequenceLength(), C, Cpos, Clen); // Create copies of the sequence for forward and reverse. There isn't a need for the forward copy (except that // we mutate it with corrections), and the reverse copy could be deferred until it is needed. memcpy(rseq, fseq, sizeof(char) * (fseqLen + 1)); reverseComplementSequence(rseq, fseqLen); Make_Rev_Adjust(radj, fadj, fadjLen, fseqLen); // Recompute alignments for all overlaps involving the B read. for (; ((thisOvl <= lastOvl) && (G->olaps[thisOvl].b_iid == curID)); thisOvl++) { Olap_Info_t *olap = G->olaps + thisOvl; //fprintf(stderr, "processing overlap %u - %u\n", olap->a_iid, olap->b_iid); // Find the A segment. It's always forward. It's already been corrected. char *a_part = G->reads[olap->a_iid - G->bgnID].bases; if (olap->a_hang > 0) { int32 ha = Hang_Adjust(olap->a_hang, G->reads[olap->a_iid - G->bgnID].adjusts, G->reads[olap->a_iid - G->bgnID].adjustsLen); a_part += ha; //fprintf(stderr, "offset a_part by ha=%d\n", ha); } // Find the B segment. char *b_part = (olap->normal == true) ? fseq : rseq; //if (olap->normal == true) // fprintf(stderr, "b_part = fseq %40.40s\n", fseq); //else // fprintf(stderr, "b_part = rseq %40.40s\n", rseq); if (olap->normal == true) olapsFwd++; else olapsRev++; bool rha=false; if (olap->a_hang < 0) { int32 ha = (olap->normal == true) ? Hang_Adjust(-olap->a_hang, fadj, fadjLen) : Hang_Adjust(-olap->a_hang, radj, fadjLen); b_part += ha; //fprintf(stderr, "offset b_part by ha=%d normal=%d\n", ha, olap->normal); rha=true; } // Compute the alignment. int32 a_part_len = strlen(a_part); int32 b_part_len = strlen(b_part); int32 olap_len = min(a_part_len, b_part_len); int32 a_end = 0; int32 b_end = 0; bool match_to_end = false; //fprintf(stderr, ">A\n%s\n", a_part); //fprintf(stderr, ">B\n%s\n", b_part); int32 errors = Prefix_Edit_Dist(a_part, a_part_len, b_part, b_part_len, G->Error_Bound[olap_len], a_end, b_end, match_to_end, ped); // ped->delta isn't used. // ?? These both occur, but the first is much much more common. if ((ped->deltaLen > 0) && (ped->delta[0] == 1) && (0 < G->olaps[thisOvl].a_hang)) { int32 stop = min(ped->deltaLen, (int32)G->olaps[thisOvl].a_hang); // a_hang is int32:31! int32 i = 0; for (i=0; (i < stop) && (ped->delta[i] == 1); i++) ; //fprintf(stderr, "RESET 1 i=%d delta=%d\n", i, ped->delta[i]); assert((i == stop) || (ped->delta[i] != -1)); ped->deltaLen -= i; memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int)); a_part += i; a_end -= i; a_part_len -= i; errors -= i; } else if ((ped->deltaLen > 0) && (ped->delta[0] == -1) && (G->olaps[thisOvl].a_hang < 0)) { int32 stop = min(ped->deltaLen, - G->olaps[thisOvl].a_hang); int32 i = 0; for (i=0; (i < stop) && (ped->delta[i] == -1); i++) ; //fprintf(stderr, "RESET 2 i=%d delta=%d\n", i, ped->delta[i]); assert((i == stop) || (ped->delta[i] != 1)); ped->deltaLen -= i; memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int)); b_part += i; b_end -= i; b_part_len -= i; errors -= i; } Total_Alignments_Ct++; int32 olapLen = min(a_end, b_end); if ((match_to_end == false) && (olapLen <= 0)) Failed_Alignments_Both_Ct++; if (match_to_end == false) Failed_Alignments_End_Ct++; if (olapLen <= 0) Failed_Alignments_Length_Ct++; if ((match_to_end == false) || (olapLen <= 0)) { Failed_Alignments_Ct++; #if 0 // I can't find any patterns in these errors. I thought that it was caused by the corrections, but I // found a case where no corrections were made and the alignment still failed. Perhaps it is differences // in the alignment code (the forward vs reverse prefix distance in overlapper vs only the forward here)? fprintf(stderr, "Redo_Olaps()--\n"); fprintf(stderr, "Redo_Olaps()--\n"); fprintf(stderr, "Redo_Olaps()-- Bad alignment errors %d a_end %d b_end %d match_to_end %d olapLen %d\n", errors, a_end, b_end, match_to_end, olapLen); fprintf(stderr, "Redo_Olaps()-- Overlap a_hang %d b_hang %d innie %d\n", olap->a_hang, olap->b_hang, olap->innie); fprintf(stderr, "Redo_Olaps()-- Reads a_id %u a_length %d b_id %u b_length %d\n", G->olaps[thisOvl].a_iid, G->reads[ G->olaps[thisOvl].a_iid ].basesLen, G->olaps[thisOvl].b_iid, G->reads[ G->olaps[thisOvl].b_iid ].basesLen); fprintf(stderr, "Redo_Olaps()-- A %s\n", a_part); fprintf(stderr, "Redo_Olaps()-- B %s\n", b_part); Display_Alignment(a_part, a_part_len, b_part, b_part_len, ped->delta, ped->deltaLen); fprintf(stderr, "\n"); #endif if (rha) rhaFail++; continue; } if (rha) rhaPass++; G->olaps[thisOvl].evalue = AS_OVS_encodeEvalue((double)errors / olapLen); //fprintf(stderr, "REDO - errors = %u / olapLep = %u -- %f\n", errors, olapLen, AS_OVS_decodeEvalue(G->olaps[thisOvl].evalue)); } } fprintf(stderr, "\n"); delete ped; delete readData; delete [] radj; delete [] fadj; delete [] rseq; delete [] fseq; delete Cfile; fprintf(stderr, "-- Release bases, adjusts and reads.\n"); delete [] G->bases; G->bases = NULL; delete [] G->adjusts; G->adjusts = NULL; delete [] G->reads; G->reads = NULL; fprintf(stderr, "Olaps Fwd "F_U64"\n", olapsFwd); fprintf(stderr, "Olaps Rev "F_U64"\n", olapsRev); fprintf(stderr, "Total: "F_U64"\n", Total_Alignments_Ct); fprintf(stderr, "Failed: "F_U64" (both)\n", Failed_Alignments_Both_Ct); fprintf(stderr, "Failed: "F_U64" (either)\n", Failed_Alignments_Ct); fprintf(stderr, "Failed: "F_U64" (match to end)\n", Failed_Alignments_End_Ct); fprintf(stderr, "Failed: "F_U64" (negative length)\n", Failed_Alignments_Length_Ct); fprintf(stderr, "rhaFail %u rhaPass %u\n", rhaFail, rhaPass); }
int main(int argc, char **argv) { char *gkpName = NULL; char *ovlName = NULL; char *outPrefix = NULL; uint32 bgnID = 0; uint32 endID = UINT32_MAX; uint32 ovlSelect = 0; double ovlAtMost = AS_OVS_encodeEvalue(1.0); double ovlAtLeast = AS_OVS_encodeEvalue(0.0); double expectedMean = 30.0; double expectedStdDev = 7.0; bool toFile = true; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-G") == 0) gkpName = argv[++arg]; else if (strcmp(argv[arg], "-O") == 0) ovlName = argv[++arg]; else if (strcmp(argv[arg], "-o") == 0) outPrefix = argv[++arg]; else if (strcmp(argv[arg], "-C") == 0) { expectedMean = atof(argv[++arg]); expectedStdDev = atof(argv[++arg]); } else if (strcmp(argv[arg], "-c") == 0) toFile = false; else if (strcmp(argv[arg], "-b") == 0) bgnID = atoi(argv[++arg]); else if (strcmp(argv[arg], "-e") == 0) endID = atoi(argv[++arg]); else if (strcmp(argv[arg], "-overlap") == 0) { arg++; if (strcmp(argv[arg], "5") == 0) ovlSelect |= OVL_5; else if (strcmp(argv[arg], "3") == 0) ovlSelect |= OVL_3; else if (strcmp(argv[arg], "contained") == 0) ovlSelect |= OVL_CONTAINED; else if (strcmp(argv[arg], "container") == 0) ovlSelect |= OVL_CONTAINER; else if (strcmp(argv[arg], "partial") == 0) ovlSelect |= OVL_PARTIAL; else if (strcmp(argv[arg], "atmost") == 0) ovlAtMost = atof(argv[++arg]); else if (strcmp(argv[arg], "atleast") == 0) ovlAtLeast = atof(argv[++arg]); else { fprintf(stderr, "ERROR: unknown -overlap '%s'\n", argv[arg]); exit(1); } } else { fprintf(stderr, "%s: unknown option '%s'.\n", argv[0], argv[arg]); err++; } arg++; } if (gkpName == NULL) err++; if (ovlName == NULL) err++; if (outPrefix == NULL) err++; if (err) { fprintf(stderr, "usage: %s -G gkpStore -O ovlStore -o outPrefix [-b bgnID] [-e endID] ...\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "Generates statistics for an overlap store. By default all possible classes\n"); fprintf(stderr, "are generated, options can disable specific classes.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -C mean stddev Expect coverage at mean +- stddev\n"); fprintf(stderr, " -c Write stats to stdout, not to a file\n"); fprintf(stderr, "\n"); fprintf(stderr, "Outputs:\n"); fprintf(stderr, "\n"); fprintf(stderr, " outPrefix.per-read.log One line per read, giving readID, read length and classification.\n"); fprintf(stderr, " outPrefix.summary The primary statistical output.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Overlap Selection:\n"); fprintf(stderr, " -overlap 5 5' overlaps only\n"); fprintf(stderr, " -overlap 3 3' overlaps only\n"); fprintf(stderr, " -overlap contained contained overlaps only\n"); fprintf(stderr, " -overlap container container overlaps only\n"); fprintf(stderr, " -overlap partial overlap is not valid for assembly\n"); fprintf(stderr, "\n"); fprintf(stderr, " An overlap is classified as exactly one of 5', 3', contained or container.\n"); fprintf(stderr, " By default, all overlaps are selected. Specifying any of these options will\n"); fprintf(stderr, " restrict overlaps to just those classifications. E.g., '-overlap 5 -overlap 3'\n"); fprintf(stderr, " will select dovetail overlaps off either end of the read.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -overlap atmost x at most fraction x error (overlap-erate <= x)\n"); fprintf(stderr, " -overlap atleast x at least fraction x error (x <= overlap-erate)\n"); fprintf(stderr, "\n"); fprintf(stderr, " Overlaps can be further filtered by fraction error. Usually, this will be an\n"); fprintf(stderr, " 'atmost' filtering to use only the higher qualtiy overlaps.\n"); fprintf(stderr, "\n"); fprintf(stderr, " A contained read has at least one container overlap. Container read -> ---------------\n"); fprintf(stderr, " A container read has at least one contained overlap. Contained overlap -> -----\n"); fprintf(stderr, "\n"); exit(1); } // Set the default to 'all' if nothing set. if (ovlSelect == 0) ovlSelect = 0xff; // Open inputs, find limits. gkStore *gkpStore = gkStore::gkStore_open(gkpName); ovStore *ovlStore = new ovStore(ovlName, gkpStore); if (endID > gkpStore->gkStore_getNumReads()) endID = gkpStore->gkStore_getNumReads(); if (endID < bgnID) fprintf(stderr, "ERROR: invalid bgn/end range bgn=%u end=%u; only %u reads in the store\n", bgnID, endID, gkpStore->gkStore_getNumReads()), exit(1); ovlStore->setRange(bgnID, endID); // Allocate output histograms. histogramStatistics *readNoOlaps = new histogramStatistics; // Bad reads! (read length) histogramStatistics *readHole = new histogramStatistics; histogramStatistics *readHump = new histogramStatistics; histogramStatistics *readNo5 = new histogramStatistics; histogramStatistics *readNo3 = new histogramStatistics; histogramStatistics *olapHole = new histogramStatistics; // Hole size (sum of holes if more than one) histogramStatistics *olapHump = new histogramStatistics; // Hump size (sum of humps if more than one) histogramStatistics *olapNo5 = new histogramStatistics; // 5' uncovered size histogramStatistics *olapNo3 = new histogramStatistics; // 3' uncovered size histogramStatistics *readLowCov = new histogramStatistics; // Good reads! (read length) histogramStatistics *readUnique = new histogramStatistics; histogramStatistics *readRepeatCont = new histogramStatistics; histogramStatistics *readRepeatDove = new histogramStatistics; histogramStatistics *readSpanRepeat = new histogramStatistics; histogramStatistics *readUniqRepeatCont = new histogramStatistics; histogramStatistics *readUniqRepeatDove = new histogramStatistics; histogramStatistics *readUniqAnchor = new histogramStatistics; histogramStatistics *covrLowCov = new histogramStatistics; // Good reads! (overlap length) histogramStatistics *covrUnique = new histogramStatistics; histogramStatistics *covrRepeatCont = new histogramStatistics; histogramStatistics *covrRepeatDove = new histogramStatistics; histogramStatistics *covrSpanRepeat = new histogramStatistics; histogramStatistics *covrUniqRepeatCont = new histogramStatistics; histogramStatistics *covrUniqRepeatDove = new histogramStatistics; histogramStatistics *covrUniqAnchor = new histogramStatistics; histogramStatistics *olapLowCov = new histogramStatistics; // Good reads! (overlap length) histogramStatistics *olapUnique = new histogramStatistics; histogramStatistics *olapRepeatCont = new histogramStatistics; histogramStatistics *olapRepeatDove = new histogramStatistics; histogramStatistics *olapSpanRepeat = new histogramStatistics; histogramStatistics *olapUniqRepeatCont = new histogramStatistics; histogramStatistics *olapUniqRepeatDove = new histogramStatistics; histogramStatistics *olapUniqAnchor = new histogramStatistics; // Coverage interval lists, of all overlaps selected. // Open outputs. char N[FILENAME_MAX]; sprintf(N, "%s.per-read.log", outPrefix); FILE *LOG = fopen(N, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); // Compute! uint32 overlapsMax = 1024 * 1024; uint32 overlapsLen = 0; ovOverlap *overlaps = ovOverlap::allocateOverlaps(gkpStore, overlapsMax); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); while (overlapsLen > 0) { uint32 readID = overlaps[0].a_iid; uint32 readLen = gkpStore->gkStore_getRead(readID)->gkRead_sequenceLength(); intervalList<uint32> cov; uint32 covID = 0; bool readCoverage5 = false; bool readCoverage3 = false; bool readContained = false; bool readContainer = false; bool readPartial = false; for (uint32 oo=0; oo<overlapsLen; oo++) { bool is5prime = (overlaps[oo].overlapAEndIs5prime() == true) && (ovlSelect & OVL_5) && (overlaps[oo].overlap5primeIsPartial() == false); bool is3prime = (overlaps[oo].overlapAEndIs3prime() == true) && (ovlSelect & OVL_3) && (overlaps[oo].overlap3primeIsPartial() == false); bool isContained = (overlaps[oo].overlapAIsContained() == true) && (ovlSelect & OVL_CONTAINED); bool isContainer = (overlaps[oo].overlapAIsContainer() == true) && (ovlSelect & OVL_CONTAINER); bool isPartial = (overlaps[oo].overlapIsPartial() == true) && (ovlSelect & OVL_PARTIAL); // Ignore the overlap? if ((is5prime == false) && (is3prime == false) && (isContained == false) && (isContainer == false) && (isPartial == false)) continue; if (overlaps[oo].evalue() < ovlAtLeast) continue; if (overlaps[oo].evalue() > ovlAtMost) continue; readCoverage5 |= is5prime; // If there is a 5' overlap, the read isn't missing 5' coverage readCoverage3 |= is3prime; readContained |= isContained; // Read is contained in something else readContainer |= isContainer; // Read is a container of somethign else readPartial |= isPartial; cov.add(overlaps[oo].a_bgn(), overlaps[oo].a_end() - overlaps[oo].a_bgn()); } // If we filtered all the overlaps, just get out of here. Yeah, some code duplication, // but cleaner than sticking an if block around the rest of the loop. if (cov.numberOfIntervals() == 0) { readNoOlaps->add(readLen); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } // Generate a depth-of-coverage map, then merge intervals intervalList<uint32> depth(cov); cov.merge(); // Analyze the intervals, save per-read information to the log. uint32 lastInt = cov.numberOfIntervals() - 1; uint32 bgn = cov.lo(0); uint32 end = cov.hi(lastInt); bool contiguous = (lastInt == 0) ? true : false; bool readFullCoverage = (lastInt == 0) && (bgn == 0) && (end == readLen); bool readMissingMiddle = (lastInt != 0); uint32 holeSize = 0; uint32 no5Size = bgn; uint32 no3Size = readLen - end; for (uint32 ii=1; ii<cov.numberOfIntervals(); ii++) holeSize += cov.lo(ii) - cov.hi(ii-1); // Handle bad cases. If it's a partial overlap, ignore the is5prime and is3prime markings. if (readMissingMiddle == true) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "middle-missing"); readHole->add(readLen); olapHole->add(holeSize); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } if ((readCoverage5 == false) && (readCoverage3 == false) && (readContained == false) && (readPartial == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "middle-only"); readHump->add(readLen); olapHump->add(no5Size + no3Size); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } if ((readCoverage5 == false) && (readContained == false) && (readPartial == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "no-5-prime"); readNo5->add(readLen); olapNo5->add(no5Size); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } if ((readCoverage3 == false) && (readContained == false) && (readPartial == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "no-3-prime"); readNo3->add(readLen); olapNo3->add(no3Size); overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); continue; } // Handle good cases. For partial overlaps, bgn and end are not the extent of the read. if (readPartial == false) { assert(bgn == 0); assert(end == readLen); assert(contiguous == true); assert(readFullCoverage == true); } // Compute mean and std.dev of coverage. From this, we decide if the read is 'unique', // 'repeat' or 'mixed'. If 'mixed', we then need to decide if the read spans a repeat, or // joins unique and repeat. double covMean = 0; double covStdDev = 0; for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covMean += (depth.hi(ii) - depth.lo(ii)) * depth.depth(ii); covMean /= readLen; for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covStdDev += (depth.hi(ii) - depth.lo(ii)) * (depth.depth(ii) - covMean) * (depth.depth(ii) - covMean); covStdDev = sqrt(covStdDev / (readLen - 1)); // Classify each interval as either 'l'owcoverage, 'u'nique or 'r'epeat. char *classification = new char [depth.numberOfIntervals()]; for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) { if (depth.depth(ii) < expectedMean - 3 * expectedStdDev) { classification[ii] = 'l'; } else if (depth.depth(ii) < expectedMean + 3 * expectedStdDev) { classification[ii] = 'u'; } else { classification[ii] = 'r'; } } // Try to detect if a read is part unique and part repeat. bool isLowCov = false; bool isUnique = false; bool isRepeat = false; bool isSpanRepeat = false; bool isUniqRepeat = false; bool isUniqAnchor = false; int32 bgni = 0; int32 endi = depth.numberOfIntervals() - 1; char type5 = classification[bgni]; char typem = 0; char type3 = classification[endi]; while ((bgni <= endi) && (type5 == classification[bgni])) bgni++; bgni--; while ((bgni <= endi) && (type3 == classification[endi])) endi--; endi++; // All the same classification? if (bgni == endi) { isLowCov = (type5 == 'l'); isUnique = (type5 == 'u'); isRepeat = (type5 == 'r'); } // Nope, if we aren't the same, assume it is uniqRepeat. else if (type5 != type3) { isUniqRepeat = true; } // Nope, the same on both ends. Assume we're just flipped. else { if (type5 == 'r') isUniqAnchor = true; else isSpanRepeat = true; } // Now, do something with it. // LOG - readID readLen classification if (isLowCov) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "low-cov"); readLowCov->add(readLen); for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covrLowCov->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii)); } if (isUnique) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "unique"); readUnique->add(readLen); for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covrUnique->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii)); } if ((isRepeat) && (readContained == true)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "contained-repeat"); readRepeatCont->add(readLen); for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covrRepeatCont->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii)); } if ((isRepeat) && (readContained == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "dovetail-repeat"); readRepeatDove->add(readLen); for (uint32 ii=0; ii<depth.numberOfIntervals(); ii++) covrRepeatDove->add(depth.depth(ii), depth.hi(ii) - depth.lo(ii)); } if (isSpanRepeat) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "span-repeat"); readSpanRepeat->add(readLen); olapSpanRepeat->add(depth.lo(endi) - depth.hi(bgni)); } if ((isUniqRepeat) && (readContained == true)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-repeat-cont"); readUniqRepeatCont->add(readLen); } if ((isUniqRepeat) && (readContained == false)) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-repeat-dove"); readUniqRepeatDove->add(readLen); } if (isUniqAnchor) { fprintf(LOG, "%u\t%u\t%s\n", readID, readLen, "uniq-anchor"); readUniqAnchor->add(readLen); olapUniqAnchor->add(depth.lo(endi) - depth.hi(bgni)); } // Done. Read more data. overlapsLen = ovlStore->readOverlaps(overlaps, overlapsMax); } fclose(LOG); // Done with logging. readHole->finalizeData(); olapHole->finalizeData(); readHump->finalizeData(); olapHump->finalizeData(); readNo5->finalizeData(); olapNo5->finalizeData(); readNo3->finalizeData(); olapNo3->finalizeData(); readLowCov->finalizeData(); olapLowCov->finalizeData(); covrLowCov->finalizeData(); readUnique->finalizeData(); olapUnique->finalizeData(); covrUnique->finalizeData(); readRepeatCont->finalizeData(); olapRepeatCont->finalizeData(); covrRepeatCont->finalizeData(); readRepeatDove->finalizeData(); olapRepeatDove->finalizeData(); covrRepeatDove->finalizeData(); readSpanRepeat->finalizeData(); olapSpanRepeat->finalizeData(); readUniqRepeatCont->finalizeData(); olapUniqRepeatCont->finalizeData(); readUniqRepeatDove->finalizeData(); olapUniqRepeatDove->finalizeData(); readUniqAnchor->finalizeData(); olapUniqAnchor->finalizeData(); LOG = stdout; if (toFile == true) { sprintf(N, "%s.summary", outPrefix); LOG = fopen(N, "w"); if (errno) fprintf(stderr, "Failed to open '%s' for writing: %s\n", N, strerror(errno)), exit(1); } fprintf(LOG, "category reads read length feature size or coverage analysis\n"); fprintf(LOG, "---------------- ------- ---------------------- ------------------------ --------------------\n"); fprintf(LOG, "middle-missing %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readHole->numberOfObjects(), readHole->mean(), readHole->stddev(), olapHole->mean(), olapHole->stddev()); fprintf(LOG, "middle-hump %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readHump->numberOfObjects(), readHump->mean(), readHump->stddev(), olapHump->mean(), olapHump->stddev()); fprintf(LOG, "no-5-prime %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readNo5->numberOfObjects(), readNo5->mean(), readNo5->stddev(), olapNo5->mean(), olapNo5->stddev()); fprintf(LOG, "no-3-prime %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (bad trimming)\n", readNo3->numberOfObjects(), readNo3->mean(), readNo3->stddev(), olapNo3->mean(), olapNo3->stddev()); fprintf(LOG, "\n"); fprintf(LOG, "low-coverage %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (easy to assemble, potential for lower quality consensus)\n", readLowCov->numberOfObjects(), readLowCov->mean(), readLowCov->stddev(), covrLowCov->mean(), covrLowCov->stddev()); fprintf(LOG, "unique %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (easy to assemble, perfect, yay)\n", readUnique->numberOfObjects(), readUnique->mean(), readUnique->stddev(), covrUnique->mean(), covrUnique->stddev()); fprintf(LOG, "repeat-cont %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (potential for consensus errors, no impact on assembly)\n", readRepeatCont->numberOfObjects(), readRepeatCont->mean(), readRepeatCont->stddev(), covrRepeatCont->mean(), covrRepeatCont->stddev()); fprintf(LOG, "repeat-dove %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (hard to assemble, likely won't assemble correctly or even at all)\n", readRepeatDove->numberOfObjects(), readRepeatDove->mean(), readRepeatDove->stddev(), covrRepeatDove->mean(), covrRepeatDove->stddev()); fprintf(LOG, "\n"); fprintf(LOG, "span-repeat %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (read spans a large repeat, usually easy to assemble)\n", readSpanRepeat->numberOfObjects(), readSpanRepeat->mean(), readSpanRepeat->stddev(), olapSpanRepeat->mean(), olapSpanRepeat->stddev()); fprintf(LOG, "uniq-repeat-cont %7"F_U64P" %10.2f +- %-8.2f (should be uniquely placed, low potential for consensus errors, no impact on assembly)\n", readUniqRepeatCont->numberOfObjects(), readUniqRepeatCont->mean(), readUniqRepeatCont->stddev()); fprintf(LOG, "uniq-repeat-dove %7"F_U64P" %10.2f +- %-8.2f (will end contigs, potential to misassemble)\n", readUniqRepeatDove->numberOfObjects(), readUniqRepeatDove->mean(), readUniqRepeatDove->stddev()); fprintf(LOG, "uniq-anchor %7"F_U64P" %10.2f +- %-8.2f %10.2f +- %-8.2f (repeat read, with unique section, probable bad read)\n", readUniqAnchor->numberOfObjects(), readUniqAnchor->mean(), readUniqAnchor->stddev(), olapUniqAnchor->mean(), olapUniqAnchor->stddev()); if (toFile == true) fclose(LOG); delete ovlStore; gkpStore->gkStore_close(); exit(0); }