void Correct_Frags(coParameters *G, gkStore *gkpStore) { // The original converted to lowercase, and made non-acgt be 'a'. for (uint32 i=0; i<256; i++) filter[i] = 'a'; filter['A'] = filter['a'] = 'a'; filter['C'] = filter['c'] = 'c'; filter['G'] = filter['g'] = 'g'; filter['T'] = filter['t'] = 't'; // Open the corrections, as an array. memoryMappedFile *Cfile = new memoryMappedFile(G->correctionsName); Correction_Output_t *C = (Correction_Output_t *)Cfile->get(); uint64 Cpos = 0; uint64 Clen = Cfile->length() / sizeof(Correction_Output_t); uint64 firstRecord = 0; uint64 currentRecord = 0; fprintf(stderr, "Reading "F_U64" corrections from '%s'.\n", Clen, G->correctionsName); // Count the number of bases, so we can do two gigantic allocations for bases and adjustments. // Adjustments are always less than the number of corrections; we could also count exactly. G->basesLen = 0; G->adjustsLen = 0; for (uint32 curID=G->bgnID; curID<=G->endID; curID++) { gkRead *read = gkpStore->gkStore_getRead(curID); G->basesLen += read->gkRead_sequenceLength() + 1; } for (uint64 c=0; c<Clen; c++) { switch (C[c].type) { case DELETE: case A_INSERT: case C_INSERT: case G_INSERT: case T_INSERT: G->adjustsLen++; break; } } fprintf(stderr, "Correcting "F_U64" bases with "F_U64" indel adjustments.\n", G->basesLen, G->adjustsLen); G->bases = new char [G->basesLen]; G->adjusts = new Adjust_t [G->adjustsLen]; G->reads = new Frag_Info_t [G->endID - G->bgnID + 1]; G->readsLen = 0; G->basesLen = 0; G->adjustsLen = 0; uint64 changes[12] = {0}; // Load reads and apply corrections for each one. gkReadData *readData = new gkReadData; for (uint32 curID=G->bgnID; curID<=G->endID; curID++) { gkRead *read = gkpStore->gkStore_getRead(curID); gkpStore->gkStore_loadReadData(read, readData); uint32 readLength = read->gkRead_sequenceLength(); char *readBases = readData->gkReadData_getSequence(); // Save pointers to the bases and adjustments. G->reads[G->readsLen].bases = G->bases + G->basesLen; G->reads[G->readsLen].basesLen = 0; G->reads[G->readsLen].adjusts = G->adjusts + G->adjustsLen; G->reads[G->readsLen].adjustsLen = 0; // Find the correct corrections. while ((Cpos < Clen) && (C[Cpos].readID < curID)) Cpos++; // We should be at the IDENT message. if (C[Cpos].type != IDENT) { fprintf(stderr, "ERROR: didn't find IDENT at Cpos=%u for read %u\n", Cpos, curID); fprintf(stderr, " C[Cpos] = keep_left=%u keep_right=%u type=%u pos=%u readID=%u\n", C[Cpos].keep_left, C[Cpos].keep_right, C[Cpos].type, C[Cpos].pos, C[Cpos].readID); } assert(C[Cpos].type == IDENT); G->reads[G->readsLen].keep_left = C[Cpos].keep_left; G->reads[G->readsLen].keep_right = C[Cpos].keep_right; //Cpos++; // Now do the corrections. correctRead(curID, G->reads[G->readsLen].bases, G->reads[G->readsLen].basesLen, G->reads[G->readsLen].adjusts, G->reads[G->readsLen].adjustsLen, readData->gkReadData_getSequence(), read->gkRead_sequenceLength(), C, Cpos, Clen, changes); // Update the lengths in the globals. G->basesLen += G->reads[G->readsLen].basesLen + 1; G->adjustsLen += G->reads[G->readsLen].adjustsLen; G->readsLen += 1; } delete readData; delete Cfile; fprintf(stderr, "Corrected "F_U64" bases with "F_U64" substitutions, "F_U64" deletions and "F_U64" insertions.\n", G->basesLen, changes[A_SUBST] + changes[C_SUBST] + changes[G_SUBST] + changes[T_SUBST], changes[DELETE], changes[A_INSERT] + changes[C_INSERT] + changes[G_INSERT] + changes[T_INSERT]); }
// Read old fragments in gkpStore and choose the ones that // have overlaps with fragments in Frag. Recompute the // overlaps, using fragment corrections and output the revised error. void Redo_Olaps(coParameters *G, gkStore *gkpStore) { // Figure out the range of B reads we care about. We probably could just loop over every read in // the store with minimal penalty. uint64 thisOvl = 0; uint64 lastOvl = G->olapsLen - 1; uint32 loBid = G->olaps[thisOvl].b_iid; uint32 hiBid = G->olaps[lastOvl].b_iid; // Open all the corrections. memoryMappedFile *Cfile = new memoryMappedFile(G->correctionsName); Correction_Output_t *C = (Correction_Output_t *)Cfile->get(); uint64 Cpos = 0; uint64 Clen = Cfile->length() / sizeof(Correction_Output_t); // Allocate some temporary work space for the forward and reverse corrected B reads. fprintf(stderr, "--Allocate "F_U64" MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20); char *fseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1]; uint32 fseqLen = 0; char *rseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1]; uint32 rseqLen = 0; fprintf(stderr, "--Allocate "F_U64" MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20); Adjust_t *fadj = new Adjust_t [AS_MAX_READLEN + 1]; Adjust_t *radj = new Adjust_t [AS_MAX_READLEN + 1]; uint32 fadjLen = 0; // radj is the same length fprintf(stderr, "--Allocate "F_U64" MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20); gkReadData *readData = new gkReadData; pedWorkArea_t *ped = new pedWorkArea_t; uint64 Total_Alignments_Ct = 0; uint64 Failed_Alignments_Ct = 0; uint64 Failed_Alignments_Both_Ct = 0; uint64 Failed_Alignments_End_Ct = 0; uint64 Failed_Alignments_Length_Ct = 0; uint32 rhaFail = 0; uint32 rhaPass = 0; uint64 olapsFwd = 0; uint64 olapsRev = 0; ped->initialize(G, G->errorRate); // Process overlaps. Loop over the B reads, and recompute each overlap. for (uint32 curID=loBid; curID<=hiBid; curID++) { if (((curID - loBid) % 1024) == 0) fprintf(stderr, "Recomputing overlaps - %9u - %9u - %9u\r", loBid, curID, hiBid); if (curID < G->olaps[thisOvl].b_iid) continue; gkRead *read = gkpStore->gkStore_getRead(curID); gkpStore->gkStore_loadReadData(read, readData); // Apply corrections to the B read (also converts to lower case, reverses it, etc) //fprintf(stderr, "Correcting B read %u at Cpos=%u\n", curID, Cpos); fseqLen = 0; rseqLen = 0; fadjLen = 0; correctRead(curID, fseq, fseqLen, fadj, fadjLen, readData->gkReadData_getSequence(), read->gkRead_sequenceLength(), C, Cpos, Clen); // Create copies of the sequence for forward and reverse. There isn't a need for the forward copy (except that // we mutate it with corrections), and the reverse copy could be deferred until it is needed. memcpy(rseq, fseq, sizeof(char) * (fseqLen + 1)); reverseComplementSequence(rseq, fseqLen); Make_Rev_Adjust(radj, fadj, fadjLen, fseqLen); // Recompute alignments for all overlaps involving the B read. for (; ((thisOvl <= lastOvl) && (G->olaps[thisOvl].b_iid == curID)); thisOvl++) { Olap_Info_t *olap = G->olaps + thisOvl; //fprintf(stderr, "processing overlap %u - %u\n", olap->a_iid, olap->b_iid); // Find the A segment. It's always forward. It's already been corrected. char *a_part = G->reads[olap->a_iid - G->bgnID].bases; if (olap->a_hang > 0) { int32 ha = Hang_Adjust(olap->a_hang, G->reads[olap->a_iid - G->bgnID].adjusts, G->reads[olap->a_iid - G->bgnID].adjustsLen); a_part += ha; //fprintf(stderr, "offset a_part by ha=%d\n", ha); } // Find the B segment. char *b_part = (olap->normal == true) ? fseq : rseq; //if (olap->normal == true) // fprintf(stderr, "b_part = fseq %40.40s\n", fseq); //else // fprintf(stderr, "b_part = rseq %40.40s\n", rseq); if (olap->normal == true) olapsFwd++; else olapsRev++; bool rha=false; if (olap->a_hang < 0) { int32 ha = (olap->normal == true) ? Hang_Adjust(-olap->a_hang, fadj, fadjLen) : Hang_Adjust(-olap->a_hang, radj, fadjLen); b_part += ha; //fprintf(stderr, "offset b_part by ha=%d normal=%d\n", ha, olap->normal); rha=true; } // Compute the alignment. int32 a_part_len = strlen(a_part); int32 b_part_len = strlen(b_part); int32 olap_len = min(a_part_len, b_part_len); int32 a_end = 0; int32 b_end = 0; bool match_to_end = false; //fprintf(stderr, ">A\n%s\n", a_part); //fprintf(stderr, ">B\n%s\n", b_part); int32 errors = Prefix_Edit_Dist(a_part, a_part_len, b_part, b_part_len, G->Error_Bound[olap_len], a_end, b_end, match_to_end, ped); // ped->delta isn't used. // ?? These both occur, but the first is much much more common. if ((ped->deltaLen > 0) && (ped->delta[0] == 1) && (0 < G->olaps[thisOvl].a_hang)) { int32 stop = min(ped->deltaLen, (int32)G->olaps[thisOvl].a_hang); // a_hang is int32:31! int32 i = 0; for (i=0; (i < stop) && (ped->delta[i] == 1); i++) ; //fprintf(stderr, "RESET 1 i=%d delta=%d\n", i, ped->delta[i]); assert((i == stop) || (ped->delta[i] != -1)); ped->deltaLen -= i; memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int)); a_part += i; a_end -= i; a_part_len -= i; errors -= i; } else if ((ped->deltaLen > 0) && (ped->delta[0] == -1) && (G->olaps[thisOvl].a_hang < 0)) { int32 stop = min(ped->deltaLen, - G->olaps[thisOvl].a_hang); int32 i = 0; for (i=0; (i < stop) && (ped->delta[i] == -1); i++) ; //fprintf(stderr, "RESET 2 i=%d delta=%d\n", i, ped->delta[i]); assert((i == stop) || (ped->delta[i] != 1)); ped->deltaLen -= i; memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int)); b_part += i; b_end -= i; b_part_len -= i; errors -= i; } Total_Alignments_Ct++; int32 olapLen = min(a_end, b_end); if ((match_to_end == false) && (olapLen <= 0)) Failed_Alignments_Both_Ct++; if (match_to_end == false) Failed_Alignments_End_Ct++; if (olapLen <= 0) Failed_Alignments_Length_Ct++; if ((match_to_end == false) || (olapLen <= 0)) { Failed_Alignments_Ct++; #if 0 // I can't find any patterns in these errors. I thought that it was caused by the corrections, but I // found a case where no corrections were made and the alignment still failed. Perhaps it is differences // in the alignment code (the forward vs reverse prefix distance in overlapper vs only the forward here)? fprintf(stderr, "Redo_Olaps()--\n"); fprintf(stderr, "Redo_Olaps()--\n"); fprintf(stderr, "Redo_Olaps()-- Bad alignment errors %d a_end %d b_end %d match_to_end %d olapLen %d\n", errors, a_end, b_end, match_to_end, olapLen); fprintf(stderr, "Redo_Olaps()-- Overlap a_hang %d b_hang %d innie %d\n", olap->a_hang, olap->b_hang, olap->innie); fprintf(stderr, "Redo_Olaps()-- Reads a_id %u a_length %d b_id %u b_length %d\n", G->olaps[thisOvl].a_iid, G->reads[ G->olaps[thisOvl].a_iid ].basesLen, G->olaps[thisOvl].b_iid, G->reads[ G->olaps[thisOvl].b_iid ].basesLen); fprintf(stderr, "Redo_Olaps()-- A %s\n", a_part); fprintf(stderr, "Redo_Olaps()-- B %s\n", b_part); Display_Alignment(a_part, a_part_len, b_part, b_part_len, ped->delta, ped->deltaLen); fprintf(stderr, "\n"); #endif if (rha) rhaFail++; continue; } if (rha) rhaPass++; G->olaps[thisOvl].evalue = AS_OVS_encodeEvalue((double)errors / olapLen); //fprintf(stderr, "REDO - errors = %u / olapLep = %u -- %f\n", errors, olapLen, AS_OVS_decodeEvalue(G->olaps[thisOvl].evalue)); } } fprintf(stderr, "\n"); delete ped; delete readData; delete [] radj; delete [] fadj; delete [] rseq; delete [] fseq; delete Cfile; fprintf(stderr, "-- Release bases, adjusts and reads.\n"); delete [] G->bases; G->bases = NULL; delete [] G->adjusts; G->adjusts = NULL; delete [] G->reads; G->reads = NULL; fprintf(stderr, "Olaps Fwd "F_U64"\n", olapsFwd); fprintf(stderr, "Olaps Rev "F_U64"\n", olapsRev); fprintf(stderr, "Total: "F_U64"\n", Total_Alignments_Ct); fprintf(stderr, "Failed: "F_U64" (both)\n", Failed_Alignments_Both_Ct); fprintf(stderr, "Failed: "F_U64" (either)\n", Failed_Alignments_Ct); fprintf(stderr, "Failed: "F_U64" (match to end)\n", Failed_Alignments_End_Ct); fprintf(stderr, "Failed: "F_U64" (negative length)\n", Failed_Alignments_Length_Ct); fprintf(stderr, "rhaFail %u rhaPass %u\n", rhaFail, rhaPass); }