// Read old fragments in gkpStore and choose the ones that // have overlaps with fragments in Frag. Recompute the // overlaps, using fragment corrections and output the revised error. void Redo_Olaps(coParameters *G, gkStore *gkpStore) { // Figure out the range of B reads we care about. We probably could just loop over every read in // the store with minimal penalty. uint64 thisOvl = 0; uint64 lastOvl = G->olapsLen - 1; uint32 loBid = G->olaps[thisOvl].b_iid; uint32 hiBid = G->olaps[lastOvl].b_iid; // Open all the corrections. memoryMappedFile *Cfile = new memoryMappedFile(G->correctionsName); Correction_Output_t *C = (Correction_Output_t *)Cfile->get(); uint64 Cpos = 0; uint64 Clen = Cfile->length() / sizeof(Correction_Output_t); // Allocate some temporary work space for the forward and reverse corrected B reads. fprintf(stderr, "--Allocate "F_U64" MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20); char *fseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1]; uint32 fseqLen = 0; char *rseq = new char [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1]; uint32 rseqLen = 0; fprintf(stderr, "--Allocate "F_U64" MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20); Adjust_t *fadj = new Adjust_t [AS_MAX_READLEN + 1]; Adjust_t *radj = new Adjust_t [AS_MAX_READLEN + 1]; uint32 fadjLen = 0; // radj is the same length fprintf(stderr, "--Allocate "F_U64" MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20); gkReadData *readData = new gkReadData; pedWorkArea_t *ped = new pedWorkArea_t; uint64 Total_Alignments_Ct = 0; uint64 Failed_Alignments_Ct = 0; uint64 Failed_Alignments_Both_Ct = 0; uint64 Failed_Alignments_End_Ct = 0; uint64 Failed_Alignments_Length_Ct = 0; uint32 rhaFail = 0; uint32 rhaPass = 0; uint64 olapsFwd = 0; uint64 olapsRev = 0; ped->initialize(G, G->errorRate); // Process overlaps. Loop over the B reads, and recompute each overlap. for (uint32 curID=loBid; curID<=hiBid; curID++) { if (((curID - loBid) % 1024) == 0) fprintf(stderr, "Recomputing overlaps - %9u - %9u - %9u\r", loBid, curID, hiBid); if (curID < G->olaps[thisOvl].b_iid) continue; gkRead *read = gkpStore->gkStore_getRead(curID); gkpStore->gkStore_loadReadData(read, readData); // Apply corrections to the B read (also converts to lower case, reverses it, etc) //fprintf(stderr, "Correcting B read %u at Cpos=%u\n", curID, Cpos); fseqLen = 0; rseqLen = 0; fadjLen = 0; correctRead(curID, fseq, fseqLen, fadj, fadjLen, readData->gkReadData_getSequence(), read->gkRead_sequenceLength(), C, Cpos, Clen); // Create copies of the sequence for forward and reverse. There isn't a need for the forward copy (except that // we mutate it with corrections), and the reverse copy could be deferred until it is needed. memcpy(rseq, fseq, sizeof(char) * (fseqLen + 1)); reverseComplementSequence(rseq, fseqLen); Make_Rev_Adjust(radj, fadj, fadjLen, fseqLen); // Recompute alignments for all overlaps involving the B read. for (; ((thisOvl <= lastOvl) && (G->olaps[thisOvl].b_iid == curID)); thisOvl++) { Olap_Info_t *olap = G->olaps + thisOvl; //fprintf(stderr, "processing overlap %u - %u\n", olap->a_iid, olap->b_iid); // Find the A segment. It's always forward. It's already been corrected. char *a_part = G->reads[olap->a_iid - G->bgnID].bases; if (olap->a_hang > 0) { int32 ha = Hang_Adjust(olap->a_hang, G->reads[olap->a_iid - G->bgnID].adjusts, G->reads[olap->a_iid - G->bgnID].adjustsLen); a_part += ha; //fprintf(stderr, "offset a_part by ha=%d\n", ha); } // Find the B segment. char *b_part = (olap->normal == true) ? fseq : rseq; //if (olap->normal == true) // fprintf(stderr, "b_part = fseq %40.40s\n", fseq); //else // fprintf(stderr, "b_part = rseq %40.40s\n", rseq); if (olap->normal == true) olapsFwd++; else olapsRev++; bool rha=false; if (olap->a_hang < 0) { int32 ha = (olap->normal == true) ? Hang_Adjust(-olap->a_hang, fadj, fadjLen) : Hang_Adjust(-olap->a_hang, radj, fadjLen); b_part += ha; //fprintf(stderr, "offset b_part by ha=%d normal=%d\n", ha, olap->normal); rha=true; } // Compute the alignment. int32 a_part_len = strlen(a_part); int32 b_part_len = strlen(b_part); int32 olap_len = min(a_part_len, b_part_len); int32 a_end = 0; int32 b_end = 0; bool match_to_end = false; //fprintf(stderr, ">A\n%s\n", a_part); //fprintf(stderr, ">B\n%s\n", b_part); int32 errors = Prefix_Edit_Dist(a_part, a_part_len, b_part, b_part_len, G->Error_Bound[olap_len], a_end, b_end, match_to_end, ped); // ped->delta isn't used. // ?? These both occur, but the first is much much more common. if ((ped->deltaLen > 0) && (ped->delta[0] == 1) && (0 < G->olaps[thisOvl].a_hang)) { int32 stop = min(ped->deltaLen, (int32)G->olaps[thisOvl].a_hang); // a_hang is int32:31! int32 i = 0; for (i=0; (i < stop) && (ped->delta[i] == 1); i++) ; //fprintf(stderr, "RESET 1 i=%d delta=%d\n", i, ped->delta[i]); assert((i == stop) || (ped->delta[i] != -1)); ped->deltaLen -= i; memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int)); a_part += i; a_end -= i; a_part_len -= i; errors -= i; } else if ((ped->deltaLen > 0) && (ped->delta[0] == -1) && (G->olaps[thisOvl].a_hang < 0)) { int32 stop = min(ped->deltaLen, - G->olaps[thisOvl].a_hang); int32 i = 0; for (i=0; (i < stop) && (ped->delta[i] == -1); i++) ; //fprintf(stderr, "RESET 2 i=%d delta=%d\n", i, ped->delta[i]); assert((i == stop) || (ped->delta[i] != 1)); ped->deltaLen -= i; memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int)); b_part += i; b_end -= i; b_part_len -= i; errors -= i; } Total_Alignments_Ct++; int32 olapLen = min(a_end, b_end); if ((match_to_end == false) && (olapLen <= 0)) Failed_Alignments_Both_Ct++; if (match_to_end == false) Failed_Alignments_End_Ct++; if (olapLen <= 0) Failed_Alignments_Length_Ct++; if ((match_to_end == false) || (olapLen <= 0)) { Failed_Alignments_Ct++; #if 0 // I can't find any patterns in these errors. I thought that it was caused by the corrections, but I // found a case where no corrections were made and the alignment still failed. Perhaps it is differences // in the alignment code (the forward vs reverse prefix distance in overlapper vs only the forward here)? fprintf(stderr, "Redo_Olaps()--\n"); fprintf(stderr, "Redo_Olaps()--\n"); fprintf(stderr, "Redo_Olaps()-- Bad alignment errors %d a_end %d b_end %d match_to_end %d olapLen %d\n", errors, a_end, b_end, match_to_end, olapLen); fprintf(stderr, "Redo_Olaps()-- Overlap a_hang %d b_hang %d innie %d\n", olap->a_hang, olap->b_hang, olap->innie); fprintf(stderr, "Redo_Olaps()-- Reads a_id %u a_length %d b_id %u b_length %d\n", G->olaps[thisOvl].a_iid, G->reads[ G->olaps[thisOvl].a_iid ].basesLen, G->olaps[thisOvl].b_iid, G->reads[ G->olaps[thisOvl].b_iid ].basesLen); fprintf(stderr, "Redo_Olaps()-- A %s\n", a_part); fprintf(stderr, "Redo_Olaps()-- B %s\n", b_part); Display_Alignment(a_part, a_part_len, b_part, b_part_len, ped->delta, ped->deltaLen); fprintf(stderr, "\n"); #endif if (rha) rhaFail++; continue; } if (rha) rhaPass++; G->olaps[thisOvl].evalue = AS_OVS_encodeEvalue((double)errors / olapLen); //fprintf(stderr, "REDO - errors = %u / olapLep = %u -- %f\n", errors, olapLen, AS_OVS_decodeEvalue(G->olaps[thisOvl].evalue)); } } fprintf(stderr, "\n"); delete ped; delete readData; delete [] radj; delete [] fadj; delete [] rseq; delete [] fseq; delete Cfile; fprintf(stderr, "-- Release bases, adjusts and reads.\n"); delete [] G->bases; G->bases = NULL; delete [] G->adjusts; G->adjusts = NULL; delete [] G->reads; G->reads = NULL; fprintf(stderr, "Olaps Fwd "F_U64"\n", olapsFwd); fprintf(stderr, "Olaps Rev "F_U64"\n", olapsRev); fprintf(stderr, "Total: "F_U64"\n", Total_Alignments_Ct); fprintf(stderr, "Failed: "F_U64" (both)\n", Failed_Alignments_Both_Ct); fprintf(stderr, "Failed: "F_U64" (either)\n", Failed_Alignments_Ct); fprintf(stderr, "Failed: "F_U64" (match to end)\n", Failed_Alignments_End_Ct); fprintf(stderr, "Failed: "F_U64" (negative length)\n", Failed_Alignments_Length_Ct); fprintf(stderr, "rhaFail %u rhaPass %u\n", rhaFail, rhaPass); }
Overlap_t Extend_Alignment(Match_Node_t * Match, char * S, int S_Len, char * T, int T_Len, int * S_Lo, int * S_Hi, int * T_Lo, int * T_Hi, int * Errors, Work_Area_t * WA) { Overlap_t return_type; int S_Left_Begin, S_Right_Begin, S_Right_Len; int T_Left_Begin, T_Right_Begin, T_Right_Len; int Error_Limit, Left_Errors, Right_Errors, Total_Olap; int i, Leftover, Right_Match_To_End, Left_Match_To_End; S_Left_Begin = Match->Start - 1; S_Right_Begin = Match->Start + Match->Len; S_Right_Len = S_Len - S_Right_Begin; T_Left_Begin = Match->Offset - 1; T_Right_Begin = Match->Offset + Match->Len; T_Right_Len = T_Len - T_Right_Begin; Total_Olap = (MIN(Match->Start, Match->Offset) + Match->Len + MIN(S_Right_Len, T_Right_Len)); Error_Limit = WA->Error_Bound[Total_Olap]; if (S_Right_Len == 0 || T_Right_Len == 0) { Right_Errors = 0; WA->Right_Delta_Len = 0; (* S_Hi) = (* T_Hi) = 0; Right_Match_To_End = TRUE; } else if (S_Right_Len <= T_Right_Len) { Right_Errors = Prefix_Edit_Dist (S + S_Right_Begin, S_Right_Len, T + T_Right_Begin, T_Right_Len, Error_Limit, S_Hi, T_Hi, & Right_Match_To_End, WA); } else { Right_Errors = Prefix_Edit_Dist (T + T_Right_Begin, T_Right_Len, S + S_Right_Begin, S_Right_Len, Error_Limit, T_Hi, S_Hi, & Right_Match_To_End, WA); } for (i = 0; i < WA->Right_Delta_Len; i++) WA->Right_Delta[i] *= -1; (* S_Hi) += S_Right_Begin - 1; (* T_Hi) += T_Right_Begin - 1; assert (Right_Errors <= Error_Limit); if (S_Left_Begin < 0 || T_Left_Begin < 0) { Left_Errors = 0; WA->Left_Delta_Len = 0; (* S_Lo) = (* T_Lo) = 0; Leftover = 0; Left_Match_To_End = TRUE; } else if (S_Right_Begin <= T_Right_Begin) { Left_Errors = Rev_Prefix_Edit_Dist (S + S_Left_Begin, S_Left_Begin + 1, T + T_Left_Begin, T_Left_Begin + 1, Error_Limit - Right_Errors, S_Lo, T_Lo, & Leftover, & Left_Match_To_End, WA); } else { Left_Errors = Rev_Prefix_Edit_Dist (T + T_Left_Begin, T_Left_Begin + 1, S + S_Left_Begin, S_Left_Begin + 1, Error_Limit - Right_Errors, T_Lo, S_Lo, & Leftover, & Left_Match_To_End, WA); } for (i = 0; i < WA->Left_Delta_Len; i++) WA->Left_Delta[i] *= -1; (* S_Lo) += S_Left_Begin + 1; // Check later for branch points (* T_Lo) += T_Left_Begin + 1; // Check later for branch points if (! Right_Match_To_End) { if (! Doing_Partial_Overlaps) WA->Left_Delta_Len = 0; if (! Left_Match_To_End) return_type = NONE; else return_type = RIGHT_BRANCH_PT; } else { if (! Left_Match_To_End) return_type = LEFT_BRANCH_PT; else return_type = DOVETAIL; } if (return_type == DOVETAIL || Doing_Partial_Overlaps) { (* Errors) = Left_Errors + Right_Errors; assert ((* Errors) <= Error_Limit); if (WA->Right_Delta_Len > 0) { if (WA->Right_Delta[0] > 0) WA->Left_Delta[WA->Left_Delta_Len++] = WA->Right_Delta[0] + Leftover + Match->Len; else WA->Left_Delta[WA->Left_Delta_Len++] = WA->Right_Delta[0] - Leftover - Match->Len; } for (i = 1; i < WA->Right_Delta_Len; i++) WA->Left_Delta[WA->Left_Delta_Len++] = WA->Right_Delta[i]; } return return_type; }