/* Debug functions that don't use curses - handy for valgrind testing */ static void test_mode(GapIO *io, contig_t **c, int xpos) { rangec_t *r; int nr, i; r = contig_seqs_in_range(io, c, xpos, xpos+79, CSIR_SORT_BY_X, &nr); for (i = 0; i < nr; i++) { seq_t *s = get_seq(io, r[i].rec); printf("%.*s: range %d..%d seq %d+%d st=%d en=%d %.*s\n", s->name_len, s->name, r[i].start, r[i].end, s->pos, s->len, s->left, s->right, ABS(s->len), s->seq); s = dup_seq(s); complement_seq_t(s); printf("%.*s: range %d..%d seq %d+%d st=%d en=%d %.*s\n", s->name_len, s->name, r[i].start, r[i].end, s->pos, s->len, s->left, s->right, ABS(s->len), s->seq); } gio_close(io); system("ps lx | grep g_iotest | grep -v grep"); exit(0); }
/* * Attempt to find edits. It's not 100% reliable, but works for most cases. * We look for lowercase bases and confidence 100 and 0 (if not N). * We cannot find deleted bases though. */ int edview_search_edit(edview *xx, int dir, int strand, char *value) { int start, end; contig_iterator *iter; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); rangec_t *r; int best_pos, found = 0; int fpos; tg_rec fseq; if (dir) { start = xx->cursor_apos + 1; end = CITER_CEND; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_FIRST | CITER_ISTART, start, end); ifunc = contig_iter_next; best_pos = INT_MAX; } else { start = CITER_CSTART; end = xx->cursor_apos -1; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_LAST | CITER_IEND, start, end); ifunc = contig_iter_prev; best_pos = INT_MIN; } if (!iter) return -1; while ((r = ifunc(xx->io, iter))) { seq_t *s, *sorig; char *seq, *qual; int seq_len, comp, off = 0, i; if (found && dir && r->start > best_pos) break; if (found && !dir && r->end < best_pos) break; if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec))) break; if (r->comp ^ (s->len < 0)) { s = dup_seq(s); complement_seq_t(s); } seq = s->seq; qual = s->conf; seq_len = ABS(s->len); if (r->start < start) { off = start - r->start; seq += off; qual += off; seq_len -= off; } for (i = 0; i < seq_len; i++) { if (islower(seq[i]) || qual[i] == 100 || (qual[i] == 0 && seq[i] != 'N' && seq[i] != '-' && seq[i] != '*')) { int pos = r->start + i + off; if (dir) { if (best_pos > pos && pos > xx->cursor_apos) { found = 1; best_pos = pos; fpos = i + off; fseq = r->rec; } break; } else { if (best_pos < pos && pos < xx->cursor_apos) { found = 1; best_pos = pos; fpos = i + off; fseq = r->rec; } } } } if (s != sorig) free(s); } if (found) { edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq, fseq, fpos, 1); } contig_iter_del(iter); return found ? 0 : -1; }
int edview_search_sequence(edview *xx, int dir, int strand, char *value) { int mismatches = 0; /* exact match */ int where = 2; /* consensus */ char *p; int start, end; int patlen; char *uppert, *upperb; int found = 0, at_end = 0; tg_rec fseq; int fpos, i, j; contig_t *c; contig_iterator *iter; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); rangec_t *r; int best_pos; if (dir) { start = xx->cursor_apos + 1; end = CITER_CEND; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_FIRST | CITER_ISTART, start, end); ifunc = contig_iter_next; best_pos = INT_MAX; } else { start = CITER_CSTART; end = xx->cursor_apos -1; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_LAST | CITER_IEND, start, end); ifunc = contig_iter_prev; best_pos = INT_MIN; } if (!iter) return -1; /* * Parse value search string. It optionally includes two extra params * separated by #. Ie: * <string>#<N.mismatches>#<where>. * <where> is 1 for readings, 2 for consensus, 3 for both. */ if (p = strchr(value, '#')) { mismatches = atoi(p+1); *p = 0; if (p = strchr(p+1, '#')) where = atoi(p+1); } /* uppercase search string, remove pads, and store fwd/rev copies */ patlen = strlen(value); depad_seq(value, &patlen, NULL); if (NULL == (uppert = (char *)xmalloc(patlen + 1))) return 0; if (NULL == (upperb = (char *)xmalloc(patlen + 1))) return 0; uppert[patlen] = upperb[patlen] = 0; for (i = patlen-1; i >= 0; i--) { upperb[i] = uppert[i] = toupper(value[i]); } complement_seq(upperb, patlen); while ((r = ifunc(xx->io, iter))) { seq_t *s, *sorig; char *ind, *indt = NULL, *indb = NULL, *seq; int seq_len, comp, off = 0; if (found && dir && r->start > best_pos) break; if (found && !dir && r->end < best_pos) break; if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec))) break; if (r->comp ^ (s->len < 0)) { s = dup_seq(s); complement_seq_t(s); } seq = s->seq; seq_len = ABS(s->len); if (r->start < start) { off = start - r->start; seq += off; seq_len -= off; } if (r->end - (patlen-1) > end) seq_len -= r->end - (patlen-1) - end; if (dir) { if (strand == '+' || strand == '=') indt = pstrnstr_inexact(seq, seq_len, uppert, patlen, mismatches, NULL); if (strand == '-' || strand == '=') indb = pstrnstr_inexact(seq, seq_len, upperb, patlen, mismatches, NULL); } else { if (strand == '+' || strand == '=') indt = prstrnstr_inexact(seq, seq_len, uppert, patlen, mismatches, NULL); if (strand == '-' || strand == '=') indb = prstrnstr_inexact(seq, seq_len, upperb, patlen, mismatches, NULL); } if (indt && indb) ind = MIN(indt, indb); else if (indt) ind = indt; else if (indb) ind = indb; else ind = NULL; if (ind) { int pos = r->start + ind - seq + off; if (dir) { if (best_pos > pos) { found = 1; best_pos = pos; fpos = ind - s->seq; fseq = r->rec; } } else { if (best_pos < pos) { found = 1; best_pos = pos; fpos = ind - s->seq; fseq = r->rec; } } //printf("Matches #%"PRIrec": at abs pos %d\n", r->rec, pos); } if (s != sorig) free(s); } if (found) { edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq, fseq, fpos, 1); } free(uppert); free(upperb); contig_iter_del(iter); return found ? 0 : -1; }
/* * Checks a single reading for correct assembly by analysing the used data. * * Returns -1 for system error, otherwise a score (0-1000000) * 'pos_p' and 'len_p' are filled in with the position and length of the match * within the consensus. */ int check_uassembly_single(GapIO *io, char *cons, int contig, rangec_t *r, float maxperc, int win_len, int ignore_N) { int start, end; unsigned char *seq = NULL; unsigned char *con = (unsigned char *) cons; int i, j, mism = 0; int worst, worst_pos = -1; seq_t *s, *sorig; static int lookup[256]; static int lookup_done = 0; if (!lookup_done) { for (i = 0; i < 256; i++) lookup[i] = 0; lookup['A'] = lookup['a'] = 1; lookup['C'] = lookup['c'] = 2; lookup['G'] = lookup['g'] = 3; lookup['T'] = lookup['t'] = 4; lookup['U'] = lookup['u'] = 4; lookup['*'] = lookup[','] = lookup['-'] = 5; lookup_done = 1; } /* Get sequence */ if (!(sorig = s = cache_search(io, GT_Seq, r->rec))) return -1; /* Complement data on-the-fly */ if ((s->len < 0) ^ r->comp) { s = dup_seq(s); complement_seq_t(s); } seq = (unsigned char *) s->seq; start = s->left; end = s->right; /* Initialise scoring for divergence checks */ if (end - start - 1 < win_len) { win_len = end - start - 1; } worst = 0.5 + maxperc * win_len; for (i=start-1, j=r->start + start-1; i < start-1 + win_len; i++, j++) { if (ignore_N) { if (lookup[seq[i]] && lookup[seq[i]] != lookup[con[j]]) mism++; } else { if (lookup[seq[i]] != lookup[con[j]]) mism++; } } /* Loop through sequence looking for problems */ if (ignore_N) { do { if (mism >= worst) { worst_pos = i; worst = mism; } mism -= lookup[seq[i-win_len]] && lookup[seq[i-win_len]] != lookup[con[j-win_len]]; i++; j++; if (i < end-1) mism += lookup[seq[i]] && lookup[seq[i]] != lookup[con[j]]; } while (i < end); } else { do { if (mism >= worst) { worst_pos = i; worst = mism; } mism -= lookup[seq[i++-win_len]] != lookup[con[j++-win_len]]; if (i < end-1) mism += lookup[seq[i]] != lookup[con[j]]; } while (i < end); } /* Display problem, listing worst score */ if (worst_pos != -1) { //*pos_p = io_relpos(io, rn); //*len_p = end - start + 1; vmessage("\nReading #%"PRIrec"(%s) has a local percentage " "mismatch of %2.1f\n", s->rec, s->name, 100 * (float)worst / win_len); vmessage("SEQ: %.*s\n", end-start+1, &seq[start-1]); vmessage("CON: %.*s\n", end-start+1, &con[r->start + start-1]); if (sorig != s) xfree(s); return 10000 * (float)worst / win_len; } if (sorig != s) xfree(s); return 0; }
/** * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'. */ MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) { CONTIGL *contig, *first_contig = NULL, *last_contig = NULL; int i, j; contig_iterator *citer; rangec_t *r; /* Expand start and end to the range covered by seqs overlapping * start .. end */ { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_FIRST | CITER_ICLIPPEDSTART, start, start); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); start = ((s->len < 0) ^ r->comp) ? r->end - s->right - 2 : r->start + s->left - 2; } contig_iter_del(citer); } { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_LAST | CITER_ICLIPPEDEND, end, end); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); end = ((s->len < 0) ^ r->comp) ? r->end - s->left + 2 : r->start + s->right + 2; } contig_iter_del(citer); } //printf("Generating data for %d..%d\n", start, end); /* Generate contigl linked list */ //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND); citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end); while ((r = contig_iter_next(io, citer))) { seq_t *s, *sorig; char *seq; int len; assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ); contig = create_contig_link(); contig->id = r->rec; contig->mseg = create_mseg(); sorig = s = cache_search(io, GT_Seq, r->rec); /* Check for out-of-bounds clip points. It shouldn't happen, but gap5 databases have been seen with this problem, and we don't want to crash if there are any. */ if (s->left < 1) s->left = 1; if (s->right > ABS(s->len)) s->right = ABS(s->len); /* Fix reads of zero length */ if (s->right < s->left) { sorig = s = cache_rw(io, s); s->right = s->left; if (s->right > ABS(s->len)) s->left = s->right = ABS(s->len); } if ((s->len < 0) ^ r->comp) { s = dup_seq(s); complement_seq_t(s); } len = s->right - s->left + 1; if (NULL == (seq = malloc(len+1))) return NULL; for (j = 0, i = s->left-1; i < s->right; i++, j++) { /* Protect against the sequence containing "."; our pad sym */ if (s->seq[i] == '.') seq[j] = 'N'; else seq[j] = s->seq[i]; } seq[j] = 0; init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1); contig->mseg->comp = (s != sorig); if (last_contig) { last_contig->next = contig; } else { first_contig = contig; } last_contig = contig; if (s != sorig) free(s); } contig_iter_del(citer); /* for 454 data -6 to -10 seem to work fine */ return contigl_to_malign(first_contig, -7, -7); }
/* * Takes a multiple alignment and updates the on-disk data structures to * match. This needs to correct confidence values, original positions and * tags too. */ void update_io(GapIO *io, tg_rec cnum, MALIGN *malign, Array indels) { CONTIGL *cl; tg_rec rnum; range_t r, *r_out; bin_index_t *bin; contig_t *c = cache_search(io, GT_Contig, cnum); size_t i, nindel; cache_incr(io, c); /* * To minimise number of data modifications we use a three step approach. * * Step 1: insert columns of pads, shifting reads as appropriate. * Step 2: edit sequence alignments as required, possibly involving * moving sequences and/or adding and removing pads. * Step 3: remove columns of entire pads. * * This means that when we introduce a column of pads we don't have * to make edits to every single read position down stream, and can * instead make use of the optimised recursive bin functions to do this * for us. */ /* Step 1: make indels */ nindel = ArrayMax(indels); for (i = 0; i < nindel; i++) { con_indel_t *id = arrp(con_indel_t, indels, i); int j; if (id->size > 0) { contig_insert_bases(io, &c, id->pos+1, '*', -1, id->size); } else { for (j = 0; j < -id->size; j++) { contig_delete_pad(io, &c, id->pos+1); } } } /* Step 2: edit alignments */ for (cl = malign->contigl; cl; cl = cl->next) { seq_t *s, *sorig; int len, update_range = 0; int shift; rnum = cl->id; sorig = cache_search(io, GT_Seq, rnum); cache_incr(io, sorig); s = dup_seq(sorig); if (cl->mseg->comp) complement_seq_t(s); len = s->right - s->left + 1; /* Check if sequence has changed. If so assign a new one */ if (cl->mseg->length != len || memcmp(s->seq + s->left-1, cl->mseg->seq, cl->mseg->length) != 0) { int newlen = s->left-1 + ABS(s->len) - s->right + cl->mseg->length; int i, j, np; char *newseq = malloc(newlen+1); int8_t *newconf = malloc(newlen+1); /* Build new seq/conf arrays */ memcpy(newseq, s->seq, s->left-1); memcpy(newconf, s->conf, s->left-1); memcpy(&newseq[s->left-1], cl->mseg->seq, cl->mseg->length); /* * Step through both old and new sequences working out how * they differ. This will (*should*) be entire pad movements. * i = index to old seq * j = index to new seq * np = number of pads added minus removed from old seq. */ np = 0; for (i =j =s->left-1; i < ABS(s->len) && j < s->left-1 + cl->mseg->length; ) { /* Bases match */ if (toupper(newseq[j]) == toupper(s->seq[i]) || (s->seq[i] == '.' && newseq[j] == 'N')) { if (isupper(s->seq[i])) newseq[j] = toupper(newseq[j]); else newseq[j] = tolower(newseq[j]); newconf[j] = s->conf[i]; i++, j++; continue; } /* Pad removed */ if (s->seq[i] == '*') { i++; tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+np--, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_delete(io, rnum, r.length - i + 1); } else { tag_shift_for_delete(io, rnum, i+np--); } */ continue; } /* Pad created */ if (newseq[j] == '*') { int k; int ql = 0, qr = 0; for (k = i-1; k >= 0; k--) { if (s->seq[k] != '*') { ql = s->conf[k]; break; } } for (k = i+1; k < s->right; k++) { if (s->seq[k] != '*') { qr = s->conf[k]; break; } } newconf[j] = MIN(ql, qr); /* min conf of neighbours */ j++; tag_shift_for_insert(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+ ++np, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_insert(io, rnum, r.length - i + 1); } else { tag_shift_for_insert(io, rnum, i+ ++np); } */ continue; } fprintf(stderr, "Alignment introduced non-pad character"); abort(); } /* Pads previously at the end of the reading & now removed */ while (i < s->right) { if (s->seq[i] == '*') { i++; tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+np--, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_delete(io, rnum, r.length - i + 1); } else { tag_shift_for_delete(io, rnum, i+np--); } */ } else { /* Error: clipped data that wasn't a pad */ abort(); } } /* Should only be pads remaining in newseq, if anything */ s->right = j; for (; j < s->left-1 + cl->mseg->length; j++) { if (newseq[j] != '*') { fprintf(stderr, "Alignment introduced non-pad character"); abort(); } newconf[j] = 0; } /* Append on the right hand cutoff data */ for (; i < ABS(s->len); i++, j++) { newseq[j] = s->seq[i]; newconf[j] = s->conf[i]; } if (j != newlen) { abort(); } /* Write it back out */ /* Copy newseq/newconf into seq_t */ s->seq = newseq; s->conf = newconf; update_range = 0; if (ABS(s->len) != j) { /* Length change implies updating the range array too */ s->len = s->len >= 0 ? j : -j; update_range = 1; } if (cl->mseg->comp) complement_seq_t(s); /* The memcpy trashes the block pointer, so special care needed */ { sorig = cache_rw(io, sorig); void *blk = sorig->block; memcpy(sorig, s, sizeof(seq_t)); sorig->block = blk; } if (update_range) sorig = cache_item_resize(sorig, sizeof(*sorig) + sequence_extra_len(sorig)); sequence_reset_ptr(sorig); if (s->name) memcpy(sorig->name, s->name, s->name_len+1); if (s->trace_name) memcpy(sorig->trace_name, s->trace_name, s->trace_name_len+1); if (s->alignment) memcpy(sorig->alignment, s->alignment, s->alignment_len+1); memcpy(sorig->seq, s->seq, ABS(s->len)); memcpy(sorig->conf, s->conf, ABS(s->len)); xfree(newconf); xfree(newseq); } { int st, en, or; sequence_get_position(io, s->rec, NULL, &st, &en, &or); if (or ^ (sorig->len < 0)) { shift = ABS(sorig->len) - sorig->right; } else { shift = sorig->left-1; } st += shift; if (st != cl->mseg->offset+1) { update_range = 1; } } free(s); if (update_range) { int bin_changed = 0; /* Get old range and pair data */ s = sorig; bin = cache_search(io, GT_Bin, s->bin); r = *arrp(range_t, bin->rng, s->bin_index); assert(r.rec == s->rec); /* Update range, tedious and slow way */ bin_remove_item(io, &c, GT_Seq, s->rec); r.start = cl->mseg->offset + 1 - shift; r.end = r.start + ABS(s->len) - 1; bin = bin_add_range(io, &c, &r, &r_out, NULL, 0); /* Check if the new bin has a different complemented status too */ if (s->bin != bin->rec) { int old_comp = bin_get_orient(io, s->bin); int new_comp = bin_get_orient(io, bin->rec); if (new_comp != old_comp) { //int tmp; s = cache_rw(io, s); s->len *= -1; s->flags ^= SEQ_COMPLEMENTED; //tmp = s->left; //s->left = ABS(s->len) - (s->right-1); //s->right = ABS(s->len) - (tmp-1); } bin_changed = 1; } /* Update seq bin & bin_index fields */ s = cache_rw(io, s); s->bin = bin->rec; s->bin_index = r_out - ArrayBase(range_t, bin->rng); if (bin_changed) { if (-1 == sequence_fix_anno_bins(io, &s)) { verror(ERR_WARN, "update_io", "sequence_fix_anno_bins() failure"); } } } cache_decr(io, sorig); } /* Step 3 (remove pad columns) done in calling function. */ cache_decr(io, c); }
/* * Compute a basic non-weighted consensus. We simply pick the basecall * most frequently used. * * FIXME: use a weighted sum based on confidence values instead? */ int calc_cons(GapIO *io, rangec_t *r, int nr, int xpos, int wid, char *cons) { int i, j; int (*cvec)[6] = (int (*)[6])calloc(wid, 6 * sizeof(int)); if (!lookup_done) { memset(lookup, 5, 256); lookup_done = 1; lookup['A'] = lookup['a'] = 0; lookup['C'] = lookup['c'] = 1; lookup['G'] = lookup['g'] = 2; lookup['T'] = lookup['t'] = 3; lookup['*'] = lookup[','] = 4; } /* Accumulate */ for (i = 0; i < nr; i++) { int sp = r[i].start; seq_t *s = get_seq(io, r[i].rec); seq_t *sorig = s; int l = s->len > 0 ? s->len : -s->len; unsigned char *seq; int left, right; /* Complement data on-the-fly */ if ((s->len < 0) ^ r[i].comp) { s = dup_seq(s); complement_seq_t(s); } seq = (unsigned char *)s->seq; left = s->left; right = s->right; if (sp < xpos) { seq += xpos - sp; l -= xpos - sp; left -= xpos - sp; right -= xpos - sp; sp = xpos; } if (l > wid - (sp-xpos)) l = wid - (sp-xpos); if (left < 1) left = 1; for (j = left-1; j < right; j++) { if (sp-xpos+j < wid) cvec[sp-xpos+j][lookup[seq[j]]]++; } if (s != sorig) free(s); } memset(cons, ' ', wid); /* and speculate :-) */ for (i = 0; i < wid; i++) { int max, max_base = 5; for (max = j = 0; j < 6; j++) { if (max < cvec[i][j]) { max = cvec[i][j]; max_base = j; } } cons[i] = "ACGT*N"[max_base]; } free(cvec); return 0; }
static void display_gap(GapIO *io, contig_t **c, int xpos, int ypos, int nlines, int wid, int mode, int qual_cutoff, int in_curses) { rangec_t *r; int i, nr, lno, y; char line[1024], *lp; char cons[1024]; int attr; static int lookup_1conf[256]; static int lookup_4conf[256]; static int lookup_init = 0; if (!lookup_init) { for (i = 0; i < 256; i++) lookup_1conf[i] = lookup_4conf[0] = 0; lookup_4conf['a'] = lookup_4conf['A'] = 0; lookup_4conf['c'] = lookup_4conf['C'] = 1; lookup_4conf['g'] = lookup_4conf['G'] = 2; lookup_4conf['t'] = lookup_4conf['T'] = 3; } wid -= MAX_NAME_LEN+2; //if (xpos < wid/2 + (*c)->start) // xpos = wid/2 + (*c)->start; xpos -= wid/2; /* Query visible objects */ r = contig_seqs_in_range(io, c, xpos, xpos+wid-1, CSIR_SORT_BY_X, &nr); /* Consensus */ calc_cons(io, r, nr, xpos, wid, cons); if (in_curses) { clear(); mvaddnstr(0, 1, contig_get_name(c), strlen(contig_get_name(c))); mvaddnstr(0, MAX_NAME_LEN+2, cons, wid); } else { printf(" %-*s %.*s\n", MAX_NAME_LEN, contig_get_name(c), wid, cons); } /* Position */ for (lp = line, i = xpos; i < xpos+wid+19; i++) { if (i % 10 == 0) { sprintf(lp, "%10d", i-10); lp += 10; } } if (in_curses) { int m = (xpos-1)%10; if (m < 0) m += 10; mvaddnstr(1, MAX_NAME_LEN+2, line+10+m, wid); } else { printf("%*s%.*s\n", MAX_NAME_LEN+2, "", wid, line+9+((xpos-1)%10)); } /* Sequences */ for (i = y = 0; i < nr && y < ypos; i++, y++); for (lno = 2; i < nr && lno < nlines; i++, lno++) { seq_t *s = get_seq(io, r[i].rec); seq_t *sorig = s; int sp = r[i].start; int l = s->len > 0 ? s->len : -s->len; unsigned char seq_a[MAX_SEQ_LEN], *seq = seq_a; int j, dir = '+'; int left, right; char *conf; int nc = s->format == SEQ_FORMAT_CNF4 ? 4 : 1; int *L = s->format == SEQ_FORMAT_CNF4 ? lookup_4conf : lookup_1conf; /* Complement data on-the-fly */ if ((s->len < 0) ^ r[i].comp) { dir = '-'; s = dup_seq(s); complement_seq_t(s); } left = s->left; right = s->right; memcpy(seq, s->seq, l); conf = s->conf; if (sp < xpos) { seq += xpos - sp; conf += nc * (xpos - sp); l -= xpos - sp; left -= xpos - sp; right -= xpos - sp; sp = xpos; } if (l > wid - (sp-xpos)) l = wid - (sp-xpos); if (in_curses) { /* Test of sequence_get_position */ /* int c, p; sequence_get_position(io, r[i].rec, &c, &p); s->name_len = sprintf(s->name, ":%d-%d:", p, p+ABS(s->len)-1); */ mvaddch(lno, 0, dir); addnstr(s->name, MIN(MAX_NAME_LEN, s->name_len)); move(lno, MAX_NAME_LEN+2+sp-xpos); } else { printf("%c%.*s%*s", dir, MIN(MAX_NAME_LEN, s->name_len), s->name, MAX_NAME_LEN+1-MIN(MAX_NAME_LEN, s->name_len) +sp-xpos, ""); } for (j = 0; j < l; j++) { attr = (mode & DISPLAY_COLOURS) ? COLOR_PAIR(lookup[seq[j]]) : 0; if (mode & DISPLAY_DIFFS && sp-xpos+j < wid && seq[j] == cons[sp-xpos+j]) seq[j] = '.'; if (j < left-1 || j > right-1) seq[j] = (mode & DISPLAY_CUTOFFS) ? tolower(seq[j]) : ' '; if (conf[j*nc+L[seq[j]]] >= qual_cutoff && mode & DISPLAY_QUAL) { attr |= A_BOLD; } if (in_curses) { addch(seq[j] | attr); } else { putchar(seq[j]); } } if (!in_curses) putchar('\n'); if (s != sorig) free(s); } /* Useful debugging code to show bin locations. */ #if 0 free(r); r = contig_bins_in_range(io, c, xpos, xpos+wid-1, &nr); /* Bins */ for (i=0; i < nr && lno < nlines; i++, lno++) { bin_index_t *bin = (bin_index_t *)cache_search(io, GT_Bin, r[i].rec); unsigned char *seq, *seqm; int j, dir = "+-"[r[i].comp]; int sp = r[i].start; int l = ABS(r[i].end - r[i].start + 1); char name[100]; sprintf(name, "bin-%d", bin->rec); seqm = seq = malloc(l+1); memset(seq, '-', l); if (!(bin->start_used == 0 && bin->end_used == 0)) { if (r[i].comp) { memset(&seq[bin->size - bin->end_used - 1], '=', bin->end_used - bin->start_used + 1); } else { memset(&seq[bin->start_used], '=', bin->end_used - bin->start_used + 1); } } /* fprintf(stderr, "Bin-%d: %d+%d %d..%d\n", bin->rec, bin->pos, bin->size, bin->start_used, bin->end_used); */ if (sp < xpos) { seq += xpos - sp; l -= xpos - sp; sp = xpos; } if (l > wid - (sp-xpos)) l = wid - (sp-xpos); if (in_curses) { mvaddch(lno, 0, dir); addnstr(name, strlen(name)); move(lno, MAX_NAME_LEN+2+sp-xpos); } else { printf("%c%.*s%*s", dir, (int)MIN(MAX_NAME_LEN, strlen(name)), name, (int)(MAX_NAME_LEN+1-MIN(MAX_NAME_LEN, strlen(name)) +sp-xpos), ""); } for (j = 0; j < l; j++) { if (in_curses) { addch(seq[j]); } else { putchar(seq[j]); } } if (!in_curses) putchar('\n'); free(seqm); } #endif if (in_curses) refresh(); free(r); }
/* * Extends the right hand end of a single contig. * * Min_depth is the minimum depth for extension. If lower then even if the * data matches we'll not extend further. * * Match_score (+ve) and mismatch_score (-ve) are accumulated during * extension to ensure that we don't extend into junk mismatching DNA. */ static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth, int match_score, int mismatch_score) { int end; rangec_t *r; int nr, i; contig_t *c; char cons[CSZ], new_cons[ESZ]; int freqs[ESZ][4], depth[ESZ]; double score, best_score; int best_pos, nseq; vmessage("Processing contig #%"PRIrec", %s end\n", crec, dir ? "left" : "right"); for (i = 0; i < ESZ; i++) { freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0; depth[i] = 0; } c = cache_search(io, GT_Contig, crec); if (NULL == c) return -1; cache_incr(io, c); if (consensus_valid_range(io, crec, NULL, &end) != 0) { cache_decr(io, c); return -1; } calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL); /* Start */ /* Not implemented for now: rev complement and go again! */ /* End */ r = contig_seqs_in_range(io, &c, end, end, 0, &nr); if (!r) { cache_decr(io, c); return -1; } for (i = 0; i < nr; i++) { seq_t *s = cache_search(io, GT_Seq, r[i].rec); seq_t *sorig = s; int cstart, cend; int j, k, slen; if ((s->len < 0) ^ r[i].comp) { s = dup_seq(s); complement_seq_t(s); } cstart = r[i].start + s->left-1; cend = r[i].start + s->right-1; /* Does cutoff extend to contig end, if so does it match cons? */ if (cend < end) { int mis = 0, len = 0; if (end - cend >= CSZ) { /* fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n", r[i].rec); */ if (sorig != s) free(s); r[i].rec = 0; /* Mark for removal */ continue; } for (k = s->right, j = cend+1; j <= end; j++, k++) { //printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]); if (s->seq[k] != cons[j-(end-(CSZ-1))]) mis++; } len = end - cend; if (100*mis/len > 5) { /* fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement " "with consensus.\n", r[i].rec); */ if (sorig != s) free(s); r[i].rec = 0; continue; } } /* So we got here, let's accumulate extension stats */ slen = ABS(s->len); for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) { //printf("%d: %c\n", j + r[i].start, s->seq[j]); if(s->seq[j] == 'N') continue; freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++; depth[k]++; } if (sorig != s) free(s); } score = best_score = 0; best_pos = 0; for (i = 0; i < ESZ; i++) { int call, best = 0, j; double dd; if (depth[i] < min_depth) break; for (j = 0; j < 4; j++) { if (best < freqs[i][j]) { best = freqs[i][j]; call = j; } } new_cons[i] = "ACGT"[call]; dd = (double)depth[i]; switch (call) { case 0: score += freqs[i][0] / dd; score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd; break; case 1: score += freqs[i][1] / dd; score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd; break; case 2: score += freqs[i][2] / dd; score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd; break; case 3: score += freqs[i][3] / dd; score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd; break; } if (best_score <= score) { best_score = score; best_pos = i+1; } /* printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n", i, depth[i], "ACGT"[call], freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3], score); */ } /* printf("Best score is %f at %d\n", best_score, best_pos); */ /* Extend */ nseq = 0; if (best_pos > 0) { int furthest_left = end; for (i = 0; i < nr; i++) { seq_t *s; int r_pos; int score; if (r[i].rec == 0) continue; s = cache_search(io, GT_Seq, r[i].rec); s = cache_rw(io, s); if (furthest_left > r[i].start) furthest_left = r[i].start; /* * end + best_pos is the furthest right we can go, but this * specific read may not be justified in reaching that far * if it has too many disagreements. */ if ((s->len > 0) ^ r[i].comp) { int best_r = 0, j, k; int len = ABS(s->len); //printf(">%s\t", s->name); r_pos = 0; score = 0; //for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) { for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) { if (new_cons[j] == toupper(s->seq[k])) { score += match_score; if (best_r <= score) { best_r = score; r_pos = k+1; } } else { score += mismatch_score; } //putchar(new_cons[j] == toupper(s->seq[k]) // ? toupper(s->seq[k]) // : tolower(s->seq[k])); } //putchar('\n'); if (s->right != r_pos) { s->right = r_pos; nseq++; } } else { int best_r = 0, j, k; //printf("<%s\t", s->name); r_pos = 0; score = 0; //for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) { for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) { char b = complement_base(s->seq[k]); if (new_cons[j] == b) { score += match_score; if (best_r <= score) { best_r = score; r_pos = k-1; } } else { score += mismatch_score; } //putchar(new_cons[j] == toupper(b) // ? toupper(b) // : tolower(b)); } //putchar('\n'); if (s->left != r_pos+2) { s->left = r_pos+2; nseq++; } } } vmessage(" Extended by %d, adjusting %d sequence clip%s\n", best_pos, nseq, nseq == 1 ? "" : "s"); bin_invalidate_consensus(io, crec, furthest_left, end + best_pos); } else { vmessage(" Unable to extend contig\n"); } free(r); cache_decr(io, c); cache_flush(io); return 0; }