static char * GetTagSequence(GapIO *io, /* in */ int c_num, /* in */ int position, /* in */ int length) /* in */ { char *sequence; static char seq[1024]; if (length < 1024) sequence = seq; else if (NULL == (sequence = (char *)xmalloc((length + 1) * sizeof(char )))) return NULL; calculate_consensus_simple(io, c_num, position, position+length-1, sequence, NULL); sequence[length] = '\0'; return sequence; }
int edview_search_consensus(edview *xx, int dir, int strand, char *value) { int mismatches = 0; /* exact match */ int where = 2; /* consensus */ char *p; int start, end; char cons[WIN_WIDTH+1]; int patlen; char *uppert, *upperb; int found = 0, at_end = 0; tg_rec fseq; int fpos, i, j; contig_t *c; /* * Parse value search string. It optionally includes two extra params * separated by #. Ie: * <string>#<N.mismatches>#<where>. * <where> is 1 for readings, 2 for consensus, 3 for both. */ if (p = strchr(value, '#')) { mismatches = atoi(p+1); *p = 0; if (p = strchr(p+1, '#')) where = atoi(p+1); } /* uppercase search string, remove pads, and store fwd/rev copies */ patlen = strlen(value); depad_seq(value, &patlen, NULL); if (NULL == (uppert = (char *)xmalloc(patlen + 1))) return 0; if (NULL == (upperb = (char *)xmalloc(patlen + 1))) return 0; uppert[patlen] = upperb[patlen] = 0; for (i = patlen-1; i >= 0; i--) { upperb[i] = uppert[i] = toupper(value[i]); } complement_seq(upperb, patlen); /* Loop */ if (dir) { start = xx->cursor_apos + (dir ? 1 : -1); end = start + (WIN_WIDTH-1); } else { end = xx->cursor_apos + (dir ? 1 : -1); start = end - (WIN_WIDTH-1); } fpos = xx->cursor_apos; c = cache_search(xx->io, GT_Contig, xx->cnum); cache_incr(xx->io, c); do { char *ind, *indt = NULL, *indb = NULL; calculate_consensus_simple(xx->io, xx->cnum, start, end, cons, NULL); cons[WIN_WIDTH] = 0; if (dir) { if (strand == '+' || strand == '=') indt = pstrstr_inexact(cons, uppert, mismatches, NULL); if (strand == '-' || strand == '=') indb = pstrstr_inexact(cons, upperb, mismatches, NULL); } else { if (strand == '+' || strand == '=') indt = prstrstr_inexact(cons, uppert, mismatches, NULL); if (strand == '-' || strand == '=') indb = prstrstr_inexact(cons, upperb, mismatches, NULL); } if (indt && indb) ind = MIN(indt, indb); else if (indt) ind = indt; else if (indb) ind = indb; else ind = NULL; if (ind != NULL) { if (dir) { if (fpos <= start + ind-cons) { found = 1; fpos = start + ind-cons; fseq = xx->cnum; } } else { if (fpos >= start + ind-cons) { found = 1; fpos = start + ind-cons; fseq = xx->cnum; } } break; } /* Next search region - overlapping by patlen+pads */ if (dir) { for (i = WIN_WIDTH-1, j = patlen; j && i; i--) { if (cons[i] != '*') j--; } if (i == 0) break; start += i; end += i; if (start > c->end) at_end = 1; } else { for (i = 0, j = patlen; j && i < WIN_WIDTH; i++) { if (cons[i] != '*') j--; } if (i == WIN_WIDTH) break; start -= WIN_WIDTH-i; end -= WIN_WIDTH-i; if (end < c->start) at_end = 1; } } while (!at_end); cache_decr(xx->io, c); if (found) { edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq, fseq, fpos, 1); } free(uppert); free(upperb); return found ? 0 : -1; }
int edview_search_consquality(edview *xx, int dir, int strand, char *value) { int start, end; float qual[WIN_WIDTH+1]; int found = 0, at_end = 0; int fpos, i, qval = atoi(value); contig_t *c; /* Set initial start positions */ if (dir) { start = xx->cursor_apos + (dir ? 1 : -1); end = start + (WIN_WIDTH-1); } else { end = xx->cursor_apos + (dir ? 1 : -1); start = end - (WIN_WIDTH-1); } fpos = xx->cursor_apos; /* Loop WIN_WIDTH block at a time */ c = cache_search(xx->io, GT_Contig, xx->cnum); cache_incr(xx->io, c); do { calculate_consensus_simple(xx->io, xx->cnum, start, end, NULL, qual); if (dir) { for (i = 0; i < WIN_WIDTH; i++) { if (qual[i] < qval) { found = 1; break; } } } else { for (i = WIN_WIDTH-1; i; i--) { if (qual[i] < qval) { found = 1; break; } } } if (found) { fpos = start + i; break; } /* Next search region - overlapping by patlen+pads */ if (dir) { start += WIN_WIDTH; end += WIN_WIDTH; if (start > c->end) at_end = 1; } else { start -= WIN_WIDTH; end -= WIN_WIDTH; if (end < c->start) at_end = 1; } } while (!at_end); cache_decr(xx->io, c); if (found) { edSetCursorPos(xx, GT_Contig, xx->cnum, fpos, 1); return 0; } return -1; }
/* * Scans through one or more contigs checking each reading for correct * assembly. This is simply a check for misaligned data, not looking into * cutoff data. (The gap4 method did this, but it hasn't yet been implemented * in gap5). * * Returns -1 for failure, 0 for success. */ int check_assembly(GapIO *io, int num_contigs, contig_list_t *contigs, int winsize, float maxperc, int ignore_N) { int i, sc, count = 0, allocated = 0; char *con; tg_rec *reads = NULL, *conts = NULL; int *score = NULL, *length = NULL, *pos = NULL; for (i = 0; i < num_contigs; i++) { tg_rec crec = contigs[i].contig; contig_iterator *ci; rangec_t *r; int start = contigs[i].start, end = contigs[i].end; if (NULL == (con = (char *)xmalloc(end-start+1))) return -1; calculate_consensus_simple(io, crec, start, end, con, NULL); ci = contig_iter_new(io, crec, 0, CITER_FIRST, start, end); while (NULL != (r = contig_iter_next(io, ci))) { UpdateTextOutput(); sc = check_uassembly_single(io, con - start, crec, r, maxperc, winsize, ignore_N); if (count >= allocated) { allocated = allocated ? allocated * 2 : 256; reads = xrealloc(reads, allocated * sizeof(*reads)); conts = xrealloc(conts, allocated * sizeof(*conts)); score = xrealloc(score, allocated * sizeof(*score)); length = xrealloc(length, allocated * sizeof(*length)); pos = xrealloc(pos, allocated * sizeof(*pos)); if (!reads || !conts || !score || !length || !pos) goto error; } if (sc > 0) { reads[count] = r->rec; score[count] = sc * 100; pos[count] = r->start; length[count] = r->end - r->start+1; conts[count++] = crec; } } contig_iter_del(ci); xfree(con); } if (-1 == check_assembly_plot(io, reads, conts, score, pos, length, count)) goto error; if (reads) xfree(reads); if (conts) xfree(conts); if (pos) xfree(pos); if (length) xfree(length); if (score) xfree(score); return 0; error: if (reads) xfree(reads); if (conts) xfree(conts); if (pos) xfree(pos); if (length) xfree(length); if (score) xfree(score); return -1; }
int find_oligos(GapIO *io, int num_contigs, contig_list_t *contig_array, float mis_match, char *string, int consensus_only, int in_cutoff) { int i; int *pos1 = NULL; int *pos2 = NULL; int *score = NULL; int *length = NULL; tg_rec *c1 = NULL; tg_rec *c2 = NULL; int max_matches, abs_max; int seq_len; int n_matches; int max_clen; char **cons_array = NULL; /* Calculate maximum contig length and total contig length */ for (max_matches = 0, max_clen = 0, i=0; i<num_contigs; i++) { if (io_clength(io, contig_array[i].contig) > max_clen) max_clen = io_clength(io, contig_array[i].contig); max_matches += io_clength(io, contig_array[i].contig); } max_matches *= 2; /* both strands */ abs_max = get_default_int(GetInterp(), gap5_defs, "FINDOLIGO.MAX_MATCHES"); if (max_matches > abs_max) max_matches = abs_max; if (NULL == (pos1 = (int *)xmalloc((max_matches + 1) * sizeof(int)))) goto error; if (NULL == (pos2 = (int *)xmalloc((max_matches + 1) * sizeof(int)))) goto error; if (NULL == (score = (int *)xmalloc((max_matches + 1) * sizeof(int)))) goto error; if (NULL == (length = (int *)xmalloc((max_matches + 1) * sizeof(int)))) goto error; if (NULL == (c1 = (tg_rec *)xmalloc((max_matches + 1) * sizeof(tg_rec)))) goto error; if (NULL == (c2 = (tg_rec *)xmalloc((max_matches + 1) * sizeof(tg_rec)))) goto error; /* save consensus for each contig */ if (NULL == (cons_array = (char **)xmalloc(num_contigs * sizeof(char *)))) goto error; for (i = 0; i < num_contigs; i++) { seq_len = contig_array[i].end - contig_array[i].start + 1; if (NULL == (cons_array[i] = (char *)xmalloc(seq_len + 1))) goto error; calculate_consensus_simple(io, contig_array[i].contig, contig_array[i].start, contig_array[i].end, cons_array[i], NULL); cons_array[i][seq_len] = '\0'; } /* do match on either tag(s) or string */ if (string && *string) { n_matches = StringMatch(io, num_contigs, contig_array, cons_array, string, mis_match, pos1, pos2, score, length, c1, c2, max_matches, consensus_only, in_cutoff); if (-1 == RegFindOligo(io, SEQUENCE, pos1, pos2, score, length, c1, c2, n_matches)) goto error; } else { /* if (-1 == (n_matches = TagMatch(io, max_clen, num_contigs, contig_array, cons_array, mis_match, pos1, pos2, score, length, c1, c2, max_matches))) goto error; if (-1 == RegFindOligo(io, TAG, pos1, pos2, score, length, c1, c2, n_matches)) */ goto error; } for (i = 0; i < num_contigs; i++) { if (cons_array[i]) xfree(cons_array[i]); } xfree(cons_array); xfree(c1); xfree(c2); xfree(pos1); xfree(pos2); xfree(score); xfree(length); return 0; error: if (c1) xfree(c1); if (c2) xfree(c2); if (cons_array) xfree(cons_array); if (pos1) xfree(pos1); if (pos2) xfree(pos2); if (score) xfree(score); if (length) xfree(length); return -1; }
/* * Extends the right hand end of a single contig. * * Min_depth is the minimum depth for extension. If lower then even if the * data matches we'll not extend further. * * Match_score (+ve) and mismatch_score (-ve) are accumulated during * extension to ensure that we don't extend into junk mismatching DNA. */ static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth, int match_score, int mismatch_score) { int end; rangec_t *r; int nr, i; contig_t *c; char cons[CSZ], new_cons[ESZ]; int freqs[ESZ][4], depth[ESZ]; double score, best_score; int best_pos, nseq; vmessage("Processing contig #%"PRIrec", %s end\n", crec, dir ? "left" : "right"); for (i = 0; i < ESZ; i++) { freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0; depth[i] = 0; } c = cache_search(io, GT_Contig, crec); if (NULL == c) return -1; cache_incr(io, c); if (consensus_valid_range(io, crec, NULL, &end) != 0) { cache_decr(io, c); return -1; } calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL); /* Start */ /* Not implemented for now: rev complement and go again! */ /* End */ r = contig_seqs_in_range(io, &c, end, end, 0, &nr); if (!r) { cache_decr(io, c); return -1; } for (i = 0; i < nr; i++) { seq_t *s = cache_search(io, GT_Seq, r[i].rec); seq_t *sorig = s; int cstart, cend; int j, k, slen; if ((s->len < 0) ^ r[i].comp) { s = dup_seq(s); complement_seq_t(s); } cstart = r[i].start + s->left-1; cend = r[i].start + s->right-1; /* Does cutoff extend to contig end, if so does it match cons? */ if (cend < end) { int mis = 0, len = 0; if (end - cend >= CSZ) { /* fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n", r[i].rec); */ if (sorig != s) free(s); r[i].rec = 0; /* Mark for removal */ continue; } for (k = s->right, j = cend+1; j <= end; j++, k++) { //printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]); if (s->seq[k] != cons[j-(end-(CSZ-1))]) mis++; } len = end - cend; if (100*mis/len > 5) { /* fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement " "with consensus.\n", r[i].rec); */ if (sorig != s) free(s); r[i].rec = 0; continue; } } /* So we got here, let's accumulate extension stats */ slen = ABS(s->len); for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) { //printf("%d: %c\n", j + r[i].start, s->seq[j]); if(s->seq[j] == 'N') continue; freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++; depth[k]++; } if (sorig != s) free(s); } score = best_score = 0; best_pos = 0; for (i = 0; i < ESZ; i++) { int call, best = 0, j; double dd; if (depth[i] < min_depth) break; for (j = 0; j < 4; j++) { if (best < freqs[i][j]) { best = freqs[i][j]; call = j; } } new_cons[i] = "ACGT"[call]; dd = (double)depth[i]; switch (call) { case 0: score += freqs[i][0] / dd; score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd; break; case 1: score += freqs[i][1] / dd; score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd; break; case 2: score += freqs[i][2] / dd; score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd; break; case 3: score += freqs[i][3] / dd; score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd; break; } if (best_score <= score) { best_score = score; best_pos = i+1; } /* printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n", i, depth[i], "ACGT"[call], freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3], score); */ } /* printf("Best score is %f at %d\n", best_score, best_pos); */ /* Extend */ nseq = 0; if (best_pos > 0) { int furthest_left = end; for (i = 0; i < nr; i++) { seq_t *s; int r_pos; int score; if (r[i].rec == 0) continue; s = cache_search(io, GT_Seq, r[i].rec); s = cache_rw(io, s); if (furthest_left > r[i].start) furthest_left = r[i].start; /* * end + best_pos is the furthest right we can go, but this * specific read may not be justified in reaching that far * if it has too many disagreements. */ if ((s->len > 0) ^ r[i].comp) { int best_r = 0, j, k; int len = ABS(s->len); //printf(">%s\t", s->name); r_pos = 0; score = 0; //for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) { for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) { if (new_cons[j] == toupper(s->seq[k])) { score += match_score; if (best_r <= score) { best_r = score; r_pos = k+1; } } else { score += mismatch_score; } //putchar(new_cons[j] == toupper(s->seq[k]) // ? toupper(s->seq[k]) // : tolower(s->seq[k])); } //putchar('\n'); if (s->right != r_pos) { s->right = r_pos; nseq++; } } else { int best_r = 0, j, k; //printf("<%s\t", s->name); r_pos = 0; score = 0; //for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) { for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) { char b = complement_base(s->seq[k]); if (new_cons[j] == b) { score += match_score; if (best_r <= score) { best_r = score; r_pos = k-1; } } else { score += mismatch_score; } //putchar(new_cons[j] == toupper(b) // ? toupper(b) // : tolower(b)); } //putchar('\n'); if (s->left != r_pos+2) { s->left = r_pos+2; nseq++; } } } vmessage(" Extended by %d, adjusting %d sequence clip%s\n", best_pos, nseq, nseq == 1 ? "" : "s"); bin_invalidate_consensus(io, crec, furthest_left, end + best_pos); } else { vmessage(" Unable to extend contig\n"); } free(r); cache_decr(io, c); cache_flush(io); return 0; }