static void test_mode3(GapIO *io, int cnum, int xpos) { rangec_t *r; contig_iterator *ci; ci = contig_iter_new(io, cnum, 0, CITER_FIRST, CITER_CSTART, CITER_CEND); while (r = contig_iter_next(io, ci)) { seq_t *s = get_seq(io, r->rec); char name[256]; sprintf(name, "%.*s", s->name_len, s->name); printf("%c%-22s\t%8d..%-8d\t%.*s\n", "+-"[s->len<0], name, r->start, r->end, ABS(s->len), s->seq); } contig_iter_del(ci); exit(0); }
/* * Attempt to find edits. It's not 100% reliable, but works for most cases. * We look for lowercase bases and confidence 100 and 0 (if not N). * We cannot find deleted bases though. */ int edview_search_edit(edview *xx, int dir, int strand, char *value) { int start, end; contig_iterator *iter; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); rangec_t *r; int best_pos, found = 0; int fpos; tg_rec fseq; if (dir) { start = xx->cursor_apos + 1; end = CITER_CEND; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_FIRST | CITER_ISTART, start, end); ifunc = contig_iter_next; best_pos = INT_MAX; } else { start = CITER_CSTART; end = xx->cursor_apos -1; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_LAST | CITER_IEND, start, end); ifunc = contig_iter_prev; best_pos = INT_MIN; } if (!iter) return -1; while ((r = ifunc(xx->io, iter))) { seq_t *s, *sorig; char *seq, *qual; int seq_len, comp, off = 0, i; if (found && dir && r->start > best_pos) break; if (found && !dir && r->end < best_pos) break; if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec))) break; if (r->comp ^ (s->len < 0)) { s = dup_seq(s); complement_seq_t(s); } seq = s->seq; qual = s->conf; seq_len = ABS(s->len); if (r->start < start) { off = start - r->start; seq += off; qual += off; seq_len -= off; } for (i = 0; i < seq_len; i++) { if (islower(seq[i]) || qual[i] == 100 || (qual[i] == 0 && seq[i] != 'N' && seq[i] != '-' && seq[i] != '*')) { int pos = r->start + i + off; if (dir) { if (best_pos > pos && pos > xx->cursor_apos) { found = 1; best_pos = pos; fpos = i + off; fseq = r->rec; } break; } else { if (best_pos < pos && pos < xx->cursor_apos) { found = 1; best_pos = pos; fpos = i + off; fseq = r->rec; } } } } if (s != sorig) free(s); } if (found) { edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq, fseq, fpos, 1); } contig_iter_del(iter); return found ? 0 : -1; }
int edview_search_sequence(edview *xx, int dir, int strand, char *value) { int mismatches = 0; /* exact match */ int where = 2; /* consensus */ char *p; int start, end; int patlen; char *uppert, *upperb; int found = 0, at_end = 0; tg_rec fseq; int fpos, i, j; contig_t *c; contig_iterator *iter; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); rangec_t *r; int best_pos; if (dir) { start = xx->cursor_apos + 1; end = CITER_CEND; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_FIRST | CITER_ISTART, start, end); ifunc = contig_iter_next; best_pos = INT_MAX; } else { start = CITER_CSTART; end = xx->cursor_apos -1; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_LAST | CITER_IEND, start, end); ifunc = contig_iter_prev; best_pos = INT_MIN; } if (!iter) return -1; /* * Parse value search string. It optionally includes two extra params * separated by #. Ie: * <string>#<N.mismatches>#<where>. * <where> is 1 for readings, 2 for consensus, 3 for both. */ if (p = strchr(value, '#')) { mismatches = atoi(p+1); *p = 0; if (p = strchr(p+1, '#')) where = atoi(p+1); } /* uppercase search string, remove pads, and store fwd/rev copies */ patlen = strlen(value); depad_seq(value, &patlen, NULL); if (NULL == (uppert = (char *)xmalloc(patlen + 1))) return 0; if (NULL == (upperb = (char *)xmalloc(patlen + 1))) return 0; uppert[patlen] = upperb[patlen] = 0; for (i = patlen-1; i >= 0; i--) { upperb[i] = uppert[i] = toupper(value[i]); } complement_seq(upperb, patlen); while ((r = ifunc(xx->io, iter))) { seq_t *s, *sorig; char *ind, *indt = NULL, *indb = NULL, *seq; int seq_len, comp, off = 0; if (found && dir && r->start > best_pos) break; if (found && !dir && r->end < best_pos) break; if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec))) break; if (r->comp ^ (s->len < 0)) { s = dup_seq(s); complement_seq_t(s); } seq = s->seq; seq_len = ABS(s->len); if (r->start < start) { off = start - r->start; seq += off; seq_len -= off; } if (r->end - (patlen-1) > end) seq_len -= r->end - (patlen-1) - end; if (dir) { if (strand == '+' || strand == '=') indt = pstrnstr_inexact(seq, seq_len, uppert, patlen, mismatches, NULL); if (strand == '-' || strand == '=') indb = pstrnstr_inexact(seq, seq_len, upperb, patlen, mismatches, NULL); } else { if (strand == '+' || strand == '=') indt = prstrnstr_inexact(seq, seq_len, uppert, patlen, mismatches, NULL); if (strand == '-' || strand == '=') indb = prstrnstr_inexact(seq, seq_len, upperb, patlen, mismatches, NULL); } if (indt && indb) ind = MIN(indt, indb); else if (indt) ind = indt; else if (indb) ind = indb; else ind = NULL; if (ind) { int pos = r->start + ind - seq + off; if (dir) { if (best_pos > pos) { found = 1; best_pos = pos; fpos = ind - s->seq; fseq = r->rec; } } else { if (best_pos < pos) { found = 1; best_pos = pos; fpos = ind - s->seq; fseq = r->rec; } } //printf("Matches #%"PRIrec": at abs pos %d\n", r->rec, pos); } if (s != sorig) free(s); } if (found) { edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq, fseq, fpos, 1); } free(uppert); free(upperb); contig_iter_del(iter); return found ? 0 : -1; }
/* * Scans through one or more contigs checking each reading for correct * assembly. This is simply a check for misaligned data, not looking into * cutoff data. (The gap4 method did this, but it hasn't yet been implemented * in gap5). * * Returns -1 for failure, 0 for success. */ int check_assembly(GapIO *io, int num_contigs, contig_list_t *contigs, int winsize, float maxperc, int ignore_N) { int i, sc, count = 0, allocated = 0; char *con; tg_rec *reads = NULL, *conts = NULL; int *score = NULL, *length = NULL, *pos = NULL; for (i = 0; i < num_contigs; i++) { tg_rec crec = contigs[i].contig; contig_iterator *ci; rangec_t *r; int start = contigs[i].start, end = contigs[i].end; if (NULL == (con = (char *)xmalloc(end-start+1))) return -1; calculate_consensus_simple(io, crec, start, end, con, NULL); ci = contig_iter_new(io, crec, 0, CITER_FIRST, start, end); while (NULL != (r = contig_iter_next(io, ci))) { UpdateTextOutput(); sc = check_uassembly_single(io, con - start, crec, r, maxperc, winsize, ignore_N); if (count >= allocated) { allocated = allocated ? allocated * 2 : 256; reads = xrealloc(reads, allocated * sizeof(*reads)); conts = xrealloc(conts, allocated * sizeof(*conts)); score = xrealloc(score, allocated * sizeof(*score)); length = xrealloc(length, allocated * sizeof(*length)); pos = xrealloc(pos, allocated * sizeof(*pos)); if (!reads || !conts || !score || !length || !pos) goto error; } if (sc > 0) { reads[count] = r->rec; score[count] = sc * 100; pos[count] = r->start; length[count] = r->end - r->start+1; conts[count++] = crec; } } contig_iter_del(ci); xfree(con); } if (-1 == check_assembly_plot(io, reads, conts, score, pos, length, count)) goto error; if (reads) xfree(reads); if (conts) xfree(conts); if (pos) xfree(pos); if (length) xfree(length); if (score) xfree(score); return 0; error: if (reads) xfree(reads); if (conts) xfree(conts); if (pos) xfree(pos); if (length) xfree(length); if (score) xfree(score); return -1; }
/** * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'. */ MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) { CONTIGL *contig, *first_contig = NULL, *last_contig = NULL; int i, j; contig_iterator *citer; rangec_t *r; /* Expand start and end to the range covered by seqs overlapping * start .. end */ { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_FIRST | CITER_ICLIPPEDSTART, start, start); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); start = ((s->len < 0) ^ r->comp) ? r->end - s->right - 2 : r->start + s->left - 2; } contig_iter_del(citer); } { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_LAST | CITER_ICLIPPEDEND, end, end); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); end = ((s->len < 0) ^ r->comp) ? r->end - s->left + 2 : r->start + s->right + 2; } contig_iter_del(citer); } //printf("Generating data for %d..%d\n", start, end); /* Generate contigl linked list */ //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND); citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end); while ((r = contig_iter_next(io, citer))) { seq_t *s, *sorig; char *seq; int len; assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ); contig = create_contig_link(); contig->id = r->rec; contig->mseg = create_mseg(); sorig = s = cache_search(io, GT_Seq, r->rec); /* Check for out-of-bounds clip points. It shouldn't happen, but gap5 databases have been seen with this problem, and we don't want to crash if there are any. */ if (s->left < 1) s->left = 1; if (s->right > ABS(s->len)) s->right = ABS(s->len); /* Fix reads of zero length */ if (s->right < s->left) { sorig = s = cache_rw(io, s); s->right = s->left; if (s->right > ABS(s->len)) s->left = s->right = ABS(s->len); } if ((s->len < 0) ^ r->comp) { s = dup_seq(s); complement_seq_t(s); } len = s->right - s->left + 1; if (NULL == (seq = malloc(len+1))) return NULL; for (j = 0, i = s->left-1; i < s->right; i++, j++) { /* Protect against the sequence containing "."; our pad sym */ if (s->seq[i] == '.') seq[j] = 'N'; else seq[j] = s->seq[i]; } seq[j] = 0; init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1); contig->mseg->comp = (s != sorig); if (last_contig) { last_contig->next = contig; } else { first_contig = contig; } last_contig = contig; if (s != sorig) free(s); } contig_iter_del(citer); /* for 454 data -6 to -10 seem to work fine */ return contigl_to_malign(first_contig, -7, -7); }
/* * find matches between user entered sequence string and contig list with * a minimum match of mis_match */ int StringMatch(GapIO *io, /* in */ int num_contigs, /* in */ contig_list_t *contig_array, /* in */ char **cons_array, /* in */ char *string, /* in */ float mis_fmatch, /* in */ int *pos1, /* out */ int *pos2, /* out */ int *score, /* out */ int *length, /* out */ tg_rec *c1, /* out */ tg_rec *c2, /* out */ int max_matches, /* in */ int consensus_only, /* in */ int cutoff_data) /* in */ { int n_matches = 0; int i, j, k, c; int mis_match; int seq_len; int orig; int res, too_many = 0; char *cons_match; char title[1024]; char name1[10]; int max_imatches = max_matches; size_t stringlen = strlen(string); if (NULL == (cons_match = (char *)xmalloc(stringlen + 1))) return -1; /* convert percentage mis-matches into number of mis matches */ mis_match = strlen(string) - (ceil(strlen(string) * mis_fmatch / 100.)); /* complement string */ for (c = 0; c < 2; c++) { if (c == 1) complement_seq(string, stringlen); for (i = 0; i < num_contigs; i++) { rangec_t *r; contig_iterator *ci = NULL; /* * Consensus first time through loop. * Sequences in that contig on subsequent loops. */ for (r = (rangec_t *)1; r; r = contig_iter_next(io, ci)) { char *seq; seq_t *s = NULL; if (ci == 0) { /* First time through is consensus */ seq = cons_array[i]; seq_len = strlen(cons_array[i]); } else { /* Subsequent times r is valid (not 1) and a sequence */ if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISSEQ) continue; s = cache_search(io, GT_Seq, r->rec); if (cutoff_data) { seq = s->seq; seq_len = ABS(s->len); } else { seq = &s->seq[s->left-1]; seq_len = s->right - s->left+1; } } orig = n_matches; res = inexact_pad_match(seq, seq_len, string, stringlen, mis_match, &pos1[n_matches], &score[n_matches], max_imatches); if (res == -2) return -1; if (res < 0) { verror(ERR_WARN, "find_oligos", "Too many matches"); too_many = 1; res = max_imatches; } n_matches += res; max_imatches -= res; for (j = k = orig; j < n_matches; j++) { int padded_len; c1[j] = contig_array[i].contig; if (c == 0) { c2[j] = contig_array[i].contig; } else { c2[j] = -contig_array[i].contig; } /* * remove pads such that the final length of cons_match is * of length length[j] */ padded_len = depad_seq_len(cons_match, &seq[pos1[j]-1], stringlen); if (ci) { if (cutoff_data) { pos1[j] += r->start-1; } else { pos1[j] += r->start-1 + s->left-1; } } length[j] = padded_len; /* Adjust for searching in a sub-range of the contig */ if (!ci) pos1[j] += contig_array[i].start-1; pos2[j] = pos1[j]; /* * The searching above may find hits outside of * contig_array[i].start and contig_array[i].end. * * This happens if we search sequences and the * sequence overlaps the desired range, but has a * hit outside of the desired range. * * Rather than complicate the above code, we post * filter these false hits here. */ if (pos1[j] >= contig_array[i].start && pos1[j] <= contig_array[i].end) { sprintf(name1, "%"PRIrec"", io_clnbr(io, ABS(c1[j]))); sprintf(title, "Match found with contig #%"PRIrec " read #%"PRIrec " in the %c sense", contig_array[i].contig, ci ? r->rec : 0, c2[j] > 0 ? '+' : '-'); list_alignment(string, cons_match, "oligo", name1, 1, pos1[j], title); /* * Copy it from *[j] to *[k]. * This code REALLY needs to be using structs! * This is foul. */ pos1 [k] = pos1 [j]; pos2 [k] = pos2 [j]; c1 [k] = c1 [j]; c2 [k] = c2 [j]; length[k] = length[j]; score [k] = score [j]; k++; } } n_matches -= j-k; max_imatches += j-k; if (too_many) break; if (consensus_only) break; if (!ci) { ci = contig_iter_new(io, contig_array[i].contig, 0 /*autoextend */, CITER_FIRST, contig_array[i].start, contig_array[i].end); if (!ci) break; } } if (too_many) break; } if (too_many) break; } xfree(cons_match); vmessage("Number of matches found %d \n", n_matches); return n_matches; }