/* * Compute the visible end position of a contig. This isn't just the extents * of start_used / end_used in the bins as this can included invisible * data such as cached consensus sequences. */ int contig_visible_end(GapIO *io, tg_rec crec) { rangec_t *r; contig_iterator *ci; ci = contig_iter_new_by_type(io, crec, 1, CITER_LAST | CITER_IEND, CITER_CSTART, CITER_CEND, GRANGE_FLAG_ISANY); if (!ci) { contig_t *c = cache_search(io, GT_Contig, crec); return c->end; } while (r = contig_iter_prev(io, ci)) { int v; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISCONS) continue; v = r->end; contig_iter_del(ci); return v; } contig_iter_del(ci); return 0; }
int edview_search_tag_type(edview *xx, int dir, int strand, char *value) { contig_iterator *iter; int start, end; rangec_t *r; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); int type = str2type(value); contig_t *c = cache_search(xx->io, GT_Contig, xx->cnum); if (dir) { start = xx->cursor_apos + (dir ? 1 : -1); end = c->end; ifunc = contig_iter_next; } else { start = c->start; end = xx->cursor_apos + (dir ? 1 : -1); ifunc = contig_iter_prev; } iter = contig_iter_new_by_type(xx->io, xx->cnum, 1, dir == 1 ? CITER_FIRST : CITER_LAST, start, end, GRANGE_FLAG_ISANNO); if (!iter) /* Can happen legitimately when we're already at the end of contig */ return -1; while (r = ifunc(xx->io, iter)) { if ((dir && r->start < start) || (!dir && r->start > end)) continue; if (r->mqual == type) break; } if (r) { if (r->flags & GRANGE_FLAG_TAG_SEQ) { int pos; sequence_get_position(xx->io, r->pair_rec, NULL, &pos, NULL, NULL); pos = r->start - pos; edSetCursorPos(xx, GT_Seq, r->pair_rec, pos, 1); } else { edSetCursorPos(xx, GT_Contig, xx->cnum, r->start, 1); } contig_iter_del(iter); return 0; } contig_iter_del(iter); return -1; }
static void tag_shift_for_delete(GapIO *io, tg_rec crec, tg_rec srec, int start, int end, int pos, tg_rec brec) { contig_iterator *ci; rangec_t *r; contig_t *c = cache_search(io, GT_Contig, crec);; //printf("< tag in seq %"PRIrec" at %d\n", srec, pos); cache_incr(io, c); ci = contig_iter_new_by_type(io, crec, 0, CITER_FIRST | CITER_ISTART, start+pos, end, GRANGE_FLAG_ISANNO); if (!ci) { cache_decr(io, c); return; } while ((r = contig_iter_next(io, ci))) { range_t r2, *r_out; anno_ele_t *a; bin_index_t *bin; if (r->pair_rec != srec) continue; bin_remove_item(io, &c, GT_AnnoEle, r->rec); r2.start = (r->start > start+pos) ? r->start-1 : r->start; r2.end = r->end-1; r2.mqual = r->mqual; r2.rec = r->rec; r2.pair_rec = r->pair_rec; r2.flags = r->flags; if (r2.end < r2.start) { /* Tag entirely removed now, it must have been on a pad */ a = cache_search(io, GT_AnnoEle, r->rec); a = cache_rw(io, a); cache_deallocate(io, a); continue; } bin = bin_add_to_range(io, &c, brec, &r2, &r_out, NULL, 0); a = cache_search(io, GT_AnnoEle, r->rec); if (a->bin != bin->rec /*|| a->idx != r_out - ArrayBase(range_t, bin->rng)*/) { /* Annotation moved bins */ a = cache_rw(io, a); a->bin = bin->rec; //a->bin_idx = r_out - ArrayBase(range_t, bin->rng); } } cache_decr(io, c); contig_iter_del(ci); }
int edview_search_tag_indel(edview *xx, int dir, int strand, char *value) { contig_iterator *iter; int start, end; rangec_t *r; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); contig_t *c = cache_search(xx->io, GT_Contig, xx->cnum); if (dir) { start = xx->cursor_apos + (dir ? 1 : -1); end = c->end; ifunc = contig_iter_next; } else { start = c->start; end = xx->cursor_apos + (dir ? 1 : -1); ifunc = contig_iter_prev; } iter = contig_iter_new_by_type(xx->io, xx->cnum, 1, dir == 1 ? CITER_FIRST : CITER_LAST, start, end, GRANGE_FLAG_ISREFPOS); if (!iter) return -1; while (r = ifunc(xx->io, iter)) { if ((dir && r->start < start) || (!dir && r->start > end)) continue; break; } if (r) { edSetCursorPos(xx, GT_Contig, xx->cnum, r->start, 1); contig_iter_del(iter); return 0; } contig_iter_del(iter); return -1; }
/* * Removes all tags of specific types (hashed in h, or all if h == NULL) * from a specified contig. * * Returns 0 on success * -1 on failure */ static int delete_tag_single_contig(GapIO *io, tg_rec crec, HashTable *h, int verbose) { contig_iterator *ci; rangec_t *r; contig_t *c; int ret = -1; ci = contig_iter_new_by_type(io, crec, 1, CITER_FIRST, CITER_CSTART, CITER_CEND, GRANGE_FLAG_ISANNO); if (!ci) return -1; if (!(c = cache_search(io, GT_Contig, crec))) { contig_iter_del(ci); return -1; } cache_incr(io, c); while (NULL != (r = contig_iter_next(io, ci))) { char t[5]; (void)type2str(r->mqual, t); if (!h || HashTableSearch(h, t, 4)) { anno_ele_t *e; if (verbose) vmessage("Removing anno %s #%"PRIrec"\tContig %s\t%d..%d\n", t, r->rec, c->name, r->start, r->end); if (bin_remove_item(io, &c, GT_AnnoEle, r->rec)) goto fail; /* FIXME: Need to reclaim the GT_AnnoEle record itself */ } } ret = 0; fail: contig_iter_del(ci); cache_decr(io, c); return ret; }
static void test_mode3(GapIO *io, int cnum, int xpos) { rangec_t *r; contig_iterator *ci; ci = contig_iter_new(io, cnum, 0, CITER_FIRST, CITER_CSTART, CITER_CEND); while (r = contig_iter_next(io, ci)) { seq_t *s = get_seq(io, r->rec); char name[256]; sprintf(name, "%.*s", s->name_len, s->name); printf("%c%-22s\t%8d..%-8d\t%.*s\n", "+-"[s->len<0], name, r->start, r->end, ABS(s->len), s->seq); } contig_iter_del(ci); exit(0); }
/* * Attempt to find edits. It's not 100% reliable, but works for most cases. * We look for lowercase bases and confidence 100 and 0 (if not N). * We cannot find deleted bases though. */ int edview_search_edit(edview *xx, int dir, int strand, char *value) { int start, end; contig_iterator *iter; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); rangec_t *r; int best_pos, found = 0; int fpos; tg_rec fseq; if (dir) { start = xx->cursor_apos + 1; end = CITER_CEND; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_FIRST | CITER_ISTART, start, end); ifunc = contig_iter_next; best_pos = INT_MAX; } else { start = CITER_CSTART; end = xx->cursor_apos -1; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_LAST | CITER_IEND, start, end); ifunc = contig_iter_prev; best_pos = INT_MIN; } if (!iter) return -1; while ((r = ifunc(xx->io, iter))) { seq_t *s, *sorig; char *seq, *qual; int seq_len, comp, off = 0, i; if (found && dir && r->start > best_pos) break; if (found && !dir && r->end < best_pos) break; if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec))) break; if (r->comp ^ (s->len < 0)) { s = dup_seq(s); complement_seq_t(s); } seq = s->seq; qual = s->conf; seq_len = ABS(s->len); if (r->start < start) { off = start - r->start; seq += off; qual += off; seq_len -= off; } for (i = 0; i < seq_len; i++) { if (islower(seq[i]) || qual[i] == 100 || (qual[i] == 0 && seq[i] != 'N' && seq[i] != '-' && seq[i] != '*')) { int pos = r->start + i + off; if (dir) { if (best_pos > pos && pos > xx->cursor_apos) { found = 1; best_pos = pos; fpos = i + off; fseq = r->rec; } break; } else { if (best_pos < pos && pos < xx->cursor_apos) { found = 1; best_pos = pos; fpos = i + off; fseq = r->rec; } } } } if (s != sorig) free(s); } if (found) { edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq, fseq, fpos, 1); } contig_iter_del(iter); return found ? 0 : -1; }
int edview_search_tag_anno(edview *xx, int dir, int strand, char *value) { contig_iterator *iter; int start, end; rangec_t *r; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); char *r_exp = NULL; contig_t *c = cache_search(xx->io, GT_Contig, xx->cnum); if (value) { if (NULL == (r_exp = REGCMP(xx->interp, value))) { verror(ERR_WARN, "Search by anno", "invalid regular expression"); return -1; } } if (dir) { start = xx->cursor_apos + (dir ? 1 : -1); end = c->end; ifunc = contig_iter_next; } else { start = c->start; end = xx->cursor_apos + (dir ? 1 : -1); ifunc = contig_iter_prev; } iter = contig_iter_new_by_type(xx->io, xx->cnum, 1, dir == 1 ? CITER_FIRST : CITER_LAST, start, end, GRANGE_FLAG_ISANNO); if (!iter) return -1; while (r = ifunc(xx->io, iter)) { anno_ele_t *ae; if ((dir && r->start < start) || (!dir && r->start > end)) continue; if (!r_exp) break; /* blank expr => match all */ ae = cache_search(xx->io, GT_AnnoEle, r->rec); if (!ae->comment) continue; if (REGEX(xx->interp, ae->comment, r_exp)) break; } REGFREE(xx->interp, r_exp); if (r) { if (r->flags & GRANGE_FLAG_TAG_SEQ) { int pos; sequence_get_position(xx->io, r->pair_rec, NULL, &pos, NULL, NULL); pos = r->start - pos; edSetCursorPos(xx, GT_Seq, r->pair_rec, pos, 1); } else { edSetCursorPos(xx, GT_Contig, xx->cnum, r->start, 1); } contig_iter_del(iter); return 0; } contig_iter_del(iter); return -1; }
int edview_search_name(edview *xx, int dir, int strand, char *value) { tg_rec rec, *rp, cnum = -1, best_rec; int best_pos, best_off; int nr, i; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); int start, end, cstart; contig_iterator *iter; contig_t *c; /* Check for #num where num is a sequence record in this contig */ if (*value == '#') { char *endp; int64_t v = strtol64(value+1, &endp, 10); rec = v; if (*endp == '\0' && cache_exists(xx->io, GT_Seq, rec)) { sequence_get_clipped_position(xx->io, rec, &cnum, &start, NULL, &cstart, NULL, NULL); if (cnum == xx->cnum) { edSetCursorPos(xx, GT_Seq, rec, cstart - start, 1); return 0; } } } /* Find all hits matching this name */ rp = sequence_index_query_all(xx->io, value, 1, &nr); /* Also get an position-based iterator */ c = cache_search(xx->io, GT_Contig, xx->cnum); if (dir) { start = xx->cursor_apos + 1; end = c->end; ifunc = contig_iter_next; best_pos = end + 1; best_off = 0; } else { start = c->start; end = xx->cursor_apos - 1; ifunc = contig_iter_prev; best_pos = start - 1; best_off = 0; } iter = contig_iter_new_by_type(xx->io, xx->cnum, 1, dir == 1 ? CITER_FIRST : CITER_LAST, start-1, end+1, GRANGE_FLAG_ISSEQ); if (!iter) return -1; /* * The iterator also finds overlapping objects, not just ones beyond this * point. That's fine if we're on the consensus as we probably want to * jump to the first seq-name overlapping this point. * * However if we're on a sequence already, we want the first one * after or before that sequence. So we skip along iterator until we're * at the current record. */ if (xx->cursor_type == GT_Seq) { rangec_t *r; while ((r = ifunc(xx->io, iter))) { if (r->rec == xx->cursor_rec) break; } } /* Alternate between the by-name and by-position scan */ best_rec = -1; for (i = 0; i < nr; i++) { int start, end; rangec_t *r; /* From name index */ rec = rp[i++]; sequence_get_clipped_position(xx->io, rec, &cnum, &start, &end, &cstart, NULL, NULL); if (cnum == xx->cnum) { if ((dir && best_pos > cstart && cstart > xx->cursor_apos) || (!dir && best_pos < cstart && cstart < xx->cursor_apos)) { best_pos = cstart; best_off = cstart - start; best_rec = rec; } } /* From iterator */ if ((r = ifunc(xx->io, iter))) { seq_t *s; if (NULL == (s = cache_search(xx->io, GT_Seq, r->rec))) { /* No match */ best_rec = -1; break; } if (strncmp(s->name, value, strlen(value)) == 0) { /* prefix match */ puts("Found by pos iterator"); best_rec = r->rec; break; } } else { /* End of contig - bail out early */ best_rec = -1; break; } } contig_iter_del(iter); if (rp) free(rp); if (best_rec != -1) { edSetCursorPos(xx, GT_Seq, best_rec, best_off, 1); return 0; } return -1; }
int edview_search_sequence(edview *xx, int dir, int strand, char *value) { int mismatches = 0; /* exact match */ int where = 2; /* consensus */ char *p; int start, end; int patlen; char *uppert, *upperb; int found = 0, at_end = 0; tg_rec fseq; int fpos, i, j; contig_t *c; contig_iterator *iter; rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci); rangec_t *r; int best_pos; if (dir) { start = xx->cursor_apos + 1; end = CITER_CEND; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_FIRST | CITER_ISTART, start, end); ifunc = contig_iter_next; best_pos = INT_MAX; } else { start = CITER_CSTART; end = xx->cursor_apos -1; iter = contig_iter_new(xx->io, xx->cnum, 1, CITER_LAST | CITER_IEND, start, end); ifunc = contig_iter_prev; best_pos = INT_MIN; } if (!iter) return -1; /* * Parse value search string. It optionally includes two extra params * separated by #. Ie: * <string>#<N.mismatches>#<where>. * <where> is 1 for readings, 2 for consensus, 3 for both. */ if (p = strchr(value, '#')) { mismatches = atoi(p+1); *p = 0; if (p = strchr(p+1, '#')) where = atoi(p+1); } /* uppercase search string, remove pads, and store fwd/rev copies */ patlen = strlen(value); depad_seq(value, &patlen, NULL); if (NULL == (uppert = (char *)xmalloc(patlen + 1))) return 0; if (NULL == (upperb = (char *)xmalloc(patlen + 1))) return 0; uppert[patlen] = upperb[patlen] = 0; for (i = patlen-1; i >= 0; i--) { upperb[i] = uppert[i] = toupper(value[i]); } complement_seq(upperb, patlen); while ((r = ifunc(xx->io, iter))) { seq_t *s, *sorig; char *ind, *indt = NULL, *indb = NULL, *seq; int seq_len, comp, off = 0; if (found && dir && r->start > best_pos) break; if (found && !dir && r->end < best_pos) break; if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec))) break; if (r->comp ^ (s->len < 0)) { s = dup_seq(s); complement_seq_t(s); } seq = s->seq; seq_len = ABS(s->len); if (r->start < start) { off = start - r->start; seq += off; seq_len -= off; } if (r->end - (patlen-1) > end) seq_len -= r->end - (patlen-1) - end; if (dir) { if (strand == '+' || strand == '=') indt = pstrnstr_inexact(seq, seq_len, uppert, patlen, mismatches, NULL); if (strand == '-' || strand == '=') indb = pstrnstr_inexact(seq, seq_len, upperb, patlen, mismatches, NULL); } else { if (strand == '+' || strand == '=') indt = prstrnstr_inexact(seq, seq_len, uppert, patlen, mismatches, NULL); if (strand == '-' || strand == '=') indb = prstrnstr_inexact(seq, seq_len, upperb, patlen, mismatches, NULL); } if (indt && indb) ind = MIN(indt, indb); else if (indt) ind = indt; else if (indb) ind = indb; else ind = NULL; if (ind) { int pos = r->start + ind - seq + off; if (dir) { if (best_pos > pos) { found = 1; best_pos = pos; fpos = ind - s->seq; fseq = r->rec; } } else { if (best_pos < pos) { found = 1; best_pos = pos; fpos = ind - s->seq; fseq = r->rec; } } //printf("Matches #%"PRIrec": at abs pos %d\n", r->rec, pos); } if (s != sorig) free(s); } if (found) { edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq, fseq, fpos, 1); } free(uppert); free(upperb); contig_iter_del(iter); return found ? 0 : -1; }
/* * Scans through one or more contigs checking each reading for correct * assembly. This is simply a check for misaligned data, not looking into * cutoff data. (The gap4 method did this, but it hasn't yet been implemented * in gap5). * * Returns -1 for failure, 0 for success. */ int check_assembly(GapIO *io, int num_contigs, contig_list_t *contigs, int winsize, float maxperc, int ignore_N) { int i, sc, count = 0, allocated = 0; char *con; tg_rec *reads = NULL, *conts = NULL; int *score = NULL, *length = NULL, *pos = NULL; for (i = 0; i < num_contigs; i++) { tg_rec crec = contigs[i].contig; contig_iterator *ci; rangec_t *r; int start = contigs[i].start, end = contigs[i].end; if (NULL == (con = (char *)xmalloc(end-start+1))) return -1; calculate_consensus_simple(io, crec, start, end, con, NULL); ci = contig_iter_new(io, crec, 0, CITER_FIRST, start, end); while (NULL != (r = contig_iter_next(io, ci))) { UpdateTextOutput(); sc = check_uassembly_single(io, con - start, crec, r, maxperc, winsize, ignore_N); if (count >= allocated) { allocated = allocated ? allocated * 2 : 256; reads = xrealloc(reads, allocated * sizeof(*reads)); conts = xrealloc(conts, allocated * sizeof(*conts)); score = xrealloc(score, allocated * sizeof(*score)); length = xrealloc(length, allocated * sizeof(*length)); pos = xrealloc(pos, allocated * sizeof(*pos)); if (!reads || !conts || !score || !length || !pos) goto error; } if (sc > 0) { reads[count] = r->rec; score[count] = sc * 100; pos[count] = r->start; length[count] = r->end - r->start+1; conts[count++] = crec; } } contig_iter_del(ci); xfree(con); } if (-1 == check_assembly_plot(io, reads, conts, score, pos, length, count)) goto error; if (reads) xfree(reads); if (conts) xfree(conts); if (pos) xfree(pos); if (length) xfree(length); if (score) xfree(score); return 0; error: if (reads) xfree(reads); if (conts) xfree(conts); if (pos) xfree(pos); if (length) xfree(length); if (score) xfree(score); return -1; }
/** * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'. */ MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) { CONTIGL *contig, *first_contig = NULL, *last_contig = NULL; int i, j; contig_iterator *citer; rangec_t *r; /* Expand start and end to the range covered by seqs overlapping * start .. end */ { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_FIRST | CITER_ICLIPPEDSTART, start, start); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); start = ((s->len < 0) ^ r->comp) ? r->end - s->right - 2 : r->start + s->left - 2; } contig_iter_del(citer); } { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_LAST | CITER_ICLIPPEDEND, end, end); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); end = ((s->len < 0) ^ r->comp) ? r->end - s->left + 2 : r->start + s->right + 2; } contig_iter_del(citer); } //printf("Generating data for %d..%d\n", start, end); /* Generate contigl linked list */ //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND); citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end); while ((r = contig_iter_next(io, citer))) { seq_t *s, *sorig; char *seq; int len; assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ); contig = create_contig_link(); contig->id = r->rec; contig->mseg = create_mseg(); sorig = s = cache_search(io, GT_Seq, r->rec); /* Check for out-of-bounds clip points. It shouldn't happen, but gap5 databases have been seen with this problem, and we don't want to crash if there are any. */ if (s->left < 1) s->left = 1; if (s->right > ABS(s->len)) s->right = ABS(s->len); /* Fix reads of zero length */ if (s->right < s->left) { sorig = s = cache_rw(io, s); s->right = s->left; if (s->right > ABS(s->len)) s->left = s->right = ABS(s->len); } if ((s->len < 0) ^ r->comp) { s = dup_seq(s); complement_seq_t(s); } len = s->right - s->left + 1; if (NULL == (seq = malloc(len+1))) return NULL; for (j = 0, i = s->left-1; i < s->right; i++, j++) { /* Protect against the sequence containing "."; our pad sym */ if (s->seq[i] == '.') seq[j] = 'N'; else seq[j] = s->seq[i]; } seq[j] = 0; init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1); contig->mseg->comp = (s != sorig); if (last_contig) { last_contig->next = contig; } else { first_contig = contig; } last_contig = contig; if (s != sorig) free(s); } contig_iter_del(citer); /* for 454 data -6 to -10 seem to work fine */ return contigl_to_malign(first_contig, -7, -7); }