/* * Compute the visible statr position of a contig. This isn't just the extents * of start_used / end_used in the bins as this can included invisible * data such as cached consensus sequences. */ int contig_visible_start(GapIO *io, tg_rec crec) { rangec_t *r; contig_iterator *ci; ci = contig_iter_new_by_type(io, crec, 1, CITER_FIRST | CITER_ISTART, CITER_CSTART, CITER_CEND, GRANGE_FLAG_ISANY); if (!ci) { contig_t *c = cache_search(io, GT_Contig, crec); return c->start; } while (r = contig_iter_next(io, ci)) { int v; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISCONS) continue; v = r->start; contig_iter_del(ci); return v; } contig_iter_del(ci); return 0; }
static void tag_shift_for_delete(GapIO *io, tg_rec crec, tg_rec srec, int start, int end, int pos, tg_rec brec) { contig_iterator *ci; rangec_t *r; contig_t *c = cache_search(io, GT_Contig, crec);; //printf("< tag in seq %"PRIrec" at %d\n", srec, pos); cache_incr(io, c); ci = contig_iter_new_by_type(io, crec, 0, CITER_FIRST | CITER_ISTART, start+pos, end, GRANGE_FLAG_ISANNO); if (!ci) { cache_decr(io, c); return; } while ((r = contig_iter_next(io, ci))) { range_t r2, *r_out; anno_ele_t *a; bin_index_t *bin; if (r->pair_rec != srec) continue; bin_remove_item(io, &c, GT_AnnoEle, r->rec); r2.start = (r->start > start+pos) ? r->start-1 : r->start; r2.end = r->end-1; r2.mqual = r->mqual; r2.rec = r->rec; r2.pair_rec = r->pair_rec; r2.flags = r->flags; if (r2.end < r2.start) { /* Tag entirely removed now, it must have been on a pad */ a = cache_search(io, GT_AnnoEle, r->rec); a = cache_rw(io, a); cache_deallocate(io, a); continue; } bin = bin_add_to_range(io, &c, brec, &r2, &r_out, NULL, 0); a = cache_search(io, GT_AnnoEle, r->rec); if (a->bin != bin->rec /*|| a->idx != r_out - ArrayBase(range_t, bin->rng)*/) { /* Annotation moved bins */ a = cache_rw(io, a); a->bin = bin->rec; //a->bin_idx = r_out - ArrayBase(range_t, bin->rng); } } cache_decr(io, c); contig_iter_del(ci); }
static void test_mode3(GapIO *io, int cnum, int xpos) { rangec_t *r; contig_iterator *ci; ci = contig_iter_new(io, cnum, 0, CITER_FIRST, CITER_CSTART, CITER_CEND); while (r = contig_iter_next(io, ci)) { seq_t *s = get_seq(io, r->rec); char name[256]; sprintf(name, "%.*s", s->name_len, s->name); printf("%c%-22s\t%8d..%-8d\t%.*s\n", "+-"[s->len<0], name, r->start, r->end, ABS(s->len), s->seq); } contig_iter_del(ci); exit(0); }
/* * Removes all tags of specific types (hashed in h, or all if h == NULL) * from a specified contig. * * Returns 0 on success * -1 on failure */ static int delete_tag_single_contig(GapIO *io, tg_rec crec, HashTable *h, int verbose) { contig_iterator *ci; rangec_t *r; contig_t *c; int ret = -1; ci = contig_iter_new_by_type(io, crec, 1, CITER_FIRST, CITER_CSTART, CITER_CEND, GRANGE_FLAG_ISANNO); if (!ci) return -1; if (!(c = cache_search(io, GT_Contig, crec))) { contig_iter_del(ci); return -1; } cache_incr(io, c); while (NULL != (r = contig_iter_next(io, ci))) { char t[5]; (void)type2str(r->mqual, t); if (!h || HashTableSearch(h, t, 4)) { anno_ele_t *e; if (verbose) vmessage("Removing anno %s #%"PRIrec"\tContig %s\t%d..%d\n", t, r->rec, c->name, r->start, r->end); if (bin_remove_item(io, &c, GT_AnnoEle, r->rec)) goto fail; /* FIXME: Need to reclaim the GT_AnnoEle record itself */ } } ret = 0; fail: contig_iter_del(ci); cache_decr(io, c); return ret; }
/* * Scans through one or more contigs checking each reading for correct * assembly. This is simply a check for misaligned data, not looking into * cutoff data. (The gap4 method did this, but it hasn't yet been implemented * in gap5). * * Returns -1 for failure, 0 for success. */ int check_assembly(GapIO *io, int num_contigs, contig_list_t *contigs, int winsize, float maxperc, int ignore_N) { int i, sc, count = 0, allocated = 0; char *con; tg_rec *reads = NULL, *conts = NULL; int *score = NULL, *length = NULL, *pos = NULL; for (i = 0; i < num_contigs; i++) { tg_rec crec = contigs[i].contig; contig_iterator *ci; rangec_t *r; int start = contigs[i].start, end = contigs[i].end; if (NULL == (con = (char *)xmalloc(end-start+1))) return -1; calculate_consensus_simple(io, crec, start, end, con, NULL); ci = contig_iter_new(io, crec, 0, CITER_FIRST, start, end); while (NULL != (r = contig_iter_next(io, ci))) { UpdateTextOutput(); sc = check_uassembly_single(io, con - start, crec, r, maxperc, winsize, ignore_N); if (count >= allocated) { allocated = allocated ? allocated * 2 : 256; reads = xrealloc(reads, allocated * sizeof(*reads)); conts = xrealloc(conts, allocated * sizeof(*conts)); score = xrealloc(score, allocated * sizeof(*score)); length = xrealloc(length, allocated * sizeof(*length)); pos = xrealloc(pos, allocated * sizeof(*pos)); if (!reads || !conts || !score || !length || !pos) goto error; } if (sc > 0) { reads[count] = r->rec; score[count] = sc * 100; pos[count] = r->start; length[count] = r->end - r->start+1; conts[count++] = crec; } } contig_iter_del(ci); xfree(con); } if (-1 == check_assembly_plot(io, reads, conts, score, pos, length, count)) goto error; if (reads) xfree(reads); if (conts) xfree(conts); if (pos) xfree(pos); if (length) xfree(length); if (score) xfree(score); return 0; error: if (reads) xfree(reads); if (conts) xfree(conts); if (pos) xfree(pos); if (length) xfree(length); if (score) xfree(score); return -1; }
/** * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'. */ MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) { CONTIGL *contig, *first_contig = NULL, *last_contig = NULL; int i, j; contig_iterator *citer; rangec_t *r; /* Expand start and end to the range covered by seqs overlapping * start .. end */ { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_FIRST | CITER_ICLIPPEDSTART, start, start); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); start = ((s->len < 0) ^ r->comp) ? r->end - s->right - 2 : r->start + s->left - 2; } contig_iter_del(citer); } { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_LAST | CITER_ICLIPPEDEND, end, end); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); end = ((s->len < 0) ^ r->comp) ? r->end - s->left + 2 : r->start + s->right + 2; } contig_iter_del(citer); } //printf("Generating data for %d..%d\n", start, end); /* Generate contigl linked list */ //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND); citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end); while ((r = contig_iter_next(io, citer))) { seq_t *s, *sorig; char *seq; int len; assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ); contig = create_contig_link(); contig->id = r->rec; contig->mseg = create_mseg(); sorig = s = cache_search(io, GT_Seq, r->rec); /* Check for out-of-bounds clip points. It shouldn't happen, but gap5 databases have been seen with this problem, and we don't want to crash if there are any. */ if (s->left < 1) s->left = 1; if (s->right > ABS(s->len)) s->right = ABS(s->len); /* Fix reads of zero length */ if (s->right < s->left) { sorig = s = cache_rw(io, s); s->right = s->left; if (s->right > ABS(s->len)) s->left = s->right = ABS(s->len); } if ((s->len < 0) ^ r->comp) { s = dup_seq(s); complement_seq_t(s); } len = s->right - s->left + 1; if (NULL == (seq = malloc(len+1))) return NULL; for (j = 0, i = s->left-1; i < s->right; i++, j++) { /* Protect against the sequence containing "."; our pad sym */ if (s->seq[i] == '.') seq[j] = 'N'; else seq[j] = s->seq[i]; } seq[j] = 0; init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1); contig->mseg->comp = (s != sorig); if (last_contig) { last_contig->next = contig; } else { first_contig = contig; } last_contig = contig; if (s != sorig) free(s); } contig_iter_del(citer); /* for 454 data -6 to -10 seem to work fine */ return contigl_to_malign(first_contig, -7, -7); }
/* * find matches between user entered sequence string and contig list with * a minimum match of mis_match */ int StringMatch(GapIO *io, /* in */ int num_contigs, /* in */ contig_list_t *contig_array, /* in */ char **cons_array, /* in */ char *string, /* in */ float mis_fmatch, /* in */ int *pos1, /* out */ int *pos2, /* out */ int *score, /* out */ int *length, /* out */ tg_rec *c1, /* out */ tg_rec *c2, /* out */ int max_matches, /* in */ int consensus_only, /* in */ int cutoff_data) /* in */ { int n_matches = 0; int i, j, k, c; int mis_match; int seq_len; int orig; int res, too_many = 0; char *cons_match; char title[1024]; char name1[10]; int max_imatches = max_matches; size_t stringlen = strlen(string); if (NULL == (cons_match = (char *)xmalloc(stringlen + 1))) return -1; /* convert percentage mis-matches into number of mis matches */ mis_match = strlen(string) - (ceil(strlen(string) * mis_fmatch / 100.)); /* complement string */ for (c = 0; c < 2; c++) { if (c == 1) complement_seq(string, stringlen); for (i = 0; i < num_contigs; i++) { rangec_t *r; contig_iterator *ci = NULL; /* * Consensus first time through loop. * Sequences in that contig on subsequent loops. */ for (r = (rangec_t *)1; r; r = contig_iter_next(io, ci)) { char *seq; seq_t *s = NULL; if (ci == 0) { /* First time through is consensus */ seq = cons_array[i]; seq_len = strlen(cons_array[i]); } else { /* Subsequent times r is valid (not 1) and a sequence */ if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISSEQ) continue; s = cache_search(io, GT_Seq, r->rec); if (cutoff_data) { seq = s->seq; seq_len = ABS(s->len); } else { seq = &s->seq[s->left-1]; seq_len = s->right - s->left+1; } } orig = n_matches; res = inexact_pad_match(seq, seq_len, string, stringlen, mis_match, &pos1[n_matches], &score[n_matches], max_imatches); if (res == -2) return -1; if (res < 0) { verror(ERR_WARN, "find_oligos", "Too many matches"); too_many = 1; res = max_imatches; } n_matches += res; max_imatches -= res; for (j = k = orig; j < n_matches; j++) { int padded_len; c1[j] = contig_array[i].contig; if (c == 0) { c2[j] = contig_array[i].contig; } else { c2[j] = -contig_array[i].contig; } /* * remove pads such that the final length of cons_match is * of length length[j] */ padded_len = depad_seq_len(cons_match, &seq[pos1[j]-1], stringlen); if (ci) { if (cutoff_data) { pos1[j] += r->start-1; } else { pos1[j] += r->start-1 + s->left-1; } } length[j] = padded_len; /* Adjust for searching in a sub-range of the contig */ if (!ci) pos1[j] += contig_array[i].start-1; pos2[j] = pos1[j]; /* * The searching above may find hits outside of * contig_array[i].start and contig_array[i].end. * * This happens if we search sequences and the * sequence overlaps the desired range, but has a * hit outside of the desired range. * * Rather than complicate the above code, we post * filter these false hits here. */ if (pos1[j] >= contig_array[i].start && pos1[j] <= contig_array[i].end) { sprintf(name1, "%"PRIrec"", io_clnbr(io, ABS(c1[j]))); sprintf(title, "Match found with contig #%"PRIrec " read #%"PRIrec " in the %c sense", contig_array[i].contig, ci ? r->rec : 0, c2[j] > 0 ? '+' : '-'); list_alignment(string, cons_match, "oligo", name1, 1, pos1[j], title); /* * Copy it from *[j] to *[k]. * This code REALLY needs to be using structs! * This is foul. */ pos1 [k] = pos1 [j]; pos2 [k] = pos2 [j]; c1 [k] = c1 [j]; c2 [k] = c2 [j]; length[k] = length[j]; score [k] = score [j]; k++; } } n_matches -= j-k; max_imatches += j-k; if (too_many) break; if (consensus_only) break; if (!ci) { ci = contig_iter_new(io, contig_array[i].contig, 0 /*autoextend */, CITER_FIRST, contig_array[i].start, contig_array[i].end); if (!ci) break; } } if (too_many) break; } if (too_many) break; } xfree(cons_match); vmessage("Number of matches found %d \n", n_matches); return n_matches; }