Example #1
0
/*
 * Compute the visible statr position of a contig. This isn't just the extents
 * of start_used / end_used in the bins as this can included invisible
 * data such as cached consensus sequences.
 */
int contig_visible_start(GapIO *io, tg_rec crec) {
    rangec_t *r;
    contig_iterator *ci;

    ci = contig_iter_new_by_type(io, crec, 1, CITER_FIRST | CITER_ISTART,
				 CITER_CSTART, CITER_CEND,
				 GRANGE_FLAG_ISANY);
    if (!ci) {
	contig_t *c = cache_search(io, GT_Contig, crec);
	return c->start;
    }
    
    while (r = contig_iter_next(io, ci)) {
	int v;

	if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISCONS)
	    continue;

	v = r->start;
	contig_iter_del(ci);
	return v;
    }

    contig_iter_del(ci);
    return 0;
}
Example #2
0
static void tag_shift_for_delete(GapIO *io, tg_rec crec, tg_rec srec,
				 int start, int end, int pos, tg_rec brec) {
    contig_iterator *ci;
    rangec_t *r;
    contig_t *c = cache_search(io, GT_Contig, crec);;

    //printf("< tag in seq %"PRIrec" at %d\n", srec, pos);

    cache_incr(io, c);

    ci = contig_iter_new_by_type(io, crec, 0, CITER_FIRST | CITER_ISTART,
				 start+pos, end, GRANGE_FLAG_ISANNO);
    if (!ci) {
	cache_decr(io, c);
	return;
    }

    while ((r = contig_iter_next(io, ci))) {
	range_t r2, *r_out;
	anno_ele_t *a;
	bin_index_t *bin;

	if (r->pair_rec != srec)
	    continue;

	bin_remove_item(io, &c, GT_AnnoEle, r->rec);
	r2.start    = (r->start > start+pos) ? r->start-1 : r->start;
	r2.end      = r->end-1;
	r2.mqual    = r->mqual;
	r2.rec      = r->rec;
	r2.pair_rec = r->pair_rec;
	r2.flags    = r->flags;

	if (r2.end < r2.start) {
	    /* Tag entirely removed now, it must have been on a pad */
	    a = cache_search(io, GT_AnnoEle, r->rec);
	    a = cache_rw(io, a);
	    cache_deallocate(io, a);
	    continue;
	}
	bin = bin_add_to_range(io, &c, brec, &r2, &r_out, NULL, 0);

	a = cache_search(io, GT_AnnoEle, r->rec);
	if (a->bin != bin->rec /*||
	    a->idx != r_out - ArrayBase(range_t, bin->rng)*/) {
	    /* Annotation moved bins */
	    a = cache_rw(io, a);
	    a->bin = bin->rec;
	    //a->bin_idx = r_out - ArrayBase(range_t, bin->rng);
	}
    }

    cache_decr(io, c);
    contig_iter_del(ci);
}
Example #3
0
static void test_mode3(GapIO *io, int cnum, int xpos) {
    rangec_t *r;
    contig_iterator *ci;

    ci = contig_iter_new(io, cnum, 0, CITER_FIRST, CITER_CSTART, CITER_CEND);
    while (r = contig_iter_next(io, ci)) {
        seq_t *s = get_seq(io, r->rec);
        char name[256];

        sprintf(name, "%.*s", s->name_len, s->name);
        printf("%c%-22s\t%8d..%-8d\t%.*s\n",
               "+-"[s->len<0], name, r->start, r->end, ABS(s->len), s->seq);
    }
    contig_iter_del(ci);
    exit(0);
}
Example #4
0
/*
 * Removes all tags of specific types (hashed in h, or all if h == NULL)
 * from a specified contig.
 *
 * Returns 0 on success
 *        -1 on failure
 */
static int delete_tag_single_contig(GapIO *io, tg_rec crec,
				    HashTable *h, int verbose) {
    contig_iterator *ci;
    rangec_t *r;
    contig_t *c;
    int ret = -1;

    ci = contig_iter_new_by_type(io, crec, 1, CITER_FIRST,
				 CITER_CSTART, CITER_CEND,
				 GRANGE_FLAG_ISANNO);
    if (!ci)
	return -1;
    
    if (!(c = cache_search(io, GT_Contig, crec))) {
	contig_iter_del(ci);
	return -1;
    }
    cache_incr(io, c);

    while (NULL != (r = contig_iter_next(io, ci))) {
	char t[5];
	(void)type2str(r->mqual, t);
	if (!h || HashTableSearch(h, t, 4)) {
	    anno_ele_t *e;

	    if (verbose)
		vmessage("Removing anno %s #%"PRIrec"\tContig %s\t%d..%d\n",
			 t, r->rec, c->name, r->start, r->end);
	    if (bin_remove_item(io, &c, GT_AnnoEle, r->rec)) goto fail;
	    /* FIXME: Need to reclaim the GT_AnnoEle record itself */
	}
    }

    ret = 0;
 fail:
    contig_iter_del(ci);
    cache_decr(io, c);

    return ret;
}
Example #5
0
/*
 * Scans through one or more contigs checking each reading for correct
 * assembly. This is simply a check for misaligned data, not looking into
 * cutoff data. (The gap4 method did this, but it hasn't yet been implemented
 * in gap5).
 *
 * Returns -1 for failure, 0 for success.
 */
int check_assembly(GapIO *io, int num_contigs, contig_list_t *contigs,
		   int winsize, float maxperc, int ignore_N) {
    int i, sc, count = 0, allocated = 0;
    char *con;
    tg_rec *reads = NULL, *conts = NULL;
    int *score = NULL, *length = NULL, *pos = NULL;

    for (i = 0; i < num_contigs; i++) {
	tg_rec crec = contigs[i].contig;
	contig_iterator *ci;
	rangec_t *r;
	int start = contigs[i].start, end = contigs[i].end;

	if (NULL == (con = (char *)xmalloc(end-start+1)))
	    return -1;

	calculate_consensus_simple(io, crec, start, end, con, NULL);

	ci = contig_iter_new(io, crec, 0, CITER_FIRST, start, end);
	while (NULL != (r = contig_iter_next(io, ci))) {
	    UpdateTextOutput();
	    sc = check_uassembly_single(io, con - start, crec, r,
					maxperc, winsize, ignore_N);
	    if (count >= allocated) {
		allocated = allocated ? allocated * 2 : 256;
		reads  = xrealloc(reads, allocated * sizeof(*reads));
		conts  = xrealloc(conts, allocated * sizeof(*conts));
		score  = xrealloc(score, allocated * sizeof(*score));
		length = xrealloc(length, allocated * sizeof(*length));
		pos    = xrealloc(pos, allocated * sizeof(*pos));
		if (!reads || !conts || !score || !length || !pos)
		    goto error;
	    }

	    if (sc > 0) {
		reads[count]   = r->rec;
		score[count]   = sc * 100;
		pos[count]     = r->start;
		length[count]  = r->end - r->start+1;
		conts[count++] = crec;
	    }
	}

	contig_iter_del(ci);
	xfree(con);
    }

    if (-1 == check_assembly_plot(io, reads, conts, score, pos, length, count))
	goto error;

    if (reads)
	xfree(reads);
    if (conts)
	xfree(conts);
    if (pos)
	xfree(pos);
    if (length)
	xfree(length);
    if (score)
	xfree(score);

    return 0;

 error:
    if (reads)
	xfree(reads);
    if (conts)
	xfree(conts);
    if (pos)
	xfree(pos);
    if (length)
	xfree(length);
    if (score)
	xfree(score);

    return -1;
}
Example #6
0
/**
 * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'.
 */
MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) {
    CONTIGL *contig, *first_contig = NULL, *last_contig = NULL;
    int i, j;
    contig_iterator *citer;
    rangec_t *r;

    /* Expand start and end to the range covered by seqs overlapping
     * start .. end
     */

    {
	seq_t *s;
	citer = contig_iter_new(io, cnum, 0,
				CITER_FIRST | CITER_ICLIPPEDSTART,
				start, start);
	r = contig_iter_next(io, citer);
	if (r) {
	    s = cache_search(io, GT_Seq, r->rec);

	    start = ((s->len < 0) ^ r->comp)
		? r->end - s->right - 2
		: r->start + s->left - 2;
	}

	contig_iter_del(citer);
    }

    {
	seq_t *s;
	citer = contig_iter_new(io, cnum, 0,
				CITER_LAST | CITER_ICLIPPEDEND,
				end, end);
	r = contig_iter_next(io, citer);
	if (r) {
	    s = cache_search(io, GT_Seq, r->rec);

	    end = ((s->len < 0) ^ r->comp)
		? r->end - s->left + 2
		: r->start + s->right + 2;
	}

	contig_iter_del(citer);
    }
    
    //printf("Generating data for %d..%d\n", start, end);

    /* Generate contigl linked list */
    //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND);
    citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end);
    
    while ((r = contig_iter_next(io, citer))) {
	seq_t *s, *sorig;
	char *seq;
	int len;

	assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ);

	contig = create_contig_link();
	contig->id = r->rec;
	contig->mseg = create_mseg();

	sorig = s = cache_search(io, GT_Seq, r->rec);
	/* Check for out-of-bounds clip points.  It shouldn't happen, but
	   gap5 databases have been seen with this problem, and we
	   don't want to crash if there are any. */
	if (s->left < 1)            s->left = 1;
	if (s->right > ABS(s->len)) s->right = ABS(s->len);

	/* Fix reads of zero length */
	if (s->right < s->left) {
	    sorig = s = cache_rw(io, s);
	    s->right = s->left;
	    if (s->right > ABS(s->len))
		s->left = s->right = ABS(s->len);
	}

	if ((s->len < 0) ^ r->comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	len = s->right - s->left + 1;
	if (NULL == (seq = malloc(len+1)))
	    return NULL;

	for (j = 0, i = s->left-1; i < s->right; i++, j++) {
	    /* Protect against the sequence containing "."; our pad sym */
	    if (s->seq[i] == '.')
		seq[j] = 'N';
	    else
		seq[j] = s->seq[i];
	}
	seq[j] = 0;

	init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1);
	contig->mseg->comp = (s != sorig);

	if (last_contig) {
	    last_contig->next = contig;
	} else {
	    first_contig = contig;
	}
	last_contig = contig;

	if (s != sorig)
	    free(s);
    }
    contig_iter_del(citer);

    /* for 454 data -6 to -10 seem to work fine */
    return contigl_to_malign(first_contig, -7, -7);
}
Example #7
0
/*
 * find matches between user entered sequence string and contig list with
 * a minimum match of mis_match
 */
int
StringMatch(GapIO *io,                                                 /* in */
	    int num_contigs,                                           /* in */
	    contig_list_t *contig_array,                               /* in */
	    char **cons_array,                                         /* in */
	    char *string,                                              /* in */
	    float mis_fmatch,                                          /* in */
	    int *pos1,                                                /* out */
	    int *pos2,                                                /* out */
	    int *score,                                               /* out */
	    int *length,                                              /* out */
	    tg_rec *c1,                                               /* out */
	    tg_rec *c2,                                               /* out */
	    int max_matches,                                           /* in */
	    int consensus_only,                                        /* in */
	    int cutoff_data)					       /* in */
{
    int n_matches = 0;
    int i, j, k, c;
    int mis_match;
    int seq_len;
    int orig;
    int res, too_many = 0;
    char *cons_match;
    char title[1024];
    char name1[10];
    int max_imatches = max_matches;
    size_t stringlen = strlen(string);

    if (NULL == (cons_match = (char *)xmalloc(stringlen + 1)))
	return -1;

    /* convert percentage mis-matches into number of mis matches */
    mis_match = strlen(string) - (ceil(strlen(string) * mis_fmatch / 100.));

    /* complement string */
    for (c = 0; c < 2; c++) {
	if (c == 1)
	    complement_seq(string, stringlen);

	for (i = 0; i < num_contigs; i++) {
	    rangec_t *r;
	    contig_iterator *ci = NULL;

	    /*
	     * Consensus first time through loop.
	     * Sequences in that contig on subsequent loops.
	     */
	    for (r = (rangec_t *)1; r; r = contig_iter_next(io, ci)) {
		char *seq;
		seq_t *s = NULL;

		if (ci == 0) {
		    /* First time through is consensus */
		    seq = cons_array[i];
		    seq_len = strlen(cons_array[i]);

		} else {
		    /* Subsequent times r is valid (not 1) and a sequence */
		    if ((r->flags & GRANGE_FLAG_ISMASK) !=
			GRANGE_FLAG_ISSEQ)
			continue;

		    s = cache_search(io, GT_Seq, r->rec);

		    if (cutoff_data) {
			seq = s->seq;
			seq_len = ABS(s->len);
		    } else {
			seq = &s->seq[s->left-1];
			seq_len = s->right - s->left+1;
		    }
		}

		orig = n_matches;
		res = inexact_pad_match(seq, seq_len, string,
					stringlen, mis_match,
					&pos1[n_matches], &score[n_matches],
					max_imatches);
		if (res == -2)
		    return -1;

		if (res < 0) {
		    verror(ERR_WARN, "find_oligos", "Too many matches");
		    too_many = 1;
		    res = max_imatches;
		}
		n_matches += res;
		max_imatches -= res;

		for (j = k = orig; j < n_matches; j++) {
		    int padded_len;

		    c1[j] = contig_array[i].contig;
		    if (c == 0) {
			c2[j] = contig_array[i].contig;
		    } else {
			c2[j] = -contig_array[i].contig;
		    }

		    /*
		     * remove pads such that the final length of cons_match is
		     * of length length[j]
		     */
		    padded_len =
			depad_seq_len(cons_match, &seq[pos1[j]-1], stringlen);

		    if (ci) {
			if (cutoff_data) {
			    pos1[j] += r->start-1;
			} else {
			    pos1[j] += r->start-1 + s->left-1;
			}
		    }

		    length[j] = padded_len;

		    /* Adjust for searching in a sub-range of the contig */
		    if (!ci)
			pos1[j] += contig_array[i].start-1;
		    pos2[j] = pos1[j];

		    /*
		     * The searching above may find hits outside of
		     * contig_array[i].start and contig_array[i].end.
		     *
		     * This happens if we search sequences and the
		     * sequence overlaps the desired range, but has a
		     * hit outside of the desired range.
		     *
		     * Rather than complicate the above code, we post
		     * filter these false hits here.
		     */
		    if (pos1[j] >= contig_array[i].start &&
			pos1[j] <= contig_array[i].end) {
			sprintf(name1, "%"PRIrec"", io_clnbr(io, ABS(c1[j])));
			sprintf(title, "Match found with contig #%"PRIrec
				" read #%"PRIrec
				" in the %c sense",
				contig_array[i].contig,
				ci ? r->rec : 0,
				c2[j] > 0 ? '+' : '-');

			list_alignment(string, cons_match, "oligo", name1, 1,
				       pos1[j], title);
			
			/*
			 * Copy it from *[j] to *[k].
			 * This code REALLY needs to be using structs!
			 * This is foul.
			 */
			pos1  [k] = pos1  [j];
			pos2  [k] = pos2  [j];
			c1    [k] = c1    [j];
			c2    [k] = c2    [j];
			length[k] = length[j];
			score [k] = score [j];
			k++;
		    }
		}

		n_matches -= j-k;
		max_imatches += j-k;

		if (too_many)
		    break;

		if (consensus_only)
		    break;

		if (!ci) {
		    ci = contig_iter_new(io,
					 contig_array[i].contig,
					 0 /*autoextend */,
					 CITER_FIRST,
					 contig_array[i].start,
					 contig_array[i].end);
		    if (!ci)
			break;
		}
	    }

	    if (too_many)
		break;
	}

	if (too_many)
	    break;
    }

    xfree(cons_match);
    vmessage("Number of matches found %d \n", n_matches);
    return n_matches;
}