コード例 #1
0
ファイル: tg_view.c プロジェクト: nathanhaigh/staden-trunk
static void test_mode3(GapIO *io, int cnum, int xpos) {
    rangec_t *r;
    contig_iterator *ci;

    ci = contig_iter_new(io, cnum, 0, CITER_FIRST, CITER_CSTART, CITER_CEND);
    while (r = contig_iter_next(io, ci)) {
        seq_t *s = get_seq(io, r->rec);
        char name[256];

        sprintf(name, "%.*s", s->name_len, s->name);
        printf("%c%-22s\t%8d..%-8d\t%.*s\n",
               "+-"[s->len<0], name, r->start, r->end, ABS(s->len), s->seq);
    }
    contig_iter_del(ci);
    exit(0);
}
コード例 #2
0
/*
 * Attempt to find edits. It's not 100% reliable, but works for most cases.
 * We look for lowercase bases and confidence 100 and 0 (if not N).
 * We cannot find deleted bases though.
 */
int edview_search_edit(edview *xx, int dir, int strand, char *value) {
    int start, end;
    contig_iterator *iter;
    rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci);
    rangec_t *r;
    int best_pos, found = 0;
    int fpos;
    tg_rec fseq;

    if (dir) {
	start = xx->cursor_apos + 1;
	end = CITER_CEND;
	iter = contig_iter_new(xx->io, xx->cnum, 1,
			       CITER_FIRST | CITER_ISTART,
			       start, end);
	ifunc = contig_iter_next;
	best_pos = INT_MAX;
    } else {
	start = CITER_CSTART;
	end = xx->cursor_apos -1;
	iter = contig_iter_new(xx->io, xx->cnum, 1,
			       CITER_LAST | CITER_IEND,
			       start, end);
	ifunc = contig_iter_prev;
	best_pos = INT_MIN;
    }

    if (!iter)
	return -1;


    while ((r = ifunc(xx->io, iter))) {
	seq_t *s, *sorig;
	char *seq, *qual;
	int seq_len, comp, off = 0, i;

	if (found && dir  && r->start > best_pos)
	    break;
	if (found && !dir && r->end < best_pos)
	    break;

	if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec)))
	    break;

	if (r->comp ^ (s->len < 0)) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	seq  = s->seq;
	qual = s->conf;
	seq_len = ABS(s->len);

	if (r->start < start) {
	    off      = start - r->start;
	    seq     += off;
	    qual    += off;
	    seq_len -= off;
	}

	for (i = 0; i < seq_len; i++) {
	    if (islower(seq[i]) ||
		qual[i] == 100 ||
		(qual[i] == 0 && seq[i] != 'N' && seq[i] != '-'
		 && seq[i] != '*')) {
		int pos = r->start + i + off;
		if (dir) {
		    if (best_pos > pos && pos > xx->cursor_apos) {
			found = 1;
			best_pos = pos;
			fpos = i + off;
			fseq = r->rec;
		    }
		    break;
		} else {
		    if (best_pos < pos && pos < xx->cursor_apos) {
			found = 1;
			best_pos = pos;
			fpos = i + off;
			fseq = r->rec;
		    }
		}
	    }
	}

	if (s != sorig)
	    free(s);
    }

    if (found) {
	edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq,
		       fseq, fpos, 1);
    }

    contig_iter_del(iter);

    return found ? 0 : -1;
}
コード例 #3
0
int edview_search_sequence(edview *xx, int dir, int strand, char *value) {
    int mismatches = 0; /* exact match */
    int where = 2;      /* consensus */
    char *p;
    int start, end;
    int patlen;
    char *uppert, *upperb;
    int found = 0, at_end = 0;
    tg_rec fseq;
    int fpos, i, j;
    contig_t *c;
    contig_iterator *iter;
    rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci);
    rangec_t *r;
    int best_pos;

    if (dir) {
	start = xx->cursor_apos + 1;
	end = CITER_CEND;
	iter = contig_iter_new(xx->io, xx->cnum, 1,
			       CITER_FIRST | CITER_ISTART,
			       start, end);
	ifunc = contig_iter_next;
	best_pos = INT_MAX;
    } else {
	start = CITER_CSTART;
	end = xx->cursor_apos -1;
	iter = contig_iter_new(xx->io, xx->cnum, 1,
			       CITER_LAST | CITER_IEND,
			       start, end);
	ifunc = contig_iter_prev;
	best_pos = INT_MIN;
    }

    if (!iter)
	return -1;


    /*
     * Parse value search string. It optionally includes two extra params
     * separated by #. Ie:
     *     <string>#<N.mismatches>#<where>.
     * <where> is 1 for readings, 2 for consensus, 3 for both.
     */
    if (p = strchr(value, '#')) {
	mismatches = atoi(p+1);
	*p = 0;
	if (p = strchr(p+1, '#'))
	    where = atoi(p+1);
    }


    /* uppercase search string, remove pads, and store fwd/rev copies */
    patlen = strlen(value);
    depad_seq(value, &patlen, NULL);
    if (NULL == (uppert = (char *)xmalloc(patlen + 1)))
	return 0;
    if (NULL == (upperb = (char *)xmalloc(patlen + 1)))
	return 0;

    uppert[patlen] = upperb[patlen] = 0;
    for (i = patlen-1; i >= 0; i--) {
	upperb[i] = uppert[i] = toupper(value[i]);
    }
    complement_seq(upperb, patlen);

    while ((r = ifunc(xx->io, iter))) {
	seq_t *s, *sorig;
	char *ind, *indt = NULL, *indb = NULL, *seq;
	int seq_len, comp, off = 0;

	if (found && dir  && r->start > best_pos)
	    break;
	if (found && !dir && r->end < best_pos)
	    break;

	if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec)))
	    break;

	if (r->comp ^ (s->len < 0)) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	seq = s->seq;
	seq_len = ABS(s->len);

	if (r->start < start) {
	    off      = start - r->start;
	    seq     += off;
	    seq_len -= off;
	}
	if (r->end - (patlen-1) > end)
	    seq_len -= r->end - (patlen-1) - end;

	if (dir) {
	    if (strand == '+' || strand == '=')
		indt = pstrnstr_inexact(seq, seq_len, uppert, patlen,
					mismatches, NULL);
	    if (strand == '-' || strand == '=')
		indb = pstrnstr_inexact(seq, seq_len, upperb, patlen,
					mismatches, NULL);
	} else {
	    if (strand == '+' || strand == '=')
		indt = prstrnstr_inexact(seq, seq_len, uppert, patlen,
					 mismatches, NULL);
	    if (strand == '-' || strand == '=')
		indb = prstrnstr_inexact(seq, seq_len, upperb, patlen,
					 mismatches, NULL);
	}

	if (indt && indb)
	    ind = MIN(indt, indb);
	else if (indt)
	    ind = indt;
	else if (indb)
	    ind = indb;
	else
	    ind = NULL;

	if (ind) {
	    int pos =  r->start + ind - seq + off;
	    if (dir) {
		if (best_pos > pos) {
		    found = 1;
		    best_pos = pos;
		    fpos = ind - s->seq;
		    fseq = r->rec;
		}
	    } else {
		if (best_pos < pos) {
		    found = 1;
		    best_pos = pos;
		    fpos = ind - s->seq;
		    fseq = r->rec;
		}
	    }
	    //printf("Matches #%"PRIrec": at abs pos %d\n", r->rec, pos);
	}

	if (s != sorig)
	    free(s);
    }

    if (found) {
	edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq,
		       fseq, fpos, 1);
    }

    free(uppert);
    free(upperb);

    contig_iter_del(iter);

    return found ? 0 : -1;
}
コード例 #4
0
/*
 * Scans through one or more contigs checking each reading for correct
 * assembly. This is simply a check for misaligned data, not looking into
 * cutoff data. (The gap4 method did this, but it hasn't yet been implemented
 * in gap5).
 *
 * Returns -1 for failure, 0 for success.
 */
int check_assembly(GapIO *io, int num_contigs, contig_list_t *contigs,
		   int winsize, float maxperc, int ignore_N) {
    int i, sc, count = 0, allocated = 0;
    char *con;
    tg_rec *reads = NULL, *conts = NULL;
    int *score = NULL, *length = NULL, *pos = NULL;

    for (i = 0; i < num_contigs; i++) {
	tg_rec crec = contigs[i].contig;
	contig_iterator *ci;
	rangec_t *r;
	int start = contigs[i].start, end = contigs[i].end;

	if (NULL == (con = (char *)xmalloc(end-start+1)))
	    return -1;

	calculate_consensus_simple(io, crec, start, end, con, NULL);

	ci = contig_iter_new(io, crec, 0, CITER_FIRST, start, end);
	while (NULL != (r = contig_iter_next(io, ci))) {
	    UpdateTextOutput();
	    sc = check_uassembly_single(io, con - start, crec, r,
					maxperc, winsize, ignore_N);
	    if (count >= allocated) {
		allocated = allocated ? allocated * 2 : 256;
		reads  = xrealloc(reads, allocated * sizeof(*reads));
		conts  = xrealloc(conts, allocated * sizeof(*conts));
		score  = xrealloc(score, allocated * sizeof(*score));
		length = xrealloc(length, allocated * sizeof(*length));
		pos    = xrealloc(pos, allocated * sizeof(*pos));
		if (!reads || !conts || !score || !length || !pos)
		    goto error;
	    }

	    if (sc > 0) {
		reads[count]   = r->rec;
		score[count]   = sc * 100;
		pos[count]     = r->start;
		length[count]  = r->end - r->start+1;
		conts[count++] = crec;
	    }
	}

	contig_iter_del(ci);
	xfree(con);
    }

    if (-1 == check_assembly_plot(io, reads, conts, score, pos, length, count))
	goto error;

    if (reads)
	xfree(reads);
    if (conts)
	xfree(conts);
    if (pos)
	xfree(pos);
    if (length)
	xfree(length);
    if (score)
	xfree(score);

    return 0;

 error:
    if (reads)
	xfree(reads);
    if (conts)
	xfree(conts);
    if (pos)
	xfree(pos);
    if (length)
	xfree(length);
    if (score)
	xfree(score);

    return -1;
}
コード例 #5
0
/**
 * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'.
 */
MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) {
    CONTIGL *contig, *first_contig = NULL, *last_contig = NULL;
    int i, j;
    contig_iterator *citer;
    rangec_t *r;

    /* Expand start and end to the range covered by seqs overlapping
     * start .. end
     */

    {
	seq_t *s;
	citer = contig_iter_new(io, cnum, 0,
				CITER_FIRST | CITER_ICLIPPEDSTART,
				start, start);
	r = contig_iter_next(io, citer);
	if (r) {
	    s = cache_search(io, GT_Seq, r->rec);

	    start = ((s->len < 0) ^ r->comp)
		? r->end - s->right - 2
		: r->start + s->left - 2;
	}

	contig_iter_del(citer);
    }

    {
	seq_t *s;
	citer = contig_iter_new(io, cnum, 0,
				CITER_LAST | CITER_ICLIPPEDEND,
				end, end);
	r = contig_iter_next(io, citer);
	if (r) {
	    s = cache_search(io, GT_Seq, r->rec);

	    end = ((s->len < 0) ^ r->comp)
		? r->end - s->left + 2
		: r->start + s->right + 2;
	}

	contig_iter_del(citer);
    }
    
    //printf("Generating data for %d..%d\n", start, end);

    /* Generate contigl linked list */
    //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND);
    citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end);
    
    while ((r = contig_iter_next(io, citer))) {
	seq_t *s, *sorig;
	char *seq;
	int len;

	assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ);

	contig = create_contig_link();
	contig->id = r->rec;
	contig->mseg = create_mseg();

	sorig = s = cache_search(io, GT_Seq, r->rec);
	/* Check for out-of-bounds clip points.  It shouldn't happen, but
	   gap5 databases have been seen with this problem, and we
	   don't want to crash if there are any. */
	if (s->left < 1)            s->left = 1;
	if (s->right > ABS(s->len)) s->right = ABS(s->len);

	/* Fix reads of zero length */
	if (s->right < s->left) {
	    sorig = s = cache_rw(io, s);
	    s->right = s->left;
	    if (s->right > ABS(s->len))
		s->left = s->right = ABS(s->len);
	}

	if ((s->len < 0) ^ r->comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	len = s->right - s->left + 1;
	if (NULL == (seq = malloc(len+1)))
	    return NULL;

	for (j = 0, i = s->left-1; i < s->right; i++, j++) {
	    /* Protect against the sequence containing "."; our pad sym */
	    if (s->seq[i] == '.')
		seq[j] = 'N';
	    else
		seq[j] = s->seq[i];
	}
	seq[j] = 0;

	init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1);
	contig->mseg->comp = (s != sorig);

	if (last_contig) {
	    last_contig->next = contig;
	} else {
	    first_contig = contig;
	}
	last_contig = contig;

	if (s != sorig)
	    free(s);
    }
    contig_iter_del(citer);

    /* for 454 data -6 to -10 seem to work fine */
    return contigl_to_malign(first_contig, -7, -7);
}
コード例 #6
0
ファイル: find_oligo.c プロジェクト: svn2github/staden
/*
 * find matches between user entered sequence string and contig list with
 * a minimum match of mis_match
 */
int
StringMatch(GapIO *io,                                                 /* in */
	    int num_contigs,                                           /* in */
	    contig_list_t *contig_array,                               /* in */
	    char **cons_array,                                         /* in */
	    char *string,                                              /* in */
	    float mis_fmatch,                                          /* in */
	    int *pos1,                                                /* out */
	    int *pos2,                                                /* out */
	    int *score,                                               /* out */
	    int *length,                                              /* out */
	    tg_rec *c1,                                               /* out */
	    tg_rec *c2,                                               /* out */
	    int max_matches,                                           /* in */
	    int consensus_only,                                        /* in */
	    int cutoff_data)					       /* in */
{
    int n_matches = 0;
    int i, j, k, c;
    int mis_match;
    int seq_len;
    int orig;
    int res, too_many = 0;
    char *cons_match;
    char title[1024];
    char name1[10];
    int max_imatches = max_matches;
    size_t stringlen = strlen(string);

    if (NULL == (cons_match = (char *)xmalloc(stringlen + 1)))
	return -1;

    /* convert percentage mis-matches into number of mis matches */
    mis_match = strlen(string) - (ceil(strlen(string) * mis_fmatch / 100.));

    /* complement string */
    for (c = 0; c < 2; c++) {
	if (c == 1)
	    complement_seq(string, stringlen);

	for (i = 0; i < num_contigs; i++) {
	    rangec_t *r;
	    contig_iterator *ci = NULL;

	    /*
	     * Consensus first time through loop.
	     * Sequences in that contig on subsequent loops.
	     */
	    for (r = (rangec_t *)1; r; r = contig_iter_next(io, ci)) {
		char *seq;
		seq_t *s = NULL;

		if (ci == 0) {
		    /* First time through is consensus */
		    seq = cons_array[i];
		    seq_len = strlen(cons_array[i]);

		} else {
		    /* Subsequent times r is valid (not 1) and a sequence */
		    if ((r->flags & GRANGE_FLAG_ISMASK) !=
			GRANGE_FLAG_ISSEQ)
			continue;

		    s = cache_search(io, GT_Seq, r->rec);

		    if (cutoff_data) {
			seq = s->seq;
			seq_len = ABS(s->len);
		    } else {
			seq = &s->seq[s->left-1];
			seq_len = s->right - s->left+1;
		    }
		}

		orig = n_matches;
		res = inexact_pad_match(seq, seq_len, string,
					stringlen, mis_match,
					&pos1[n_matches], &score[n_matches],
					max_imatches);
		if (res == -2)
		    return -1;

		if (res < 0) {
		    verror(ERR_WARN, "find_oligos", "Too many matches");
		    too_many = 1;
		    res = max_imatches;
		}
		n_matches += res;
		max_imatches -= res;

		for (j = k = orig; j < n_matches; j++) {
		    int padded_len;

		    c1[j] = contig_array[i].contig;
		    if (c == 0) {
			c2[j] = contig_array[i].contig;
		    } else {
			c2[j] = -contig_array[i].contig;
		    }

		    /*
		     * remove pads such that the final length of cons_match is
		     * of length length[j]
		     */
		    padded_len =
			depad_seq_len(cons_match, &seq[pos1[j]-1], stringlen);

		    if (ci) {
			if (cutoff_data) {
			    pos1[j] += r->start-1;
			} else {
			    pos1[j] += r->start-1 + s->left-1;
			}
		    }

		    length[j] = padded_len;

		    /* Adjust for searching in a sub-range of the contig */
		    if (!ci)
			pos1[j] += contig_array[i].start-1;
		    pos2[j] = pos1[j];

		    /*
		     * The searching above may find hits outside of
		     * contig_array[i].start and contig_array[i].end.
		     *
		     * This happens if we search sequences and the
		     * sequence overlaps the desired range, but has a
		     * hit outside of the desired range.
		     *
		     * Rather than complicate the above code, we post
		     * filter these false hits here.
		     */
		    if (pos1[j] >= contig_array[i].start &&
			pos1[j] <= contig_array[i].end) {
			sprintf(name1, "%"PRIrec"", io_clnbr(io, ABS(c1[j])));
			sprintf(title, "Match found with contig #%"PRIrec
				" read #%"PRIrec
				" in the %c sense",
				contig_array[i].contig,
				ci ? r->rec : 0,
				c2[j] > 0 ? '+' : '-');

			list_alignment(string, cons_match, "oligo", name1, 1,
				       pos1[j], title);
			
			/*
			 * Copy it from *[j] to *[k].
			 * This code REALLY needs to be using structs!
			 * This is foul.
			 */
			pos1  [k] = pos1  [j];
			pos2  [k] = pos2  [j];
			c1    [k] = c1    [j];
			c2    [k] = c2    [j];
			length[k] = length[j];
			score [k] = score [j];
			k++;
		    }
		}

		n_matches -= j-k;
		max_imatches += j-k;

		if (too_many)
		    break;

		if (consensus_only)
		    break;

		if (!ci) {
		    ci = contig_iter_new(io,
					 contig_array[i].contig,
					 0 /*autoextend */,
					 CITER_FIRST,
					 contig_array[i].start,
					 contig_array[i].end);
		    if (!ci)
			break;
		}
	    }

	    if (too_many)
		break;
	}

	if (too_many)
	    break;
    }

    xfree(cons_match);
    vmessage("Number of matches found %d \n", n_matches);
    return n_matches;
}