Exemplo n.º 1
0
static char *
GetTagSequence(GapIO *io,                                             /* in */
	       int c_num,                                             /* in */
	       int position,                                          /* in */
	       int length)                                            /* in */
{
    char *sequence;
    static char seq[1024];

    if (length < 1024)
	sequence = seq;
    else
	if (NULL == (sequence = (char *)xmalloc((length + 1) * sizeof(char ))))
	    return NULL;

    calculate_consensus_simple(io, c_num, position, position+length-1,
			       sequence, NULL);

    sequence[length] = '\0';
    return sequence;
}
Exemplo n.º 2
0
int edview_search_consensus(edview *xx, int dir, int strand, char *value) {
    int mismatches = 0; /* exact match */
    int where = 2;      /* consensus */
    char *p;
    int start, end;
    char cons[WIN_WIDTH+1];
    int patlen;
    char *uppert, *upperb;
    int found = 0, at_end = 0;
    tg_rec fseq;
    int fpos, i, j;
    contig_t *c;

    /*
     * Parse value search string. It optionally includes two extra params
     * separated by #. Ie:
     *     <string>#<N.mismatches>#<where>.
     * <where> is 1 for readings, 2 for consensus, 3 for both.
     */
    if (p = strchr(value, '#')) {
	mismatches = atoi(p+1);
	*p = 0;
	if (p = strchr(p+1, '#'))
	    where = atoi(p+1);
    }


    /* uppercase search string, remove pads, and store fwd/rev copies */
    patlen = strlen(value);
    depad_seq(value, &patlen, NULL);
    if (NULL == (uppert = (char *)xmalloc(patlen + 1)))
	return 0;
    if (NULL == (upperb = (char *)xmalloc(patlen + 1)))
	return 0;

    uppert[patlen] = upperb[patlen] = 0;
    for (i = patlen-1; i >= 0; i--) {
	upperb[i] = uppert[i] = toupper(value[i]);
    }
    complement_seq(upperb, patlen);


    /* Loop */
    if (dir) {
	start = xx->cursor_apos + (dir ? 1 : -1);
	end   = start + (WIN_WIDTH-1);
    } else {
	end   = xx->cursor_apos + (dir ? 1 : -1);
	start = end - (WIN_WIDTH-1);
    }
    fpos = xx->cursor_apos;

    c = cache_search(xx->io, GT_Contig, xx->cnum);
    cache_incr(xx->io, c);
    do {
	char *ind, *indt = NULL, *indb = NULL;

	calculate_consensus_simple(xx->io, xx->cnum, start, end, cons, NULL);
	cons[WIN_WIDTH] = 0;

	if (dir) {
	    if (strand == '+' || strand == '=')
		indt = pstrstr_inexact(cons, uppert, mismatches, NULL);
	    if (strand == '-' || strand == '=')
		indb = pstrstr_inexact(cons, upperb, mismatches, NULL);
	} else {
	    if (strand == '+' || strand == '=')
		indt = prstrstr_inexact(cons, uppert, mismatches, NULL);
	    if (strand == '-' || strand == '=')
		indb = prstrstr_inexact(cons, upperb, mismatches, NULL);
	}

	if (indt && indb)
	    ind = MIN(indt, indb);
	else if (indt)
	    ind = indt;
	else if (indb)
	    ind = indb;
	else
	    ind = NULL;

	if (ind != NULL) {
	    if (dir) {
		if (fpos <= start + ind-cons) {
		    found = 1;
		    fpos = start + ind-cons;
		    fseq = xx->cnum;
		}
	    } else {
		if (fpos >= start + ind-cons) {
		    found = 1;
		    fpos = start + ind-cons;
		    fseq = xx->cnum;
		}
	    }
	    break;
	}

	/* Next search region - overlapping by patlen+pads */
	if (dir) {
	    for (i = WIN_WIDTH-1, j = patlen; j && i; i--) {
		if (cons[i] != '*')
		    j--;
	    }
	    if (i == 0)
		break;
	    start += i;
	    end   += i;

	    if (start > c->end)
		at_end = 1;
	} else {
	    for (i = 0, j = patlen; j && i < WIN_WIDTH; i++) {
		if (cons[i] != '*')
		    j--;
	    }
	    if (i == WIN_WIDTH)
		break;

	    start -= WIN_WIDTH-i;
	    end   -= WIN_WIDTH-i;

	    if (end < c->start)
		at_end = 1;
	}
    } while (!at_end);
    cache_decr(xx->io, c);

    if (found) {
	edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq,
		       fseq, fpos, 1);
    }

    free(uppert);
    free(upperb);

    return found ? 0 : -1;
}
Exemplo n.º 3
0
int edview_search_consquality(edview *xx, int dir, int strand, char *value) {
    int start, end;
    float qual[WIN_WIDTH+1];
    int found = 0, at_end = 0;
    int fpos, i, qval = atoi(value);
    contig_t *c;

    /* Set initial start positions */
    if (dir) {
	start = xx->cursor_apos + (dir ? 1 : -1);
	end   = start + (WIN_WIDTH-1);
    } else {
	end   = xx->cursor_apos + (dir ? 1 : -1);
	start = end - (WIN_WIDTH-1);
    }
    fpos = xx->cursor_apos;

    /* Loop WIN_WIDTH block at a time */
    c = cache_search(xx->io, GT_Contig, xx->cnum);
    cache_incr(xx->io, c);
    do {
	calculate_consensus_simple(xx->io, xx->cnum, start, end, NULL, qual);

	if (dir) {
	    for (i = 0; i < WIN_WIDTH; i++) {
		if (qual[i] < qval) {
		    found = 1;
		    break;
		}
	    }
	} else {
	    for (i = WIN_WIDTH-1; i; i--) {
		if (qual[i] < qval) {
		    found = 1;
		    break;
		}
	    }
	}

	if (found) {
	    fpos = start + i;
	    break;
	}

	/* Next search region - overlapping by patlen+pads */
	if (dir) {
	    start += WIN_WIDTH;
	    end   += WIN_WIDTH;

	    if (start > c->end)
		at_end = 1;
	} else {
	    start -= WIN_WIDTH;
	    end   -= WIN_WIDTH;

	    if (end < c->start)
		at_end = 1;
	}
    } while (!at_end);
    cache_decr(xx->io, c);

    if (found) {
	edSetCursorPos(xx, GT_Contig, xx->cnum, fpos, 1);
	return 0;
    }

    return -1;
}
Exemplo n.º 4
0
/*
 * Scans through one or more contigs checking each reading for correct
 * assembly. This is simply a check for misaligned data, not looking into
 * cutoff data. (The gap4 method did this, but it hasn't yet been implemented
 * in gap5).
 *
 * Returns -1 for failure, 0 for success.
 */
int check_assembly(GapIO *io, int num_contigs, contig_list_t *contigs,
		   int winsize, float maxperc, int ignore_N) {
    int i, sc, count = 0, allocated = 0;
    char *con;
    tg_rec *reads = NULL, *conts = NULL;
    int *score = NULL, *length = NULL, *pos = NULL;

    for (i = 0; i < num_contigs; i++) {
	tg_rec crec = contigs[i].contig;
	contig_iterator *ci;
	rangec_t *r;
	int start = contigs[i].start, end = contigs[i].end;

	if (NULL == (con = (char *)xmalloc(end-start+1)))
	    return -1;

	calculate_consensus_simple(io, crec, start, end, con, NULL);

	ci = contig_iter_new(io, crec, 0, CITER_FIRST, start, end);
	while (NULL != (r = contig_iter_next(io, ci))) {
	    UpdateTextOutput();
	    sc = check_uassembly_single(io, con - start, crec, r,
					maxperc, winsize, ignore_N);
	    if (count >= allocated) {
		allocated = allocated ? allocated * 2 : 256;
		reads  = xrealloc(reads, allocated * sizeof(*reads));
		conts  = xrealloc(conts, allocated * sizeof(*conts));
		score  = xrealloc(score, allocated * sizeof(*score));
		length = xrealloc(length, allocated * sizeof(*length));
		pos    = xrealloc(pos, allocated * sizeof(*pos));
		if (!reads || !conts || !score || !length || !pos)
		    goto error;
	    }

	    if (sc > 0) {
		reads[count]   = r->rec;
		score[count]   = sc * 100;
		pos[count]     = r->start;
		length[count]  = r->end - r->start+1;
		conts[count++] = crec;
	    }
	}

	contig_iter_del(ci);
	xfree(con);
    }

    if (-1 == check_assembly_plot(io, reads, conts, score, pos, length, count))
	goto error;

    if (reads)
	xfree(reads);
    if (conts)
	xfree(conts);
    if (pos)
	xfree(pos);
    if (length)
	xfree(length);
    if (score)
	xfree(score);

    return 0;

 error:
    if (reads)
	xfree(reads);
    if (conts)
	xfree(conts);
    if (pos)
	xfree(pos);
    if (length)
	xfree(length);
    if (score)
	xfree(score);

    return -1;
}
Exemplo n.º 5
0
int
find_oligos(GapIO *io,
	    int num_contigs,
	    contig_list_t *contig_array,
	    float mis_match,
	    char *string,
	    int consensus_only,
	    int in_cutoff)
{
    int i;
    int *pos1 = NULL;
    int *pos2 = NULL;
    int *score = NULL;
    int *length = NULL;
    tg_rec *c1 = NULL;
    tg_rec *c2 = NULL;
    int max_matches, abs_max;
    int seq_len;
    int n_matches;
    int max_clen;
    char **cons_array = NULL;

    /* Calculate maximum contig length and total contig length */
    for (max_matches = 0, max_clen = 0, i=0; i<num_contigs; i++) {
	if (io_clength(io, contig_array[i].contig) > max_clen)
	    max_clen = io_clength(io, contig_array[i].contig);
	max_matches += io_clength(io, contig_array[i].contig);
    }
    max_matches *= 2; /* both strands */

    abs_max = get_default_int(GetInterp(), gap5_defs, "FINDOLIGO.MAX_MATCHES");

    if (max_matches > abs_max)
	max_matches = abs_max;

    if (NULL == (pos1 = (int *)xmalloc((max_matches + 1) * sizeof(int))))
	goto error;
    if (NULL == (pos2 = (int *)xmalloc((max_matches + 1) * sizeof(int))))
	goto error;
    if (NULL == (score = (int *)xmalloc((max_matches + 1) * sizeof(int))))
	goto error;
    if (NULL == (length = (int *)xmalloc((max_matches + 1) * sizeof(int))))
	goto error;
    if (NULL == (c1 = (tg_rec *)xmalloc((max_matches + 1) * sizeof(tg_rec))))
	goto error;
    if (NULL == (c2 = (tg_rec *)xmalloc((max_matches + 1) * sizeof(tg_rec))))
	goto error;

    /* save consensus for each contig */
    if (NULL == (cons_array = (char **)xmalloc(num_contigs * sizeof(char *))))
	goto error;

    for (i = 0; i < num_contigs; i++) {
	seq_len = contig_array[i].end - contig_array[i].start + 1;
	if (NULL == (cons_array[i] = (char *)xmalloc(seq_len + 1)))
	    goto error;

	calculate_consensus_simple(io, contig_array[i].contig,
				   contig_array[i].start, contig_array[i].end,
				   cons_array[i], NULL);

	cons_array[i][seq_len] = '\0';
    }

    /* do match on either tag(s) or string */
    if (string && *string) {
	n_matches = StringMatch(io, num_contigs, contig_array,
				cons_array, string, mis_match, pos1, pos2,
				score, length, c1, c2, max_matches,
				consensus_only, in_cutoff);
	if (-1 == RegFindOligo(io, SEQUENCE, pos1, pos2, score, length, c1,
			       c2, n_matches))
	    goto error;
    } else {
	/*
	if (-1 == (n_matches = TagMatch(io, max_clen, num_contigs,
					contig_array, cons_array,
					mis_match, pos1, pos2,
					score, length, c1, c2, max_matches)))
	    goto error;
	if (-1 == RegFindOligo(io, TAG, pos1, pos2, score, length, c1, c2,
			       n_matches))
	*/
	    goto error;
    }

    for (i = 0; i < num_contigs; i++) {
	if (cons_array[i])
	    xfree(cons_array[i]);
    }
    xfree(cons_array);
    xfree(c1);
    xfree(c2);
    xfree(pos1);
    xfree(pos2);
    xfree(score);
    xfree(length);
    return 0;

 error:
    if (c1)
	xfree(c1);
    if (c2)
	xfree(c2);
    if (cons_array)
	xfree(cons_array);
    if (pos1)
	xfree(pos1);
    if (pos2)
	xfree(pos2);
    if (score)
	xfree(score);
    if (length)
	xfree(length);

    return -1;
}
Exemplo n.º 6
0
/*
 * Extends the right hand end of a single contig.
 *
 * Min_depth is the minimum depth for extension. If lower then even if the
 * data matches we'll not extend further.
 *
 * Match_score (+ve) and mismatch_score (-ve) are accumulated during
 * extension to ensure that we don't extend into junk mismatching DNA.
 */
static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth,
				int match_score, int mismatch_score) {
    int end;
    rangec_t *r;
    int nr, i;
    contig_t *c;
    char cons[CSZ], new_cons[ESZ];
    int freqs[ESZ][4], depth[ESZ];
    double score, best_score;
    int best_pos, nseq;

    vmessage("Processing contig #%"PRIrec", %s end\n",
	     crec, dir ? "left" : "right");

    for (i = 0; i < ESZ; i++) {
	freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0;
	depth[i] = 0;
    }

    c = cache_search(io, GT_Contig, crec);
    if (NULL == c) return -1;
    cache_incr(io, c);

    if (consensus_valid_range(io, crec, NULL, &end) != 0) {
	cache_decr(io, c);
	return -1;
    }

    calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL);

    /* Start */
    /* Not implemented for now: rev complement and go again! */

    /* End */
    r = contig_seqs_in_range(io, &c, end, end, 0, &nr);
    if (!r) {
	cache_decr(io, c);
	return -1;
    }

    for (i = 0; i < nr; i++) {
	seq_t *s = cache_search(io, GT_Seq, r[i].rec);
	seq_t *sorig = s;
	int cstart, cend;
	int j, k, slen;

	if ((s->len < 0) ^ r[i].comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	cstart = r[i].start + s->left-1;
	cend   = r[i].start + s->right-1;

	/* Does cutoff extend to contig end, if so does it match cons? */
	if (cend < end) {
	    int mis = 0, len = 0;
	    if (end - cend >= CSZ) {
		/*
		fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n",
			r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0; /* Mark for removal */
		continue;
	    }

	    for (k = s->right, j = cend+1; j <= end; j++, k++) {
		//printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]);
		if (s->seq[k] != cons[j-(end-(CSZ-1))])
		    mis++;
	    }
	    len = end - cend;
	    if (100*mis/len > 5) {
		/*
		fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement "
			"with consensus.\n", r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0;
		continue;
	    }
	}

	/* So we got here, let's accumulate extension stats */
	slen = ABS(s->len);
	for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) {
	    //printf("%d: %c\n", j + r[i].start, s->seq[j]);
	    if(s->seq[j] == 'N')
		continue;

	    freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++;
	    depth[k]++;
	}

	if (sorig != s)
	    free(s);
    }

    score = best_score = 0;
    best_pos = 0;
    
    for (i = 0; i < ESZ; i++) {
	int call, best = 0, j;
	double dd;

	if (depth[i] < min_depth)
	    break;

	for (j = 0; j < 4; j++) {
	    if (best < freqs[i][j]) {
		best = freqs[i][j];
		call = j;
	    }
	}
	new_cons[i] = "ACGT"[call];

	dd = (double)depth[i];
	switch (call) {
	case 0:
	    score +=  freqs[i][0] / dd;
	    score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 1:
	    score +=  freqs[i][1] / dd;
	    score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 2:
	    score +=  freqs[i][2] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd;
	    break;
	case 3:
	    score +=  freqs[i][3] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd;
	    break;
	}

	if (best_score <= score) {
	    best_score = score;
	    best_pos = i+1;
	}
	/*
	printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n",
	       i, depth[i], "ACGT"[call],
	       freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3],
	       score);
	*/
    }
    /* printf("Best score is %f at %d\n", best_score, best_pos); */

    /* Extend */
    nseq = 0;
    if (best_pos > 0) {
	int furthest_left = end;

	for (i = 0; i < nr; i++) {
	    seq_t *s;
	    int r_pos;
	    int score;

	    if (r[i].rec == 0)
		continue;

	    s = cache_search(io, GT_Seq, r[i].rec);
	    s = cache_rw(io, s);

	    if (furthest_left > r[i].start)
		furthest_left = r[i].start;

	    /*
	     * end + best_pos is the furthest right we can go, but this
	     * specific read may not be justified in reaching that far
	     * if it has too many disagreements.
	     */
	    if ((s->len > 0) ^ r[i].comp) {
		int best_r = 0, j, k;
		int len = ABS(s->len);

		//printf(">%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) {
		for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) {
		    if (new_cons[j] == toupper(s->seq[k])) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k+1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(s->seq[k])
		    //	    ? toupper(s->seq[k])
		    //        : tolower(s->seq[k]));
		}
		//putchar('\n');

		if (s->right != r_pos) {
		    s->right  = r_pos;
		    nseq++;
		}
	    } else {
		int best_r = 0, j, k;

		//printf("<%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) {
		for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) {
		    char b = complement_base(s->seq[k]);
		    if (new_cons[j] == b) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k-1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(b)
		    //	    ? toupper(b)
		    //	    : tolower(b));
		}
		//putchar('\n');

		if (s->left != r_pos+2) {
		    s->left  = r_pos+2;
		    nseq++;
		}
	    }
	}

	vmessage("    Extended by %d, adjusting %d sequence clip%s\n",
		 best_pos, nseq, nseq == 1 ? "" : "s");

	bin_invalidate_consensus(io, crec, furthest_left, end + best_pos);
    } else {
	vmessage("    Unable to extend contig\n");
    }
    free(r);

    cache_decr(io, c);
    cache_flush(io);
    return 0;
}