Beispiel #1
0
/* Debug functions that don't use curses - handy for valgrind testing */
static void test_mode(GapIO *io, contig_t **c, int xpos) {
    rangec_t *r;
    int nr, i;

    r = contig_seqs_in_range(io, c, xpos, xpos+79, CSIR_SORT_BY_X, &nr);
    for (i = 0; i < nr; i++) {
        seq_t *s = get_seq(io, r[i].rec);
        printf("%.*s: range %d..%d seq %d+%d st=%d en=%d %.*s\n",
               s->name_len, s->name,
               r[i].start, r[i].end,
               s->pos, s->len,
               s->left, s->right,
               ABS(s->len), s->seq);

        s = dup_seq(s);
        complement_seq_t(s);

        printf("%.*s: range %d..%d seq %d+%d st=%d en=%d %.*s\n",
               s->name_len, s->name,
               r[i].start, r[i].end,
               s->pos, s->len,
               s->left, s->right,
               ABS(s->len), s->seq);
    }

    gio_close(io);
    system("ps lx | grep g_iotest | grep -v grep");
    exit(0);
}
/*
 * Attempt to find edits. It's not 100% reliable, but works for most cases.
 * We look for lowercase bases and confidence 100 and 0 (if not N).
 * We cannot find deleted bases though.
 */
int edview_search_edit(edview *xx, int dir, int strand, char *value) {
    int start, end;
    contig_iterator *iter;
    rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci);
    rangec_t *r;
    int best_pos, found = 0;
    int fpos;
    tg_rec fseq;

    if (dir) {
	start = xx->cursor_apos + 1;
	end = CITER_CEND;
	iter = contig_iter_new(xx->io, xx->cnum, 1,
			       CITER_FIRST | CITER_ISTART,
			       start, end);
	ifunc = contig_iter_next;
	best_pos = INT_MAX;
    } else {
	start = CITER_CSTART;
	end = xx->cursor_apos -1;
	iter = contig_iter_new(xx->io, xx->cnum, 1,
			       CITER_LAST | CITER_IEND,
			       start, end);
	ifunc = contig_iter_prev;
	best_pos = INT_MIN;
    }

    if (!iter)
	return -1;


    while ((r = ifunc(xx->io, iter))) {
	seq_t *s, *sorig;
	char *seq, *qual;
	int seq_len, comp, off = 0, i;

	if (found && dir  && r->start > best_pos)
	    break;
	if (found && !dir && r->end < best_pos)
	    break;

	if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec)))
	    break;

	if (r->comp ^ (s->len < 0)) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	seq  = s->seq;
	qual = s->conf;
	seq_len = ABS(s->len);

	if (r->start < start) {
	    off      = start - r->start;
	    seq     += off;
	    qual    += off;
	    seq_len -= off;
	}

	for (i = 0; i < seq_len; i++) {
	    if (islower(seq[i]) ||
		qual[i] == 100 ||
		(qual[i] == 0 && seq[i] != 'N' && seq[i] != '-'
		 && seq[i] != '*')) {
		int pos = r->start + i + off;
		if (dir) {
		    if (best_pos > pos && pos > xx->cursor_apos) {
			found = 1;
			best_pos = pos;
			fpos = i + off;
			fseq = r->rec;
		    }
		    break;
		} else {
		    if (best_pos < pos && pos < xx->cursor_apos) {
			found = 1;
			best_pos = pos;
			fpos = i + off;
			fseq = r->rec;
		    }
		}
	    }
	}

	if (s != sorig)
	    free(s);
    }

    if (found) {
	edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq,
		       fseq, fpos, 1);
    }

    contig_iter_del(iter);

    return found ? 0 : -1;
}
int edview_search_sequence(edview *xx, int dir, int strand, char *value) {
    int mismatches = 0; /* exact match */
    int where = 2;      /* consensus */
    char *p;
    int start, end;
    int patlen;
    char *uppert, *upperb;
    int found = 0, at_end = 0;
    tg_rec fseq;
    int fpos, i, j;
    contig_t *c;
    contig_iterator *iter;
    rangec_t *(*ifunc)(GapIO *io, contig_iterator *ci);
    rangec_t *r;
    int best_pos;

    if (dir) {
	start = xx->cursor_apos + 1;
	end = CITER_CEND;
	iter = contig_iter_new(xx->io, xx->cnum, 1,
			       CITER_FIRST | CITER_ISTART,
			       start, end);
	ifunc = contig_iter_next;
	best_pos = INT_MAX;
    } else {
	start = CITER_CSTART;
	end = xx->cursor_apos -1;
	iter = contig_iter_new(xx->io, xx->cnum, 1,
			       CITER_LAST | CITER_IEND,
			       start, end);
	ifunc = contig_iter_prev;
	best_pos = INT_MIN;
    }

    if (!iter)
	return -1;


    /*
     * Parse value search string. It optionally includes two extra params
     * separated by #. Ie:
     *     <string>#<N.mismatches>#<where>.
     * <where> is 1 for readings, 2 for consensus, 3 for both.
     */
    if (p = strchr(value, '#')) {
	mismatches = atoi(p+1);
	*p = 0;
	if (p = strchr(p+1, '#'))
	    where = atoi(p+1);
    }


    /* uppercase search string, remove pads, and store fwd/rev copies */
    patlen = strlen(value);
    depad_seq(value, &patlen, NULL);
    if (NULL == (uppert = (char *)xmalloc(patlen + 1)))
	return 0;
    if (NULL == (upperb = (char *)xmalloc(patlen + 1)))
	return 0;

    uppert[patlen] = upperb[patlen] = 0;
    for (i = patlen-1; i >= 0; i--) {
	upperb[i] = uppert[i] = toupper(value[i]);
    }
    complement_seq(upperb, patlen);

    while ((r = ifunc(xx->io, iter))) {
	seq_t *s, *sorig;
	char *ind, *indt = NULL, *indb = NULL, *seq;
	int seq_len, comp, off = 0;

	if (found && dir  && r->start > best_pos)
	    break;
	if (found && !dir && r->end < best_pos)
	    break;

	if (NULL == (s = sorig = cache_search(xx->io, GT_Seq, r->rec)))
	    break;

	if (r->comp ^ (s->len < 0)) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	seq = s->seq;
	seq_len = ABS(s->len);

	if (r->start < start) {
	    off      = start - r->start;
	    seq     += off;
	    seq_len -= off;
	}
	if (r->end - (patlen-1) > end)
	    seq_len -= r->end - (patlen-1) - end;

	if (dir) {
	    if (strand == '+' || strand == '=')
		indt = pstrnstr_inexact(seq, seq_len, uppert, patlen,
					mismatches, NULL);
	    if (strand == '-' || strand == '=')
		indb = pstrnstr_inexact(seq, seq_len, upperb, patlen,
					mismatches, NULL);
	} else {
	    if (strand == '+' || strand == '=')
		indt = prstrnstr_inexact(seq, seq_len, uppert, patlen,
					 mismatches, NULL);
	    if (strand == '-' || strand == '=')
		indb = prstrnstr_inexact(seq, seq_len, upperb, patlen,
					 mismatches, NULL);
	}

	if (indt && indb)
	    ind = MIN(indt, indb);
	else if (indt)
	    ind = indt;
	else if (indb)
	    ind = indb;
	else
	    ind = NULL;

	if (ind) {
	    int pos =  r->start + ind - seq + off;
	    if (dir) {
		if (best_pos > pos) {
		    found = 1;
		    best_pos = pos;
		    fpos = ind - s->seq;
		    fseq = r->rec;
		}
	    } else {
		if (best_pos < pos) {
		    found = 1;
		    best_pos = pos;
		    fpos = ind - s->seq;
		    fseq = r->rec;
		}
	    }
	    //printf("Matches #%"PRIrec": at abs pos %d\n", r->rec, pos);
	}

	if (s != sorig)
	    free(s);
    }

    if (found) {
	edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq,
		       fseq, fpos, 1);
    }

    free(uppert);
    free(upperb);

    contig_iter_del(iter);

    return found ? 0 : -1;
}
/*
 * Checks a single reading for correct assembly by analysing the used data.
 *
 * Returns -1 for system error, otherwise a score (0-1000000)
 * 'pos_p' and 'len_p' are filled in with the position and length of the match
 * within the consensus.
 */
int check_uassembly_single(GapIO *io, char *cons, int contig, rangec_t *r,
			   float maxperc, int win_len, int ignore_N) {
    int start, end;
    unsigned char *seq = NULL;
    unsigned char *con = (unsigned char *) cons;
    int i, j, mism = 0;
    int worst, worst_pos = -1;
    seq_t *s, *sorig;
    static int lookup[256];
    static int lookup_done = 0;

    if (!lookup_done) {
	for (i = 0; i < 256; i++)
	    lookup[i] = 0;
	lookup['A'] = lookup['a'] = 1;
	lookup['C'] = lookup['c'] = 2;
	lookup['G'] = lookup['g'] = 3;
	lookup['T'] = lookup['t'] = 4;
	lookup['U'] = lookup['u'] = 4;
	lookup['*'] = lookup[','] = lookup['-'] = 5;
	lookup_done = 1;
    }

    /* Get sequence */
    if (!(sorig = s = cache_search(io, GT_Seq, r->rec)))
	return -1;

    /* Complement data on-the-fly */
    if ((s->len < 0) ^ r->comp) {
	s = dup_seq(s);
	complement_seq_t(s);
    }
    seq = (unsigned char *) s->seq;

    start = s->left;
    end = s->right;

    /* Initialise scoring for divergence checks */
    if (end - start - 1 < win_len) {
	win_len = end - start - 1;
    }
    worst = 0.5 + maxperc * win_len;

    for (i=start-1, j=r->start + start-1; i < start-1 + win_len; i++, j++) {
	if (ignore_N) {
	    if (lookup[seq[i]] && lookup[seq[i]] != lookup[con[j]])
		mism++;
	} else {
	    if (lookup[seq[i]] != lookup[con[j]])
		mism++;
	}
    }

    /* Loop through sequence looking for problems */
    if (ignore_N) {
	do {
	    if (mism >= worst) {
		worst_pos = i;
		worst = mism;
	    }
	    
	    mism -= lookup[seq[i-win_len]] &&
		lookup[seq[i-win_len]] != lookup[con[j-win_len]];
	    i++; j++;

	    if (i < end-1)
		mism += lookup[seq[i]] && lookup[seq[i]] != lookup[con[j]];
	} while (i < end);
    } else {
	do {
	    if (mism >= worst) {
		worst_pos = i;
		worst = mism;
	    }
	    
	    mism -= lookup[seq[i++-win_len]] != lookup[con[j++-win_len]];
	    if (i < end-1)
		mism += lookup[seq[i]] != lookup[con[j]];
	} while (i < end);
    }

    /* Display problem, listing worst score */
    if (worst_pos != -1) {
	//*pos_p = io_relpos(io, rn);
	//*len_p = end - start + 1;

	vmessage("\nReading #%"PRIrec"(%s) has a local percentage "
		 "mismatch of %2.1f\n",
		 s->rec, s->name, 100 * (float)worst / win_len);
	vmessage("SEQ: %.*s\n", end-start+1, &seq[start-1]);
	vmessage("CON: %.*s\n", end-start+1, &con[r->start + start-1]);

	if (sorig != s)
	    xfree(s);

	return 10000 * (float)worst / win_len;
    }

    if (sorig != s)
	xfree(s);

    return 0;
}
Beispiel #5
0
/**
 * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'.
 */
MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) {
    CONTIGL *contig, *first_contig = NULL, *last_contig = NULL;
    int i, j;
    contig_iterator *citer;
    rangec_t *r;

    /* Expand start and end to the range covered by seqs overlapping
     * start .. end
     */

    {
	seq_t *s;
	citer = contig_iter_new(io, cnum, 0,
				CITER_FIRST | CITER_ICLIPPEDSTART,
				start, start);
	r = contig_iter_next(io, citer);
	if (r) {
	    s = cache_search(io, GT_Seq, r->rec);

	    start = ((s->len < 0) ^ r->comp)
		? r->end - s->right - 2
		: r->start + s->left - 2;
	}

	contig_iter_del(citer);
    }

    {
	seq_t *s;
	citer = contig_iter_new(io, cnum, 0,
				CITER_LAST | CITER_ICLIPPEDEND,
				end, end);
	r = contig_iter_next(io, citer);
	if (r) {
	    s = cache_search(io, GT_Seq, r->rec);

	    end = ((s->len < 0) ^ r->comp)
		? r->end - s->left + 2
		: r->start + s->right + 2;
	}

	contig_iter_del(citer);
    }
    
    //printf("Generating data for %d..%d\n", start, end);

    /* Generate contigl linked list */
    //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND);
    citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end);
    
    while ((r = contig_iter_next(io, citer))) {
	seq_t *s, *sorig;
	char *seq;
	int len;

	assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ);

	contig = create_contig_link();
	contig->id = r->rec;
	contig->mseg = create_mseg();

	sorig = s = cache_search(io, GT_Seq, r->rec);
	/* Check for out-of-bounds clip points.  It shouldn't happen, but
	   gap5 databases have been seen with this problem, and we
	   don't want to crash if there are any. */
	if (s->left < 1)            s->left = 1;
	if (s->right > ABS(s->len)) s->right = ABS(s->len);

	/* Fix reads of zero length */
	if (s->right < s->left) {
	    sorig = s = cache_rw(io, s);
	    s->right = s->left;
	    if (s->right > ABS(s->len))
		s->left = s->right = ABS(s->len);
	}

	if ((s->len < 0) ^ r->comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	len = s->right - s->left + 1;
	if (NULL == (seq = malloc(len+1)))
	    return NULL;

	for (j = 0, i = s->left-1; i < s->right; i++, j++) {
	    /* Protect against the sequence containing "."; our pad sym */
	    if (s->seq[i] == '.')
		seq[j] = 'N';
	    else
		seq[j] = s->seq[i];
	}
	seq[j] = 0;

	init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1);
	contig->mseg->comp = (s != sorig);

	if (last_contig) {
	    last_contig->next = contig;
	} else {
	    first_contig = contig;
	}
	last_contig = contig;

	if (s != sorig)
	    free(s);
    }
    contig_iter_del(citer);

    /* for 454 data -6 to -10 seem to work fine */
    return contigl_to_malign(first_contig, -7, -7);
}
Beispiel #6
0
/*
 * Takes a multiple alignment and updates the on-disk data structures to
 * match. This needs to correct confidence values, original positions and
 * tags too.
 */
void update_io(GapIO *io, tg_rec cnum, MALIGN *malign, Array indels) {
    CONTIGL *cl;
    tg_rec rnum;
    range_t r, *r_out;
    bin_index_t *bin;
    contig_t *c = cache_search(io, GT_Contig, cnum);
    size_t i, nindel;

    cache_incr(io, c);

    /*
     * To minimise number of data modifications we use a three step approach.
     *
     * Step 1: insert columns of pads, shifting reads as appropriate.
     * Step 2: edit sequence alignments as required, possibly involving
     *         moving sequences and/or adding and removing pads.
     * Step 3: remove columns of entire pads.
     *
     * This means that when we introduce a column of pads we don't have
     * to make edits to every single read position down stream, and can
     * instead make use of the optimised recursive bin functions to do this
     * for us.
     */

    /* Step 1: make indels */
    nindel = ArrayMax(indels);
    for (i = 0; i < nindel; i++) {
	con_indel_t *id = arrp(con_indel_t, indels, i);
	int j;

	if (id->size > 0) {
	    contig_insert_bases(io, &c, id->pos+1, '*', -1, id->size);
	} else {
	    for (j = 0; j < -id->size; j++) {
		contig_delete_pad(io, &c, id->pos+1);
	    }
	}
    }

    /* Step 2: edit alignments */
    for (cl = malign->contigl; cl; cl = cl->next) {
	seq_t *s, *sorig;
	int len, update_range = 0;
	int shift;

	rnum = cl->id;
	
	sorig = cache_search(io, GT_Seq, rnum);
	cache_incr(io, sorig);
	s = dup_seq(sorig);
	if (cl->mseg->comp)
	    complement_seq_t(s);

	len = s->right - s->left + 1;

	/* Check if sequence has changed. If so assign a new one */
	if (cl->mseg->length != len ||
	    memcmp(s->seq + s->left-1, cl->mseg->seq, cl->mseg->length) != 0) {
	    int newlen = s->left-1 + ABS(s->len) - s->right + cl->mseg->length;
	    int i, j, np;
	    char   *newseq  = malloc(newlen+1);
	    int8_t *newconf = malloc(newlen+1);

	    /* Build new seq/conf arrays */
	    memcpy(newseq,  s->seq,  s->left-1);
	    memcpy(newconf, s->conf, s->left-1);

	    memcpy(&newseq[s->left-1], cl->mseg->seq, cl->mseg->length);

	    /*
	     * Step through both old and new sequences working out how
	     * they differ. This will (*should*) be entire pad movements.
	     * i = index to old seq
	     * j = index to new seq
	     * np = number of pads added minus removed from old seq.
	     */
	    np = 0;
	    for (i =j =s->left-1;
		 i < ABS(s->len) && j < s->left-1 + cl->mseg->length;
		 ) {
		/* Bases match */
		if (toupper(newseq[j]) == toupper(s->seq[i]) ||
		    (s->seq[i] == '.' && newseq[j] == 'N')) {
		    if (isupper(s->seq[i]))
			newseq[j] = toupper(newseq[j]);
		    else
			newseq[j] = tolower(newseq[j]);
		    newconf[j] = s->conf[i];
		    i++, j++;
		    continue;
		}

		/* Pad removed */
		if (s->seq[i] == '*') {
		    i++;
		    tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+np--,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_delete(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_delete(io, rnum, i+np--);
		    }
		    */
		    continue;
		}

		/* Pad created */
		if (newseq[j] == '*') {
		    int k;
		    int ql = 0, qr = 0;
		    for (k = i-1; k >= 0; k--) {
			if (s->seq[k] != '*') {
			    ql = s->conf[k];
			    break;
			}
		    }
		    for (k = i+1; k < s->right; k++) {
			if (s->seq[k] != '*') {
			    qr = s->conf[k];
			    break;
			}
		    }
		    newconf[j] = MIN(ql, qr); /* min conf of neighbours */
		    j++;
		    tag_shift_for_insert(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+ ++np,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_insert(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_insert(io, rnum, i+ ++np);
		    }
		    */
		    continue;
		}

		fprintf(stderr, "Alignment introduced non-pad character");
		abort();
	    }

	    /* Pads previously at the end of the reading & now removed */
	    while (i < s->right) {
		if (s->seq[i] == '*') {
		    i++;
		    tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+np--,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_delete(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_delete(io, rnum, i+np--);
		    }
		    */
		} else {
		    /* Error: clipped data that wasn't a pad */
		    abort();
		}
	    }

	    /* Should only be pads remaining in newseq, if anything */
	    s->right = j;
	    for (; j < s->left-1 + cl->mseg->length; j++) {
		if (newseq[j] != '*') {
		    fprintf(stderr, "Alignment introduced non-pad character");
		    abort();
		}
		newconf[j] = 0;
	    }

	    /* Append on the right hand cutoff data */
	    for (; i < ABS(s->len); i++, j++) {
		newseq[j]  = s->seq[i];
		newconf[j] = s->conf[i];
	    }
	    if (j != newlen) {
		abort();
	    }

	    /* Write it back out */
	    /* Copy newseq/newconf into seq_t */

	    s->seq = newseq;
	    s->conf = newconf;
	    update_range = 0;
	    if (ABS(s->len) != j) {
		/* Length change implies updating the range array too */
		s->len = s->len >= 0 ? j : -j;
		update_range = 1;
	    }

	    if (cl->mseg->comp)
		complement_seq_t(s);

	    /* The memcpy trashes the block pointer, so special care needed */
	    {
		sorig = cache_rw(io, sorig);
		void *blk = sorig->block;
		memcpy(sorig, s, sizeof(seq_t)); 
		sorig->block = blk;
	    }

	    if (update_range)
		sorig = cache_item_resize(sorig, sizeof(*sorig) +
					  sequence_extra_len(sorig));

	    sequence_reset_ptr(sorig);

	    if (s->name)
		memcpy(sorig->name,       s->name,       s->name_len+1);
	    if (s->trace_name)
		memcpy(sorig->trace_name, s->trace_name, s->trace_name_len+1);
	    if (s->alignment)
		memcpy(sorig->alignment,  s->alignment,  s->alignment_len+1);
	    memcpy(sorig->seq,  s->seq,  ABS(s->len));
	    memcpy(sorig->conf, s->conf, ABS(s->len));

	    xfree(newconf);
	    xfree(newseq);
	}

	{
	    int st, en, or;
	    sequence_get_position(io, s->rec, NULL, &st, &en, &or);
	    if (or ^ (sorig->len < 0)) {
		shift = ABS(sorig->len) - sorig->right;
	    } else {
		shift = sorig->left-1;
	    }
	    st += shift;
	    if (st != cl->mseg->offset+1) {
		update_range = 1;
	    }
	}

	free(s);

	if (update_range) {
	    int bin_changed = 0;

	    /* Get old range and pair data */
	    s = sorig;
	    bin = cache_search(io, GT_Bin, s->bin);
	    r = *arrp(range_t, bin->rng, s->bin_index);
	    assert(r.rec == s->rec);

	    /* Update range, tedious and slow way */
	    bin_remove_item(io, &c, GT_Seq, s->rec);
	    r.start = cl->mseg->offset + 1 - shift;
	    r.end   = r.start + ABS(s->len) - 1;
	    bin = bin_add_range(io, &c, &r, &r_out, NULL, 0);

	    /* Check if the new bin has a different complemented status too */
	    if (s->bin != bin->rec) {
		int old_comp = bin_get_orient(io, s->bin);
		int new_comp = bin_get_orient(io, bin->rec);

		if (new_comp != old_comp) {
		    //int tmp;
		    s = cache_rw(io, s);
		    s->len *= -1;
		    s->flags ^= SEQ_COMPLEMENTED;
		    //tmp = s->left;
		    //s->left  = ABS(s->len) - (s->right-1);
		    //s->right = ABS(s->len) - (tmp-1);
		}

		bin_changed = 1;
	    }
	
	    /* Update seq bin & bin_index fields */
	    s = cache_rw(io, s);
	    s->bin = bin->rec;
	    s->bin_index = r_out - ArrayBase(range_t, bin->rng);

	    if (bin_changed) {
		if (-1 == sequence_fix_anno_bins(io, &s)) {
		    verror(ERR_WARN, "update_io",
			   "sequence_fix_anno_bins() failure");
		}
	    }
	}

	cache_decr(io, sorig);
    }

    /* Step 3 (remove pad columns) done in calling function. */

    cache_decr(io, c);
}
Beispiel #7
0
/*
 * Compute a basic non-weighted consensus. We simply pick the basecall
 * most frequently used.
 *
 * FIXME: use a weighted sum based on confidence values instead?
 */
int calc_cons(GapIO *io, rangec_t *r, int nr, int xpos, int wid,
              char *cons) {
    int i, j;
    int (*cvec)[6] = (int (*)[6])calloc(wid, 6 * sizeof(int));

    if (!lookup_done) {
        memset(lookup, 5, 256);
        lookup_done = 1;
        lookup['A'] = lookup['a'] = 0;
        lookup['C'] = lookup['c'] = 1;
        lookup['G'] = lookup['g'] = 2;
        lookup['T'] = lookup['t'] = 3;
        lookup['*'] = lookup[','] = 4;
    }

    /* Accumulate */
    for (i = 0; i < nr; i++) {
        int sp = r[i].start;
        seq_t *s = get_seq(io, r[i].rec);
        seq_t *sorig = s;
        int l = s->len > 0 ? s->len : -s->len;
        unsigned char *seq;
        int left, right;

        /* Complement data on-the-fly */
        if ((s->len < 0) ^ r[i].comp) {
            s = dup_seq(s);
            complement_seq_t(s);
        }

        seq = (unsigned char *)s->seq;
        left = s->left;
        right = s->right;

        if (sp < xpos) {
            seq   += xpos - sp;
            l     -= xpos - sp;
            left  -= xpos - sp;
            right -= xpos - sp;
            sp = xpos;
        }
        if (l > wid - (sp-xpos))
            l = wid - (sp-xpos);
        if (left < 1)
            left = 1;

        for (j = left-1; j < right; j++) {
            if (sp-xpos+j < wid)
                cvec[sp-xpos+j][lookup[seq[j]]]++;
        }

        if (s != sorig)
            free(s);
    }

    memset(cons, ' ', wid);

    /* and speculate :-) */
    for (i = 0; i < wid; i++) {
        int max, max_base = 5;
        for (max = j = 0; j < 6; j++) {
            if (max < cvec[i][j]) {
                max = cvec[i][j];
                max_base = j;
            }
        }
        cons[i] = "ACGT*N"[max_base];
    }

    free(cvec);

    return 0;

}
Beispiel #8
0
static void display_gap(GapIO *io, contig_t **c, int xpos, int ypos,
                        int nlines, int wid, int mode, int qual_cutoff,
                        int in_curses) {
    rangec_t *r;
    int i, nr, lno, y;
    char line[1024], *lp;
    char cons[1024];
    int attr;
    static int lookup_1conf[256];
    static int lookup_4conf[256];
    static int lookup_init = 0;

    if (!lookup_init) {
        for (i = 0; i < 256; i++)
            lookup_1conf[i] = lookup_4conf[0] = 0;

        lookup_4conf['a'] = lookup_4conf['A'] = 0;
        lookup_4conf['c'] = lookup_4conf['C'] = 1;
        lookup_4conf['g'] = lookup_4conf['G'] = 2;
        lookup_4conf['t'] = lookup_4conf['T'] = 3;
    }

    wid -= MAX_NAME_LEN+2;

    //if (xpos < wid/2 + (*c)->start)
    //	xpos = wid/2 + (*c)->start;

    xpos -= wid/2;

    /* Query visible objects */
    r = contig_seqs_in_range(io, c, xpos, xpos+wid-1, CSIR_SORT_BY_X, &nr);

    /* Consensus */
    calc_cons(io, r, nr, xpos, wid, cons);
    if (in_curses) {
        clear();
        mvaddnstr(0, 1, contig_get_name(c), strlen(contig_get_name(c)));
        mvaddnstr(0, MAX_NAME_LEN+2, cons, wid);
    } else {
        printf(" %-*s %.*s\n", MAX_NAME_LEN, contig_get_name(c), wid, cons);
    }

    /* Position */
    for (lp = line, i = xpos; i < xpos+wid+19; i++) {
        if (i % 10 == 0) {
            sprintf(lp, "%10d", i-10);
            lp += 10;
        }
    }
    if (in_curses) {
        int m = (xpos-1)%10;
        if (m < 0) m += 10;
        mvaddnstr(1, MAX_NAME_LEN+2, line+10+m, wid);
    } else {
        printf("%*s%.*s\n", MAX_NAME_LEN+2, "", wid,
               line+9+((xpos-1)%10));
    }


    /* Sequences */
    for (i = y = 0; i < nr && y < ypos; i++, y++);
    for (lno = 2; i < nr && lno < nlines; i++, lno++) {
        seq_t *s = get_seq(io, r[i].rec);
        seq_t *sorig = s;
        int sp = r[i].start;
        int l = s->len > 0 ? s->len : -s->len;
        unsigned char seq_a[MAX_SEQ_LEN], *seq = seq_a;
        int j, dir = '+';
        int left, right;
        char *conf;
        int nc = s->format == SEQ_FORMAT_CNF4 ? 4 : 1;
        int *L = s->format == SEQ_FORMAT_CNF4 ? lookup_4conf : lookup_1conf;

        /* Complement data on-the-fly */
        if ((s->len < 0) ^ r[i].comp) {
            dir = '-';
            s = dup_seq(s);
            complement_seq_t(s);
        }

        left = s->left;
        right = s->right;

        memcpy(seq, s->seq, l);
        conf = s->conf;

        if (sp < xpos) {
            seq   += xpos - sp;
            conf  += nc * (xpos - sp);
            l     -= xpos - sp;
            left  -= xpos - sp;
            right -= xpos - sp;
            sp = xpos;
        }
        if (l > wid - (sp-xpos))
            l = wid - (sp-xpos);

        if (in_curses) {
            /* Test of sequence_get_position */
            /*
              int c, p;
              sequence_get_position(io, r[i].rec, &c, &p);
              s->name_len = sprintf(s->name, ":%d-%d:", p, p+ABS(s->len)-1);
            */
            mvaddch(lno, 0, dir);
            addnstr(s->name, MIN(MAX_NAME_LEN, s->name_len));
            move(lno, MAX_NAME_LEN+2+sp-xpos);
        } else {
            printf("%c%.*s%*s",
                   dir,
                   MIN(MAX_NAME_LEN, s->name_len), s->name,
                   MAX_NAME_LEN+1-MIN(MAX_NAME_LEN, s->name_len) +sp-xpos, "");
        }

        for (j = 0; j < l; j++) {
            attr = (mode & DISPLAY_COLOURS) ? COLOR_PAIR(lookup[seq[j]]) : 0;

            if (mode & DISPLAY_DIFFS
                    && sp-xpos+j < wid && seq[j] == cons[sp-xpos+j])
                seq[j] = '.';
            if (j < left-1 || j > right-1)
                seq[j] = (mode & DISPLAY_CUTOFFS) ? tolower(seq[j]) : ' ';

            if (conf[j*nc+L[seq[j]]] >= qual_cutoff && mode & DISPLAY_QUAL) {
                attr |= A_BOLD;
            }

            if (in_curses) {
                addch(seq[j] | attr);
            } else {
                putchar(seq[j]);
            }
        }

        if (!in_curses)
            putchar('\n');

        if (s != sorig)
            free(s);
    }

    /* Useful debugging code to show bin locations. */
#if 0
    free(r);
    r = contig_bins_in_range(io, c, xpos, xpos+wid-1, &nr);
    /* Bins */
    for (i=0; i < nr && lno < nlines; i++, lno++) {
        bin_index_t *bin = (bin_index_t *)cache_search(io, GT_Bin, r[i].rec);
        unsigned char *seq, *seqm;
        int j, dir = "+-"[r[i].comp];
        int sp = r[i].start;
        int l = ABS(r[i].end - r[i].start + 1);
        char name[100];

        sprintf(name, "bin-%d", bin->rec);
        seqm = seq = malloc(l+1);
        memset(seq, '-', l);

        if (!(bin->start_used == 0 && bin->end_used == 0)) {
            if (r[i].comp) {
                memset(&seq[bin->size - bin->end_used - 1], '=',
                       bin->end_used - bin->start_used + 1);
            } else {
                memset(&seq[bin->start_used], '=',
                       bin->end_used - bin->start_used + 1);
            }
        }

        /*
        fprintf(stderr, "Bin-%d: %d+%d %d..%d\n",
        	bin->rec,
        	bin->pos, bin->size,
        	bin->start_used, bin->end_used);
        */

        if (sp < xpos) {
            seq   += xpos - sp;
            l     -= xpos - sp;
            sp = xpos;
        }
        if (l > wid - (sp-xpos))
            l = wid - (sp-xpos);

        if (in_curses) {
            mvaddch(lno, 0, dir);
            addnstr(name, strlen(name));
            move(lno, MAX_NAME_LEN+2+sp-xpos);
        } else {
            printf("%c%.*s%*s",
                   dir,
                   (int)MIN(MAX_NAME_LEN, strlen(name)),
                   name,
                   (int)(MAX_NAME_LEN+1-MIN(MAX_NAME_LEN,
                                            strlen(name)) +sp-xpos),
                   "");
        }

        for (j = 0; j < l; j++) {
            if (in_curses) {
                addch(seq[j]);
            } else {
                putchar(seq[j]);
            }
        }

        if (!in_curses)
            putchar('\n');

        free(seqm);
    }
#endif

    if (in_curses)
        refresh();

    free(r);
}
/*
 * Extends the right hand end of a single contig.
 *
 * Min_depth is the minimum depth for extension. If lower then even if the
 * data matches we'll not extend further.
 *
 * Match_score (+ve) and mismatch_score (-ve) are accumulated during
 * extension to ensure that we don't extend into junk mismatching DNA.
 */
static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth,
				int match_score, int mismatch_score) {
    int end;
    rangec_t *r;
    int nr, i;
    contig_t *c;
    char cons[CSZ], new_cons[ESZ];
    int freqs[ESZ][4], depth[ESZ];
    double score, best_score;
    int best_pos, nseq;

    vmessage("Processing contig #%"PRIrec", %s end\n",
	     crec, dir ? "left" : "right");

    for (i = 0; i < ESZ; i++) {
	freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0;
	depth[i] = 0;
    }

    c = cache_search(io, GT_Contig, crec);
    if (NULL == c) return -1;
    cache_incr(io, c);

    if (consensus_valid_range(io, crec, NULL, &end) != 0) {
	cache_decr(io, c);
	return -1;
    }

    calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL);

    /* Start */
    /* Not implemented for now: rev complement and go again! */

    /* End */
    r = contig_seqs_in_range(io, &c, end, end, 0, &nr);
    if (!r) {
	cache_decr(io, c);
	return -1;
    }

    for (i = 0; i < nr; i++) {
	seq_t *s = cache_search(io, GT_Seq, r[i].rec);
	seq_t *sorig = s;
	int cstart, cend;
	int j, k, slen;

	if ((s->len < 0) ^ r[i].comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	cstart = r[i].start + s->left-1;
	cend   = r[i].start + s->right-1;

	/* Does cutoff extend to contig end, if so does it match cons? */
	if (cend < end) {
	    int mis = 0, len = 0;
	    if (end - cend >= CSZ) {
		/*
		fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n",
			r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0; /* Mark for removal */
		continue;
	    }

	    for (k = s->right, j = cend+1; j <= end; j++, k++) {
		//printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]);
		if (s->seq[k] != cons[j-(end-(CSZ-1))])
		    mis++;
	    }
	    len = end - cend;
	    if (100*mis/len > 5) {
		/*
		fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement "
			"with consensus.\n", r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0;
		continue;
	    }
	}

	/* So we got here, let's accumulate extension stats */
	slen = ABS(s->len);
	for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) {
	    //printf("%d: %c\n", j + r[i].start, s->seq[j]);
	    if(s->seq[j] == 'N')
		continue;

	    freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++;
	    depth[k]++;
	}

	if (sorig != s)
	    free(s);
    }

    score = best_score = 0;
    best_pos = 0;
    
    for (i = 0; i < ESZ; i++) {
	int call, best = 0, j;
	double dd;

	if (depth[i] < min_depth)
	    break;

	for (j = 0; j < 4; j++) {
	    if (best < freqs[i][j]) {
		best = freqs[i][j];
		call = j;
	    }
	}
	new_cons[i] = "ACGT"[call];

	dd = (double)depth[i];
	switch (call) {
	case 0:
	    score +=  freqs[i][0] / dd;
	    score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 1:
	    score +=  freqs[i][1] / dd;
	    score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 2:
	    score +=  freqs[i][2] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd;
	    break;
	case 3:
	    score +=  freqs[i][3] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd;
	    break;
	}

	if (best_score <= score) {
	    best_score = score;
	    best_pos = i+1;
	}
	/*
	printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n",
	       i, depth[i], "ACGT"[call],
	       freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3],
	       score);
	*/
    }
    /* printf("Best score is %f at %d\n", best_score, best_pos); */

    /* Extend */
    nseq = 0;
    if (best_pos > 0) {
	int furthest_left = end;

	for (i = 0; i < nr; i++) {
	    seq_t *s;
	    int r_pos;
	    int score;

	    if (r[i].rec == 0)
		continue;

	    s = cache_search(io, GT_Seq, r[i].rec);
	    s = cache_rw(io, s);

	    if (furthest_left > r[i].start)
		furthest_left = r[i].start;

	    /*
	     * end + best_pos is the furthest right we can go, but this
	     * specific read may not be justified in reaching that far
	     * if it has too many disagreements.
	     */
	    if ((s->len > 0) ^ r[i].comp) {
		int best_r = 0, j, k;
		int len = ABS(s->len);

		//printf(">%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) {
		for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) {
		    if (new_cons[j] == toupper(s->seq[k])) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k+1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(s->seq[k])
		    //	    ? toupper(s->seq[k])
		    //        : tolower(s->seq[k]));
		}
		//putchar('\n');

		if (s->right != r_pos) {
		    s->right  = r_pos;
		    nseq++;
		}
	    } else {
		int best_r = 0, j, k;

		//printf("<%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) {
		for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) {
		    char b = complement_base(s->seq[k]);
		    if (new_cons[j] == b) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k-1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(b)
		    //	    ? toupper(b)
		    //	    : tolower(b));
		}
		//putchar('\n');

		if (s->left != r_pos+2) {
		    s->left  = r_pos+2;
		    nseq++;
		}
	    }
	}

	vmessage("    Extended by %d, adjusting %d sequence clip%s\n",
		 best_pos, nseq, nseq == 1 ? "" : "s");

	bin_invalidate_consensus(io, crec, furthest_left, end + best_pos);
    } else {
	vmessage("    Unable to extend contig\n");
    }
    free(r);

    cache_decr(io, c);
    cache_flush(io);
    return 0;
}