Exemple #1
0
/* Debug functions that don't use curses - handy for valgrind testing */
static void test_mode(GapIO *io, contig_t **c, int xpos) {
    rangec_t *r;
    int nr, i;

    r = contig_seqs_in_range(io, c, xpos, xpos+79, CSIR_SORT_BY_X, &nr);
    for (i = 0; i < nr; i++) {
        seq_t *s = get_seq(io, r[i].rec);
        printf("%.*s: range %d..%d seq %d+%d st=%d en=%d %.*s\n",
               s->name_len, s->name,
               r[i].start, r[i].end,
               s->pos, s->len,
               s->left, s->right,
               ABS(s->len), s->seq);

        s = dup_seq(s);
        complement_seq_t(s);

        printf("%.*s: range %d..%d seq %d+%d st=%d en=%d %.*s\n",
               s->name_len, s->name,
               r[i].start, r[i].end,
               s->pos, s->len,
               s->left, s->right,
               ABS(s->len), s->seq);
    }

    gio_close(io);
    system("ps lx | grep g_iotest | grep -v grep");
    exit(0);
}
Exemple #2
0
static void benchmark(GapIO *io, contig_t **c) {
    int i;
    char cons[10000];

    srandom(0);
    fprintf(stderr, "=== Benchmarking ===\n");
    for (i = 0; i < 1000; i++) {
        int xpos = random() % 2000000;
        int size = random() % 1000;
        int nr;
        rangec_t *r;

        r = contig_seqs_in_range(io, c, xpos, xpos+size, 0, &nr);
        calc_cons(io, r, nr, xpos, size, cons);
        printf("%.*s\n", size, cons);
        fputc('.', stderr);
        fflush(stderr);
        free(r);
    }
    gio_close(io);
    exit(0);
}
Exemple #3
0
static void display_gap(GapIO *io, contig_t **c, int xpos, int ypos,
                        int nlines, int wid, int mode, int qual_cutoff,
                        int in_curses) {
    rangec_t *r;
    int i, nr, lno, y;
    char line[1024], *lp;
    char cons[1024];
    int attr;
    static int lookup_1conf[256];
    static int lookup_4conf[256];
    static int lookup_init = 0;

    if (!lookup_init) {
        for (i = 0; i < 256; i++)
            lookup_1conf[i] = lookup_4conf[0] = 0;

        lookup_4conf['a'] = lookup_4conf['A'] = 0;
        lookup_4conf['c'] = lookup_4conf['C'] = 1;
        lookup_4conf['g'] = lookup_4conf['G'] = 2;
        lookup_4conf['t'] = lookup_4conf['T'] = 3;
    }

    wid -= MAX_NAME_LEN+2;

    //if (xpos < wid/2 + (*c)->start)
    //	xpos = wid/2 + (*c)->start;

    xpos -= wid/2;

    /* Query visible objects */
    r = contig_seqs_in_range(io, c, xpos, xpos+wid-1, CSIR_SORT_BY_X, &nr);

    /* Consensus */
    calc_cons(io, r, nr, xpos, wid, cons);
    if (in_curses) {
        clear();
        mvaddnstr(0, 1, contig_get_name(c), strlen(contig_get_name(c)));
        mvaddnstr(0, MAX_NAME_LEN+2, cons, wid);
    } else {
        printf(" %-*s %.*s\n", MAX_NAME_LEN, contig_get_name(c), wid, cons);
    }

    /* Position */
    for (lp = line, i = xpos; i < xpos+wid+19; i++) {
        if (i % 10 == 0) {
            sprintf(lp, "%10d", i-10);
            lp += 10;
        }
    }
    if (in_curses) {
        int m = (xpos-1)%10;
        if (m < 0) m += 10;
        mvaddnstr(1, MAX_NAME_LEN+2, line+10+m, wid);
    } else {
        printf("%*s%.*s\n", MAX_NAME_LEN+2, "", wid,
               line+9+((xpos-1)%10));
    }


    /* Sequences */
    for (i = y = 0; i < nr && y < ypos; i++, y++);
    for (lno = 2; i < nr && lno < nlines; i++, lno++) {
        seq_t *s = get_seq(io, r[i].rec);
        seq_t *sorig = s;
        int sp = r[i].start;
        int l = s->len > 0 ? s->len : -s->len;
        unsigned char seq_a[MAX_SEQ_LEN], *seq = seq_a;
        int j, dir = '+';
        int left, right;
        char *conf;
        int nc = s->format == SEQ_FORMAT_CNF4 ? 4 : 1;
        int *L = s->format == SEQ_FORMAT_CNF4 ? lookup_4conf : lookup_1conf;

        /* Complement data on-the-fly */
        if ((s->len < 0) ^ r[i].comp) {
            dir = '-';
            s = dup_seq(s);
            complement_seq_t(s);
        }

        left = s->left;
        right = s->right;

        memcpy(seq, s->seq, l);
        conf = s->conf;

        if (sp < xpos) {
            seq   += xpos - sp;
            conf  += nc * (xpos - sp);
            l     -= xpos - sp;
            left  -= xpos - sp;
            right -= xpos - sp;
            sp = xpos;
        }
        if (l > wid - (sp-xpos))
            l = wid - (sp-xpos);

        if (in_curses) {
            /* Test of sequence_get_position */
            /*
              int c, p;
              sequence_get_position(io, r[i].rec, &c, &p);
              s->name_len = sprintf(s->name, ":%d-%d:", p, p+ABS(s->len)-1);
            */
            mvaddch(lno, 0, dir);
            addnstr(s->name, MIN(MAX_NAME_LEN, s->name_len));
            move(lno, MAX_NAME_LEN+2+sp-xpos);
        } else {
            printf("%c%.*s%*s",
                   dir,
                   MIN(MAX_NAME_LEN, s->name_len), s->name,
                   MAX_NAME_LEN+1-MIN(MAX_NAME_LEN, s->name_len) +sp-xpos, "");
        }

        for (j = 0; j < l; j++) {
            attr = (mode & DISPLAY_COLOURS) ? COLOR_PAIR(lookup[seq[j]]) : 0;

            if (mode & DISPLAY_DIFFS
                    && sp-xpos+j < wid && seq[j] == cons[sp-xpos+j])
                seq[j] = '.';
            if (j < left-1 || j > right-1)
                seq[j] = (mode & DISPLAY_CUTOFFS) ? tolower(seq[j]) : ' ';

            if (conf[j*nc+L[seq[j]]] >= qual_cutoff && mode & DISPLAY_QUAL) {
                attr |= A_BOLD;
            }

            if (in_curses) {
                addch(seq[j] | attr);
            } else {
                putchar(seq[j]);
            }
        }

        if (!in_curses)
            putchar('\n');

        if (s != sorig)
            free(s);
    }

    /* Useful debugging code to show bin locations. */
#if 0
    free(r);
    r = contig_bins_in_range(io, c, xpos, xpos+wid-1, &nr);
    /* Bins */
    for (i=0; i < nr && lno < nlines; i++, lno++) {
        bin_index_t *bin = (bin_index_t *)cache_search(io, GT_Bin, r[i].rec);
        unsigned char *seq, *seqm;
        int j, dir = "+-"[r[i].comp];
        int sp = r[i].start;
        int l = ABS(r[i].end - r[i].start + 1);
        char name[100];

        sprintf(name, "bin-%d", bin->rec);
        seqm = seq = malloc(l+1);
        memset(seq, '-', l);

        if (!(bin->start_used == 0 && bin->end_used == 0)) {
            if (r[i].comp) {
                memset(&seq[bin->size - bin->end_used - 1], '=',
                       bin->end_used - bin->start_used + 1);
            } else {
                memset(&seq[bin->start_used], '=',
                       bin->end_used - bin->start_used + 1);
            }
        }

        /*
        fprintf(stderr, "Bin-%d: %d+%d %d..%d\n",
        	bin->rec,
        	bin->pos, bin->size,
        	bin->start_used, bin->end_used);
        */

        if (sp < xpos) {
            seq   += xpos - sp;
            l     -= xpos - sp;
            sp = xpos;
        }
        if (l > wid - (sp-xpos))
            l = wid - (sp-xpos);

        if (in_curses) {
            mvaddch(lno, 0, dir);
            addnstr(name, strlen(name));
            move(lno, MAX_NAME_LEN+2+sp-xpos);
        } else {
            printf("%c%.*s%*s",
                   dir,
                   (int)MIN(MAX_NAME_LEN, strlen(name)),
                   name,
                   (int)(MAX_NAME_LEN+1-MIN(MAX_NAME_LEN,
                                            strlen(name)) +sp-xpos),
                   "");
        }

        for (j = 0; j < l; j++) {
            if (in_curses) {
                addch(seq[j]);
            } else {
                putchar(seq[j]);
            }
        }

        if (!in_curses)
            putchar('\n');

        free(seqm);
    }
#endif

    if (in_curses)
        refresh();

    free(r);
}
/*
 * Extends the right hand end of a single contig.
 *
 * Min_depth is the minimum depth for extension. If lower then even if the
 * data matches we'll not extend further.
 *
 * Match_score (+ve) and mismatch_score (-ve) are accumulated during
 * extension to ensure that we don't extend into junk mismatching DNA.
 */
static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth,
				int match_score, int mismatch_score) {
    int end;
    rangec_t *r;
    int nr, i;
    contig_t *c;
    char cons[CSZ], new_cons[ESZ];
    int freqs[ESZ][4], depth[ESZ];
    double score, best_score;
    int best_pos, nseq;

    vmessage("Processing contig #%"PRIrec", %s end\n",
	     crec, dir ? "left" : "right");

    for (i = 0; i < ESZ; i++) {
	freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0;
	depth[i] = 0;
    }

    c = cache_search(io, GT_Contig, crec);
    if (NULL == c) return -1;
    cache_incr(io, c);

    if (consensus_valid_range(io, crec, NULL, &end) != 0) {
	cache_decr(io, c);
	return -1;
    }

    calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL);

    /* Start */
    /* Not implemented for now: rev complement and go again! */

    /* End */
    r = contig_seqs_in_range(io, &c, end, end, 0, &nr);
    if (!r) {
	cache_decr(io, c);
	return -1;
    }

    for (i = 0; i < nr; i++) {
	seq_t *s = cache_search(io, GT_Seq, r[i].rec);
	seq_t *sorig = s;
	int cstart, cend;
	int j, k, slen;

	if ((s->len < 0) ^ r[i].comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	cstart = r[i].start + s->left-1;
	cend   = r[i].start + s->right-1;

	/* Does cutoff extend to contig end, if so does it match cons? */
	if (cend < end) {
	    int mis = 0, len = 0;
	    if (end - cend >= CSZ) {
		/*
		fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n",
			r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0; /* Mark for removal */
		continue;
	    }

	    for (k = s->right, j = cend+1; j <= end; j++, k++) {
		//printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]);
		if (s->seq[k] != cons[j-(end-(CSZ-1))])
		    mis++;
	    }
	    len = end - cend;
	    if (100*mis/len > 5) {
		/*
		fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement "
			"with consensus.\n", r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0;
		continue;
	    }
	}

	/* So we got here, let's accumulate extension stats */
	slen = ABS(s->len);
	for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) {
	    //printf("%d: %c\n", j + r[i].start, s->seq[j]);
	    if(s->seq[j] == 'N')
		continue;

	    freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++;
	    depth[k]++;
	}

	if (sorig != s)
	    free(s);
    }

    score = best_score = 0;
    best_pos = 0;
    
    for (i = 0; i < ESZ; i++) {
	int call, best = 0, j;
	double dd;

	if (depth[i] < min_depth)
	    break;

	for (j = 0; j < 4; j++) {
	    if (best < freqs[i][j]) {
		best = freqs[i][j];
		call = j;
	    }
	}
	new_cons[i] = "ACGT"[call];

	dd = (double)depth[i];
	switch (call) {
	case 0:
	    score +=  freqs[i][0] / dd;
	    score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 1:
	    score +=  freqs[i][1] / dd;
	    score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 2:
	    score +=  freqs[i][2] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd;
	    break;
	case 3:
	    score +=  freqs[i][3] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd;
	    break;
	}

	if (best_score <= score) {
	    best_score = score;
	    best_pos = i+1;
	}
	/*
	printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n",
	       i, depth[i], "ACGT"[call],
	       freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3],
	       score);
	*/
    }
    /* printf("Best score is %f at %d\n", best_score, best_pos); */

    /* Extend */
    nseq = 0;
    if (best_pos > 0) {
	int furthest_left = end;

	for (i = 0; i < nr; i++) {
	    seq_t *s;
	    int r_pos;
	    int score;

	    if (r[i].rec == 0)
		continue;

	    s = cache_search(io, GT_Seq, r[i].rec);
	    s = cache_rw(io, s);

	    if (furthest_left > r[i].start)
		furthest_left = r[i].start;

	    /*
	     * end + best_pos is the furthest right we can go, but this
	     * specific read may not be justified in reaching that far
	     * if it has too many disagreements.
	     */
	    if ((s->len > 0) ^ r[i].comp) {
		int best_r = 0, j, k;
		int len = ABS(s->len);

		//printf(">%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) {
		for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) {
		    if (new_cons[j] == toupper(s->seq[k])) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k+1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(s->seq[k])
		    //	    ? toupper(s->seq[k])
		    //        : tolower(s->seq[k]));
		}
		//putchar('\n');

		if (s->right != r_pos) {
		    s->right  = r_pos;
		    nseq++;
		}
	    } else {
		int best_r = 0, j, k;

		//printf("<%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) {
		for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) {
		    char b = complement_base(s->seq[k]);
		    if (new_cons[j] == b) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k-1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(b)
		    //	    ? toupper(b)
		    //	    : tolower(b));
		}
		//putchar('\n');

		if (s->left != r_pos+2) {
		    s->left  = r_pos+2;
		    nseq++;
		}
	    }
	}

	vmessage("    Extended by %d, adjusting %d sequence clip%s\n",
		 best_pos, nseq, nseq == 1 ? "" : "s");

	bin_invalidate_consensus(io, crec, furthest_left, end + best_pos);
    } else {
	vmessage("    Unable to extend contig\n");
    }
    free(r);

    cache_decr(io, c);
    cache_flush(io);
    return 0;
}