Example #1
0
void next_contig(GapIO *io, contig_t **cp) {
    cache_decr(io, *cp);

    if (++io->contig_num == io->db->Ncontigs)
        io->contig_num = 0;

    gio_read_contig(io, io->contig_num, cp);
    cache_incr(io, *cp);
}
Example #2
0
/*
 * Exports Scaffold information to an AGP file
 *
 * Returns 0 on success
 *        -1 on failure
 */
int scaffold_to_agp(GapIO *io, char *fn) {
    FILE *fp;
    int i, j;

    if (NULL == (fp = fopen(fn, "w+"))) {
	verror(ERR_WARN, "scaffold_from_agp", "%s: %s", fn, strerror(errno));
	return -1;
    }

    for (i = 0; io->scaffold && i < ArrayMax(io->scaffold); i++) {
	scaffold_t *f = cache_search(io, GT_Scaffold,
				     arr(tg_rec, io->scaffold, i));
	int start = 1, end = 1;
	int k = 1;

	if (!f) {
	    verror(ERR_WARN, "scaffold_from_agp", "Failed to load scaffold\n");
	    fclose(fp);
	    return -1;
	}

	cache_incr(io, f);

	for (j = 0; f->contig && j < ArrayMax(f->contig); j++) {
	    scaffold_member_t *m = arrp(scaffold_member_t, f->contig, j);
	    contig_t *c = cache_search(io, GT_Contig, m->rec);
	    int ustart, uend;
	    int len;

	    /* Get the unpadded clipped contig length */
	    consensus_valid_range(io, m->rec, &ustart, &uend);
	    consensus_unpadded_pos(io, m->rec, uend, &uend);
	    len = uend - ustart + 1;

	    if (j) {
		int gap = m->gap_size;
		fprintf(fp, "%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n",
			f->name, start, start+gap-1, k++, gap);
		start += gap;
	    }
	    fprintf(fp, "%s\t%d\t%d\t%d\tW\t%s\t%d\t%d\t+\n",
		    f->name, start, start + len-1,
		    k++, c->name, ustart, uend);
	    start += len;
	}

	cache_decr(io, f);
    }

    if (0 != fclose(fp)) {
	verror(ERR_WARN, "scaffold_from_agp", "%s: %s", fn, strerror(errno));
	return -1;
    }

    return 0;
}
Example #3
0
void prev_contig(GapIO *io, contig_t **cp) {
    cache_decr(io, *cp);

    if (io->contig_num == 0)
        io->contig_num = io->db->Ncontigs;
    io->contig_num--;

    gio_read_contig(io, io->contig_num, cp);
    cache_incr(io, *cp);
}
Example #4
0
static void tag_shift_for_delete(GapIO *io, tg_rec crec, tg_rec srec,
				 int start, int end, int pos, tg_rec brec) {
    contig_iterator *ci;
    rangec_t *r;
    contig_t *c = cache_search(io, GT_Contig, crec);;

    //printf("< tag in seq %"PRIrec" at %d\n", srec, pos);

    cache_incr(io, c);

    ci = contig_iter_new_by_type(io, crec, 0, CITER_FIRST | CITER_ISTART,
				 start+pos, end, GRANGE_FLAG_ISANNO);
    if (!ci) {
	cache_decr(io, c);
	return;
    }

    while ((r = contig_iter_next(io, ci))) {
	range_t r2, *r_out;
	anno_ele_t *a;
	bin_index_t *bin;

	if (r->pair_rec != srec)
	    continue;

	bin_remove_item(io, &c, GT_AnnoEle, r->rec);
	r2.start    = (r->start > start+pos) ? r->start-1 : r->start;
	r2.end      = r->end-1;
	r2.mqual    = r->mqual;
	r2.rec      = r->rec;
	r2.pair_rec = r->pair_rec;
	r2.flags    = r->flags;

	if (r2.end < r2.start) {
	    /* Tag entirely removed now, it must have been on a pad */
	    a = cache_search(io, GT_AnnoEle, r->rec);
	    a = cache_rw(io, a);
	    cache_deallocate(io, a);
	    continue;
	}
	bin = bin_add_to_range(io, &c, brec, &r2, &r_out, NULL, 0);

	a = cache_search(io, GT_AnnoEle, r->rec);
	if (a->bin != bin->rec /*||
	    a->idx != r_out - ArrayBase(range_t, bin->rng)*/) {
	    /* Annotation moved bins */
	    a = cache_rw(io, a);
	    a->bin = bin->rec;
	    //a->bin_idx = r_out - ArrayBase(range_t, bin->rng);
	}
    }

    cache_decr(io, c);
    contig_iter_del(ci);
}
Example #5
0
/*
 * Creates an anno_ele as per anno_ele_new, but also adds it to an object
 * and creates the bin Range entry too.
 */
tg_rec anno_ele_add(GapIO *io, int obj_type, tg_rec obj_rec, tg_rec anno_rec,
		    int type, char *comment, int start, int end, char dir) {
    range_t r;
    anno_ele_t *e;
    contig_t *c;
    tg_rec crec;
    bin_index_t *bin;
    tg_rec seq_bin = 0;

    /* Find contig for obj_rec/obj_type */
    if (obj_type == GT_Contig) {
	crec = obj_rec;
    } else {
	int st, en;
	sequence_get_position2(io, obj_rec, &crec, &st, &en, NULL,
			       &seq_bin, NULL, NULL);

	start += st;
	end += st;
    }

    c = (contig_t *)cache_search(io, GT_Contig, crec);
    cache_incr(io, c);

    r.start    = start;
    r.end      = end;
    r.flags    = GRANGE_FLAG_ISANNO;
    r.mqual    = type;
    r.pair_rec = obj_rec;

    if (GT_Seq == obj_type)
	r.flags |= GRANGE_FLAG_TAG_SEQ;

    r.rec = anno_ele_new(io, 0, obj_type, obj_rec, 0, type, dir, comment);
    e = (anno_ele_t *)cache_search(io, GT_AnnoEle, r.rec);
    e = cache_rw(io, e);

    if (seq_bin)
	bin = bin_add_to_range(io, &c, seq_bin, &r, NULL, NULL, 0);
    else
	bin = bin_add_range(io, &c, &r, NULL, NULL, 0);

    if (!bin) 
	verror(ERR_FATAL, "anno_ele_add", "bin_add_to_range returned NULL");

    e->bin = bin ? bin->rec : 0;

    cache_decr(io, c);
    return r.rec;
}
Example #6
0
/*
 * Tidies up after break contig or disassemble readings, looking for now
 * redundant bins.
 *
 * This has the following functions (not all implemented yet!)
 * 1) If a contig is totally empty, remove the contig.
 * 2) If a bin is empty and all below it, remove the bin.
 * 3) If a bin is empty and all above it, remove parent bins and link
 *    contig to new root. (TODO)
 */
static void remove_empty_bins(GapIO *io, tg_rec contig) {
    contig_t *c = cache_search(io, GT_Contig, contig);
    tg_rec first = 0;

    cache_incr(io, c);

    if (c->bin) {
	if (remove_empty_bins_r(io, c->bin, &first)) {
	    cache_decr(io, c);
	    contig_destroy(io, contig);
	    return;
	}

	if (first != c->bin) {
	    bin_index_t *bin;
	    tg_rec bp, br, cdummy;
	    int offset;

	    /* Cut out the offending waste */
	    bin = cache_search(io, GT_Bin, first);
	    bin = cache_rw(io, bin);
	    bp = bin->parent;

	    // Find new bin offset
	    bin_get_position(io, bin, &cdummy, &offset);
	    assert(cdummy == contig);

	    bin->pos = offset;
	    bin->parent = contig;
	    bin->parent_type = GT_Contig;
	    bin->flags |= BIN_BIN_UPDATED;

	    c = cache_rw(io, c);
	    br = c->bin;
	    c->bin = first;

	    bin = cache_search(io, GT_Bin, bp);
	    bin = cache_rw(io, bin);
	    if (bin->child[0] == first) bin->child[0] = 0;
	    if (bin->child[1] == first) bin->child[1] = 0;

	    /* Recursively remove the bin tree from old root, br */
	    bin_destroy_recurse(io, br);
	}
    }

    cache_decr(io, c);
}
Example #7
0
/*
 * Removes all tags of specific types (hashed in h, or all if h == NULL)
 * from a specified contig.
 *
 * Returns 0 on success
 *        -1 on failure
 */
static int delete_tag_single_contig(GapIO *io, tg_rec crec,
				    HashTable *h, int verbose) {
    contig_iterator *ci;
    rangec_t *r;
    contig_t *c;
    int ret = -1;

    ci = contig_iter_new_by_type(io, crec, 1, CITER_FIRST,
				 CITER_CSTART, CITER_CEND,
				 GRANGE_FLAG_ISANNO);
    if (!ci)
	return -1;
    
    if (!(c = cache_search(io, GT_Contig, crec))) {
	contig_iter_del(ci);
	return -1;
    }
    cache_incr(io, c);

    while (NULL != (r = contig_iter_next(io, ci))) {
	char t[5];
	(void)type2str(r->mqual, t);
	if (!h || HashTableSearch(h, t, 4)) {
	    anno_ele_t *e;

	    if (verbose)
		vmessage("Removing anno %s #%"PRIrec"\tContig %s\t%d..%d\n",
			 t, r->rec, c->name, r->start, r->end);
	    if (bin_remove_item(io, &c, GT_AnnoEle, r->rec)) goto fail;
	    /* FIXME: Need to reclaim the GT_AnnoEle record itself */
	}
    }

    ret = 0;
 fail:
    contig_iter_del(ci);
    cache_decr(io, c);

    return ret;
}
Example #8
0
/*
 * ----------------------------------------------------------------------
 * Remove Pad Columns. Sometimes we don't want to realign data, we just
 * want to remove (aligned) columns of pads.
 * ----------------------------------------------------------------------
 */
int remove_pad_columns(GapIO *io, int ncontigs, contig_list_t *contigs,
		       int percent_pad, int quiet) {
    int i;
    consensus_t *cons = NULL;
    size_t max_alloc = 0;

    for (i = 0; i < ncontigs; i++) {
	tg_rec cnum = contigs[i].contig;
	size_t len, j;
	int ndel = 0;
	contig_t *c;

	if (!quiet) {
	    vmessage("Processing contig %d of %d (#%"PRIrec")\n",
		     i+1, ncontigs, cnum);
	    UpdateTextOutput();
	}

	c = cache_search(io, GT_Contig, cnum);
	if (!c)
	    return -1;

	cache_incr(io, c);
	
	len = contigs[i].end - contigs[i].start + 1;
	if (max_alloc < len) {
	    max_alloc = len;
	    cons = realloc(cons, max_alloc * sizeof(*cons));
	}
	
	if (0 != calculate_consensus(io, cnum,
				     contigs[i].start, contigs[i].end,
				     cons)) {
	    free(cons);
	    cache_decr(io, c);
	    return -1;
	}

	for (j = 0; j < len; j++) {
	    if (cons[j].call != 4)
		continue;

	    if (100 * cons[j].counts[4] / cons[j].depth < percent_pad)
		continue;

	    if (!quiet)
		vmessage("  Removing column %d %d%% pad (%d of %d), conf. %f)\n",
			 (int)j+contigs[i].start,
			 100 * cons[j].counts[4] / cons[j].depth,
			 cons[j].counts[4], cons[j].depth,
			 cons[j].scores[cons[j].call]);

	    contig_delete_base(io, &c, contigs[i].start + j - ndel);
	    ndel++;
	}

	cache_decr(io, c);
    }

    if (cons)
	free(cons);

    return 0;
}
Example #9
0
/*
 * Takes a multiple alignment and updates the on-disk data structures to
 * match. This needs to correct confidence values, original positions and
 * tags too.
 */
void update_io(GapIO *io, tg_rec cnum, MALIGN *malign, Array indels) {
    CONTIGL *cl;
    tg_rec rnum;
    range_t r, *r_out;
    bin_index_t *bin;
    contig_t *c = cache_search(io, GT_Contig, cnum);
    size_t i, nindel;

    cache_incr(io, c);

    /*
     * To minimise number of data modifications we use a three step approach.
     *
     * Step 1: insert columns of pads, shifting reads as appropriate.
     * Step 2: edit sequence alignments as required, possibly involving
     *         moving sequences and/or adding and removing pads.
     * Step 3: remove columns of entire pads.
     *
     * This means that when we introduce a column of pads we don't have
     * to make edits to every single read position down stream, and can
     * instead make use of the optimised recursive bin functions to do this
     * for us.
     */

    /* Step 1: make indels */
    nindel = ArrayMax(indels);
    for (i = 0; i < nindel; i++) {
	con_indel_t *id = arrp(con_indel_t, indels, i);
	int j;

	if (id->size > 0) {
	    contig_insert_bases(io, &c, id->pos+1, '*', -1, id->size);
	} else {
	    for (j = 0; j < -id->size; j++) {
		contig_delete_pad(io, &c, id->pos+1);
	    }
	}
    }

    /* Step 2: edit alignments */
    for (cl = malign->contigl; cl; cl = cl->next) {
	seq_t *s, *sorig;
	int len, update_range = 0;
	int shift;

	rnum = cl->id;
	
	sorig = cache_search(io, GT_Seq, rnum);
	cache_incr(io, sorig);
	s = dup_seq(sorig);
	if (cl->mseg->comp)
	    complement_seq_t(s);

	len = s->right - s->left + 1;

	/* Check if sequence has changed. If so assign a new one */
	if (cl->mseg->length != len ||
	    memcmp(s->seq + s->left-1, cl->mseg->seq, cl->mseg->length) != 0) {
	    int newlen = s->left-1 + ABS(s->len) - s->right + cl->mseg->length;
	    int i, j, np;
	    char   *newseq  = malloc(newlen+1);
	    int8_t *newconf = malloc(newlen+1);

	    /* Build new seq/conf arrays */
	    memcpy(newseq,  s->seq,  s->left-1);
	    memcpy(newconf, s->conf, s->left-1);

	    memcpy(&newseq[s->left-1], cl->mseg->seq, cl->mseg->length);

	    /*
	     * Step through both old and new sequences working out how
	     * they differ. This will (*should*) be entire pad movements.
	     * i = index to old seq
	     * j = index to new seq
	     * np = number of pads added minus removed from old seq.
	     */
	    np = 0;
	    for (i =j =s->left-1;
		 i < ABS(s->len) && j < s->left-1 + cl->mseg->length;
		 ) {
		/* Bases match */
		if (toupper(newseq[j]) == toupper(s->seq[i]) ||
		    (s->seq[i] == '.' && newseq[j] == 'N')) {
		    if (isupper(s->seq[i]))
			newseq[j] = toupper(newseq[j]);
		    else
			newseq[j] = tolower(newseq[j]);
		    newconf[j] = s->conf[i];
		    i++, j++;
		    continue;
		}

		/* Pad removed */
		if (s->seq[i] == '*') {
		    i++;
		    tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+np--,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_delete(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_delete(io, rnum, i+np--);
		    }
		    */
		    continue;
		}

		/* Pad created */
		if (newseq[j] == '*') {
		    int k;
		    int ql = 0, qr = 0;
		    for (k = i-1; k >= 0; k--) {
			if (s->seq[k] != '*') {
			    ql = s->conf[k];
			    break;
			}
		    }
		    for (k = i+1; k < s->right; k++) {
			if (s->seq[k] != '*') {
			    qr = s->conf[k];
			    break;
			}
		    }
		    newconf[j] = MIN(ql, qr); /* min conf of neighbours */
		    j++;
		    tag_shift_for_insert(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+ ++np,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_insert(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_insert(io, rnum, i+ ++np);
		    }
		    */
		    continue;
		}

		fprintf(stderr, "Alignment introduced non-pad character");
		abort();
	    }

	    /* Pads previously at the end of the reading & now removed */
	    while (i < s->right) {
		if (s->seq[i] == '*') {
		    i++;
		    tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+np--,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_delete(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_delete(io, rnum, i+np--);
		    }
		    */
		} else {
		    /* Error: clipped data that wasn't a pad */
		    abort();
		}
	    }

	    /* Should only be pads remaining in newseq, if anything */
	    s->right = j;
	    for (; j < s->left-1 + cl->mseg->length; j++) {
		if (newseq[j] != '*') {
		    fprintf(stderr, "Alignment introduced non-pad character");
		    abort();
		}
		newconf[j] = 0;
	    }

	    /* Append on the right hand cutoff data */
	    for (; i < ABS(s->len); i++, j++) {
		newseq[j]  = s->seq[i];
		newconf[j] = s->conf[i];
	    }
	    if (j != newlen) {
		abort();
	    }

	    /* Write it back out */
	    /* Copy newseq/newconf into seq_t */

	    s->seq = newseq;
	    s->conf = newconf;
	    update_range = 0;
	    if (ABS(s->len) != j) {
		/* Length change implies updating the range array too */
		s->len = s->len >= 0 ? j : -j;
		update_range = 1;
	    }

	    if (cl->mseg->comp)
		complement_seq_t(s);

	    /* The memcpy trashes the block pointer, so special care needed */
	    {
		sorig = cache_rw(io, sorig);
		void *blk = sorig->block;
		memcpy(sorig, s, sizeof(seq_t)); 
		sorig->block = blk;
	    }

	    if (update_range)
		sorig = cache_item_resize(sorig, sizeof(*sorig) +
					  sequence_extra_len(sorig));

	    sequence_reset_ptr(sorig);

	    if (s->name)
		memcpy(sorig->name,       s->name,       s->name_len+1);
	    if (s->trace_name)
		memcpy(sorig->trace_name, s->trace_name, s->trace_name_len+1);
	    if (s->alignment)
		memcpy(sorig->alignment,  s->alignment,  s->alignment_len+1);
	    memcpy(sorig->seq,  s->seq,  ABS(s->len));
	    memcpy(sorig->conf, s->conf, ABS(s->len));

	    xfree(newconf);
	    xfree(newseq);
	}

	{
	    int st, en, or;
	    sequence_get_position(io, s->rec, NULL, &st, &en, &or);
	    if (or ^ (sorig->len < 0)) {
		shift = ABS(sorig->len) - sorig->right;
	    } else {
		shift = sorig->left-1;
	    }
	    st += shift;
	    if (st != cl->mseg->offset+1) {
		update_range = 1;
	    }
	}

	free(s);

	if (update_range) {
	    int bin_changed = 0;

	    /* Get old range and pair data */
	    s = sorig;
	    bin = cache_search(io, GT_Bin, s->bin);
	    r = *arrp(range_t, bin->rng, s->bin_index);
	    assert(r.rec == s->rec);

	    /* Update range, tedious and slow way */
	    bin_remove_item(io, &c, GT_Seq, s->rec);
	    r.start = cl->mseg->offset + 1 - shift;
	    r.end   = r.start + ABS(s->len) - 1;
	    bin = bin_add_range(io, &c, &r, &r_out, NULL, 0);

	    /* Check if the new bin has a different complemented status too */
	    if (s->bin != bin->rec) {
		int old_comp = bin_get_orient(io, s->bin);
		int new_comp = bin_get_orient(io, bin->rec);

		if (new_comp != old_comp) {
		    //int tmp;
		    s = cache_rw(io, s);
		    s->len *= -1;
		    s->flags ^= SEQ_COMPLEMENTED;
		    //tmp = s->left;
		    //s->left  = ABS(s->len) - (s->right-1);
		    //s->right = ABS(s->len) - (tmp-1);
		}

		bin_changed = 1;
	    }
	
	    /* Update seq bin & bin_index fields */
	    s = cache_rw(io, s);
	    s->bin = bin->rec;
	    s->bin_index = r_out - ArrayBase(range_t, bin->rng);

	    if (bin_changed) {
		if (-1 == sequence_fix_anno_bins(io, &s)) {
		    verror(ERR_WARN, "update_io",
			   "sequence_fix_anno_bins() failure");
		}
	    }
	}

	cache_decr(io, sorig);
    }

    /* Step 3 (remove pad columns) done in calling function. */

    cache_decr(io, c);
}
Example #10
0
int main(int argc, char **argv) {
    GapIO *io;
    int xpos = 0;
    int opt;
    int lp_mode = 0;
    int mode = DISPLAY_QUAL | DISPLAY_CUTOFFS;
    extern char *optarg;
    contig_t *c;
    int cnum = 0;
    int read_only = 1;

    while ((opt = getopt(argc, argv, "hl:dcCx:e")) != -1) {
        switch (opt) {
        case '?':
        case 'h':
            usage();
            return 0;

        case 'd':
            mode |= DISPLAY_DIFFS;
            break;

        case 'c':
            mode |= DISPLAY_CUTOFFS;
            break;

        case 'C':
            mode &= ~DISPLAY_CUTOFFS;
            break;

        case 'l':
            lp_mode = atoi(optarg);
            break;

        case 'x':
            cnum = atoi(optarg)-1;
            break;

        case 'e':
            read_only = 0;
            break;

        default:
            if (opt == ':')
                fprintf(stderr, "Missing parameter\n");
            else
                fprintf(stderr, "Unknown option '%c'\n", opt);
            usage();
            return 1;
        }
    }

    if (optind == argc) {
        usage();
        return 1;
    }

    if (NULL == (io = gio_open(argv[optind], read_only, 0))) {
        fprintf(stderr, "Unable to open db: %s\n", argv[1]);
        return 1;
    }
    optind++;

    if (optind != argc) {
        xpos = atoi(argv[optind]);
    }

    io->contig_num = cnum;
    gio_read_contig(io, cnum, &c);
    cache_incr(io, c);

#ifdef TEST_MODE
    //test_mode(io, &c, xpos);
    test_mode2(io, &c, xpos);
    //benchmark(io, &c);
    //test_mode3(io, arr(GCardinal, io->contig_order, cnum), xpos);
#endif

    if (lp_mode) {
        print_output(io, &c, xpos, lp_mode, mode);
        gio_close(io);
    } else {
        init_curses();
        curses_loop(io, &c, xpos, mode);
        endwin();

        if (io->cache && io->debug_level > 0) {
            fputs("\n=== cache ===", stderr);
            HacheTableStats(io->cache, stderr);
        }

        gio_close(io);
    }

    if (!lp_mode) {
        printf("\n\n\tg_view:\tShort Read Alignment Viewer, version 1.2.11"SVN_VERS"\n");
        printf("\n\tAuthor:\tJames Bonfield ([email protected])\n");
        printf("\t\t2007-2011, Wellcome Trust Sanger Institute\n\n");
    }

    return 0;
}
Example #11
0
int edview_search_cons_discrep(edview *xx, int dir, int strand, char *value) {
    int start, end;
    int found = 0, at_end = 0;
    int fpos, i;
    double qval = atof(value);
    consensus_t cons[WIN_WIDTH+1];
    contig_t *c;

    /* Set initial start positions */
    if (dir) {
	start = xx->cursor_apos + (dir ? 1 : -1);
	end   = start + (WIN_WIDTH-1);
    } else {
	end   = xx->cursor_apos + (dir ? 1 : -1);
	start = end - (WIN_WIDTH-1);
    }
    fpos = xx->cursor_apos;

    /* Loop WIN_WIDTH block at a time */
    c = cache_search(xx->io, GT_Contig, xx->cnum);
    cache_incr(xx->io, c);
    do {
	calculate_consensus(xx->io, xx->cnum, start, end, cons);

	if (dir) {
	    for (i = 0; i < WIN_WIDTH; i++) {
		if (cons[i].discrep >= qval) {
		    found = 1;
		    break;
		}
	    }
	} else {
	    for (i = WIN_WIDTH-1; i; i--) {
		if (cons[i].discrep >= qval) {
		    found = 1;
		    break;
		}
	    }
	}

	if (found) {
	    fpos = start + i;
	    break;
	}

	/* Next search region - overlapping by patlen+pads */
	if (dir) {
	    start += WIN_WIDTH;
	    end   += WIN_WIDTH;

	    if (start > c->end)
		at_end = 1;
	} else {
	    start -= WIN_WIDTH;
	    end   -= WIN_WIDTH;

	    if (end < c->start)
		at_end = 1;
	}
    } while (!at_end);
    cache_decr(xx->io, c);

    if (found) {
	edSetCursorPos(xx, GT_Contig, xx->cnum, fpos, 1);
	return 0;
    }

    return -1;
}
Example #12
0
/*
 * A recursive break contig function.
 * bin_num	The current bin being moved or split.
 * pos		The contig break point.
 * offset	The absolute positional offset of this bin in original contig
 * pleft	The parent bin/contig record num in the left new contig
 * pright	The parent bin/contig record num in the right new contig
 * child_no     0 or 1 - whether this bin is the left/right child of its parent
 */
static int break_contig_recurse(GapIO *io, HacheTable *h,
				contig_t *cl, contig_t *cr,
				tg_rec bin_num, int pos, int offset,
				int level, tg_rec pleft, tg_rec pright,
				int child_no, int complement) {
    int i, j, f_a, f_b;
    tg_rec rbin;
    bin_index_t *bin = get_bin(io, bin_num), *bin_dup ;
    //int bin_min, bin_max;
    int nseqs;
    tg_rec opright; /* old pright, needed if we revert back */

    cache_incr(io, bin);

    if (bin->flags & BIN_COMPLEMENTED) {
	complement ^= 1;
    }

    if (complement) {
	f_a = -1;
	f_b = offset + bin->size-1;
    } else {
	f_a = +1;
	f_b = offset;
    }

    printf("%*sBreak offset %d pos %d => test bin %"PRIrec": %d..%d\n",
	   level*4, "",
	   offset, pos, bin->rec,
	   NMIN(bin->start_used, bin->end_used),
	   NMAX(bin->start_used, bin->end_used));

    bin = cache_rw(io, bin);
    nseqs = bin->nseqs;
    bin->nseqs = 0;

    /* Invalidate any cached data */
    bin_invalidate_track(io, bin, TRACK_ALL);
    if (bin->flags & BIN_CONS_VALID) {
	bin->flags |= BIN_BIN_UPDATED;
	bin->flags &= ~BIN_CONS_VALID;
    }

    //bin_min = bin->rng ? NMIN(bin->start_used, bin->end_used) : offset;
    //bin_max = bin->rng ? NMAX(bin->start_used, bin->end_used) : offset;

    /*
     * Add to right parent if this bin is to the right of pos,
     * or if the used portion is to the right and we have no left child.
     *
     * FIXME: Not a valid assumption!
     * The used portion of a bin is not a placeholder for the used portion
     * of all the the children beneath it. Therefore if the used portion of
     * this bin is > pos (and we have no left child) it still doesn't mean
     * that the absolute positions of the used portion of the right child
     * won't be < pos.
     */
    if (offset >= pos /*|| (bin_min >= pos && !bin->child[0])*/) {
	printf("%*sADD_TO_RIGHT pl=%"PRIrec" pr=%"PRIrec"\n",
	       level*4, "", pleft, pright);
	if (0 != break_contig_move_bin(io, bin,
				       cl, pleft, cr, pright, 
				       child_no))
	    return -1;

	bin_incr_nseq(io, bin, nseqs);
	cache_decr(io, bin);

	return 0;
    }

    /*
     * Add to left parent if this bin is entirely to the left of pos,
     * or if the used portion is to the left and we have no right child.
     */
    if (offset + bin->size < pos /*|| (bin_max < pos && !bin->child[1])*/) {
	printf("%*sADD_TO_LEFT\n", level*4, "");

	//if (0 != break_contig_move_bin(io, bin, cr, pright, cl, pleft, child_no))
	//return -1;

	bin_incr_nseq(io, bin, nseqs);
	cache_decr(io, bin);
	
	return 0;
    }

    /*
     * Nominally the bin overlaps both left and right and so needs duplicating.
     * There are cases though at the roots of our trees where duplicating is
     * unnecessary as it leads to empty bins at the root. In this case
     * we skip creating a duplicate for the right, or alternatively steal
     * the left root bin and use that instead.
     *
     * Similarly the range_t array will either be left where it is, moved to
     * the right contig, or split in half (creating a new one for the right).
     *
     * FIXED: always need this. Eg:
     *
     * |-------------empty--------------|
     * |----------------|---------------|
     * |--------|-------|--------|------|
     *             ^
     *             |
     *             break here
     *
     * In this case we need to duplicate the parent as it overlaps the left
     * bin, which may (or may not) have data that needs to end up in the right
     * hand contig. Just duplicate for now and free later on if needed.
     */
    if (1 /* always! */ || pright != cr->rec ||
	(bin->rng && NMAX(bin->start_used, bin->end_used) >= pos)) {
	//printf("NMAX=%d >= %d\n", NMAX(bin->start_used, bin->end_used), pos);

	rbin = 0;

	/* Possibly steal left contig's bin */
	if (pleft == cl->rec && NMIN(bin->start_used, bin->end_used) >= pos) {
#if 0
	    /* Currently this doesn't always work */
	    if (bin->child[1]) {
		bin_index_t *ch = get_bin(io, bin->child[1]);
		if (NMIN(ch->pos, ch->pos + ch->size-1) >= pos) {
		    rbin = cl->bin;
		    cl->bin = bin->child[0];
		}
	    }
#else
	    pleft = bin->rec;
#endif
	} else {
	    pleft = bin->rec;
	}

	/* Create new bin, or use root of contig if it's unused so far */
	if (!rbin && pright == cr->rec) {
	    rbin = cr->bin;
	}

	/* Otherwise we genuingly need a duplicate */
	if (!rbin)
	    rbin = bin_new(io, 0, 0, 0, GT_Bin);

	/* Initialise with duplicate values from left bin */
	bin_dup = get_bin(io, rbin);
	bin_dup = cache_rw(io, bin_dup);
	bin_dup->size = bin->size;
	bin_dup->pos = bin->pos;
	bin_dup->parent = pright;
	bin_dup->parent_type = (pright == cr->rec ? GT_Contig : GT_Bin);
	bin_dup->flags = bin->flags | BIN_BIN_UPDATED;
	bin_dup->start_used = bin->start_used;
	bin_dup->end_used = bin->end_used;

	/*
	 * Shift bin to offset if it's the contig root.
	 * It'll be shifted back by the correct amount later.
	 */
	if (pright == cr->rec) {
	    printf("moving root bin to offset=%d comp=%d\n", offset, complement);
	    bin_dup->pos = offset;
	}

	printf("%*sCreated dup for right, rec %"PRIrec"\n",
	       level*4,"", bin_dup->rec);
	break_contig_move_bin(io, bin_dup, cl, 0, cr, pright, child_no);
	opright = pright;
	pright = bin_dup->rec;
    } else {
	bin_dup = NULL;
	pleft = bin->rec;
    }

    if (!bin->rng) {
	/* Empty bin */
	printf("%*sEMPTY range\n", level*4, "");
	bin->start_used = bin->end_used = 0;
	bin->flags |= BIN_BIN_UPDATED;
	if (bin_dup) {
	    bin_dup->start_used = bin_dup->end_used = 0;
	    bin_dup->flags |= BIN_BIN_UPDATED;
	}
	    
    } else if (NMIN(bin->start_used, bin->end_used) >= pos) {
	/* Move range to right contig */
	printf("%*sDUP %"PRIrec", MOVE Array to right\n",
	       level*4, "", bin_dup->rec);

	bin_dup->rng = bin->rng;
	bin_dup->rng_rec = bin->rng_rec;
	bin_dup->rng_free = bin->rng_free;
	if (bin_dup->rng_rec)
	    bin_dup->flags |= BIN_RANGE_UPDATED;

	if (bin->rec != bin_dup->rec) {
	    bin->rng = NULL;
	    bin->rng_rec = 0;
	    bin->rng_free = -1;
	    bin->flags |= BIN_BIN_UPDATED;
	}

	bin->start_used = bin->end_used = 0;
	break_contig_reparent_seqs(io, bin_dup);

	if (bin_dup->rng) {
	    int n = ArrayMax(bin_dup->rng);
	    for (i = j = 0; i < n; i++) {
		range_t *r = arrp(range_t, bin_dup->rng, i), *r2;
		if (r->flags & GRANGE_FLAG_UNUSED)
		    continue;

		if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) {
		    HacheData hd; hd.i = 1;
		    HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL);
		    j++;
		}
	    }
	    bin_incr_nseq(io, bin_dup, j);
	}
    } else if (NMAX(bin->start_used, bin->end_used) < pos) {
	/* Range array already in left contig, so do nothing */
	printf("%*sMOVE Array to left\n", level*4, "");

	if (bin_dup)
	    bin_dup->start_used = bin_dup->end_used = 0;

	if (bin->rng) {
	    int n = ArrayMax(bin->rng);
	    for (i = j = 0; i < n; i++) {
		range_t *r = arrp(range_t, bin->rng, i);
		if (r->flags & GRANGE_FLAG_UNUSED)
		    continue;

		if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) {
		    HacheData hd; hd.i = 0;
		    HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL);
		    j++;
		}
	    }
	    bin_incr_nseq(io, bin, j);
	}
    } else {
	/* Range array covers pos, so split in two */
	int n, nl = 0, nr = 0;
	int lmin = bin->size, lmax = 0, rmin = bin->size, rmax = 0;

	printf("%*sDUP %"PRIrec", SPLIT array\n", level*4, "", bin_dup->rec);

	bin->flags |= BIN_RANGE_UPDATED;
	bin_dup->flags |= BIN_RANGE_UPDATED;

	bin_dup->rng = ArrayCreate(sizeof(range_t), 0);
	bin_dup->rng_free = -1;

	/* Pass 1 - hash sequences */
	n = ArrayMax(bin->rng);
	for (i = 0; i < n; i++) {
	    range_t *r = arrp(range_t, bin->rng, i);
	    int cstart; /* clipped sequence positions */
	    seq_t *s;

	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)
		continue;

	    s = (seq_t *)cache_search(io, GT_Seq, r->rec);
	    if ((s->len < 0) ^ complement) {
		cstart = NMAX(r->start, r->end) - (s->right-1);
	    } else {
		cstart = NMIN(r->start, r->end) + s->left-1;
	    }
	    
	    if (cstart >= pos)  {
		HacheData hd; hd.i = 1;
		HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL);
	    } else {
		HacheData hd; hd.i = 0;
		HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL);
	    }
	}
	
	/* Pass 2 - do the moving of anno/seqs */
	n = ArrayMax(bin->rng);
	for (i = j = 0; i < n; i++) {
	    range_t *r = arrp(range_t, bin->rng, i), *r2;
	    int cstart; /* clipped sequence positions */

	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) {
		cstart = NMAX(r->start, r->end);
	    } else {
		seq_t *s = (seq_t *)cache_search(io, GT_Seq, r->rec);
		if ((s->len < 0) ^ complement) {
		    cstart = NMAX(r->start, r->end) - (s->right-1);
		} else {
		    cstart = NMIN(r->start, r->end) + s->left-1;
		}
	    }
	    
	    if (cstart >= pos &&
		((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)) {
		anno_ele_t *a = (anno_ele_t *)cache_search(io,
							   GT_AnnoEle,
							   r->rec);
		/* If it's an annotation on a sequence < pos then we
		 * still don't move.
		 *
		 * FIXME: we have no guarantee that the sequence being
		 * annotated is in the same bin as this annotation, as
		 * they may be different sizes and end up in different
		 * bins. (Should we enforce anno always in same bin as seq?
		 * If so, consensus annos fit anywhere?)
		 */
		if (a->obj_type == GT_Seq) {
		    HacheItem *hi = HacheTableSearch(h,
						     (char *)&r->pair_rec,
						     sizeof(r->pair_rec));

		    if (hi) {
			if (hi->data.i == 0)
			    cstart = pos-1;
		    } else {
			puts("FIXME: annotation for seq in unknown place - "
			     "work out correct location and move if needed.");
		    }
		}
	    }

	    if (cstart >= pos) {
		r2 = (range_t *)ArrayRef(bin_dup->rng, ArrayMax(bin_dup->rng));
		*r2 = *r;
		if (rmin > r->start) rmin = r->start;
		if (rmin > r->end)   rmin = r->end;
		if (rmax < r->start) rmax = r->start;
		if (rmax < r->end)   rmax = r->end;
		if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ)
		    nr++;
	    } else {
		if (lmin > r->start) lmin = r->start;
		if (lmin > r->end)   lmin = r->end;
		if (lmax < r->start) lmax = r->start;
		if (lmax < r->end)   lmax = r->end;

		if (j != i) {
		    r2 = arrp(range_t, bin->rng, j);
		    *r2 = *r;
		}
		j++;
		if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ)
		    nl++;
	    }
	}
	bin_incr_nseq(io, bin, nl);
	bin_incr_nseq(io, bin_dup, nr);


	ArrayMax(bin->rng) = j;

#if 0
	/*
	 * Right now this causes problems, but I'm not sure why. Try again
	 * after we've fixed the bin->nseqs issues and other deallocation
	 * woes.
	 */

	if (ArrayMax(bin_dup->rng) == 0 && bin_dup->parent_type == GT_Bin) {
	    /* We didn't need it afterall! Odd. */
	    bin_index_t *pb;

	    printf("Purging bin %d that we didn't need afterall\n",
		   bin_dup->rec);
	    cache_rec_deallocate(io, GT_Bin, bin_dup->rec);
	    pb = cache_search(io, GT_Bin, bin_dup->parent);
	    if (pb->child[0] == bin_dup->rec)
		pb->child[0] = 0;
	    if (pb->child[1] == bin_dup->rec)
		pb->child[1] = 0;
	    bin_dup = NULL;
	    pright = opright;
	}
#endif

	if (bin_dup)
	    break_contig_reparent_seqs(io, bin_dup);

	if (lmin < lmax) {
	    bin->start_used     = lmin;
	    bin->end_used       = lmax;
	} else {
	    /* No data left in bin */
	    bin->start_used = 0;
	    bin->end_used = 0;
	}

	printf("%*sLeft=>%d..%d right=>%d..%d\n", level*4, "",
	       lmin, lmax, rmin, rmax);

	if (bin_dup) {
	    if (rmin < rmax) {
		bin_dup->start_used = rmin;
		bin_dup->end_used   = rmax;
	    } else {
		/* No data moved in bin */
		bin_dup->start_used = 0;
		bin_dup->end_used   = 0;
	    }
	}
    }


    /* Recurse */
    for (i = 0; i < 2; i++) {
	bin_index_t *ch;
	if (!bin->child[i])
	    continue;

	ch = get_bin(io, bin->child[i]);
	if (0 != break_contig_recurse(io, h, cl, cr, bin->child[i], pos,
				      NMIN(ch->pos, ch->pos + ch->size-1),
				      level+1, pleft, pright,
				      i, complement))
	    return -1;
    }

    cache_decr(io, bin);
    //    if (bin_dup)
    //	cache_decr(io, bin_dup);

    return 0;
}
Example #13
0
/*
 * Open a database, optionally in read-only mode and creating if desired too.
 *
 * Returns GapIO pointer to DB on success.
 *         NULL on failure.
 */
GapIO *gio_open(char *fn, int ro, int create) {
    GapIO *io = (GapIO *)calloc(1, sizeof(*io));
    char *cp;

    io->iface = get_iface_g();
    if (create) {
	if (-1 == io->iface->create(fn))
	    return NULL;
    }

    io->min_bin_size = MIN_BIN_SIZE; /* default */

    /* Initialise the cache */
    cache_create(io);

    if (NULL == (io->dbh = io->iface->connect(fn, ro))) {
	if (!ro) {
	    ro = 1;
	    if (NULL == (io->dbh = io->iface->connect(fn, ro)))
		return NULL;
	} else {
	    return NULL;
	}
    }

    io->read_only = ro;

    if (create) {
	io->iface->database.create(io->dbh, NULL);
    }

    /* Cache the GDatabase struct */
    if (NULL == (io->db = cache_search(io, GT_Database, 0)))
	return NULL;
    cache_incr(io, io->db);

    if (io->db->version > DB_VERSION) {
	verror(ERR_WARN, "Open Database",
	       "Database version %d is too new for this version of gap5",
	       io->db->version);
	return NULL;
    }

    /* Load the contigs array */
    io->contig_order = cache_search(io, GT_RecArray, io->db->contig_order);
    cache_incr(io, io->contig_order);

    /* Load the library array */
    io->library = cache_search(io, GT_RecArray, io->db->library);
    cache_incr(io, io->library);

    /* Initialise the contig and cursor registration hashes */
    contig_register_init(io);

    io->iface->setopt(io->dbh, OPT_COMP_MODE, COMP_MODE_ZLIB);

    /* Copy the name */
    if (NULL == (cp = strrchr(fn, '/')))
	cp = fn;
    io->name = strdup(cp);

    return io;
}
Example #14
0
/*
 * Complements a scaffold; both complementing each contig within it and
 * reversing the order of contigs in the scaffold.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int complement_scaffold(GapIO *io, tg_rec srec) {
    scaffold_t *f;
    int i, j, nc = ArrayMax(io->contig_order);
    scaffold_member_t *contigs;
    tg_rec *crecs;
    HashTable *h;
    reg_order ro;
    reg_buffer_start rs;
    reg_buffer_end re;

    if (!(f = cache_search(io, GT_Scaffold, srec)))
	return -1;
    if (!(f = cache_rw(io, f)))
	return -1;
    cache_incr(io, f);

    /* Complement contigs */
    contigs = ArrayBase(scaffold_member_t, f->contig);
    for (i = 0; i < ArrayMax(f->contig); i++) {
	complement_contig(io, contigs[i].rec);
    }

    /* Reverse the order of the contigs in the scaffold array */
    for (i = 0, j = ArrayMax(f->contig)-1; i < j; i++, j--) {
	scaffold_member_t cr1 = contigs[i];
	contigs[i] = contigs[j];
	contigs[j] = cr1;
    }

    /*
     * Reverse the order of contigs in the contig_order array too.
     * This is the part that really matters. It's also hard as the contigs
     * in the contig order array could be in any order and not adjacent.
     * For our purposes we'll just ensure the contigs in this scaffold in 
     * the contig order array match our freshly complemented scaffold
     * ordering.
     *
     * We initially build a hash table of contigs in this scaffold, and
     * then iterate through contig_order copying out the new contigs whenever
     * one matches.
     */
    h = HashTableCreate(nc, 0);
    for (i = 0; i < ArrayMax(f->contig); i++) {
	HashData hd;
	hd.i = 0;
	HashTableAdd(h, (char *)&contigs[i].rec, sizeof(tg_rec), hd, NULL);
    }

    /* Replace any contig matching the scaffold with the new order */
    crecs = ArrayBase(tg_rec, io->contig_order);
    for (i = j = 0; i < nc; i++) {
	HashItem *hi;
	if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec))))
	    continue;

	crecs[i] = contigs[j++].rec;
    }

    /* Send event messages around */
    rs.job = REG_BUFFER_START;
    for (i = 0; i < nc; i++) {
	HashItem *hi;
	if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec))))
	    continue;

	contig_notify(io, crecs[i], (reg_data *)&rs);
    }

    ro.job = REG_ORDER;
    for (i = 0; i < nc; i++) {
	HashItem *hi;
	if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec))))
	    continue;

	ro.pos = i+1;
	contig_notify(io, crecs[i], (reg_data *)&ro);
    }

    /* Notify the end of our updates */
    re.job = REG_BUFFER_END;
    for (i = 0; i < nc; i++) {
	HashItem *hi;
	if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec))))
	    continue;

	contig_notify(io, crecs[i], (reg_data *)&re);
    }

    HashTableDestroy(h, 0);
    cache_decr(io, f);

    return 0;
}
Example #15
0
File: parse.c Project: dolfly/nmdb
static void parse_incr(struct req_info *req)
{
	int cres, cache_only, rv;
	const unsigned char *key;
	uint32_t ksize;
	int64_t increment, newval;
	const int max = 65536;

	/* Request format:
	 * 4		ksize
	 * ksize	key
	 * 8		increment (big endian int64_t)
	 */
	ksize = * (uint32_t *) req->payload;
	ksize = ntohl(ksize);

	/* Sanity check on sizes:
	 * - ksize + 8 must be < req->psize
	 * - ksize + 8 must be < 2^16 = 64k
	 */
	if ( (req->psize < ksize + 8) || ((ksize + 8) > max)) {
		stats.net_broken_req++;
		req->reply_err(req, ERR_BROKEN);
		return;
	}

	if (settings.read_only) {
		req->reply_err(req, ERR_RO);
		return;
	}

	FILL_CACHE_FLAG(incr);

	key = req->payload + sizeof(uint32_t);
	increment = ntohll( * (int64_t *) (key + ksize) );

	cres = cache_incr(cache_table, key, ksize, increment, &newval);
	if (cres == -3) {
		req->reply_err(req, ERR_MEM);
		return;
	} else if (cres == -2) {
		/* the value was not NULL terminated */
		req->reply_mini(req, REP_NOMATCH);
		return;
	}

	if (!cache_only) {
		/* at this point, the cache_incr() was either successful or a
		 * miss, but we don't really care */
		rv = put_in_queue(req, REQ_INCR, 1, key, ksize,
				(unsigned char *) &increment,
				sizeof(increment));
		if (!rv) {
			req->reply_err(req, ERR_MEM);
			return;
		}
	} else {
		if (cres == -1) {
			req->reply_mini(req, REP_NOTIN);
		} else {
			newval = htonll(newval);
			req->reply_long(req, REP_OK,
					(unsigned char *) &newval,
					sizeof(newval));
		}
	}

	return;
}
Example #16
0
/*
 * Open a database, optionally in read-only mode and creating if desired too.
 *
 * Returns GapIO pointer to DB on success.
 *         NULL on failure.
 */
GapIO *gio_open(char *fn, int ro, int create) {
    GapIO *io = (GapIO *)calloc(1, sizeof(*io));
    char *cp;
    int lock_err;

    /* Check locks */
    lock_err = actf_lock(ro, fn, create);
    if (!create && (lock_err == 3 || lock_err == 5)) {
	vmessage("Opening database in read only mode instead.\n");
	ro = 1;
	lock_err = actf_lock(ro, fn, create);
    }
    if (lock_err != 0) {
	verror(ERR_WARN, "Open Database",
	       "Unable to lock and/or open the database.");
	return NULL;
    }

    io->iface = get_iface_g();
    if (create) {
	if (0 != io->iface->create(fn)) {
	    xperror("In tg_gio.c:gio_open()", xperror_fatal);
	    return NULL;
	}
    }

    io->min_bin_size = MIN_BIN_SIZE; /* default */

    /* Initialise the cache */
    cache_create(io);

    if (NULL == (io->dbh = io->iface->connect(fn, ro))) {
	if (!ro) {
	    ro = 1;
	    if (NULL == (io->dbh = io->iface->connect(fn, ro)))
		return NULL;
	} else {
	    return NULL;
	}
    }

    io->read_only = ro;

    if (create) {
	io->iface->database.create(io->dbh, NULL, db_version);
    }

    /* Cache the GDatabase struct */
    if (NULL == (io->db = cache_search(io, GT_Database, 0)))
	return NULL;
    cache_incr(io, io->db);

    if (io->db->version > DB_VERSION) {
	verror(ERR_WARN, "Open Database",
	       "Database version %d is too new for this version of gap5",
	       io->db->version);
	return NULL;
    }

    /* Load the contigs array */
    io->contig_order = cache_search(io, GT_RecArray, io->db->contig_order);
    cache_incr(io, io->contig_order);

    /* Load the scaffold array */
    if (io->db->scaffold) {
	io->scaffold =
	    cache_search(io, GT_RecArray, io->db->scaffold);
	cache_incr(io, io->scaffold);
    } else {
	/* FIXME: create a dummy order of 1 per contig? */
	io->scaffold = 0;
    }
    
    /* Load the library array */
    io->library = cache_search(io, GT_RecArray, io->db->library);
    cache_incr(io, io->library);

    /* Initialise the contig and cursor registration hashes */
    contig_register_init(io);

    io->iface->setopt(io->dbh, OPT_COMP_MODE, COMP_MODE_ZLIB);

    /* Copy the name */
    if ((cp = strrchr(fn, '/')))
	cp++;
    else
	cp = fn;
    io->name = strdup(cp);

    io->last_bin = 0;
    io->incr_svalue = io->incr_rvalue = io->incr_avalue = 0;

    io->max_template_size = 0;

    io->debug_level = 0;
    io->debug_fp = stderr;

    //update_uniqueness_hash(io);

#ifdef DO_LOGGING
    open_log_file(io, fn);
#endif

    return io;
}
Example #17
0
/*
 * Breaks a contig in two such that snum is the right-most reading of
 * a new contig.
 */
int break_contig(GapIO *io, tg_rec crec, int cpos) {
    contig_t *cl;
    contig_t *cr;
    int cid;
    char cname[1024], *cname_end;
    int left_end, right_start;
    bin_index_t *bin;
    int do_comp = 0;
    HacheTable *h;

    cl = (contig_t *)cache_search(io, GT_Contig, crec);

    //contig_dump_ps(io, &cl, "/tmp/tree.ps");

    /*
     * Our hash table is keyed on sequence record numbers for all sequences
     * in all bins spanning the break point. The value is either 0 or 1
     * for left/right contig.
     * 
     * The purpose of this hash is to allow us to work out whether a tag
     * belongs in the left or right contig, as a tag could start beyond the
     * break point but be attached to a sequence before the break point.
     *
     * Further complicating this is that a tag could be in a smaller bin
     * than the sequence as it may not be as long. However we know
     * we'll recurse down these in a logical order so we can be sure
     * we've already "seen" the sequence that the tag has been
     * attached to.
     */
    h = HacheTableCreate(1024, HASH_DYNAMIC_SIZE);

    strncpy(cname, contig_get_name(&cl), 1000);
    cname_end = cname + strlen(cname);
    cid = 1;
    do {
	sprintf(cname_end, "#%d", cid++);
    } while (contig_index_query(io, cname) > 0);

    if (!(cr = contig_new(io, cname)))
	return -1;
    cl = cache_rw(io, cl);
    cr = cache_rw(io, cr);
    if (0 != contig_index_update(io, cname, strlen(cname), cr->rec))
	return -1;
    printf("Break in contig %"PRIrec", pos %d\n", crec, cpos);

    printf("Existing left bin = %"PRIrec", right bin = %"PRIrec"\n",
	   cl->bin, cr->bin);

    cache_incr(io, cl);
    cache_incr(io, cr);

    bin = get_bin(io, cl->bin);
    do_comp = bin->flags & BIN_COMPLEMENTED;

    break_contig_recurse(io, h, cl, cr,
			 contig_get_bin(&cl), cpos, contig_offset(io, &cl),
			 0, cl->rec, cr->rec, 0, 0);

    /* Recompute end positions */
    left_end    = contig_visible_end(io, cl->rec);
    right_start = contig_visible_start(io, cr->rec);

    /* Ensure start/end positions of contigs work out */
    bin = cache_rw(io, get_bin(io, cr->bin));

    //#define KEEP_POSITIONS 1
#ifndef KEEP_POSITIONS
    cr->start = 1;
    cr->end = cl->end - right_start + 1;
    bin->pos -= right_start-1;
#else
    cr->start = right_start;
    cr->end = cl->end;
#endif

    if ((do_comp && !(bin->flags & BIN_COMPLEMENTED)) ||
	(!do_comp && (bin->flags & BIN_COMPLEMENTED))) {
	bin->flags ^= BIN_COMPLEMENTED;
    }

    cl->end = left_end;

    //    remove_redundant_bins(io, cl);
    //    remove_redundant_bins(io, cr);

    printf("Final left bin = %"PRIrec", right bin = %"PRIrec"\n",
	   cl->bin, cr->bin);

    HacheTableDestroy(h, 0);

    //if (cl->bin) contig_dump_ps(io, &cl, "/tmp/tree_l.ps");
    //if (cr->bin) contig_dump_ps(io, &cr, "/tmp/tree_r.ps");

    cache_flush(io);

    remove_empty_bins(io, cl->rec);
    remove_empty_bins(io, cr->rec);

    /* Empty contig? If so remove it completely */
    if (cl->bin == 0) {
	printf("Removing empty contig %"PRIrec"\n", cl->rec);
	contig_destroy(io, cl->rec);
    }
    if (cr->bin == 0) {
	printf("Removing empty contig %"PRIrec"\n", cr->rec);
	contig_destroy(io, cr->rec);
    }

    cache_decr(io, cl);
    cache_decr(io, cr);

    cache_flush(io);

    return 0;
}
Example #18
0
int edview_search_consensus(edview *xx, int dir, int strand, char *value) {
    int mismatches = 0; /* exact match */
    int where = 2;      /* consensus */
    char *p;
    int start, end;
    char cons[WIN_WIDTH+1];
    int patlen;
    char *uppert, *upperb;
    int found = 0, at_end = 0;
    tg_rec fseq;
    int fpos, i, j;
    contig_t *c;

    /*
     * Parse value search string. It optionally includes two extra params
     * separated by #. Ie:
     *     <string>#<N.mismatches>#<where>.
     * <where> is 1 for readings, 2 for consensus, 3 for both.
     */
    if (p = strchr(value, '#')) {
	mismatches = atoi(p+1);
	*p = 0;
	if (p = strchr(p+1, '#'))
	    where = atoi(p+1);
    }


    /* uppercase search string, remove pads, and store fwd/rev copies */
    patlen = strlen(value);
    depad_seq(value, &patlen, NULL);
    if (NULL == (uppert = (char *)xmalloc(patlen + 1)))
	return 0;
    if (NULL == (upperb = (char *)xmalloc(patlen + 1)))
	return 0;

    uppert[patlen] = upperb[patlen] = 0;
    for (i = patlen-1; i >= 0; i--) {
	upperb[i] = uppert[i] = toupper(value[i]);
    }
    complement_seq(upperb, patlen);


    /* Loop */
    if (dir) {
	start = xx->cursor_apos + (dir ? 1 : -1);
	end   = start + (WIN_WIDTH-1);
    } else {
	end   = xx->cursor_apos + (dir ? 1 : -1);
	start = end - (WIN_WIDTH-1);
    }
    fpos = xx->cursor_apos;

    c = cache_search(xx->io, GT_Contig, xx->cnum);
    cache_incr(xx->io, c);
    do {
	char *ind, *indt = NULL, *indb = NULL;

	calculate_consensus_simple(xx->io, xx->cnum, start, end, cons, NULL);
	cons[WIN_WIDTH] = 0;

	if (dir) {
	    if (strand == '+' || strand == '=')
		indt = pstrstr_inexact(cons, uppert, mismatches, NULL);
	    if (strand == '-' || strand == '=')
		indb = pstrstr_inexact(cons, upperb, mismatches, NULL);
	} else {
	    if (strand == '+' || strand == '=')
		indt = prstrstr_inexact(cons, uppert, mismatches, NULL);
	    if (strand == '-' || strand == '=')
		indb = prstrstr_inexact(cons, upperb, mismatches, NULL);
	}

	if (indt && indb)
	    ind = MIN(indt, indb);
	else if (indt)
	    ind = indt;
	else if (indb)
	    ind = indb;
	else
	    ind = NULL;

	if (ind != NULL) {
	    if (dir) {
		if (fpos <= start + ind-cons) {
		    found = 1;
		    fpos = start + ind-cons;
		    fseq = xx->cnum;
		}
	    } else {
		if (fpos >= start + ind-cons) {
		    found = 1;
		    fpos = start + ind-cons;
		    fseq = xx->cnum;
		}
	    }
	    break;
	}

	/* Next search region - overlapping by patlen+pads */
	if (dir) {
	    for (i = WIN_WIDTH-1, j = patlen; j && i; i--) {
		if (cons[i] != '*')
		    j--;
	    }
	    if (i == 0)
		break;
	    start += i;
	    end   += i;

	    if (start > c->end)
		at_end = 1;
	} else {
	    for (i = 0, j = patlen; j && i < WIN_WIDTH; i++) {
		if (cons[i] != '*')
		    j--;
	    }
	    if (i == WIN_WIDTH)
		break;

	    start -= WIN_WIDTH-i;
	    end   -= WIN_WIDTH-i;

	    if (end < c->start)
		at_end = 1;
	}
    } while (!at_end);
    cache_decr(xx->io, c);

    if (found) {
	edSetCursorPos(xx, fseq == xx->cnum ? GT_Contig : GT_Seq,
		       fseq, fpos, 1);
    }

    free(uppert);
    free(upperb);

    return found ? 0 : -1;
}
Example #19
0
/*
 * Extends the right hand end of a single contig.
 *
 * Min_depth is the minimum depth for extension. If lower then even if the
 * data matches we'll not extend further.
 *
 * Match_score (+ve) and mismatch_score (-ve) are accumulated during
 * extension to ensure that we don't extend into junk mismatching DNA.
 */
static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth,
				int match_score, int mismatch_score) {
    int end;
    rangec_t *r;
    int nr, i;
    contig_t *c;
    char cons[CSZ], new_cons[ESZ];
    int freqs[ESZ][4], depth[ESZ];
    double score, best_score;
    int best_pos, nseq;

    vmessage("Processing contig #%"PRIrec", %s end\n",
	     crec, dir ? "left" : "right");

    for (i = 0; i < ESZ; i++) {
	freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0;
	depth[i] = 0;
    }

    c = cache_search(io, GT_Contig, crec);
    if (NULL == c) return -1;
    cache_incr(io, c);

    if (consensus_valid_range(io, crec, NULL, &end) != 0) {
	cache_decr(io, c);
	return -1;
    }

    calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL);

    /* Start */
    /* Not implemented for now: rev complement and go again! */

    /* End */
    r = contig_seqs_in_range(io, &c, end, end, 0, &nr);
    if (!r) {
	cache_decr(io, c);
	return -1;
    }

    for (i = 0; i < nr; i++) {
	seq_t *s = cache_search(io, GT_Seq, r[i].rec);
	seq_t *sorig = s;
	int cstart, cend;
	int j, k, slen;

	if ((s->len < 0) ^ r[i].comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	cstart = r[i].start + s->left-1;
	cend   = r[i].start + s->right-1;

	/* Does cutoff extend to contig end, if so does it match cons? */
	if (cend < end) {
	    int mis = 0, len = 0;
	    if (end - cend >= CSZ) {
		/*
		fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n",
			r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0; /* Mark for removal */
		continue;
	    }

	    for (k = s->right, j = cend+1; j <= end; j++, k++) {
		//printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]);
		if (s->seq[k] != cons[j-(end-(CSZ-1))])
		    mis++;
	    }
	    len = end - cend;
	    if (100*mis/len > 5) {
		/*
		fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement "
			"with consensus.\n", r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0;
		continue;
	    }
	}

	/* So we got here, let's accumulate extension stats */
	slen = ABS(s->len);
	for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) {
	    //printf("%d: %c\n", j + r[i].start, s->seq[j]);
	    if(s->seq[j] == 'N')
		continue;

	    freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++;
	    depth[k]++;
	}

	if (sorig != s)
	    free(s);
    }

    score = best_score = 0;
    best_pos = 0;
    
    for (i = 0; i < ESZ; i++) {
	int call, best = 0, j;
	double dd;

	if (depth[i] < min_depth)
	    break;

	for (j = 0; j < 4; j++) {
	    if (best < freqs[i][j]) {
		best = freqs[i][j];
		call = j;
	    }
	}
	new_cons[i] = "ACGT"[call];

	dd = (double)depth[i];
	switch (call) {
	case 0:
	    score +=  freqs[i][0] / dd;
	    score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 1:
	    score +=  freqs[i][1] / dd;
	    score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 2:
	    score +=  freqs[i][2] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd;
	    break;
	case 3:
	    score +=  freqs[i][3] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd;
	    break;
	}

	if (best_score <= score) {
	    best_score = score;
	    best_pos = i+1;
	}
	/*
	printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n",
	       i, depth[i], "ACGT"[call],
	       freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3],
	       score);
	*/
    }
    /* printf("Best score is %f at %d\n", best_score, best_pos); */

    /* Extend */
    nseq = 0;
    if (best_pos > 0) {
	int furthest_left = end;

	for (i = 0; i < nr; i++) {
	    seq_t *s;
	    int r_pos;
	    int score;

	    if (r[i].rec == 0)
		continue;

	    s = cache_search(io, GT_Seq, r[i].rec);
	    s = cache_rw(io, s);

	    if (furthest_left > r[i].start)
		furthest_left = r[i].start;

	    /*
	     * end + best_pos is the furthest right we can go, but this
	     * specific read may not be justified in reaching that far
	     * if it has too many disagreements.
	     */
	    if ((s->len > 0) ^ r[i].comp) {
		int best_r = 0, j, k;
		int len = ABS(s->len);

		//printf(">%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) {
		for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) {
		    if (new_cons[j] == toupper(s->seq[k])) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k+1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(s->seq[k])
		    //	    ? toupper(s->seq[k])
		    //        : tolower(s->seq[k]));
		}
		//putchar('\n');

		if (s->right != r_pos) {
		    s->right  = r_pos;
		    nseq++;
		}
	    } else {
		int best_r = 0, j, k;

		//printf("<%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) {
		for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) {
		    char b = complement_base(s->seq[k]);
		    if (new_cons[j] == b) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k-1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(b)
		    //	    ? toupper(b)
		    //	    : tolower(b));
		}
		//putchar('\n');

		if (s->left != r_pos+2) {
		    s->left  = r_pos+2;
		    nseq++;
		}
	    }
	}

	vmessage("    Extended by %d, adjusting %d sequence clip%s\n",
		 best_pos, nseq, nseq == 1 ? "" : "s");

	bin_invalidate_consensus(io, crec, furthest_left, end + best_pos);
    } else {
	vmessage("    Unable to extend contig\n");
    }
    free(r);

    cache_decr(io, c);
    cache_flush(io);
    return 0;
}