/*
 * Sets a scaffold name.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int scaffold_set_name(GapIO *io, scaffold_t **f, char *name) {
    scaffold_t *n;
    GapIO *iob = gio_base(io);

    if (!(n = cache_rw(io, *f)))
	return -1;

    /* Delete old name */
    if (n->name) {
	tg_rec r = iob->iface->scaffold.index_del(iob->dbh, n->name, n->rec);
	if (r != -1 && r != io->db->scaffold_name_index) {
	    io->db = cache_rw(io, io->db);
	    io->db->scaffold_name_index = r;
	}
    }

    if (NULL == (n = cache_item_resize(n, sizeof(*n) + strlen(name)+1)))
	return -1;

    *f = n;

    /* Add new name */
    n->name   = (char *)(&n->data);
    strcpy(n->name, name);

    if (*name) {
	tg_rec r = iob->iface->scaffold.index_add(iob->dbh, name, n->rec);
	if (r != -1 && r != io->db->scaffold_name_index) {
	    io->db = cache_rw(io, io->db);
	    io->db->scaffold_name_index = r;
	}
    }

    return 0;
}
/*
 * Creates a new named scaffold.
 *
 * Returns scaffold pointer on success.
 *         NULL on failure
 */
scaffold_t *scaffold_new(GapIO *io, char *name) {
    tg_rec rec;
    scaffold_t *f, init_f;

    if (!io->db->scaffold)
	return NULL;

    memset(&init_f, 0, sizeof(scaffold_t));
    init_f.name = name;

    /* Allocate our contig */
    rec = cache_item_create(io, GT_Scaffold, &init_f);

    /* Initialise it */
    f = (scaffold_t *)cache_search(io, GT_Scaffold, rec);
    f = cache_rw(io, f);

    if (name)
        scaffold_set_name(io, &f, name);
    else
        f->name = NULL;

    /* Add it to the scaffold order too */
    io->scaffold = cache_rw(io, io->scaffold);
    io->db = cache_rw(io, io->db);
    ARR(tg_rec, io->scaffold, io->db->Nscaffolds++) = rec;

    /* Add to the new contigs list */
    if (name)
	add_to_list("new_scaffolds", name);

    return f;
}
Exemple #3
0
/*
 * Given ranges contained within a bin this makes sure that all sequences
 * referred to in these ranges have their parent listed as the new bin.
 *
 * Returns 0 on success
 *        -1 on failure
 */
static int break_contig_reparent_seqs(GapIO *io, bin_index_t *bin) {
    int i, nr = bin->rng ? ArrayMax(bin->rng) : 0;

    for (i = 0; i < nr; i++) {
	range_t *r = arrp(range_t, bin->rng, i);
	if (r->flags & GRANGE_FLAG_UNUSED)
	    continue;

	if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) {
	    anno_ele_t *a = (anno_ele_t *)cache_search(io, GT_AnnoEle, r->rec);
	    if (a->bin != bin->rec) {
		a = cache_rw(io, a);
		a->bin = bin->rec;
	    }
	} else {
	    seq_t *seq = (seq_t *)cache_search(io, GT_Seq, r->rec);
	    if (seq->bin != bin->rec) {
		seq = cache_rw(io, seq);
		seq->bin = bin->rec;
		seq->bin_index = i;
	    }
	}
    }

    return 0;
}
/*
 * Adds a contig to a scaffold array.
 * Gap size, type and evidence refer to the gap between this and the
 * "previous" contig - ie the last in the scaffold. More complex
 * scaffold manipulations will be handled elsewhere.
 *
 * Set these fields to 0 if you do not know them.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int scaffold_add(GapIO *io, tg_rec scaffold, tg_rec contig,
		 int gap_size, int gap_type, int evidence) {
    scaffold_t *f;
    contig_t *c;
    scaffold_member_t *m;
    int i;

    /* Check if this contig is in a scaffold, if so remove now */
    c = cache_search(io, GT_Contig, contig);
    if (c->scaffold)
	scaffold_remove(io, c->scaffold, contig);

    if (!(f = cache_search(io, GT_Scaffold, scaffold)))
	return -1;

    /* Check if it already exists */
    for (i = 0; i < ArrayMax(f->contig); i++) {
	m = arrp(scaffold_member_t, f->contig, i);
	if (m->rec == contig)
	    return 0;
    }

    /* Append */
    f = cache_rw(io, f);
    m = ArrayRef(f->contig, ArrayMax(f->contig)); // extend
    m->rec = contig;
    m->gap_size = ArrayMax(f->contig) > 1 ? gap_size : 0;
    m->gap_type = gap_type;
    m->evidence = evidence;

    /* Update the contig record too */
    c = cache_search(io, GT_Contig, contig);
    c = cache_rw(io, c);
    c->scaffold = scaffold;

#if 0
    /* Add a scaffold link to the contig graph too */
    if (ArrayMax(f->contig) >= 2) {
	m = arrp(scaffold_member_t, f->contig, ArrayMax(f->contig)-2);
	contig_link_t lnk;

	lnk.rec1 = contig;
	lnk.rec2 = m->rec;
	/* Best guess */
	lnk.pos1 = 0; lnk.end1 = 1;
	lnk.pos2 = 0; lnk.end2 = 0;
	lnk.orientation = 0;
	lnk.size = 100;
	lnk.type = CLINK_TYPE_SCAFFOLD;
	lnk.score = 0;

	contig_add_link(io, &lnk);
    }
#endif

    return 0;
}
static void tag_shift_for_delete(GapIO *io, tg_rec crec, tg_rec srec,
				 int start, int end, int pos, tg_rec brec) {
    contig_iterator *ci;
    rangec_t *r;
    contig_t *c = cache_search(io, GT_Contig, crec);;

    //printf("< tag in seq %"PRIrec" at %d\n", srec, pos);

    cache_incr(io, c);

    ci = contig_iter_new_by_type(io, crec, 0, CITER_FIRST | CITER_ISTART,
				 start+pos, end, GRANGE_FLAG_ISANNO);
    if (!ci) {
	cache_decr(io, c);
	return;
    }

    while ((r = contig_iter_next(io, ci))) {
	range_t r2, *r_out;
	anno_ele_t *a;
	bin_index_t *bin;

	if (r->pair_rec != srec)
	    continue;

	bin_remove_item(io, &c, GT_AnnoEle, r->rec);
	r2.start    = (r->start > start+pos) ? r->start-1 : r->start;
	r2.end      = r->end-1;
	r2.mqual    = r->mqual;
	r2.rec      = r->rec;
	r2.pair_rec = r->pair_rec;
	r2.flags    = r->flags;

	if (r2.end < r2.start) {
	    /* Tag entirely removed now, it must have been on a pad */
	    a = cache_search(io, GT_AnnoEle, r->rec);
	    a = cache_rw(io, a);
	    cache_deallocate(io, a);
	    continue;
	}
	bin = bin_add_to_range(io, &c, brec, &r2, &r_out, NULL, 0);

	a = cache_search(io, GT_AnnoEle, r->rec);
	if (a->bin != bin->rec /*||
	    a->idx != r_out - ArrayBase(range_t, bin->rng)*/) {
	    /* Annotation moved bins */
	    a = cache_rw(io, a);
	    a->bin = bin->rec;
	    //a->bin_idx = r_out - ArrayBase(range_t, bin->rng);
	}
    }

    cache_decr(io, c);
    contig_iter_del(ci);
}
Exemple #6
0
/*
 * Sets the annotation type, passed in as a string but held in a 4-byte int.
 * This also attempts to set the cached copy of the type held within the
 * bin range array.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int anno_ele_set_type(GapIO *io, anno_ele_t **e, char *str) {
    int type;
    char stype[5];
    anno_ele_t *ae;

    if (!(ae = cache_rw(io, *e)))
	return -1;

    /* Get integer type */
    memset(stype, 0, 5);
    strncpy(stype, str, 4);
    type = str2type(stype);

    /* Update annotation */
    ae->tag_type = type;

    /* Also update range_t cached copy of type */
    if (ae->bin) {
	bin_index_t *bin = (bin_index_t *)cache_search(io, GT_Bin, ae->bin);
	range_t *r = NULL;
	int i, nranges;

	if (!bin)
	    return -1;
	if (!(bin = cache_rw(io, bin)))
	    return -1;

	/*
	 * Find the index into the bin range.
	 * FIXME: we should add a bin_index element, as seen in seq_t,
	 * to avoid the brute force loop. This doesn't have to be
	 * permanently stored - a cached copy would suffice.
	 */
	nranges = bin->rng ? ArrayMax(bin->rng) : 0;
	for (i = 0; i < nranges; i++) {
	    r = arrp(range_t, bin->rng, i);
	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if (r->rec == ae->rec)
		break;
	}
	if (i == nranges)
	    return -1;

	bin->flags |= BIN_RANGE_UPDATED;
	r->mqual = type;
    }

    *e = ae;

    return 0;
}
Exemple #7
0
/*
 * Tidies up after break contig or disassemble readings, looking for now
 * redundant bins.
 *
 * This has the following functions (not all implemented yet!)
 * 1) If a contig is totally empty, remove the contig.
 * 2) If a bin is empty and all below it, remove the bin.
 * 3) If a bin is empty and all above it, remove parent bins and link
 *    contig to new root. (TODO)
 */
static void remove_empty_bins(GapIO *io, tg_rec contig) {
    contig_t *c = cache_search(io, GT_Contig, contig);
    tg_rec first = 0;

    cache_incr(io, c);

    if (c->bin) {
	if (remove_empty_bins_r(io, c->bin, &first)) {
	    cache_decr(io, c);
	    contig_destroy(io, contig);
	    return;
	}

	if (first != c->bin) {
	    bin_index_t *bin;
	    tg_rec bp, br, cdummy;
	    int offset;

	    /* Cut out the offending waste */
	    bin = cache_search(io, GT_Bin, first);
	    bin = cache_rw(io, bin);
	    bp = bin->parent;

	    // Find new bin offset
	    bin_get_position(io, bin, &cdummy, &offset);
	    assert(cdummy == contig);

	    bin->pos = offset;
	    bin->parent = contig;
	    bin->parent_type = GT_Contig;
	    bin->flags |= BIN_BIN_UPDATED;

	    c = cache_rw(io, c);
	    br = c->bin;
	    c->bin = first;

	    bin = cache_search(io, GT_Bin, bp);
	    bin = cache_rw(io, bin);
	    if (bin->child[0] == first) bin->child[0] = 0;
	    if (bin->child[1] == first) bin->child[1] = 0;

	    /* Recursively remove the bin tree from old root, br */
	    bin_destroy_recurse(io, br);
	}
    }

    cache_decr(io, c);
}
Exemple #8
0
int track_set_flag(GapIO *io, track_t **t, int value) {
    track_t *n;
    if (!(n = cache_rw(io, *t)))
        return -1;

    n->flag = value;
    *t = n;

    return 0;
}
Exemple #9
0
int track_set_nitems(GapIO *io, track_t **t, int value) {
    track_t *n;
    if (!(n = cache_rw(io, *t)))
        return -1;

    n->nitems = value;
    *t = n;

    return 0;
}
Exemple #10
0
int track_set_item_size(GapIO *io, track_t **t, int value) {
    track_t *n;
    if (!(n = cache_rw(io, *t)))
        return -1;

    n->item_size = value;
    *t = n;

    return 0;
}
Exemple #11
0
/*
 * Sets the annotation direction, one of ANNO_DIR_* macros (+,-,.,?)
 *
 * Returns 0 on success
 *        -1 on failure
 */
int anno_ele_set_direction(GapIO *io, anno_ele_t **e, char dir) {
    anno_ele_t *ae;

    if (!(ae = cache_rw(io, *e)))
	return -1;

    *e = ae;
    ae->direction = dir;

    return 0;
}
Exemple #12
0
int track_set_data(GapIO *io, track_t **t, Array value) {
    track_t *n;
    if (!(n = cache_rw(io, *t)))
        return -1;

    if (n->data)
        ArrayDestroy(n->data);

    n->data = value;
    *t = n;

    return 0;
}
/*
 * Removes a contig from a scaffold.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int scaffold_remove(GapIO *io, tg_rec scaffold, tg_rec contig) {
    scaffold_t *f;
    scaffold_member_t *m, *m2;
    contig_t *c;
    int i;

    c = cache_search(io, GT_Contig, contig);
    f = cache_search(io, GT_Scaffold, scaffold);

    if (!c || !f)
	return -1;

    if (c->scaffold != scaffold) {
	verror(ERR_WARN, "scaffold_remove", "Attempted to remove contig #%"
	       PRIrec" from a scaffold #%"PRIrec" it is not a member of",
	       contig, scaffold);
	return -1;
    }

    c = cache_rw(io, c);
    c->scaffold = 0;

    f = cache_rw(io, f);
    for (i = 0; i < ArrayMax(f->contig); i++) {
	m = arrp(scaffold_member_t, f->contig, i);
	if (m->rec == contig) {
	    /* Shuffle array down */
	    for (i++; i < ArrayMax(f->contig); i++) {
		m2 = arrp(scaffold_member_t, f->contig, i);
		*m = *m2;
		m = m2;
	    }
	    ArrayMax(f->contig)--;
	}
    }

    return 0;
}
Exemple #14
0
/*
 * Creates an anno_ele as per anno_ele_new, but also adds it to an object
 * and creates the bin Range entry too.
 */
tg_rec anno_ele_add(GapIO *io, int obj_type, tg_rec obj_rec, tg_rec anno_rec,
		    int type, char *comment, int start, int end, char dir) {
    range_t r;
    anno_ele_t *e;
    contig_t *c;
    tg_rec crec;
    bin_index_t *bin;
    tg_rec seq_bin = 0;

    /* Find contig for obj_rec/obj_type */
    if (obj_type == GT_Contig) {
	crec = obj_rec;
    } else {
	int st, en;
	sequence_get_position2(io, obj_rec, &crec, &st, &en, NULL,
			       &seq_bin, NULL, NULL);

	start += st;
	end += st;
    }

    c = (contig_t *)cache_search(io, GT_Contig, crec);
    cache_incr(io, c);

    r.start    = start;
    r.end      = end;
    r.flags    = GRANGE_FLAG_ISANNO;
    r.mqual    = type;
    r.pair_rec = obj_rec;

    if (GT_Seq == obj_type)
	r.flags |= GRANGE_FLAG_TAG_SEQ;

    r.rec = anno_ele_new(io, 0, obj_type, obj_rec, 0, type, dir, comment);
    e = (anno_ele_t *)cache_search(io, GT_AnnoEle, r.rec);
    e = cache_rw(io, e);

    if (seq_bin)
	bin = bin_add_to_range(io, &c, seq_bin, &r, NULL, NULL, 0);
    else
	bin = bin_add_range(io, &c, &r, NULL, NULL, 0);

    if (!bin) 
	verror(ERR_FATAL, "anno_ele_add", "bin_add_to_range returned NULL");

    e->bin = bin ? bin->rec : 0;

    cache_decr(io, c);
    return r.rec;
}
Exemple #15
0
/*
 * Looks for redundant bins at the root containing no data and just a single
 * child.
 *
 * FIXME: We need to compensate for bin position here. Hence this function
 * is not called for now.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int remove_redundant_bins(GapIO *io, contig_t *c) {
    tg_rec bnum;

    if (!(c = cache_rw(io, c)))
	return -1;

    for (bnum = c->bin; bnum;) {
	bin_index_t *bin = get_bin(io, bnum);
	if (bin->rng || (bin->child[0] && bin->child[1]))
	    break;

	/* Empty */
	c->bin = bin->child[0] ? bin->child[0] : bin->child[1];
	printf("Remove bin %"PRIrec"\n", bin->rec);
	bnum = c->bin;
    }

    return 0;
}
Exemple #16
0
/*
 * Sets the comment for an annotation element.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int anno_ele_set_comment(GapIO *io, anno_ele_t **e, char *comment) {
    anno_ele_t *ae;
    size_t clen;

    if (!(ae = cache_rw(io, *e)))
	return -1;

    clen = comment ? strlen(comment) : 0;
    if (clen > (ae->comment ? strlen(ae->comment) : 0)) {
	ae = cache_item_resize(ae, sizeof(*ae) + clen+1);
	ae->comment = (char *)&ae->data;
    }
    if (clen)
	strcpy(ae->comment, comment);

    *e = ae;

    return 0;
}
Exemple #17
0
/*
 * Removes an anno_ele from the gap database.
 * FIXME: need to deallocate storage too. (See docs/TODO)
 *
 * Returns 0 on success
 *        -1 on failure
 */
int anno_ele_destroy(GapIO *io, anno_ele_t *e) {
    bin_index_t *bin;
    range_t *r;
    int i;

    /* Find the bin range pointing to this object */
    bin = (bin_index_t *)cache_search(io, GT_Bin, e->bin);
    if (!bin || !bin->rng || ArrayMax(bin->rng) == 0)
	return -1;
    if (!(bin = cache_rw(io, bin)))
	return -1;


    for (i = 0; i < ArrayMax(bin->rng); i++) {
	r = arrp(range_t, bin->rng, i);
	if (r->flags & GRANGE_FLAG_UNUSED)
	    continue;

	if (r->rec == e->rec)
	    break;
    }
    if (i == ArrayMax(bin->rng))
	return -1;

    /* Mark this bin range as unused */
    r->rec = bin->rng_free;
    r->flags |= GRANGE_FLAG_UNUSED;

    bin->rng_free = i;
    bin->flags |= BIN_RANGE_UPDATED | BIN_BIN_UPDATED;

    bin_incr_nanno(io, bin, -1);

    if (bin->start_used == r->start || bin->end_used == r->end)
	bin_set_used_range(io, bin);

    return 0;
}
Exemple #18
0
int parse_baf(GapIO *io, char *fn, tg_args *a) {
    int nseqs = 0, nobj = 0, ntags = 0, ncontigs = 0;
    struct stat sb;
    zfp *fp;
    off_t pos;
    contig_t *c = NULL;
    tg_pair_t *pair = NULL;
    baf_block *b, *co = NULL;
    int last_obj_type = 0;
    int last_obj_pos = 0;
    tg_rec last_obj_rec = 0;
    tg_rec last_cnt_rec = 0;
    int last_cnt_pos = 0;
    int last_obj_orient = 0;
    
    
	
    printf("Loading %s...\n", fn);
    if (-1 == stat(fn, &sb) ||
	NULL == (fp = zfopen(fn, "r"))) {
	perror(fn);
	return -1;
    }

    if (a->pair_reads) {
	pair = create_pair(a->pair_queue);
    }

    /* Loop:
     * Read 1 block of data.
     * If contig, create contig
     * If read, insert it, insert to index.
     * Anything else - reject for now
     */
    pos = 0;
    while (b = baf_next_block(fp)) {
	int delay_destroy = 0;

	switch (b->type) {
	case CO: {
	    char *contig = baf_block_value(b, CO);

	    if (co)
		baf_block_destroy(co);

	    co = b;
	    delay_destroy = 1;

	    ncontigs++;
	    
	    create_new_contig(io, &c, contig, a->merge_contigs);

	    /* For anno */
	    last_obj_type = GT_Contig;
	    last_obj_rec = c->rec;
	    last_obj_pos = c->start + 1;
	    last_cnt_rec = c->rec;
	    last_cnt_pos = c->start + 1;
	    last_obj_orient = 0;

	    break;
	}

	case RD: {
	    seq_t seq;
	    int flags;
	    char *tname;
	    tg_rec recno;
	    int is_pair = 0;

	    /* Construct seq struct */
	    if (-1 == construct_seq_from_block(a, &seq, b, &tname)) {
		fprintf(stderr, "Failed to parse read block for seq %d\n",
			nseqs);
		break;
	    }

	    /* Create range, save sequence */
	    flags = GRANGE_FLAG_TYPE_SINGLE;
	    
	    if (seq.flags & SEQ_END_REV)
		flags |= GRANGE_FLAG_END_REV;
	    else
		flags |= GRANGE_FLAG_END_FWD;
	    if (seq.len < 0)
		flags |= GRANGE_FLAG_COMP1;
		
	    if (pair) is_pair = 1;
		
	    recno = save_range_sequence(io, &seq, seq.mapping_qual, pair,
					is_pair, tname, c, a, flags, NULL);

	    /* For anno */
	    last_obj_type = GT_Seq;
	    last_obj_rec = recno;
	    if (seq.len >= 0) {
		last_obj_pos = seq.pos;
		last_obj_orient = 0;
	    } else {
		last_obj_pos = seq.pos - seq.len - 1;
		last_obj_orient = 1;
	    }

	    nseqs++;
	    
	    break;
	}

	case AN: {
	    range_t r;
	    anno_ele_t *e;
	    char *typ = baf_block_value(b, AN);
	    char *loc = baf_block_value(b, LO);
	    char *len = baf_block_value(b, LL);
	    char *txt = baf_block_value(b, TX);
	    char *at  = baf_block_value(b, AT);
	    int an_pos;
	    bin_index_t *bin;
	    int anno_obj_type;

	    if (!(a->data_type & DATA_ANNO))
		break;

	    if (txt)
		unescape_line(txt);

	    if (last_obj_type == GT_Contig || (at && *at == 'C'))
		anno_obj_type = GT_Contig;
	    else
		anno_obj_type = GT_Seq;

	    if (!loc) {
		an_pos = last_obj_pos;
	    } else {
		if (*loc == '@') {
		    an_pos = atoi(loc+1);
		} else {
		    if (anno_obj_type == GT_Contig) {
			if (last_obj_orient == 0)
			    an_pos = last_cnt_pos + atoi(loc)-1;
			else
			    an_pos = last_cnt_pos - (atoi(loc)-1)
				- (len ? atoi(len)-1 : 0);
		    } else {
			if (last_obj_orient == 0)
			    an_pos = last_obj_pos + atoi(loc)-1;
			else
			    an_pos = last_obj_pos - (atoi(loc)-1)
				- (len ? atoi(len)-1 : 0);
		    }
		}
	    }

	    r.start = an_pos;
	    r.end = an_pos + (len ? atoi(len)-1 : 0);

	    r.mqual = str2type(typ);
	    r.pair_rec = (anno_obj_type == GT_Contig)
		? last_cnt_rec
		: last_obj_rec;

	    r.flags = GRANGE_FLAG_ISANNO;
	    if (GT_Seq == anno_obj_type)
		r.flags |= GRANGE_FLAG_TAG_SEQ;
	    r.rec = anno_ele_new(io, 0, anno_obj_type, r.pair_rec, 0,
				 str2type(typ), txt);
	    e = (anno_ele_t *)cache_search(io, GT_AnnoEle, r.rec);
	    e = cache_rw(io, e);
	
	    bin = bin_add_range(io, &c, &r, NULL, NULL, 0);
	    e->bin = bin->rec;

	    ntags++;
	    break;
	}

	case 0:
	    /* blank line */
	    break;

	default:
	    printf("Unsupported block type '%s'\n",
		   linetype2str(b->type));
	}

	if (!delay_destroy)
	    baf_block_destroy(b);

	if ((++nobj & 0xfff) == 0) {
	    int perc = 0;

	    pos = zftello(fp);
	    perc = 100.0 * pos / sb.st_size;
	    printf("\r%d%c", perc, (nobj & 0x3fff) ? '%' : '*');
	    fflush(stdout);
	    if ((nobj & 0x3fff) == 0)
		cache_flush(io);
	}

#if 1
	if ((nobj & 0x3fff) == 0) {
	    static int perc = 0;
	    if (perc < 100.0 * pos / sb.st_size) {
		perc = 100.0 * pos / sb.st_size;
		printf("\r%d%%", perc);
		//HacheTableStats(io->cache, stdout);
		//HacheTableStats(((GDB **)io->dbh)[0]->gfile->idx_hash, stdout);
		{
		    static struct timeval last, curr;
		    static int first = 1;
		    static int last_obj = 0;
		    static int last_contigs = 0;
		    long delta;

		    gettimeofday(&curr, NULL);
		    if (first) {
			last = curr;
			first = 0;
		    }

		    delta = (curr.tv_sec - last.tv_sec) * 1000000
			+ (curr.tv_usec - last.tv_usec);
		    printf(" - %g sec %d obj (%d contigs)\n", delta/1000000.0,
			   nobj - last_obj, ncontigs - last_contigs);
		    last = curr;
		    last_obj = nobj;
		    last_contigs = ncontigs;
		}
		fflush(stdout);
	    }
	}
#endif
    }
    
    if (pair && !a->fast_mode) {  
	finish_pairs(io, pair);
    }
    
    if (co)
	baf_block_destroy(co);

    cache_flush(io);
    zfclose(fp);

    printf("\nLoaded %12d contigs\n",     ncontigs);
    printf("       %12d sequences\n",   nseqs);
    printf("       %12d annotations\n", ntags);

    if (pair) delete_pair(pair);

    if (c)
	cache_decr(io, c);

    return 0;
}
/**
 * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'.
 */
MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) {
    CONTIGL *contig, *first_contig = NULL, *last_contig = NULL;
    int i, j;
    contig_iterator *citer;
    rangec_t *r;

    /* Expand start and end to the range covered by seqs overlapping
     * start .. end
     */

    {
	seq_t *s;
	citer = contig_iter_new(io, cnum, 0,
				CITER_FIRST | CITER_ICLIPPEDSTART,
				start, start);
	r = contig_iter_next(io, citer);
	if (r) {
	    s = cache_search(io, GT_Seq, r->rec);

	    start = ((s->len < 0) ^ r->comp)
		? r->end - s->right - 2
		: r->start + s->left - 2;
	}

	contig_iter_del(citer);
    }

    {
	seq_t *s;
	citer = contig_iter_new(io, cnum, 0,
				CITER_LAST | CITER_ICLIPPEDEND,
				end, end);
	r = contig_iter_next(io, citer);
	if (r) {
	    s = cache_search(io, GT_Seq, r->rec);

	    end = ((s->len < 0) ^ r->comp)
		? r->end - s->left + 2
		: r->start + s->right + 2;
	}

	contig_iter_del(citer);
    }
    
    //printf("Generating data for %d..%d\n", start, end);

    /* Generate contigl linked list */
    //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND);
    citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end);
    
    while ((r = contig_iter_next(io, citer))) {
	seq_t *s, *sorig;
	char *seq;
	int len;

	assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ);

	contig = create_contig_link();
	contig->id = r->rec;
	contig->mseg = create_mseg();

	sorig = s = cache_search(io, GT_Seq, r->rec);
	/* Check for out-of-bounds clip points.  It shouldn't happen, but
	   gap5 databases have been seen with this problem, and we
	   don't want to crash if there are any. */
	if (s->left < 1)            s->left = 1;
	if (s->right > ABS(s->len)) s->right = ABS(s->len);

	/* Fix reads of zero length */
	if (s->right < s->left) {
	    sorig = s = cache_rw(io, s);
	    s->right = s->left;
	    if (s->right > ABS(s->len))
		s->left = s->right = ABS(s->len);
	}

	if ((s->len < 0) ^ r->comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	len = s->right - s->left + 1;
	if (NULL == (seq = malloc(len+1)))
	    return NULL;

	for (j = 0, i = s->left-1; i < s->right; i++, j++) {
	    /* Protect against the sequence containing "."; our pad sym */
	    if (s->seq[i] == '.')
		seq[j] = 'N';
	    else
		seq[j] = s->seq[i];
	}
	seq[j] = 0;

	init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1);
	contig->mseg->comp = (s != sorig);

	if (last_contig) {
	    last_contig->next = contig;
	} else {
	    first_contig = contig;
	}
	last_contig = contig;

	if (s != sorig)
	    free(s);
    }
    contig_iter_del(citer);

    /* for 454 data -6 to -10 seem to work fine */
    return contigl_to_malign(first_contig, -7, -7);
}
/*
 * Takes a multiple alignment and updates the on-disk data structures to
 * match. This needs to correct confidence values, original positions and
 * tags too.
 */
void update_io(GapIO *io, tg_rec cnum, MALIGN *malign, Array indels) {
    CONTIGL *cl;
    tg_rec rnum;
    range_t r, *r_out;
    bin_index_t *bin;
    contig_t *c = cache_search(io, GT_Contig, cnum);
    size_t i, nindel;

    cache_incr(io, c);

    /*
     * To minimise number of data modifications we use a three step approach.
     *
     * Step 1: insert columns of pads, shifting reads as appropriate.
     * Step 2: edit sequence alignments as required, possibly involving
     *         moving sequences and/or adding and removing pads.
     * Step 3: remove columns of entire pads.
     *
     * This means that when we introduce a column of pads we don't have
     * to make edits to every single read position down stream, and can
     * instead make use of the optimised recursive bin functions to do this
     * for us.
     */

    /* Step 1: make indels */
    nindel = ArrayMax(indels);
    for (i = 0; i < nindel; i++) {
	con_indel_t *id = arrp(con_indel_t, indels, i);
	int j;

	if (id->size > 0) {
	    contig_insert_bases(io, &c, id->pos+1, '*', -1, id->size);
	} else {
	    for (j = 0; j < -id->size; j++) {
		contig_delete_pad(io, &c, id->pos+1);
	    }
	}
    }

    /* Step 2: edit alignments */
    for (cl = malign->contigl; cl; cl = cl->next) {
	seq_t *s, *sorig;
	int len, update_range = 0;
	int shift;

	rnum = cl->id;
	
	sorig = cache_search(io, GT_Seq, rnum);
	cache_incr(io, sorig);
	s = dup_seq(sorig);
	if (cl->mseg->comp)
	    complement_seq_t(s);

	len = s->right - s->left + 1;

	/* Check if sequence has changed. If so assign a new one */
	if (cl->mseg->length != len ||
	    memcmp(s->seq + s->left-1, cl->mseg->seq, cl->mseg->length) != 0) {
	    int newlen = s->left-1 + ABS(s->len) - s->right + cl->mseg->length;
	    int i, j, np;
	    char   *newseq  = malloc(newlen+1);
	    int8_t *newconf = malloc(newlen+1);

	    /* Build new seq/conf arrays */
	    memcpy(newseq,  s->seq,  s->left-1);
	    memcpy(newconf, s->conf, s->left-1);

	    memcpy(&newseq[s->left-1], cl->mseg->seq, cl->mseg->length);

	    /*
	     * Step through both old and new sequences working out how
	     * they differ. This will (*should*) be entire pad movements.
	     * i = index to old seq
	     * j = index to new seq
	     * np = number of pads added minus removed from old seq.
	     */
	    np = 0;
	    for (i =j =s->left-1;
		 i < ABS(s->len) && j < s->left-1 + cl->mseg->length;
		 ) {
		/* Bases match */
		if (toupper(newseq[j]) == toupper(s->seq[i]) ||
		    (s->seq[i] == '.' && newseq[j] == 'N')) {
		    if (isupper(s->seq[i]))
			newseq[j] = toupper(newseq[j]);
		    else
			newseq[j] = tolower(newseq[j]);
		    newconf[j] = s->conf[i];
		    i++, j++;
		    continue;
		}

		/* Pad removed */
		if (s->seq[i] == '*') {
		    i++;
		    tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+np--,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_delete(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_delete(io, rnum, i+np--);
		    }
		    */
		    continue;
		}

		/* Pad created */
		if (newseq[j] == '*') {
		    int k;
		    int ql = 0, qr = 0;
		    for (k = i-1; k >= 0; k--) {
			if (s->seq[k] != '*') {
			    ql = s->conf[k];
			    break;
			}
		    }
		    for (k = i+1; k < s->right; k++) {
			if (s->seq[k] != '*') {
			    qr = s->conf[k];
			    break;
			}
		    }
		    newconf[j] = MIN(ql, qr); /* min conf of neighbours */
		    j++;
		    tag_shift_for_insert(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+ ++np,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_insert(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_insert(io, rnum, i+ ++np);
		    }
		    */
		    continue;
		}

		fprintf(stderr, "Alignment introduced non-pad character");
		abort();
	    }

	    /* Pads previously at the end of the reading & now removed */
	    while (i < s->right) {
		if (s->seq[i] == '*') {
		    i++;
		    tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+np--,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_delete(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_delete(io, rnum, i+np--);
		    }
		    */
		} else {
		    /* Error: clipped data that wasn't a pad */
		    abort();
		}
	    }

	    /* Should only be pads remaining in newseq, if anything */
	    s->right = j;
	    for (; j < s->left-1 + cl->mseg->length; j++) {
		if (newseq[j] != '*') {
		    fprintf(stderr, "Alignment introduced non-pad character");
		    abort();
		}
		newconf[j] = 0;
	    }

	    /* Append on the right hand cutoff data */
	    for (; i < ABS(s->len); i++, j++) {
		newseq[j]  = s->seq[i];
		newconf[j] = s->conf[i];
	    }
	    if (j != newlen) {
		abort();
	    }

	    /* Write it back out */
	    /* Copy newseq/newconf into seq_t */

	    s->seq = newseq;
	    s->conf = newconf;
	    update_range = 0;
	    if (ABS(s->len) != j) {
		/* Length change implies updating the range array too */
		s->len = s->len >= 0 ? j : -j;
		update_range = 1;
	    }

	    if (cl->mseg->comp)
		complement_seq_t(s);

	    /* The memcpy trashes the block pointer, so special care needed */
	    {
		sorig = cache_rw(io, sorig);
		void *blk = sorig->block;
		memcpy(sorig, s, sizeof(seq_t)); 
		sorig->block = blk;
	    }

	    if (update_range)
		sorig = cache_item_resize(sorig, sizeof(*sorig) +
					  sequence_extra_len(sorig));

	    sequence_reset_ptr(sorig);

	    if (s->name)
		memcpy(sorig->name,       s->name,       s->name_len+1);
	    if (s->trace_name)
		memcpy(sorig->trace_name, s->trace_name, s->trace_name_len+1);
	    if (s->alignment)
		memcpy(sorig->alignment,  s->alignment,  s->alignment_len+1);
	    memcpy(sorig->seq,  s->seq,  ABS(s->len));
	    memcpy(sorig->conf, s->conf, ABS(s->len));

	    xfree(newconf);
	    xfree(newseq);
	}

	{
	    int st, en, or;
	    sequence_get_position(io, s->rec, NULL, &st, &en, &or);
	    if (or ^ (sorig->len < 0)) {
		shift = ABS(sorig->len) - sorig->right;
	    } else {
		shift = sorig->left-1;
	    }
	    st += shift;
	    if (st != cl->mseg->offset+1) {
		update_range = 1;
	    }
	}

	free(s);

	if (update_range) {
	    int bin_changed = 0;

	    /* Get old range and pair data */
	    s = sorig;
	    bin = cache_search(io, GT_Bin, s->bin);
	    r = *arrp(range_t, bin->rng, s->bin_index);
	    assert(r.rec == s->rec);

	    /* Update range, tedious and slow way */
	    bin_remove_item(io, &c, GT_Seq, s->rec);
	    r.start = cl->mseg->offset + 1 - shift;
	    r.end   = r.start + ABS(s->len) - 1;
	    bin = bin_add_range(io, &c, &r, &r_out, NULL, 0);

	    /* Check if the new bin has a different complemented status too */
	    if (s->bin != bin->rec) {
		int old_comp = bin_get_orient(io, s->bin);
		int new_comp = bin_get_orient(io, bin->rec);

		if (new_comp != old_comp) {
		    //int tmp;
		    s = cache_rw(io, s);
		    s->len *= -1;
		    s->flags ^= SEQ_COMPLEMENTED;
		    //tmp = s->left;
		    //s->left  = ABS(s->len) - (s->right-1);
		    //s->right = ABS(s->len) - (tmp-1);
		}

		bin_changed = 1;
	    }
	
	    /* Update seq bin & bin_index fields */
	    s = cache_rw(io, s);
	    s->bin = bin->rec;
	    s->bin_index = r_out - ArrayBase(range_t, bin->rng);

	    if (bin_changed) {
		if (-1 == sequence_fix_anno_bins(io, &s)) {
		    verror(ERR_WARN, "update_io",
			   "sequence_fix_anno_bins() failure");
		}
	    }
	}

	cache_decr(io, sorig);
    }

    /* Step 3 (remove pad columns) done in calling function. */

    cache_decr(io, c);
}
Exemple #21
0
/*
 * Breaks a contig in two such that snum is the right-most reading of
 * a new contig.
 */
int break_contig(GapIO *io, tg_rec crec, int cpos) {
    contig_t *cl;
    contig_t *cr;
    int cid;
    char cname[1024], *cname_end;
    int left_end, right_start;
    bin_index_t *bin;
    int do_comp = 0;
    HacheTable *h;

    cl = (contig_t *)cache_search(io, GT_Contig, crec);

    //contig_dump_ps(io, &cl, "/tmp/tree.ps");

    /*
     * Our hash table is keyed on sequence record numbers for all sequences
     * in all bins spanning the break point. The value is either 0 or 1
     * for left/right contig.
     * 
     * The purpose of this hash is to allow us to work out whether a tag
     * belongs in the left or right contig, as a tag could start beyond the
     * break point but be attached to a sequence before the break point.
     *
     * Further complicating this is that a tag could be in a smaller bin
     * than the sequence as it may not be as long. However we know
     * we'll recurse down these in a logical order so we can be sure
     * we've already "seen" the sequence that the tag has been
     * attached to.
     */
    h = HacheTableCreate(1024, HASH_DYNAMIC_SIZE);

    strncpy(cname, contig_get_name(&cl), 1000);
    cname_end = cname + strlen(cname);
    cid = 1;
    do {
	sprintf(cname_end, "#%d", cid++);
    } while (contig_index_query(io, cname) > 0);

    if (!(cr = contig_new(io, cname)))
	return -1;
    cl = cache_rw(io, cl);
    cr = cache_rw(io, cr);
    if (0 != contig_index_update(io, cname, strlen(cname), cr->rec))
	return -1;
    printf("Break in contig %"PRIrec", pos %d\n", crec, cpos);

    printf("Existing left bin = %"PRIrec", right bin = %"PRIrec"\n",
	   cl->bin, cr->bin);

    cache_incr(io, cl);
    cache_incr(io, cr);

    bin = get_bin(io, cl->bin);
    do_comp = bin->flags & BIN_COMPLEMENTED;

    break_contig_recurse(io, h, cl, cr,
			 contig_get_bin(&cl), cpos, contig_offset(io, &cl),
			 0, cl->rec, cr->rec, 0, 0);

    /* Recompute end positions */
    left_end    = contig_visible_end(io, cl->rec);
    right_start = contig_visible_start(io, cr->rec);

    /* Ensure start/end positions of contigs work out */
    bin = cache_rw(io, get_bin(io, cr->bin));

    //#define KEEP_POSITIONS 1
#ifndef KEEP_POSITIONS
    cr->start = 1;
    cr->end = cl->end - right_start + 1;
    bin->pos -= right_start-1;
#else
    cr->start = right_start;
    cr->end = cl->end;
#endif

    if ((do_comp && !(bin->flags & BIN_COMPLEMENTED)) ||
	(!do_comp && (bin->flags & BIN_COMPLEMENTED))) {
	bin->flags ^= BIN_COMPLEMENTED;
    }

    cl->end = left_end;

    //    remove_redundant_bins(io, cl);
    //    remove_redundant_bins(io, cr);

    printf("Final left bin = %"PRIrec", right bin = %"PRIrec"\n",
	   cl->bin, cr->bin);

    HacheTableDestroy(h, 0);

    //if (cl->bin) contig_dump_ps(io, &cl, "/tmp/tree_l.ps");
    //if (cr->bin) contig_dump_ps(io, &cr, "/tmp/tree_r.ps");

    cache_flush(io);

    remove_empty_bins(io, cl->rec);
    remove_empty_bins(io, cr->rec);

    /* Empty contig? If so remove it completely */
    if (cl->bin == 0) {
	printf("Removing empty contig %"PRIrec"\n", cl->rec);
	contig_destroy(io, cl->rec);
    }
    if (cr->bin == 0) {
	printf("Removing empty contig %"PRIrec"\n", cr->rec);
	contig_destroy(io, cr->rec);
    }

    cache_decr(io, cl);
    cache_decr(io, cr);

    cache_flush(io);

    return 0;
}
Exemple #22
0
/*
 * A recursive break contig function.
 * bin_num	The current bin being moved or split.
 * pos		The contig break point.
 * offset	The absolute positional offset of this bin in original contig
 * pleft	The parent bin/contig record num in the left new contig
 * pright	The parent bin/contig record num in the right new contig
 * child_no     0 or 1 - whether this bin is the left/right child of its parent
 */
static int break_contig_recurse(GapIO *io, HacheTable *h,
				contig_t *cl, contig_t *cr,
				tg_rec bin_num, int pos, int offset,
				int level, tg_rec pleft, tg_rec pright,
				int child_no, int complement) {
    int i, j, f_a, f_b;
    tg_rec rbin;
    bin_index_t *bin = get_bin(io, bin_num), *bin_dup ;
    //int bin_min, bin_max;
    int nseqs;
    tg_rec opright; /* old pright, needed if we revert back */

    cache_incr(io, bin);

    if (bin->flags & BIN_COMPLEMENTED) {
	complement ^= 1;
    }

    if (complement) {
	f_a = -1;
	f_b = offset + bin->size-1;
    } else {
	f_a = +1;
	f_b = offset;
    }

    printf("%*sBreak offset %d pos %d => test bin %"PRIrec": %d..%d\n",
	   level*4, "",
	   offset, pos, bin->rec,
	   NMIN(bin->start_used, bin->end_used),
	   NMAX(bin->start_used, bin->end_used));

    bin = cache_rw(io, bin);
    nseqs = bin->nseqs;
    bin->nseqs = 0;

    /* Invalidate any cached data */
    bin_invalidate_track(io, bin, TRACK_ALL);
    if (bin->flags & BIN_CONS_VALID) {
	bin->flags |= BIN_BIN_UPDATED;
	bin->flags &= ~BIN_CONS_VALID;
    }

    //bin_min = bin->rng ? NMIN(bin->start_used, bin->end_used) : offset;
    //bin_max = bin->rng ? NMAX(bin->start_used, bin->end_used) : offset;

    /*
     * Add to right parent if this bin is to the right of pos,
     * or if the used portion is to the right and we have no left child.
     *
     * FIXME: Not a valid assumption!
     * The used portion of a bin is not a placeholder for the used portion
     * of all the the children beneath it. Therefore if the used portion of
     * this bin is > pos (and we have no left child) it still doesn't mean
     * that the absolute positions of the used portion of the right child
     * won't be < pos.
     */
    if (offset >= pos /*|| (bin_min >= pos && !bin->child[0])*/) {
	printf("%*sADD_TO_RIGHT pl=%"PRIrec" pr=%"PRIrec"\n",
	       level*4, "", pleft, pright);
	if (0 != break_contig_move_bin(io, bin,
				       cl, pleft, cr, pright, 
				       child_no))
	    return -1;

	bin_incr_nseq(io, bin, nseqs);
	cache_decr(io, bin);

	return 0;
    }

    /*
     * Add to left parent if this bin is entirely to the left of pos,
     * or if the used portion is to the left and we have no right child.
     */
    if (offset + bin->size < pos /*|| (bin_max < pos && !bin->child[1])*/) {
	printf("%*sADD_TO_LEFT\n", level*4, "");

	//if (0 != break_contig_move_bin(io, bin, cr, pright, cl, pleft, child_no))
	//return -1;

	bin_incr_nseq(io, bin, nseqs);
	cache_decr(io, bin);
	
	return 0;
    }

    /*
     * Nominally the bin overlaps both left and right and so needs duplicating.
     * There are cases though at the roots of our trees where duplicating is
     * unnecessary as it leads to empty bins at the root. In this case
     * we skip creating a duplicate for the right, or alternatively steal
     * the left root bin and use that instead.
     *
     * Similarly the range_t array will either be left where it is, moved to
     * the right contig, or split in half (creating a new one for the right).
     *
     * FIXED: always need this. Eg:
     *
     * |-------------empty--------------|
     * |----------------|---------------|
     * |--------|-------|--------|------|
     *             ^
     *             |
     *             break here
     *
     * In this case we need to duplicate the parent as it overlaps the left
     * bin, which may (or may not) have data that needs to end up in the right
     * hand contig. Just duplicate for now and free later on if needed.
     */
    if (1 /* always! */ || pright != cr->rec ||
	(bin->rng && NMAX(bin->start_used, bin->end_used) >= pos)) {
	//printf("NMAX=%d >= %d\n", NMAX(bin->start_used, bin->end_used), pos);

	rbin = 0;

	/* Possibly steal left contig's bin */
	if (pleft == cl->rec && NMIN(bin->start_used, bin->end_used) >= pos) {
#if 0
	    /* Currently this doesn't always work */
	    if (bin->child[1]) {
		bin_index_t *ch = get_bin(io, bin->child[1]);
		if (NMIN(ch->pos, ch->pos + ch->size-1) >= pos) {
		    rbin = cl->bin;
		    cl->bin = bin->child[0];
		}
	    }
#else
	    pleft = bin->rec;
#endif
	} else {
	    pleft = bin->rec;
	}

	/* Create new bin, or use root of contig if it's unused so far */
	if (!rbin && pright == cr->rec) {
	    rbin = cr->bin;
	}

	/* Otherwise we genuingly need a duplicate */
	if (!rbin)
	    rbin = bin_new(io, 0, 0, 0, GT_Bin);

	/* Initialise with duplicate values from left bin */
	bin_dup = get_bin(io, rbin);
	bin_dup = cache_rw(io, bin_dup);
	bin_dup->size = bin->size;
	bin_dup->pos = bin->pos;
	bin_dup->parent = pright;
	bin_dup->parent_type = (pright == cr->rec ? GT_Contig : GT_Bin);
	bin_dup->flags = bin->flags | BIN_BIN_UPDATED;
	bin_dup->start_used = bin->start_used;
	bin_dup->end_used = bin->end_used;

	/*
	 * Shift bin to offset if it's the contig root.
	 * It'll be shifted back by the correct amount later.
	 */
	if (pright == cr->rec) {
	    printf("moving root bin to offset=%d comp=%d\n", offset, complement);
	    bin_dup->pos = offset;
	}

	printf("%*sCreated dup for right, rec %"PRIrec"\n",
	       level*4,"", bin_dup->rec);
	break_contig_move_bin(io, bin_dup, cl, 0, cr, pright, child_no);
	opright = pright;
	pright = bin_dup->rec;
    } else {
	bin_dup = NULL;
	pleft = bin->rec;
    }

    if (!bin->rng) {
	/* Empty bin */
	printf("%*sEMPTY range\n", level*4, "");
	bin->start_used = bin->end_used = 0;
	bin->flags |= BIN_BIN_UPDATED;
	if (bin_dup) {
	    bin_dup->start_used = bin_dup->end_used = 0;
	    bin_dup->flags |= BIN_BIN_UPDATED;
	}
	    
    } else if (NMIN(bin->start_used, bin->end_used) >= pos) {
	/* Move range to right contig */
	printf("%*sDUP %"PRIrec", MOVE Array to right\n",
	       level*4, "", bin_dup->rec);

	bin_dup->rng = bin->rng;
	bin_dup->rng_rec = bin->rng_rec;
	bin_dup->rng_free = bin->rng_free;
	if (bin_dup->rng_rec)
	    bin_dup->flags |= BIN_RANGE_UPDATED;

	if (bin->rec != bin_dup->rec) {
	    bin->rng = NULL;
	    bin->rng_rec = 0;
	    bin->rng_free = -1;
	    bin->flags |= BIN_BIN_UPDATED;
	}

	bin->start_used = bin->end_used = 0;
	break_contig_reparent_seqs(io, bin_dup);

	if (bin_dup->rng) {
	    int n = ArrayMax(bin_dup->rng);
	    for (i = j = 0; i < n; i++) {
		range_t *r = arrp(range_t, bin_dup->rng, i), *r2;
		if (r->flags & GRANGE_FLAG_UNUSED)
		    continue;

		if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) {
		    HacheData hd; hd.i = 1;
		    HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL);
		    j++;
		}
	    }
	    bin_incr_nseq(io, bin_dup, j);
	}
    } else if (NMAX(bin->start_used, bin->end_used) < pos) {
	/* Range array already in left contig, so do nothing */
	printf("%*sMOVE Array to left\n", level*4, "");

	if (bin_dup)
	    bin_dup->start_used = bin_dup->end_used = 0;

	if (bin->rng) {
	    int n = ArrayMax(bin->rng);
	    for (i = j = 0; i < n; i++) {
		range_t *r = arrp(range_t, bin->rng, i);
		if (r->flags & GRANGE_FLAG_UNUSED)
		    continue;

		if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) {
		    HacheData hd; hd.i = 0;
		    HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL);
		    j++;
		}
	    }
	    bin_incr_nseq(io, bin, j);
	}
    } else {
	/* Range array covers pos, so split in two */
	int n, nl = 0, nr = 0;
	int lmin = bin->size, lmax = 0, rmin = bin->size, rmax = 0;

	printf("%*sDUP %"PRIrec", SPLIT array\n", level*4, "", bin_dup->rec);

	bin->flags |= BIN_RANGE_UPDATED;
	bin_dup->flags |= BIN_RANGE_UPDATED;

	bin_dup->rng = ArrayCreate(sizeof(range_t), 0);
	bin_dup->rng_free = -1;

	/* Pass 1 - hash sequences */
	n = ArrayMax(bin->rng);
	for (i = 0; i < n; i++) {
	    range_t *r = arrp(range_t, bin->rng, i);
	    int cstart; /* clipped sequence positions */
	    seq_t *s;

	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)
		continue;

	    s = (seq_t *)cache_search(io, GT_Seq, r->rec);
	    if ((s->len < 0) ^ complement) {
		cstart = NMAX(r->start, r->end) - (s->right-1);
	    } else {
		cstart = NMIN(r->start, r->end) + s->left-1;
	    }
	    
	    if (cstart >= pos)  {
		HacheData hd; hd.i = 1;
		HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL);
	    } else {
		HacheData hd; hd.i = 0;
		HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL);
	    }
	}
	
	/* Pass 2 - do the moving of anno/seqs */
	n = ArrayMax(bin->rng);
	for (i = j = 0; i < n; i++) {
	    range_t *r = arrp(range_t, bin->rng, i), *r2;
	    int cstart; /* clipped sequence positions */

	    if (r->flags & GRANGE_FLAG_UNUSED)
		continue;

	    if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) {
		cstart = NMAX(r->start, r->end);
	    } else {
		seq_t *s = (seq_t *)cache_search(io, GT_Seq, r->rec);
		if ((s->len < 0) ^ complement) {
		    cstart = NMAX(r->start, r->end) - (s->right-1);
		} else {
		    cstart = NMIN(r->start, r->end) + s->left-1;
		}
	    }
	    
	    if (cstart >= pos &&
		((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)) {
		anno_ele_t *a = (anno_ele_t *)cache_search(io,
							   GT_AnnoEle,
							   r->rec);
		/* If it's an annotation on a sequence < pos then we
		 * still don't move.
		 *
		 * FIXME: we have no guarantee that the sequence being
		 * annotated is in the same bin as this annotation, as
		 * they may be different sizes and end up in different
		 * bins. (Should we enforce anno always in same bin as seq?
		 * If so, consensus annos fit anywhere?)
		 */
		if (a->obj_type == GT_Seq) {
		    HacheItem *hi = HacheTableSearch(h,
						     (char *)&r->pair_rec,
						     sizeof(r->pair_rec));

		    if (hi) {
			if (hi->data.i == 0)
			    cstart = pos-1;
		    } else {
			puts("FIXME: annotation for seq in unknown place - "
			     "work out correct location and move if needed.");
		    }
		}
	    }

	    if (cstart >= pos) {
		r2 = (range_t *)ArrayRef(bin_dup->rng, ArrayMax(bin_dup->rng));
		*r2 = *r;
		if (rmin > r->start) rmin = r->start;
		if (rmin > r->end)   rmin = r->end;
		if (rmax < r->start) rmax = r->start;
		if (rmax < r->end)   rmax = r->end;
		if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ)
		    nr++;
	    } else {
		if (lmin > r->start) lmin = r->start;
		if (lmin > r->end)   lmin = r->end;
		if (lmax < r->start) lmax = r->start;
		if (lmax < r->end)   lmax = r->end;

		if (j != i) {
		    r2 = arrp(range_t, bin->rng, j);
		    *r2 = *r;
		}
		j++;
		if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ)
		    nl++;
	    }
	}
	bin_incr_nseq(io, bin, nl);
	bin_incr_nseq(io, bin_dup, nr);


	ArrayMax(bin->rng) = j;

#if 0
	/*
	 * Right now this causes problems, but I'm not sure why. Try again
	 * after we've fixed the bin->nseqs issues and other deallocation
	 * woes.
	 */

	if (ArrayMax(bin_dup->rng) == 0 && bin_dup->parent_type == GT_Bin) {
	    /* We didn't need it afterall! Odd. */
	    bin_index_t *pb;

	    printf("Purging bin %d that we didn't need afterall\n",
		   bin_dup->rec);
	    cache_rec_deallocate(io, GT_Bin, bin_dup->rec);
	    pb = cache_search(io, GT_Bin, bin_dup->parent);
	    if (pb->child[0] == bin_dup->rec)
		pb->child[0] = 0;
	    if (pb->child[1] == bin_dup->rec)
		pb->child[1] = 0;
	    bin_dup = NULL;
	    pright = opright;
	}
#endif

	if (bin_dup)
	    break_contig_reparent_seqs(io, bin_dup);

	if (lmin < lmax) {
	    bin->start_used     = lmin;
	    bin->end_used       = lmax;
	} else {
	    /* No data left in bin */
	    bin->start_used = 0;
	    bin->end_used = 0;
	}

	printf("%*sLeft=>%d..%d right=>%d..%d\n", level*4, "",
	       lmin, lmax, rmin, rmax);

	if (bin_dup) {
	    if (rmin < rmax) {
		bin_dup->start_used = rmin;
		bin_dup->end_used   = rmax;
	    } else {
		/* No data moved in bin */
		bin_dup->start_used = 0;
		bin_dup->end_used   = 0;
	    }
	}
    }


    /* Recurse */
    for (i = 0; i < 2; i++) {
	bin_index_t *ch;
	if (!bin->child[i])
	    continue;

	ch = get_bin(io, bin->child[i]);
	if (0 != break_contig_recurse(io, h, cl, cr, bin->child[i], pos,
				      NMIN(ch->pos, ch->pos + ch->size-1),
				      level+1, pleft, pright,
				      i, complement))
	    return -1;
    }

    cache_decr(io, bin);
    //    if (bin_dup)
    //	cache_decr(io, bin_dup);

    return 0;
}
/*
 * Complements a scaffold; both complementing each contig within it and
 * reversing the order of contigs in the scaffold.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int complement_scaffold(GapIO *io, tg_rec srec) {
    scaffold_t *f;
    int i, j, nc = ArrayMax(io->contig_order);
    scaffold_member_t *contigs;
    tg_rec *crecs;
    HashTable *h;
    reg_order ro;
    reg_buffer_start rs;
    reg_buffer_end re;

    if (!(f = cache_search(io, GT_Scaffold, srec)))
	return -1;
    if (!(f = cache_rw(io, f)))
	return -1;
    cache_incr(io, f);

    /* Complement contigs */
    contigs = ArrayBase(scaffold_member_t, f->contig);
    for (i = 0; i < ArrayMax(f->contig); i++) {
	complement_contig(io, contigs[i].rec);
    }

    /* Reverse the order of the contigs in the scaffold array */
    for (i = 0, j = ArrayMax(f->contig)-1; i < j; i++, j--) {
	scaffold_member_t cr1 = contigs[i];
	contigs[i] = contigs[j];
	contigs[j] = cr1;
    }

    /*
     * Reverse the order of contigs in the contig_order array too.
     * This is the part that really matters. It's also hard as the contigs
     * in the contig order array could be in any order and not adjacent.
     * For our purposes we'll just ensure the contigs in this scaffold in 
     * the contig order array match our freshly complemented scaffold
     * ordering.
     *
     * We initially build a hash table of contigs in this scaffold, and
     * then iterate through contig_order copying out the new contigs whenever
     * one matches.
     */
    h = HashTableCreate(nc, 0);
    for (i = 0; i < ArrayMax(f->contig); i++) {
	HashData hd;
	hd.i = 0;
	HashTableAdd(h, (char *)&contigs[i].rec, sizeof(tg_rec), hd, NULL);
    }

    /* Replace any contig matching the scaffold with the new order */
    crecs = ArrayBase(tg_rec, io->contig_order);
    for (i = j = 0; i < nc; i++) {
	HashItem *hi;
	if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec))))
	    continue;

	crecs[i] = contigs[j++].rec;
    }

    /* Send event messages around */
    rs.job = REG_BUFFER_START;
    for (i = 0; i < nc; i++) {
	HashItem *hi;
	if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec))))
	    continue;

	contig_notify(io, crecs[i], (reg_data *)&rs);
    }

    ro.job = REG_ORDER;
    for (i = 0; i < nc; i++) {
	HashItem *hi;
	if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec))))
	    continue;

	ro.pos = i+1;
	contig_notify(io, crecs[i], (reg_data *)&ro);
    }

    /* Notify the end of our updates */
    re.job = REG_BUFFER_END;
    for (i = 0; i < nc; i++) {
	HashItem *hi;
	if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec))))
	    continue;

	contig_notify(io, crecs[i], (reg_data *)&re);
    }

    HashTableDestroy(h, 0);
    cache_decr(io, f);

    return 0;
}
/*
 * Given a contig order and a set of current scaffolds, this updates the
 * order of entries within each scaffold to match the contig order.
 *
 * For example if we have contigs in order 1 3 5 2 6 8 4 7 9 and
 * scaffolds {1 2 3 4} {5 6 7 8 9} we would shuffle the scaffold members
 * to        {1 3 2 4} {5 6 8 7 9}
 *
 * The purpose is for integration with contig shuffling in the Contig List
 * or Contig Selector. The master contig order array is what gets shuffled
 * manually by the user and it is also the definitive order to use when
 * outputting data (so it is completely under users control whether they
 * sort by name, size or scaffold).
 *
 * Returns 0 on success
 *        -1 on failure
 */
int update_scaffold_order(GapIO *io) {
    int i, j, ret = -1;
    int nc;
    int ns;
    tg_rec *crecs;

    if (!io->scaffold)
	return 0; /* Not supported, but considered success */

    nc = ArrayMax(io->contig_order);
    ns = ArrayMax(io->scaffold);

    scaf_ctg_t *a = (scaf_ctg_t *)malloc(nc * sizeof(*a));
    if (!a)
	return -1;

    /*
     * Produce an array of scaffold and contig recs, so we can sort on
     * both fields.
     */
    crecs = ArrayBase(tg_rec, io->contig_order);
    for (i = 0; i < nc; i++) {
	contig_t *c = cache_search(io, GT_Contig, crecs[i]);
	if (!c)
	    goto err;

	a[i].ctg_idx = i;
	a[i].scaffold = c->scaffold;
    }

    qsort(a, nc, sizeof(*a), scaf_ctg_sort);

    /*
     * Now recreate scaffold orders from the sorted contig list.
     */
    for (i = 0; i < nc; i++) {
	scaffold_t *f;
	int k;

	if (!a[i].scaffold)
	    continue;

	j = i;
	while (i < nc && a[i].scaffold == a[j].scaffold)
	    i++;

	/* j .. i-1 share the same scaffold */
	f = cache_search(io, GT_Scaffold, a[j].scaffold);
	if (!f)
	    goto err;

	if (!f->contig || ArrayMax(f->contig) != i-j) {
	    verror(ERR_WARN, "update_scaffold_order", "Scaffold %"PRIrec
		   "has different number of entries than contigs claim.",
		   f->rec);
	    goto err;
	}

	/* Only mark r/w and update if they differ */
	for (k = 0; k < ArrayMax(f->contig); k++) {
	    if ((arrp(scaffold_member_t, f->contig, k))->rec
		!= crecs[a[j+k].ctg_idx])
		break;
	}
	
	if (k != ArrayMax(f->contig)) {
	    f = cache_rw(io, f);
	    for (k = 0; k < ArrayMax(f->contig); k++)
		(arrp(scaffold_member_t, f->contig, k))->rec
		    = crecs[a[j+k].ctg_idx];
	}
	
	i--;
    }

    ret = 0;
 err:
    free(a);
    return ret;
}
Exemple #25
0
/*
 * Recursive part of remove_empty_bins.
 * Takes bin record.
 * Removes the bin if it is empty and has no children.
 * Modifies *first to contain the first bin record with data.
 *
 * Returns 1 if removed
 *         0 if not.
 */
static int remove_empty_bins_r(GapIO *io, tg_rec brec, tg_rec *first) {
    bin_index_t *bin = cache_search(io, GT_Bin, brec);
    int i, empty[2]; /* Emptied or non-existant */
    int this_is_empty;
    tg_rec child[2], f[2];

    /* Check if this bin is empty */
    this_is_empty = 0;
    if (!bin->rng || ArrayMax(bin->rng) == 0) {
	this_is_empty = 1;
    } else {
	/* Check if ranges are all unused */
	for (i = 0 ; i < ArrayMax(bin->rng); i++) {
	    range_t *r = arrp(range_t, bin->rng, i);
	    if (!(r->flags & GRANGE_FLAG_UNUSED))
		break;
	}

	if (i == ArrayMax(bin->rng)) {
	    this_is_empty = 1;
	}
    }


    /* Temporary copies to avoid needing cache_incr */
    child[0] = bin->child[0];
    child[1] = bin->child[1];

    f[0] = f[1] = 0;
    empty[0] = child[0] ? remove_empty_bins_r(io, child[0], &f[0]) : 1;
    empty[1] = child[1] ? remove_empty_bins_r(io, child[1], &f[1]) : 1;

    /* Remove this bin if empty and children are too */
    if (empty[0] && empty[1] && this_is_empty) {
	printf("Bin %"PRIrec": this & children are empty / non-existant\n",
	       brec);
	cache_rec_deallocate(io, GT_Bin, brec);
	return 1;
    }


    /* If we removed a child bin but are keeping this, then fix links */
    if ((empty[0] && child[0]) || (empty[1] && child[1])) {
	bin = cache_search(io, GT_Bin, brec);
	bin = cache_rw(io, bin);
	if (empty[0]) {
	    bin->flags |= BIN_BIN_UPDATED;
	    bin->child[0] = 0;
	}
	if (empty[1]) {
	    bin->flags |= BIN_BIN_UPDATED;
	    bin->child[1] = 0;
	}
    }


    /* Track first useful bin */
    if (first && !*first) {
	if ((f[0] && f[1]) || !this_is_empty) {
	    *first = brec;
	} else if (f[0]) {
	    *first = f[0];
	} else if (f[1]) {
	    *first = f[1];
	}
    }

    return 0;
}
Exemple #26
0
static int break_contig_move_bin(GapIO *io, bin_index_t *bin,
				 contig_t *cfrom, tg_rec pfrom,
				 contig_t *cto,   tg_rec pto,
				 int child_no) {
    /* Add to */
    if (pto == cto->rec) {
	/* Parent is a contig */
	if (bin->rec != cto->bin) {
	    cache_rec_deallocate(io, GT_Bin, cto->rec);
	}
	cto->bin = bin->rec;
	cto->start = 1;
	cto->end = bin->size;

	bin->parent = cto->rec;
	bin->parent_type = GT_Contig;
	bin->flags |= BIN_BIN_UPDATED;

    } else {
	/* Parent is a bin */
	bin_index_t *pb;

	if (!(pb = get_bin(io, pto)))
	    return -1;
	if (!(pb = cache_rw(io, pb)))
	    return -1;

	pb->child[child_no] = bin->rec;
	pb->flags |= BIN_BIN_UPDATED;

	bin->parent = pto;
	bin->parent_type = GT_Bin;
	bin->flags |= BIN_BIN_UPDATED;
    }

    /* Remove from: NB it may not exist? */
    if (pfrom == cfrom->rec) {
	/* Parent is a contig */
	if (cfrom->bin != bin->rec) {
	    fprintf(stderr, "pfrom incorrect\n");
	    return -1;
	}

	cfrom->bin = 0;
    } else if (pfrom > 0) {
	/* Parent is a bin */
	bin_index_t *pb;

	if (!(pb = get_bin(io, pfrom)))
	    return -1;
	if (!(pb = cache_rw(io, pb)))
	    return -1;

	if (pb->child[0] != bin->rec && pb->child[1] != bin->rec) {
	    fprintf(stderr, "pfrom incorrect\n");
	    return -1;
	}

	if (!(pb = cache_rw(io, pb)))
	    return -1;
	
	if (pb->child[0] == bin->rec)
	    pb->child[0] = 0;
	else
	    pb->child[1] = 0;
	pb->flags |= BIN_BIN_UPDATED;
    }

    return 0;
}
Exemple #27
0
int io_timestamp_incr(GapIO *io) {
    io = gio_base(io);
    io->db = cache_rw(io, io->db);
    return ++io->db->timestamp;
}
/*
 * Extends the right hand end of a single contig.
 *
 * Min_depth is the minimum depth for extension. If lower then even if the
 * data matches we'll not extend further.
 *
 * Match_score (+ve) and mismatch_score (-ve) are accumulated during
 * extension to ensure that we don't extend into junk mismatching DNA.
 */
static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth,
				int match_score, int mismatch_score) {
    int end;
    rangec_t *r;
    int nr, i;
    contig_t *c;
    char cons[CSZ], new_cons[ESZ];
    int freqs[ESZ][4], depth[ESZ];
    double score, best_score;
    int best_pos, nseq;

    vmessage("Processing contig #%"PRIrec", %s end\n",
	     crec, dir ? "left" : "right");

    for (i = 0; i < ESZ; i++) {
	freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0;
	depth[i] = 0;
    }

    c = cache_search(io, GT_Contig, crec);
    if (NULL == c) return -1;
    cache_incr(io, c);

    if (consensus_valid_range(io, crec, NULL, &end) != 0) {
	cache_decr(io, c);
	return -1;
    }

    calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL);

    /* Start */
    /* Not implemented for now: rev complement and go again! */

    /* End */
    r = contig_seqs_in_range(io, &c, end, end, 0, &nr);
    if (!r) {
	cache_decr(io, c);
	return -1;
    }

    for (i = 0; i < nr; i++) {
	seq_t *s = cache_search(io, GT_Seq, r[i].rec);
	seq_t *sorig = s;
	int cstart, cend;
	int j, k, slen;

	if ((s->len < 0) ^ r[i].comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	cstart = r[i].start + s->left-1;
	cend   = r[i].start + s->right-1;

	/* Does cutoff extend to contig end, if so does it match cons? */
	if (cend < end) {
	    int mis = 0, len = 0;
	    if (end - cend >= CSZ) {
		/*
		fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n",
			r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0; /* Mark for removal */
		continue;
	    }

	    for (k = s->right, j = cend+1; j <= end; j++, k++) {
		//printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]);
		if (s->seq[k] != cons[j-(end-(CSZ-1))])
		    mis++;
	    }
	    len = end - cend;
	    if (100*mis/len > 5) {
		/*
		fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement "
			"with consensus.\n", r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0;
		continue;
	    }
	}

	/* So we got here, let's accumulate extension stats */
	slen = ABS(s->len);
	for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) {
	    //printf("%d: %c\n", j + r[i].start, s->seq[j]);
	    if(s->seq[j] == 'N')
		continue;

	    freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++;
	    depth[k]++;
	}

	if (sorig != s)
	    free(s);
    }

    score = best_score = 0;
    best_pos = 0;
    
    for (i = 0; i < ESZ; i++) {
	int call, best = 0, j;
	double dd;

	if (depth[i] < min_depth)
	    break;

	for (j = 0; j < 4; j++) {
	    if (best < freqs[i][j]) {
		best = freqs[i][j];
		call = j;
	    }
	}
	new_cons[i] = "ACGT"[call];

	dd = (double)depth[i];
	switch (call) {
	case 0:
	    score +=  freqs[i][0] / dd;
	    score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 1:
	    score +=  freqs[i][1] / dd;
	    score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 2:
	    score +=  freqs[i][2] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd;
	    break;
	case 3:
	    score +=  freqs[i][3] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd;
	    break;
	}

	if (best_score <= score) {
	    best_score = score;
	    best_pos = i+1;
	}
	/*
	printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n",
	       i, depth[i], "ACGT"[call],
	       freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3],
	       score);
	*/
    }
    /* printf("Best score is %f at %d\n", best_score, best_pos); */

    /* Extend */
    nseq = 0;
    if (best_pos > 0) {
	int furthest_left = end;

	for (i = 0; i < nr; i++) {
	    seq_t *s;
	    int r_pos;
	    int score;

	    if (r[i].rec == 0)
		continue;

	    s = cache_search(io, GT_Seq, r[i].rec);
	    s = cache_rw(io, s);

	    if (furthest_left > r[i].start)
		furthest_left = r[i].start;

	    /*
	     * end + best_pos is the furthest right we can go, but this
	     * specific read may not be justified in reaching that far
	     * if it has too many disagreements.
	     */
	    if ((s->len > 0) ^ r[i].comp) {
		int best_r = 0, j, k;
		int len = ABS(s->len);

		//printf(">%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) {
		for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) {
		    if (new_cons[j] == toupper(s->seq[k])) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k+1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(s->seq[k])
		    //	    ? toupper(s->seq[k])
		    //        : tolower(s->seq[k]));
		}
		//putchar('\n');

		if (s->right != r_pos) {
		    s->right  = r_pos;
		    nseq++;
		}
	    } else {
		int best_r = 0, j, k;

		//printf("<%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) {
		for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) {
		    char b = complement_base(s->seq[k]);
		    if (new_cons[j] == b) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k-1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(b)
		    //	    ? toupper(b)
		    //	    : tolower(b));
		}
		//putchar('\n');

		if (s->left != r_pos+2) {
		    s->left  = r_pos+2;
		    nseq++;
		}
	    }
	}

	vmessage("    Extended by %d, adjusting %d sequence clip%s\n",
		 best_pos, nseq, nseq == 1 ? "" : "s");

	bin_invalidate_consensus(io, crec, furthest_left, end + best_pos);
    } else {
	vmessage("    Unable to extend contig\n");
    }
    free(r);

    cache_decr(io, c);
    cache_flush(io);
    return 0;
}