Esempio n. 1
0
/*
 * Sets a scaffold name.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int scaffold_set_name(GapIO *io, scaffold_t **f, char *name) {
    scaffold_t *n;
    GapIO *iob = gio_base(io);

    if (!(n = cache_rw(io, *f)))
	return -1;

    /* Delete old name */
    if (n->name) {
	tg_rec r = iob->iface->scaffold.index_del(iob->dbh, n->name, n->rec);
	if (r != -1 && r != io->db->scaffold_name_index) {
	    io->db = cache_rw(io, io->db);
	    io->db->scaffold_name_index = r;
	}
    }

    if (NULL == (n = cache_item_resize(n, sizeof(*n) + strlen(name)+1)))
	return -1;

    *f = n;

    /* Add new name */
    n->name   = (char *)(&n->data);
    strcpy(n->name, name);

    if (*name) {
	tg_rec r = iob->iface->scaffold.index_add(iob->dbh, name, n->rec);
	if (r != -1 && r != io->db->scaffold_name_index) {
	    io->db = cache_rw(io, io->db);
	    io->db->scaffold_name_index = r;
	}
    }

    return 0;
}
Esempio n. 2
0
/*
 * Sets the comment for an annotation element.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int anno_ele_set_comment(GapIO *io, anno_ele_t **e, char *comment) {
    anno_ele_t *ae;
    size_t clen;

    if (!(ae = cache_rw(io, *e)))
	return -1;

    clen = comment ? strlen(comment) : 0;
    if (clen > (ae->comment ? strlen(ae->comment) : 0)) {
	ae = cache_item_resize(ae, sizeof(*ae) + clen+1);
	ae->comment = (char *)&ae->data;
    }
    if (clen)
	strcpy(ae->comment, comment);

    *e = ae;

    return 0;
}
Esempio n. 3
0
/*
 * Takes a multiple alignment and updates the on-disk data structures to
 * match. This needs to correct confidence values, original positions and
 * tags too.
 */
void update_io(GapIO *io, tg_rec cnum, MALIGN *malign, Array indels) {
    CONTIGL *cl;
    tg_rec rnum;
    range_t r, *r_out;
    bin_index_t *bin;
    contig_t *c = cache_search(io, GT_Contig, cnum);
    size_t i, nindel;

    cache_incr(io, c);

    /*
     * To minimise number of data modifications we use a three step approach.
     *
     * Step 1: insert columns of pads, shifting reads as appropriate.
     * Step 2: edit sequence alignments as required, possibly involving
     *         moving sequences and/or adding and removing pads.
     * Step 3: remove columns of entire pads.
     *
     * This means that when we introduce a column of pads we don't have
     * to make edits to every single read position down stream, and can
     * instead make use of the optimised recursive bin functions to do this
     * for us.
     */

    /* Step 1: make indels */
    nindel = ArrayMax(indels);
    for (i = 0; i < nindel; i++) {
	con_indel_t *id = arrp(con_indel_t, indels, i);
	int j;

	if (id->size > 0) {
	    contig_insert_bases(io, &c, id->pos+1, '*', -1, id->size);
	} else {
	    for (j = 0; j < -id->size; j++) {
		contig_delete_pad(io, &c, id->pos+1);
	    }
	}
    }

    /* Step 2: edit alignments */
    for (cl = malign->contigl; cl; cl = cl->next) {
	seq_t *s, *sorig;
	int len, update_range = 0;
	int shift;

	rnum = cl->id;
	
	sorig = cache_search(io, GT_Seq, rnum);
	cache_incr(io, sorig);
	s = dup_seq(sorig);
	if (cl->mseg->comp)
	    complement_seq_t(s);

	len = s->right - s->left + 1;

	/* Check if sequence has changed. If so assign a new one */
	if (cl->mseg->length != len ||
	    memcmp(s->seq + s->left-1, cl->mseg->seq, cl->mseg->length) != 0) {
	    int newlen = s->left-1 + ABS(s->len) - s->right + cl->mseg->length;
	    int i, j, np;
	    char   *newseq  = malloc(newlen+1);
	    int8_t *newconf = malloc(newlen+1);

	    /* Build new seq/conf arrays */
	    memcpy(newseq,  s->seq,  s->left-1);
	    memcpy(newconf, s->conf, s->left-1);

	    memcpy(&newseq[s->left-1], cl->mseg->seq, cl->mseg->length);

	    /*
	     * Step through both old and new sequences working out how
	     * they differ. This will (*should*) be entire pad movements.
	     * i = index to old seq
	     * j = index to new seq
	     * np = number of pads added minus removed from old seq.
	     */
	    np = 0;
	    for (i =j =s->left-1;
		 i < ABS(s->len) && j < s->left-1 + cl->mseg->length;
		 ) {
		/* Bases match */
		if (toupper(newseq[j]) == toupper(s->seq[i]) ||
		    (s->seq[i] == '.' && newseq[j] == 'N')) {
		    if (isupper(s->seq[i]))
			newseq[j] = toupper(newseq[j]);
		    else
			newseq[j] = tolower(newseq[j]);
		    newconf[j] = s->conf[i];
		    i++, j++;
		    continue;
		}

		/* Pad removed */
		if (s->seq[i] == '*') {
		    i++;
		    tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+np--,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_delete(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_delete(io, rnum, i+np--);
		    }
		    */
		    continue;
		}

		/* Pad created */
		if (newseq[j] == '*') {
		    int k;
		    int ql = 0, qr = 0;
		    for (k = i-1; k >= 0; k--) {
			if (s->seq[k] != '*') {
			    ql = s->conf[k];
			    break;
			}
		    }
		    for (k = i+1; k < s->right; k++) {
			if (s->seq[k] != '*') {
			    qr = s->conf[k];
			    break;
			}
		    }
		    newconf[j] = MIN(ql, qr); /* min conf of neighbours */
		    j++;
		    tag_shift_for_insert(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+ ++np,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_insert(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_insert(io, rnum, i+ ++np);
		    }
		    */
		    continue;
		}

		fprintf(stderr, "Alignment introduced non-pad character");
		abort();
	    }

	    /* Pads previously at the end of the reading & now removed */
	    while (i < s->right) {
		if (s->seq[i] == '*') {
		    i++;
		    tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset,
					 cl->mseg->length, i+np--,
					 s->bin);
		    /*
		    if (io_length(io, rnum) < 0) {
			tag_shift_for_delete(io, rnum, r.length - i + 1);
		    } else {
			tag_shift_for_delete(io, rnum, i+np--);
		    }
		    */
		} else {
		    /* Error: clipped data that wasn't a pad */
		    abort();
		}
	    }

	    /* Should only be pads remaining in newseq, if anything */
	    s->right = j;
	    for (; j < s->left-1 + cl->mseg->length; j++) {
		if (newseq[j] != '*') {
		    fprintf(stderr, "Alignment introduced non-pad character");
		    abort();
		}
		newconf[j] = 0;
	    }

	    /* Append on the right hand cutoff data */
	    for (; i < ABS(s->len); i++, j++) {
		newseq[j]  = s->seq[i];
		newconf[j] = s->conf[i];
	    }
	    if (j != newlen) {
		abort();
	    }

	    /* Write it back out */
	    /* Copy newseq/newconf into seq_t */

	    s->seq = newseq;
	    s->conf = newconf;
	    update_range = 0;
	    if (ABS(s->len) != j) {
		/* Length change implies updating the range array too */
		s->len = s->len >= 0 ? j : -j;
		update_range = 1;
	    }

	    if (cl->mseg->comp)
		complement_seq_t(s);

	    /* The memcpy trashes the block pointer, so special care needed */
	    {
		sorig = cache_rw(io, sorig);
		void *blk = sorig->block;
		memcpy(sorig, s, sizeof(seq_t)); 
		sorig->block = blk;
	    }

	    if (update_range)
		sorig = cache_item_resize(sorig, sizeof(*sorig) +
					  sequence_extra_len(sorig));

	    sequence_reset_ptr(sorig);

	    if (s->name)
		memcpy(sorig->name,       s->name,       s->name_len+1);
	    if (s->trace_name)
		memcpy(sorig->trace_name, s->trace_name, s->trace_name_len+1);
	    if (s->alignment)
		memcpy(sorig->alignment,  s->alignment,  s->alignment_len+1);
	    memcpy(sorig->seq,  s->seq,  ABS(s->len));
	    memcpy(sorig->conf, s->conf, ABS(s->len));

	    xfree(newconf);
	    xfree(newseq);
	}

	{
	    int st, en, or;
	    sequence_get_position(io, s->rec, NULL, &st, &en, &or);
	    if (or ^ (sorig->len < 0)) {
		shift = ABS(sorig->len) - sorig->right;
	    } else {
		shift = sorig->left-1;
	    }
	    st += shift;
	    if (st != cl->mseg->offset+1) {
		update_range = 1;
	    }
	}

	free(s);

	if (update_range) {
	    int bin_changed = 0;

	    /* Get old range and pair data */
	    s = sorig;
	    bin = cache_search(io, GT_Bin, s->bin);
	    r = *arrp(range_t, bin->rng, s->bin_index);
	    assert(r.rec == s->rec);

	    /* Update range, tedious and slow way */
	    bin_remove_item(io, &c, GT_Seq, s->rec);
	    r.start = cl->mseg->offset + 1 - shift;
	    r.end   = r.start + ABS(s->len) - 1;
	    bin = bin_add_range(io, &c, &r, &r_out, NULL, 0);

	    /* Check if the new bin has a different complemented status too */
	    if (s->bin != bin->rec) {
		int old_comp = bin_get_orient(io, s->bin);
		int new_comp = bin_get_orient(io, bin->rec);

		if (new_comp != old_comp) {
		    //int tmp;
		    s = cache_rw(io, s);
		    s->len *= -1;
		    s->flags ^= SEQ_COMPLEMENTED;
		    //tmp = s->left;
		    //s->left  = ABS(s->len) - (s->right-1);
		    //s->right = ABS(s->len) - (tmp-1);
		}

		bin_changed = 1;
	    }
	
	    /* Update seq bin & bin_index fields */
	    s = cache_rw(io, s);
	    s->bin = bin->rec;
	    s->bin_index = r_out - ArrayBase(range_t, bin->rng);

	    if (bin_changed) {
		if (-1 == sequence_fix_anno_bins(io, &s)) {
		    verror(ERR_WARN, "update_io",
			   "sequence_fix_anno_bins() failure");
		}
	    }
	}

	cache_decr(io, sorig);
    }

    /* Step 3 (remove pad columns) done in calling function. */

    cache_decr(io, c);
}