/* * Sets a scaffold name. * * Returns 0 on success * -1 on failure */ int scaffold_set_name(GapIO *io, scaffold_t **f, char *name) { scaffold_t *n; GapIO *iob = gio_base(io); if (!(n = cache_rw(io, *f))) return -1; /* Delete old name */ if (n->name) { tg_rec r = iob->iface->scaffold.index_del(iob->dbh, n->name, n->rec); if (r != -1 && r != io->db->scaffold_name_index) { io->db = cache_rw(io, io->db); io->db->scaffold_name_index = r; } } if (NULL == (n = cache_item_resize(n, sizeof(*n) + strlen(name)+1))) return -1; *f = n; /* Add new name */ n->name = (char *)(&n->data); strcpy(n->name, name); if (*name) { tg_rec r = iob->iface->scaffold.index_add(iob->dbh, name, n->rec); if (r != -1 && r != io->db->scaffold_name_index) { io->db = cache_rw(io, io->db); io->db->scaffold_name_index = r; } } return 0; }
/* * Sets the comment for an annotation element. * * Returns 0 on success * -1 on failure */ int anno_ele_set_comment(GapIO *io, anno_ele_t **e, char *comment) { anno_ele_t *ae; size_t clen; if (!(ae = cache_rw(io, *e))) return -1; clen = comment ? strlen(comment) : 0; if (clen > (ae->comment ? strlen(ae->comment) : 0)) { ae = cache_item_resize(ae, sizeof(*ae) + clen+1); ae->comment = (char *)&ae->data; } if (clen) strcpy(ae->comment, comment); *e = ae; return 0; }
/* * Takes a multiple alignment and updates the on-disk data structures to * match. This needs to correct confidence values, original positions and * tags too. */ void update_io(GapIO *io, tg_rec cnum, MALIGN *malign, Array indels) { CONTIGL *cl; tg_rec rnum; range_t r, *r_out; bin_index_t *bin; contig_t *c = cache_search(io, GT_Contig, cnum); size_t i, nindel; cache_incr(io, c); /* * To minimise number of data modifications we use a three step approach. * * Step 1: insert columns of pads, shifting reads as appropriate. * Step 2: edit sequence alignments as required, possibly involving * moving sequences and/or adding and removing pads. * Step 3: remove columns of entire pads. * * This means that when we introduce a column of pads we don't have * to make edits to every single read position down stream, and can * instead make use of the optimised recursive bin functions to do this * for us. */ /* Step 1: make indels */ nindel = ArrayMax(indels); for (i = 0; i < nindel; i++) { con_indel_t *id = arrp(con_indel_t, indels, i); int j; if (id->size > 0) { contig_insert_bases(io, &c, id->pos+1, '*', -1, id->size); } else { for (j = 0; j < -id->size; j++) { contig_delete_pad(io, &c, id->pos+1); } } } /* Step 2: edit alignments */ for (cl = malign->contigl; cl; cl = cl->next) { seq_t *s, *sorig; int len, update_range = 0; int shift; rnum = cl->id; sorig = cache_search(io, GT_Seq, rnum); cache_incr(io, sorig); s = dup_seq(sorig); if (cl->mseg->comp) complement_seq_t(s); len = s->right - s->left + 1; /* Check if sequence has changed. If so assign a new one */ if (cl->mseg->length != len || memcmp(s->seq + s->left-1, cl->mseg->seq, cl->mseg->length) != 0) { int newlen = s->left-1 + ABS(s->len) - s->right + cl->mseg->length; int i, j, np; char *newseq = malloc(newlen+1); int8_t *newconf = malloc(newlen+1); /* Build new seq/conf arrays */ memcpy(newseq, s->seq, s->left-1); memcpy(newconf, s->conf, s->left-1); memcpy(&newseq[s->left-1], cl->mseg->seq, cl->mseg->length); /* * Step through both old and new sequences working out how * they differ. This will (*should*) be entire pad movements. * i = index to old seq * j = index to new seq * np = number of pads added minus removed from old seq. */ np = 0; for (i =j =s->left-1; i < ABS(s->len) && j < s->left-1 + cl->mseg->length; ) { /* Bases match */ if (toupper(newseq[j]) == toupper(s->seq[i]) || (s->seq[i] == '.' && newseq[j] == 'N')) { if (isupper(s->seq[i])) newseq[j] = toupper(newseq[j]); else newseq[j] = tolower(newseq[j]); newconf[j] = s->conf[i]; i++, j++; continue; } /* Pad removed */ if (s->seq[i] == '*') { i++; tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+np--, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_delete(io, rnum, r.length - i + 1); } else { tag_shift_for_delete(io, rnum, i+np--); } */ continue; } /* Pad created */ if (newseq[j] == '*') { int k; int ql = 0, qr = 0; for (k = i-1; k >= 0; k--) { if (s->seq[k] != '*') { ql = s->conf[k]; break; } } for (k = i+1; k < s->right; k++) { if (s->seq[k] != '*') { qr = s->conf[k]; break; } } newconf[j] = MIN(ql, qr); /* min conf of neighbours */ j++; tag_shift_for_insert(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+ ++np, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_insert(io, rnum, r.length - i + 1); } else { tag_shift_for_insert(io, rnum, i+ ++np); } */ continue; } fprintf(stderr, "Alignment introduced non-pad character"); abort(); } /* Pads previously at the end of the reading & now removed */ while (i < s->right) { if (s->seq[i] == '*') { i++; tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+np--, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_delete(io, rnum, r.length - i + 1); } else { tag_shift_for_delete(io, rnum, i+np--); } */ } else { /* Error: clipped data that wasn't a pad */ abort(); } } /* Should only be pads remaining in newseq, if anything */ s->right = j; for (; j < s->left-1 + cl->mseg->length; j++) { if (newseq[j] != '*') { fprintf(stderr, "Alignment introduced non-pad character"); abort(); } newconf[j] = 0; } /* Append on the right hand cutoff data */ for (; i < ABS(s->len); i++, j++) { newseq[j] = s->seq[i]; newconf[j] = s->conf[i]; } if (j != newlen) { abort(); } /* Write it back out */ /* Copy newseq/newconf into seq_t */ s->seq = newseq; s->conf = newconf; update_range = 0; if (ABS(s->len) != j) { /* Length change implies updating the range array too */ s->len = s->len >= 0 ? j : -j; update_range = 1; } if (cl->mseg->comp) complement_seq_t(s); /* The memcpy trashes the block pointer, so special care needed */ { sorig = cache_rw(io, sorig); void *blk = sorig->block; memcpy(sorig, s, sizeof(seq_t)); sorig->block = blk; } if (update_range) sorig = cache_item_resize(sorig, sizeof(*sorig) + sequence_extra_len(sorig)); sequence_reset_ptr(sorig); if (s->name) memcpy(sorig->name, s->name, s->name_len+1); if (s->trace_name) memcpy(sorig->trace_name, s->trace_name, s->trace_name_len+1); if (s->alignment) memcpy(sorig->alignment, s->alignment, s->alignment_len+1); memcpy(sorig->seq, s->seq, ABS(s->len)); memcpy(sorig->conf, s->conf, ABS(s->len)); xfree(newconf); xfree(newseq); } { int st, en, or; sequence_get_position(io, s->rec, NULL, &st, &en, &or); if (or ^ (sorig->len < 0)) { shift = ABS(sorig->len) - sorig->right; } else { shift = sorig->left-1; } st += shift; if (st != cl->mseg->offset+1) { update_range = 1; } } free(s); if (update_range) { int bin_changed = 0; /* Get old range and pair data */ s = sorig; bin = cache_search(io, GT_Bin, s->bin); r = *arrp(range_t, bin->rng, s->bin_index); assert(r.rec == s->rec); /* Update range, tedious and slow way */ bin_remove_item(io, &c, GT_Seq, s->rec); r.start = cl->mseg->offset + 1 - shift; r.end = r.start + ABS(s->len) - 1; bin = bin_add_range(io, &c, &r, &r_out, NULL, 0); /* Check if the new bin has a different complemented status too */ if (s->bin != bin->rec) { int old_comp = bin_get_orient(io, s->bin); int new_comp = bin_get_orient(io, bin->rec); if (new_comp != old_comp) { //int tmp; s = cache_rw(io, s); s->len *= -1; s->flags ^= SEQ_COMPLEMENTED; //tmp = s->left; //s->left = ABS(s->len) - (s->right-1); //s->right = ABS(s->len) - (tmp-1); } bin_changed = 1; } /* Update seq bin & bin_index fields */ s = cache_rw(io, s); s->bin = bin->rec; s->bin_index = r_out - ArrayBase(range_t, bin->rng); if (bin_changed) { if (-1 == sequence_fix_anno_bins(io, &s)) { verror(ERR_WARN, "update_io", "sequence_fix_anno_bins() failure"); } } } cache_decr(io, sorig); } /* Step 3 (remove pad columns) done in calling function. */ cache_decr(io, c); }