/* * Sets a scaffold name. * * Returns 0 on success * -1 on failure */ int scaffold_set_name(GapIO *io, scaffold_t **f, char *name) { scaffold_t *n; GapIO *iob = gio_base(io); if (!(n = cache_rw(io, *f))) return -1; /* Delete old name */ if (n->name) { tg_rec r = iob->iface->scaffold.index_del(iob->dbh, n->name, n->rec); if (r != -1 && r != io->db->scaffold_name_index) { io->db = cache_rw(io, io->db); io->db->scaffold_name_index = r; } } if (NULL == (n = cache_item_resize(n, sizeof(*n) + strlen(name)+1))) return -1; *f = n; /* Add new name */ n->name = (char *)(&n->data); strcpy(n->name, name); if (*name) { tg_rec r = iob->iface->scaffold.index_add(iob->dbh, name, n->rec); if (r != -1 && r != io->db->scaffold_name_index) { io->db = cache_rw(io, io->db); io->db->scaffold_name_index = r; } } return 0; }
/* * Creates a new named scaffold. * * Returns scaffold pointer on success. * NULL on failure */ scaffold_t *scaffold_new(GapIO *io, char *name) { tg_rec rec; scaffold_t *f, init_f; if (!io->db->scaffold) return NULL; memset(&init_f, 0, sizeof(scaffold_t)); init_f.name = name; /* Allocate our contig */ rec = cache_item_create(io, GT_Scaffold, &init_f); /* Initialise it */ f = (scaffold_t *)cache_search(io, GT_Scaffold, rec); f = cache_rw(io, f); if (name) scaffold_set_name(io, &f, name); else f->name = NULL; /* Add it to the scaffold order too */ io->scaffold = cache_rw(io, io->scaffold); io->db = cache_rw(io, io->db); ARR(tg_rec, io->scaffold, io->db->Nscaffolds++) = rec; /* Add to the new contigs list */ if (name) add_to_list("new_scaffolds", name); return f; }
/* * Given ranges contained within a bin this makes sure that all sequences * referred to in these ranges have their parent listed as the new bin. * * Returns 0 on success * -1 on failure */ static int break_contig_reparent_seqs(GapIO *io, bin_index_t *bin) { int i, nr = bin->rng ? ArrayMax(bin->rng) : 0; for (i = 0; i < nr; i++) { range_t *r = arrp(range_t, bin->rng, i); if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) { anno_ele_t *a = (anno_ele_t *)cache_search(io, GT_AnnoEle, r->rec); if (a->bin != bin->rec) { a = cache_rw(io, a); a->bin = bin->rec; } } else { seq_t *seq = (seq_t *)cache_search(io, GT_Seq, r->rec); if (seq->bin != bin->rec) { seq = cache_rw(io, seq); seq->bin = bin->rec; seq->bin_index = i; } } } return 0; }
/* * Adds a contig to a scaffold array. * Gap size, type and evidence refer to the gap between this and the * "previous" contig - ie the last in the scaffold. More complex * scaffold manipulations will be handled elsewhere. * * Set these fields to 0 if you do not know them. * * Returns 0 on success * -1 on failure */ int scaffold_add(GapIO *io, tg_rec scaffold, tg_rec contig, int gap_size, int gap_type, int evidence) { scaffold_t *f; contig_t *c; scaffold_member_t *m; int i; /* Check if this contig is in a scaffold, if so remove now */ c = cache_search(io, GT_Contig, contig); if (c->scaffold) scaffold_remove(io, c->scaffold, contig); if (!(f = cache_search(io, GT_Scaffold, scaffold))) return -1; /* Check if it already exists */ for (i = 0; i < ArrayMax(f->contig); i++) { m = arrp(scaffold_member_t, f->contig, i); if (m->rec == contig) return 0; } /* Append */ f = cache_rw(io, f); m = ArrayRef(f->contig, ArrayMax(f->contig)); // extend m->rec = contig; m->gap_size = ArrayMax(f->contig) > 1 ? gap_size : 0; m->gap_type = gap_type; m->evidence = evidence; /* Update the contig record too */ c = cache_search(io, GT_Contig, contig); c = cache_rw(io, c); c->scaffold = scaffold; #if 0 /* Add a scaffold link to the contig graph too */ if (ArrayMax(f->contig) >= 2) { m = arrp(scaffold_member_t, f->contig, ArrayMax(f->contig)-2); contig_link_t lnk; lnk.rec1 = contig; lnk.rec2 = m->rec; /* Best guess */ lnk.pos1 = 0; lnk.end1 = 1; lnk.pos2 = 0; lnk.end2 = 0; lnk.orientation = 0; lnk.size = 100; lnk.type = CLINK_TYPE_SCAFFOLD; lnk.score = 0; contig_add_link(io, &lnk); } #endif return 0; }
static void tag_shift_for_delete(GapIO *io, tg_rec crec, tg_rec srec, int start, int end, int pos, tg_rec brec) { contig_iterator *ci; rangec_t *r; contig_t *c = cache_search(io, GT_Contig, crec);; //printf("< tag in seq %"PRIrec" at %d\n", srec, pos); cache_incr(io, c); ci = contig_iter_new_by_type(io, crec, 0, CITER_FIRST | CITER_ISTART, start+pos, end, GRANGE_FLAG_ISANNO); if (!ci) { cache_decr(io, c); return; } while ((r = contig_iter_next(io, ci))) { range_t r2, *r_out; anno_ele_t *a; bin_index_t *bin; if (r->pair_rec != srec) continue; bin_remove_item(io, &c, GT_AnnoEle, r->rec); r2.start = (r->start > start+pos) ? r->start-1 : r->start; r2.end = r->end-1; r2.mqual = r->mqual; r2.rec = r->rec; r2.pair_rec = r->pair_rec; r2.flags = r->flags; if (r2.end < r2.start) { /* Tag entirely removed now, it must have been on a pad */ a = cache_search(io, GT_AnnoEle, r->rec); a = cache_rw(io, a); cache_deallocate(io, a); continue; } bin = bin_add_to_range(io, &c, brec, &r2, &r_out, NULL, 0); a = cache_search(io, GT_AnnoEle, r->rec); if (a->bin != bin->rec /*|| a->idx != r_out - ArrayBase(range_t, bin->rng)*/) { /* Annotation moved bins */ a = cache_rw(io, a); a->bin = bin->rec; //a->bin_idx = r_out - ArrayBase(range_t, bin->rng); } } cache_decr(io, c); contig_iter_del(ci); }
/* * Sets the annotation type, passed in as a string but held in a 4-byte int. * This also attempts to set the cached copy of the type held within the * bin range array. * * Returns 0 on success * -1 on failure */ int anno_ele_set_type(GapIO *io, anno_ele_t **e, char *str) { int type; char stype[5]; anno_ele_t *ae; if (!(ae = cache_rw(io, *e))) return -1; /* Get integer type */ memset(stype, 0, 5); strncpy(stype, str, 4); type = str2type(stype); /* Update annotation */ ae->tag_type = type; /* Also update range_t cached copy of type */ if (ae->bin) { bin_index_t *bin = (bin_index_t *)cache_search(io, GT_Bin, ae->bin); range_t *r = NULL; int i, nranges; if (!bin) return -1; if (!(bin = cache_rw(io, bin))) return -1; /* * Find the index into the bin range. * FIXME: we should add a bin_index element, as seen in seq_t, * to avoid the brute force loop. This doesn't have to be * permanently stored - a cached copy would suffice. */ nranges = bin->rng ? ArrayMax(bin->rng) : 0; for (i = 0; i < nranges; i++) { r = arrp(range_t, bin->rng, i); if (r->flags & GRANGE_FLAG_UNUSED) continue; if (r->rec == ae->rec) break; } if (i == nranges) return -1; bin->flags |= BIN_RANGE_UPDATED; r->mqual = type; } *e = ae; return 0; }
/* * Tidies up after break contig or disassemble readings, looking for now * redundant bins. * * This has the following functions (not all implemented yet!) * 1) If a contig is totally empty, remove the contig. * 2) If a bin is empty and all below it, remove the bin. * 3) If a bin is empty and all above it, remove parent bins and link * contig to new root. (TODO) */ static void remove_empty_bins(GapIO *io, tg_rec contig) { contig_t *c = cache_search(io, GT_Contig, contig); tg_rec first = 0; cache_incr(io, c); if (c->bin) { if (remove_empty_bins_r(io, c->bin, &first)) { cache_decr(io, c); contig_destroy(io, contig); return; } if (first != c->bin) { bin_index_t *bin; tg_rec bp, br, cdummy; int offset; /* Cut out the offending waste */ bin = cache_search(io, GT_Bin, first); bin = cache_rw(io, bin); bp = bin->parent; // Find new bin offset bin_get_position(io, bin, &cdummy, &offset); assert(cdummy == contig); bin->pos = offset; bin->parent = contig; bin->parent_type = GT_Contig; bin->flags |= BIN_BIN_UPDATED; c = cache_rw(io, c); br = c->bin; c->bin = first; bin = cache_search(io, GT_Bin, bp); bin = cache_rw(io, bin); if (bin->child[0] == first) bin->child[0] = 0; if (bin->child[1] == first) bin->child[1] = 0; /* Recursively remove the bin tree from old root, br */ bin_destroy_recurse(io, br); } } cache_decr(io, c); }
int track_set_flag(GapIO *io, track_t **t, int value) { track_t *n; if (!(n = cache_rw(io, *t))) return -1; n->flag = value; *t = n; return 0; }
int track_set_nitems(GapIO *io, track_t **t, int value) { track_t *n; if (!(n = cache_rw(io, *t))) return -1; n->nitems = value; *t = n; return 0; }
int track_set_item_size(GapIO *io, track_t **t, int value) { track_t *n; if (!(n = cache_rw(io, *t))) return -1; n->item_size = value; *t = n; return 0; }
/* * Sets the annotation direction, one of ANNO_DIR_* macros (+,-,.,?) * * Returns 0 on success * -1 on failure */ int anno_ele_set_direction(GapIO *io, anno_ele_t **e, char dir) { anno_ele_t *ae; if (!(ae = cache_rw(io, *e))) return -1; *e = ae; ae->direction = dir; return 0; }
int track_set_data(GapIO *io, track_t **t, Array value) { track_t *n; if (!(n = cache_rw(io, *t))) return -1; if (n->data) ArrayDestroy(n->data); n->data = value; *t = n; return 0; }
/* * Removes a contig from a scaffold. * * Returns 0 on success * -1 on failure */ int scaffold_remove(GapIO *io, tg_rec scaffold, tg_rec contig) { scaffold_t *f; scaffold_member_t *m, *m2; contig_t *c; int i; c = cache_search(io, GT_Contig, contig); f = cache_search(io, GT_Scaffold, scaffold); if (!c || !f) return -1; if (c->scaffold != scaffold) { verror(ERR_WARN, "scaffold_remove", "Attempted to remove contig #%" PRIrec" from a scaffold #%"PRIrec" it is not a member of", contig, scaffold); return -1; } c = cache_rw(io, c); c->scaffold = 0; f = cache_rw(io, f); for (i = 0; i < ArrayMax(f->contig); i++) { m = arrp(scaffold_member_t, f->contig, i); if (m->rec == contig) { /* Shuffle array down */ for (i++; i < ArrayMax(f->contig); i++) { m2 = arrp(scaffold_member_t, f->contig, i); *m = *m2; m = m2; } ArrayMax(f->contig)--; } } return 0; }
/* * Creates an anno_ele as per anno_ele_new, but also adds it to an object * and creates the bin Range entry too. */ tg_rec anno_ele_add(GapIO *io, int obj_type, tg_rec obj_rec, tg_rec anno_rec, int type, char *comment, int start, int end, char dir) { range_t r; anno_ele_t *e; contig_t *c; tg_rec crec; bin_index_t *bin; tg_rec seq_bin = 0; /* Find contig for obj_rec/obj_type */ if (obj_type == GT_Contig) { crec = obj_rec; } else { int st, en; sequence_get_position2(io, obj_rec, &crec, &st, &en, NULL, &seq_bin, NULL, NULL); start += st; end += st; } c = (contig_t *)cache_search(io, GT_Contig, crec); cache_incr(io, c); r.start = start; r.end = end; r.flags = GRANGE_FLAG_ISANNO; r.mqual = type; r.pair_rec = obj_rec; if (GT_Seq == obj_type) r.flags |= GRANGE_FLAG_TAG_SEQ; r.rec = anno_ele_new(io, 0, obj_type, obj_rec, 0, type, dir, comment); e = (anno_ele_t *)cache_search(io, GT_AnnoEle, r.rec); e = cache_rw(io, e); if (seq_bin) bin = bin_add_to_range(io, &c, seq_bin, &r, NULL, NULL, 0); else bin = bin_add_range(io, &c, &r, NULL, NULL, 0); if (!bin) verror(ERR_FATAL, "anno_ele_add", "bin_add_to_range returned NULL"); e->bin = bin ? bin->rec : 0; cache_decr(io, c); return r.rec; }
/* * Looks for redundant bins at the root containing no data and just a single * child. * * FIXME: We need to compensate for bin position here. Hence this function * is not called for now. * * Returns 0 on success * -1 on failure */ int remove_redundant_bins(GapIO *io, contig_t *c) { tg_rec bnum; if (!(c = cache_rw(io, c))) return -1; for (bnum = c->bin; bnum;) { bin_index_t *bin = get_bin(io, bnum); if (bin->rng || (bin->child[0] && bin->child[1])) break; /* Empty */ c->bin = bin->child[0] ? bin->child[0] : bin->child[1]; printf("Remove bin %"PRIrec"\n", bin->rec); bnum = c->bin; } return 0; }
/* * Sets the comment for an annotation element. * * Returns 0 on success * -1 on failure */ int anno_ele_set_comment(GapIO *io, anno_ele_t **e, char *comment) { anno_ele_t *ae; size_t clen; if (!(ae = cache_rw(io, *e))) return -1; clen = comment ? strlen(comment) : 0; if (clen > (ae->comment ? strlen(ae->comment) : 0)) { ae = cache_item_resize(ae, sizeof(*ae) + clen+1); ae->comment = (char *)&ae->data; } if (clen) strcpy(ae->comment, comment); *e = ae; return 0; }
/* * Removes an anno_ele from the gap database. * FIXME: need to deallocate storage too. (See docs/TODO) * * Returns 0 on success * -1 on failure */ int anno_ele_destroy(GapIO *io, anno_ele_t *e) { bin_index_t *bin; range_t *r; int i; /* Find the bin range pointing to this object */ bin = (bin_index_t *)cache_search(io, GT_Bin, e->bin); if (!bin || !bin->rng || ArrayMax(bin->rng) == 0) return -1; if (!(bin = cache_rw(io, bin))) return -1; for (i = 0; i < ArrayMax(bin->rng); i++) { r = arrp(range_t, bin->rng, i); if (r->flags & GRANGE_FLAG_UNUSED) continue; if (r->rec == e->rec) break; } if (i == ArrayMax(bin->rng)) return -1; /* Mark this bin range as unused */ r->rec = bin->rng_free; r->flags |= GRANGE_FLAG_UNUSED; bin->rng_free = i; bin->flags |= BIN_RANGE_UPDATED | BIN_BIN_UPDATED; bin_incr_nanno(io, bin, -1); if (bin->start_used == r->start || bin->end_used == r->end) bin_set_used_range(io, bin); return 0; }
int parse_baf(GapIO *io, char *fn, tg_args *a) { int nseqs = 0, nobj = 0, ntags = 0, ncontigs = 0; struct stat sb; zfp *fp; off_t pos; contig_t *c = NULL; tg_pair_t *pair = NULL; baf_block *b, *co = NULL; int last_obj_type = 0; int last_obj_pos = 0; tg_rec last_obj_rec = 0; tg_rec last_cnt_rec = 0; int last_cnt_pos = 0; int last_obj_orient = 0; printf("Loading %s...\n", fn); if (-1 == stat(fn, &sb) || NULL == (fp = zfopen(fn, "r"))) { perror(fn); return -1; } if (a->pair_reads) { pair = create_pair(a->pair_queue); } /* Loop: * Read 1 block of data. * If contig, create contig * If read, insert it, insert to index. * Anything else - reject for now */ pos = 0; while (b = baf_next_block(fp)) { int delay_destroy = 0; switch (b->type) { case CO: { char *contig = baf_block_value(b, CO); if (co) baf_block_destroy(co); co = b; delay_destroy = 1; ncontigs++; create_new_contig(io, &c, contig, a->merge_contigs); /* For anno */ last_obj_type = GT_Contig; last_obj_rec = c->rec; last_obj_pos = c->start + 1; last_cnt_rec = c->rec; last_cnt_pos = c->start + 1; last_obj_orient = 0; break; } case RD: { seq_t seq; int flags; char *tname; tg_rec recno; int is_pair = 0; /* Construct seq struct */ if (-1 == construct_seq_from_block(a, &seq, b, &tname)) { fprintf(stderr, "Failed to parse read block for seq %d\n", nseqs); break; } /* Create range, save sequence */ flags = GRANGE_FLAG_TYPE_SINGLE; if (seq.flags & SEQ_END_REV) flags |= GRANGE_FLAG_END_REV; else flags |= GRANGE_FLAG_END_FWD; if (seq.len < 0) flags |= GRANGE_FLAG_COMP1; if (pair) is_pair = 1; recno = save_range_sequence(io, &seq, seq.mapping_qual, pair, is_pair, tname, c, a, flags, NULL); /* For anno */ last_obj_type = GT_Seq; last_obj_rec = recno; if (seq.len >= 0) { last_obj_pos = seq.pos; last_obj_orient = 0; } else { last_obj_pos = seq.pos - seq.len - 1; last_obj_orient = 1; } nseqs++; break; } case AN: { range_t r; anno_ele_t *e; char *typ = baf_block_value(b, AN); char *loc = baf_block_value(b, LO); char *len = baf_block_value(b, LL); char *txt = baf_block_value(b, TX); char *at = baf_block_value(b, AT); int an_pos; bin_index_t *bin; int anno_obj_type; if (!(a->data_type & DATA_ANNO)) break; if (txt) unescape_line(txt); if (last_obj_type == GT_Contig || (at && *at == 'C')) anno_obj_type = GT_Contig; else anno_obj_type = GT_Seq; if (!loc) { an_pos = last_obj_pos; } else { if (*loc == '@') { an_pos = atoi(loc+1); } else { if (anno_obj_type == GT_Contig) { if (last_obj_orient == 0) an_pos = last_cnt_pos + atoi(loc)-1; else an_pos = last_cnt_pos - (atoi(loc)-1) - (len ? atoi(len)-1 : 0); } else { if (last_obj_orient == 0) an_pos = last_obj_pos + atoi(loc)-1; else an_pos = last_obj_pos - (atoi(loc)-1) - (len ? atoi(len)-1 : 0); } } } r.start = an_pos; r.end = an_pos + (len ? atoi(len)-1 : 0); r.mqual = str2type(typ); r.pair_rec = (anno_obj_type == GT_Contig) ? last_cnt_rec : last_obj_rec; r.flags = GRANGE_FLAG_ISANNO; if (GT_Seq == anno_obj_type) r.flags |= GRANGE_FLAG_TAG_SEQ; r.rec = anno_ele_new(io, 0, anno_obj_type, r.pair_rec, 0, str2type(typ), txt); e = (anno_ele_t *)cache_search(io, GT_AnnoEle, r.rec); e = cache_rw(io, e); bin = bin_add_range(io, &c, &r, NULL, NULL, 0); e->bin = bin->rec; ntags++; break; } case 0: /* blank line */ break; default: printf("Unsupported block type '%s'\n", linetype2str(b->type)); } if (!delay_destroy) baf_block_destroy(b); if ((++nobj & 0xfff) == 0) { int perc = 0; pos = zftello(fp); perc = 100.0 * pos / sb.st_size; printf("\r%d%c", perc, (nobj & 0x3fff) ? '%' : '*'); fflush(stdout); if ((nobj & 0x3fff) == 0) cache_flush(io); } #if 1 if ((nobj & 0x3fff) == 0) { static int perc = 0; if (perc < 100.0 * pos / sb.st_size) { perc = 100.0 * pos / sb.st_size; printf("\r%d%%", perc); //HacheTableStats(io->cache, stdout); //HacheTableStats(((GDB **)io->dbh)[0]->gfile->idx_hash, stdout); { static struct timeval last, curr; static int first = 1; static int last_obj = 0; static int last_contigs = 0; long delta; gettimeofday(&curr, NULL); if (first) { last = curr; first = 0; } delta = (curr.tv_sec - last.tv_sec) * 1000000 + (curr.tv_usec - last.tv_usec); printf(" - %g sec %d obj (%d contigs)\n", delta/1000000.0, nobj - last_obj, ncontigs - last_contigs); last = curr; last_obj = nobj; last_contigs = ncontigs; } fflush(stdout); } } #endif } if (pair && !a->fast_mode) { finish_pairs(io, pair); } if (co) baf_block_destroy(co); cache_flush(io); zfclose(fp); printf("\nLoaded %12d contigs\n", ncontigs); printf(" %12d sequences\n", nseqs); printf(" %12d annotations\n", ntags); if (pair) delete_pair(pair); if (c) cache_decr(io, c); return 0; }
/** * Builds and returns MALIGN from a Gap5 IO handle for the contig 'cnum'. */ MALIGN *build_malign(GapIO *io, tg_rec cnum, int start, int end) { CONTIGL *contig, *first_contig = NULL, *last_contig = NULL; int i, j; contig_iterator *citer; rangec_t *r; /* Expand start and end to the range covered by seqs overlapping * start .. end */ { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_FIRST | CITER_ICLIPPEDSTART, start, start); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); start = ((s->len < 0) ^ r->comp) ? r->end - s->right - 2 : r->start + s->left - 2; } contig_iter_del(citer); } { seq_t *s; citer = contig_iter_new(io, cnum, 0, CITER_LAST | CITER_ICLIPPEDEND, end, end); r = contig_iter_next(io, citer); if (r) { s = cache_search(io, GT_Seq, r->rec); end = ((s->len < 0) ^ r->comp) ? r->end - s->left + 2 : r->start + s->right + 2; } contig_iter_del(citer); } //printf("Generating data for %d..%d\n", start, end); /* Generate contigl linked list */ //citer = contig_iter_new(io, cnum, 1, CITER_FIRST, CITER_CSTART, CITER_CEND); citer = contig_iter_new(io, cnum, 0, CITER_FIRST, start, end); while ((r = contig_iter_next(io, citer))) { seq_t *s, *sorig; char *seq; int len; assert((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ); contig = create_contig_link(); contig->id = r->rec; contig->mseg = create_mseg(); sorig = s = cache_search(io, GT_Seq, r->rec); /* Check for out-of-bounds clip points. It shouldn't happen, but gap5 databases have been seen with this problem, and we don't want to crash if there are any. */ if (s->left < 1) s->left = 1; if (s->right > ABS(s->len)) s->right = ABS(s->len); /* Fix reads of zero length */ if (s->right < s->left) { sorig = s = cache_rw(io, s); s->right = s->left; if (s->right > ABS(s->len)) s->left = s->right = ABS(s->len); } if ((s->len < 0) ^ r->comp) { s = dup_seq(s); complement_seq_t(s); } len = s->right - s->left + 1; if (NULL == (seq = malloc(len+1))) return NULL; for (j = 0, i = s->left-1; i < s->right; i++, j++) { /* Protect against the sequence containing "."; our pad sym */ if (s->seq[i] == '.') seq[j] = 'N'; else seq[j] = s->seq[i]; } seq[j] = 0; init_mseg(contig->mseg, seq, len, r->start-1 + s->left-1); contig->mseg->comp = (s != sorig); if (last_contig) { last_contig->next = contig; } else { first_contig = contig; } last_contig = contig; if (s != sorig) free(s); } contig_iter_del(citer); /* for 454 data -6 to -10 seem to work fine */ return contigl_to_malign(first_contig, -7, -7); }
/* * Takes a multiple alignment and updates the on-disk data structures to * match. This needs to correct confidence values, original positions and * tags too. */ void update_io(GapIO *io, tg_rec cnum, MALIGN *malign, Array indels) { CONTIGL *cl; tg_rec rnum; range_t r, *r_out; bin_index_t *bin; contig_t *c = cache_search(io, GT_Contig, cnum); size_t i, nindel; cache_incr(io, c); /* * To minimise number of data modifications we use a three step approach. * * Step 1: insert columns of pads, shifting reads as appropriate. * Step 2: edit sequence alignments as required, possibly involving * moving sequences and/or adding and removing pads. * Step 3: remove columns of entire pads. * * This means that when we introduce a column of pads we don't have * to make edits to every single read position down stream, and can * instead make use of the optimised recursive bin functions to do this * for us. */ /* Step 1: make indels */ nindel = ArrayMax(indels); for (i = 0; i < nindel; i++) { con_indel_t *id = arrp(con_indel_t, indels, i); int j; if (id->size > 0) { contig_insert_bases(io, &c, id->pos+1, '*', -1, id->size); } else { for (j = 0; j < -id->size; j++) { contig_delete_pad(io, &c, id->pos+1); } } } /* Step 2: edit alignments */ for (cl = malign->contigl; cl; cl = cl->next) { seq_t *s, *sorig; int len, update_range = 0; int shift; rnum = cl->id; sorig = cache_search(io, GT_Seq, rnum); cache_incr(io, sorig); s = dup_seq(sorig); if (cl->mseg->comp) complement_seq_t(s); len = s->right - s->left + 1; /* Check if sequence has changed. If so assign a new one */ if (cl->mseg->length != len || memcmp(s->seq + s->left-1, cl->mseg->seq, cl->mseg->length) != 0) { int newlen = s->left-1 + ABS(s->len) - s->right + cl->mseg->length; int i, j, np; char *newseq = malloc(newlen+1); int8_t *newconf = malloc(newlen+1); /* Build new seq/conf arrays */ memcpy(newseq, s->seq, s->left-1); memcpy(newconf, s->conf, s->left-1); memcpy(&newseq[s->left-1], cl->mseg->seq, cl->mseg->length); /* * Step through both old and new sequences working out how * they differ. This will (*should*) be entire pad movements. * i = index to old seq * j = index to new seq * np = number of pads added minus removed from old seq. */ np = 0; for (i =j =s->left-1; i < ABS(s->len) && j < s->left-1 + cl->mseg->length; ) { /* Bases match */ if (toupper(newseq[j]) == toupper(s->seq[i]) || (s->seq[i] == '.' && newseq[j] == 'N')) { if (isupper(s->seq[i])) newseq[j] = toupper(newseq[j]); else newseq[j] = tolower(newseq[j]); newconf[j] = s->conf[i]; i++, j++; continue; } /* Pad removed */ if (s->seq[i] == '*') { i++; tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+np--, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_delete(io, rnum, r.length - i + 1); } else { tag_shift_for_delete(io, rnum, i+np--); } */ continue; } /* Pad created */ if (newseq[j] == '*') { int k; int ql = 0, qr = 0; for (k = i-1; k >= 0; k--) { if (s->seq[k] != '*') { ql = s->conf[k]; break; } } for (k = i+1; k < s->right; k++) { if (s->seq[k] != '*') { qr = s->conf[k]; break; } } newconf[j] = MIN(ql, qr); /* min conf of neighbours */ j++; tag_shift_for_insert(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+ ++np, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_insert(io, rnum, r.length - i + 1); } else { tag_shift_for_insert(io, rnum, i+ ++np); } */ continue; } fprintf(stderr, "Alignment introduced non-pad character"); abort(); } /* Pads previously at the end of the reading & now removed */ while (i < s->right) { if (s->seq[i] == '*') { i++; tag_shift_for_delete(io, cnum, rnum, cl->mseg->offset, cl->mseg->length, i+np--, s->bin); /* if (io_length(io, rnum) < 0) { tag_shift_for_delete(io, rnum, r.length - i + 1); } else { tag_shift_for_delete(io, rnum, i+np--); } */ } else { /* Error: clipped data that wasn't a pad */ abort(); } } /* Should only be pads remaining in newseq, if anything */ s->right = j; for (; j < s->left-1 + cl->mseg->length; j++) { if (newseq[j] != '*') { fprintf(stderr, "Alignment introduced non-pad character"); abort(); } newconf[j] = 0; } /* Append on the right hand cutoff data */ for (; i < ABS(s->len); i++, j++) { newseq[j] = s->seq[i]; newconf[j] = s->conf[i]; } if (j != newlen) { abort(); } /* Write it back out */ /* Copy newseq/newconf into seq_t */ s->seq = newseq; s->conf = newconf; update_range = 0; if (ABS(s->len) != j) { /* Length change implies updating the range array too */ s->len = s->len >= 0 ? j : -j; update_range = 1; } if (cl->mseg->comp) complement_seq_t(s); /* The memcpy trashes the block pointer, so special care needed */ { sorig = cache_rw(io, sorig); void *blk = sorig->block; memcpy(sorig, s, sizeof(seq_t)); sorig->block = blk; } if (update_range) sorig = cache_item_resize(sorig, sizeof(*sorig) + sequence_extra_len(sorig)); sequence_reset_ptr(sorig); if (s->name) memcpy(sorig->name, s->name, s->name_len+1); if (s->trace_name) memcpy(sorig->trace_name, s->trace_name, s->trace_name_len+1); if (s->alignment) memcpy(sorig->alignment, s->alignment, s->alignment_len+1); memcpy(sorig->seq, s->seq, ABS(s->len)); memcpy(sorig->conf, s->conf, ABS(s->len)); xfree(newconf); xfree(newseq); } { int st, en, or; sequence_get_position(io, s->rec, NULL, &st, &en, &or); if (or ^ (sorig->len < 0)) { shift = ABS(sorig->len) - sorig->right; } else { shift = sorig->left-1; } st += shift; if (st != cl->mseg->offset+1) { update_range = 1; } } free(s); if (update_range) { int bin_changed = 0; /* Get old range and pair data */ s = sorig; bin = cache_search(io, GT_Bin, s->bin); r = *arrp(range_t, bin->rng, s->bin_index); assert(r.rec == s->rec); /* Update range, tedious and slow way */ bin_remove_item(io, &c, GT_Seq, s->rec); r.start = cl->mseg->offset + 1 - shift; r.end = r.start + ABS(s->len) - 1; bin = bin_add_range(io, &c, &r, &r_out, NULL, 0); /* Check if the new bin has a different complemented status too */ if (s->bin != bin->rec) { int old_comp = bin_get_orient(io, s->bin); int new_comp = bin_get_orient(io, bin->rec); if (new_comp != old_comp) { //int tmp; s = cache_rw(io, s); s->len *= -1; s->flags ^= SEQ_COMPLEMENTED; //tmp = s->left; //s->left = ABS(s->len) - (s->right-1); //s->right = ABS(s->len) - (tmp-1); } bin_changed = 1; } /* Update seq bin & bin_index fields */ s = cache_rw(io, s); s->bin = bin->rec; s->bin_index = r_out - ArrayBase(range_t, bin->rng); if (bin_changed) { if (-1 == sequence_fix_anno_bins(io, &s)) { verror(ERR_WARN, "update_io", "sequence_fix_anno_bins() failure"); } } } cache_decr(io, sorig); } /* Step 3 (remove pad columns) done in calling function. */ cache_decr(io, c); }
/* * Breaks a contig in two such that snum is the right-most reading of * a new contig. */ int break_contig(GapIO *io, tg_rec crec, int cpos) { contig_t *cl; contig_t *cr; int cid; char cname[1024], *cname_end; int left_end, right_start; bin_index_t *bin; int do_comp = 0; HacheTable *h; cl = (contig_t *)cache_search(io, GT_Contig, crec); //contig_dump_ps(io, &cl, "/tmp/tree.ps"); /* * Our hash table is keyed on sequence record numbers for all sequences * in all bins spanning the break point. The value is either 0 or 1 * for left/right contig. * * The purpose of this hash is to allow us to work out whether a tag * belongs in the left or right contig, as a tag could start beyond the * break point but be attached to a sequence before the break point. * * Further complicating this is that a tag could be in a smaller bin * than the sequence as it may not be as long. However we know * we'll recurse down these in a logical order so we can be sure * we've already "seen" the sequence that the tag has been * attached to. */ h = HacheTableCreate(1024, HASH_DYNAMIC_SIZE); strncpy(cname, contig_get_name(&cl), 1000); cname_end = cname + strlen(cname); cid = 1; do { sprintf(cname_end, "#%d", cid++); } while (contig_index_query(io, cname) > 0); if (!(cr = contig_new(io, cname))) return -1; cl = cache_rw(io, cl); cr = cache_rw(io, cr); if (0 != contig_index_update(io, cname, strlen(cname), cr->rec)) return -1; printf("Break in contig %"PRIrec", pos %d\n", crec, cpos); printf("Existing left bin = %"PRIrec", right bin = %"PRIrec"\n", cl->bin, cr->bin); cache_incr(io, cl); cache_incr(io, cr); bin = get_bin(io, cl->bin); do_comp = bin->flags & BIN_COMPLEMENTED; break_contig_recurse(io, h, cl, cr, contig_get_bin(&cl), cpos, contig_offset(io, &cl), 0, cl->rec, cr->rec, 0, 0); /* Recompute end positions */ left_end = contig_visible_end(io, cl->rec); right_start = contig_visible_start(io, cr->rec); /* Ensure start/end positions of contigs work out */ bin = cache_rw(io, get_bin(io, cr->bin)); //#define KEEP_POSITIONS 1 #ifndef KEEP_POSITIONS cr->start = 1; cr->end = cl->end - right_start + 1; bin->pos -= right_start-1; #else cr->start = right_start; cr->end = cl->end; #endif if ((do_comp && !(bin->flags & BIN_COMPLEMENTED)) || (!do_comp && (bin->flags & BIN_COMPLEMENTED))) { bin->flags ^= BIN_COMPLEMENTED; } cl->end = left_end; // remove_redundant_bins(io, cl); // remove_redundant_bins(io, cr); printf("Final left bin = %"PRIrec", right bin = %"PRIrec"\n", cl->bin, cr->bin); HacheTableDestroy(h, 0); //if (cl->bin) contig_dump_ps(io, &cl, "/tmp/tree_l.ps"); //if (cr->bin) contig_dump_ps(io, &cr, "/tmp/tree_r.ps"); cache_flush(io); remove_empty_bins(io, cl->rec); remove_empty_bins(io, cr->rec); /* Empty contig? If so remove it completely */ if (cl->bin == 0) { printf("Removing empty contig %"PRIrec"\n", cl->rec); contig_destroy(io, cl->rec); } if (cr->bin == 0) { printf("Removing empty contig %"PRIrec"\n", cr->rec); contig_destroy(io, cr->rec); } cache_decr(io, cl); cache_decr(io, cr); cache_flush(io); return 0; }
/* * A recursive break contig function. * bin_num The current bin being moved or split. * pos The contig break point. * offset The absolute positional offset of this bin in original contig * pleft The parent bin/contig record num in the left new contig * pright The parent bin/contig record num in the right new contig * child_no 0 or 1 - whether this bin is the left/right child of its parent */ static int break_contig_recurse(GapIO *io, HacheTable *h, contig_t *cl, contig_t *cr, tg_rec bin_num, int pos, int offset, int level, tg_rec pleft, tg_rec pright, int child_no, int complement) { int i, j, f_a, f_b; tg_rec rbin; bin_index_t *bin = get_bin(io, bin_num), *bin_dup ; //int bin_min, bin_max; int nseqs; tg_rec opright; /* old pright, needed if we revert back */ cache_incr(io, bin); if (bin->flags & BIN_COMPLEMENTED) { complement ^= 1; } if (complement) { f_a = -1; f_b = offset + bin->size-1; } else { f_a = +1; f_b = offset; } printf("%*sBreak offset %d pos %d => test bin %"PRIrec": %d..%d\n", level*4, "", offset, pos, bin->rec, NMIN(bin->start_used, bin->end_used), NMAX(bin->start_used, bin->end_used)); bin = cache_rw(io, bin); nseqs = bin->nseqs; bin->nseqs = 0; /* Invalidate any cached data */ bin_invalidate_track(io, bin, TRACK_ALL); if (bin->flags & BIN_CONS_VALID) { bin->flags |= BIN_BIN_UPDATED; bin->flags &= ~BIN_CONS_VALID; } //bin_min = bin->rng ? NMIN(bin->start_used, bin->end_used) : offset; //bin_max = bin->rng ? NMAX(bin->start_used, bin->end_used) : offset; /* * Add to right parent if this bin is to the right of pos, * or if the used portion is to the right and we have no left child. * * FIXME: Not a valid assumption! * The used portion of a bin is not a placeholder for the used portion * of all the the children beneath it. Therefore if the used portion of * this bin is > pos (and we have no left child) it still doesn't mean * that the absolute positions of the used portion of the right child * won't be < pos. */ if (offset >= pos /*|| (bin_min >= pos && !bin->child[0])*/) { printf("%*sADD_TO_RIGHT pl=%"PRIrec" pr=%"PRIrec"\n", level*4, "", pleft, pright); if (0 != break_contig_move_bin(io, bin, cl, pleft, cr, pright, child_no)) return -1; bin_incr_nseq(io, bin, nseqs); cache_decr(io, bin); return 0; } /* * Add to left parent if this bin is entirely to the left of pos, * or if the used portion is to the left and we have no right child. */ if (offset + bin->size < pos /*|| (bin_max < pos && !bin->child[1])*/) { printf("%*sADD_TO_LEFT\n", level*4, ""); //if (0 != break_contig_move_bin(io, bin, cr, pright, cl, pleft, child_no)) //return -1; bin_incr_nseq(io, bin, nseqs); cache_decr(io, bin); return 0; } /* * Nominally the bin overlaps both left and right and so needs duplicating. * There are cases though at the roots of our trees where duplicating is * unnecessary as it leads to empty bins at the root. In this case * we skip creating a duplicate for the right, or alternatively steal * the left root bin and use that instead. * * Similarly the range_t array will either be left where it is, moved to * the right contig, or split in half (creating a new one for the right). * * FIXED: always need this. Eg: * * |-------------empty--------------| * |----------------|---------------| * |--------|-------|--------|------| * ^ * | * break here * * In this case we need to duplicate the parent as it overlaps the left * bin, which may (or may not) have data that needs to end up in the right * hand contig. Just duplicate for now and free later on if needed. */ if (1 /* always! */ || pright != cr->rec || (bin->rng && NMAX(bin->start_used, bin->end_used) >= pos)) { //printf("NMAX=%d >= %d\n", NMAX(bin->start_used, bin->end_used), pos); rbin = 0; /* Possibly steal left contig's bin */ if (pleft == cl->rec && NMIN(bin->start_used, bin->end_used) >= pos) { #if 0 /* Currently this doesn't always work */ if (bin->child[1]) { bin_index_t *ch = get_bin(io, bin->child[1]); if (NMIN(ch->pos, ch->pos + ch->size-1) >= pos) { rbin = cl->bin; cl->bin = bin->child[0]; } } #else pleft = bin->rec; #endif } else { pleft = bin->rec; } /* Create new bin, or use root of contig if it's unused so far */ if (!rbin && pright == cr->rec) { rbin = cr->bin; } /* Otherwise we genuingly need a duplicate */ if (!rbin) rbin = bin_new(io, 0, 0, 0, GT_Bin); /* Initialise with duplicate values from left bin */ bin_dup = get_bin(io, rbin); bin_dup = cache_rw(io, bin_dup); bin_dup->size = bin->size; bin_dup->pos = bin->pos; bin_dup->parent = pright; bin_dup->parent_type = (pright == cr->rec ? GT_Contig : GT_Bin); bin_dup->flags = bin->flags | BIN_BIN_UPDATED; bin_dup->start_used = bin->start_used; bin_dup->end_used = bin->end_used; /* * Shift bin to offset if it's the contig root. * It'll be shifted back by the correct amount later. */ if (pright == cr->rec) { printf("moving root bin to offset=%d comp=%d\n", offset, complement); bin_dup->pos = offset; } printf("%*sCreated dup for right, rec %"PRIrec"\n", level*4,"", bin_dup->rec); break_contig_move_bin(io, bin_dup, cl, 0, cr, pright, child_no); opright = pright; pright = bin_dup->rec; } else { bin_dup = NULL; pleft = bin->rec; } if (!bin->rng) { /* Empty bin */ printf("%*sEMPTY range\n", level*4, ""); bin->start_used = bin->end_used = 0; bin->flags |= BIN_BIN_UPDATED; if (bin_dup) { bin_dup->start_used = bin_dup->end_used = 0; bin_dup->flags |= BIN_BIN_UPDATED; } } else if (NMIN(bin->start_used, bin->end_used) >= pos) { /* Move range to right contig */ printf("%*sDUP %"PRIrec", MOVE Array to right\n", level*4, "", bin_dup->rec); bin_dup->rng = bin->rng; bin_dup->rng_rec = bin->rng_rec; bin_dup->rng_free = bin->rng_free; if (bin_dup->rng_rec) bin_dup->flags |= BIN_RANGE_UPDATED; if (bin->rec != bin_dup->rec) { bin->rng = NULL; bin->rng_rec = 0; bin->rng_free = -1; bin->flags |= BIN_BIN_UPDATED; } bin->start_used = bin->end_used = 0; break_contig_reparent_seqs(io, bin_dup); if (bin_dup->rng) { int n = ArrayMax(bin_dup->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin_dup->rng, i), *r2; if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) { HacheData hd; hd.i = 1; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL); j++; } } bin_incr_nseq(io, bin_dup, j); } } else if (NMAX(bin->start_used, bin->end_used) < pos) { /* Range array already in left contig, so do nothing */ printf("%*sMOVE Array to left\n", level*4, ""); if (bin_dup) bin_dup->start_used = bin_dup->end_used = 0; if (bin->rng) { int n = ArrayMax(bin->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i); if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) != GRANGE_FLAG_ISANNO) { HacheData hd; hd.i = 0; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd,NULL); j++; } } bin_incr_nseq(io, bin, j); } } else { /* Range array covers pos, so split in two */ int n, nl = 0, nr = 0; int lmin = bin->size, lmax = 0, rmin = bin->size, rmax = 0; printf("%*sDUP %"PRIrec", SPLIT array\n", level*4, "", bin_dup->rec); bin->flags |= BIN_RANGE_UPDATED; bin_dup->flags |= BIN_RANGE_UPDATED; bin_dup->rng = ArrayCreate(sizeof(range_t), 0); bin_dup->rng_free = -1; /* Pass 1 - hash sequences */ n = ArrayMax(bin->rng); for (i = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i); int cstart; /* clipped sequence positions */ seq_t *s; if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) continue; s = (seq_t *)cache_search(io, GT_Seq, r->rec); if ((s->len < 0) ^ complement) { cstart = NMAX(r->start, r->end) - (s->right-1); } else { cstart = NMIN(r->start, r->end) + s->left-1; } if (cstart >= pos) { HacheData hd; hd.i = 1; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL); } else { HacheData hd; hd.i = 0; HacheTableAdd(h, (char *)&r->rec, sizeof(r->rec), hd, NULL); } } /* Pass 2 - do the moving of anno/seqs */ n = ArrayMax(bin->rng); for (i = j = 0; i < n; i++) { range_t *r = arrp(range_t, bin->rng, i), *r2; int cstart; /* clipped sequence positions */ if (r->flags & GRANGE_FLAG_UNUSED) continue; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO) { cstart = NMAX(r->start, r->end); } else { seq_t *s = (seq_t *)cache_search(io, GT_Seq, r->rec); if ((s->len < 0) ^ complement) { cstart = NMAX(r->start, r->end) - (s->right-1); } else { cstart = NMIN(r->start, r->end) + s->left-1; } } if (cstart >= pos && ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISANNO)) { anno_ele_t *a = (anno_ele_t *)cache_search(io, GT_AnnoEle, r->rec); /* If it's an annotation on a sequence < pos then we * still don't move. * * FIXME: we have no guarantee that the sequence being * annotated is in the same bin as this annotation, as * they may be different sizes and end up in different * bins. (Should we enforce anno always in same bin as seq? * If so, consensus annos fit anywhere?) */ if (a->obj_type == GT_Seq) { HacheItem *hi = HacheTableSearch(h, (char *)&r->pair_rec, sizeof(r->pair_rec)); if (hi) { if (hi->data.i == 0) cstart = pos-1; } else { puts("FIXME: annotation for seq in unknown place - " "work out correct location and move if needed."); } } } if (cstart >= pos) { r2 = (range_t *)ArrayRef(bin_dup->rng, ArrayMax(bin_dup->rng)); *r2 = *r; if (rmin > r->start) rmin = r->start; if (rmin > r->end) rmin = r->end; if (rmax < r->start) rmax = r->start; if (rmax < r->end) rmax = r->end; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ) nr++; } else { if (lmin > r->start) lmin = r->start; if (lmin > r->end) lmin = r->end; if (lmax < r->start) lmax = r->start; if (lmax < r->end) lmax = r->end; if (j != i) { r2 = arrp(range_t, bin->rng, j); *r2 = *r; } j++; if ((r->flags & GRANGE_FLAG_ISMASK) == GRANGE_FLAG_ISSEQ) nl++; } } bin_incr_nseq(io, bin, nl); bin_incr_nseq(io, bin_dup, nr); ArrayMax(bin->rng) = j; #if 0 /* * Right now this causes problems, but I'm not sure why. Try again * after we've fixed the bin->nseqs issues and other deallocation * woes. */ if (ArrayMax(bin_dup->rng) == 0 && bin_dup->parent_type == GT_Bin) { /* We didn't need it afterall! Odd. */ bin_index_t *pb; printf("Purging bin %d that we didn't need afterall\n", bin_dup->rec); cache_rec_deallocate(io, GT_Bin, bin_dup->rec); pb = cache_search(io, GT_Bin, bin_dup->parent); if (pb->child[0] == bin_dup->rec) pb->child[0] = 0; if (pb->child[1] == bin_dup->rec) pb->child[1] = 0; bin_dup = NULL; pright = opright; } #endif if (bin_dup) break_contig_reparent_seqs(io, bin_dup); if (lmin < lmax) { bin->start_used = lmin; bin->end_used = lmax; } else { /* No data left in bin */ bin->start_used = 0; bin->end_used = 0; } printf("%*sLeft=>%d..%d right=>%d..%d\n", level*4, "", lmin, lmax, rmin, rmax); if (bin_dup) { if (rmin < rmax) { bin_dup->start_used = rmin; bin_dup->end_used = rmax; } else { /* No data moved in bin */ bin_dup->start_used = 0; bin_dup->end_used = 0; } } } /* Recurse */ for (i = 0; i < 2; i++) { bin_index_t *ch; if (!bin->child[i]) continue; ch = get_bin(io, bin->child[i]); if (0 != break_contig_recurse(io, h, cl, cr, bin->child[i], pos, NMIN(ch->pos, ch->pos + ch->size-1), level+1, pleft, pright, i, complement)) return -1; } cache_decr(io, bin); // if (bin_dup) // cache_decr(io, bin_dup); return 0; }
/* * Complements a scaffold; both complementing each contig within it and * reversing the order of contigs in the scaffold. * * Returns 0 on success * -1 on failure */ int complement_scaffold(GapIO *io, tg_rec srec) { scaffold_t *f; int i, j, nc = ArrayMax(io->contig_order); scaffold_member_t *contigs; tg_rec *crecs; HashTable *h; reg_order ro; reg_buffer_start rs; reg_buffer_end re; if (!(f = cache_search(io, GT_Scaffold, srec))) return -1; if (!(f = cache_rw(io, f))) return -1; cache_incr(io, f); /* Complement contigs */ contigs = ArrayBase(scaffold_member_t, f->contig); for (i = 0; i < ArrayMax(f->contig); i++) { complement_contig(io, contigs[i].rec); } /* Reverse the order of the contigs in the scaffold array */ for (i = 0, j = ArrayMax(f->contig)-1; i < j; i++, j--) { scaffold_member_t cr1 = contigs[i]; contigs[i] = contigs[j]; contigs[j] = cr1; } /* * Reverse the order of contigs in the contig_order array too. * This is the part that really matters. It's also hard as the contigs * in the contig order array could be in any order and not adjacent. * For our purposes we'll just ensure the contigs in this scaffold in * the contig order array match our freshly complemented scaffold * ordering. * * We initially build a hash table of contigs in this scaffold, and * then iterate through contig_order copying out the new contigs whenever * one matches. */ h = HashTableCreate(nc, 0); for (i = 0; i < ArrayMax(f->contig); i++) { HashData hd; hd.i = 0; HashTableAdd(h, (char *)&contigs[i].rec, sizeof(tg_rec), hd, NULL); } /* Replace any contig matching the scaffold with the new order */ crecs = ArrayBase(tg_rec, io->contig_order); for (i = j = 0; i < nc; i++) { HashItem *hi; if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec)))) continue; crecs[i] = contigs[j++].rec; } /* Send event messages around */ rs.job = REG_BUFFER_START; for (i = 0; i < nc; i++) { HashItem *hi; if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec)))) continue; contig_notify(io, crecs[i], (reg_data *)&rs); } ro.job = REG_ORDER; for (i = 0; i < nc; i++) { HashItem *hi; if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec)))) continue; ro.pos = i+1; contig_notify(io, crecs[i], (reg_data *)&ro); } /* Notify the end of our updates */ re.job = REG_BUFFER_END; for (i = 0; i < nc; i++) { HashItem *hi; if (!(hi = HashTableSearch(h, (char *)&crecs[i], sizeof(tg_rec)))) continue; contig_notify(io, crecs[i], (reg_data *)&re); } HashTableDestroy(h, 0); cache_decr(io, f); return 0; }
/* * Given a contig order and a set of current scaffolds, this updates the * order of entries within each scaffold to match the contig order. * * For example if we have contigs in order 1 3 5 2 6 8 4 7 9 and * scaffolds {1 2 3 4} {5 6 7 8 9} we would shuffle the scaffold members * to {1 3 2 4} {5 6 8 7 9} * * The purpose is for integration with contig shuffling in the Contig List * or Contig Selector. The master contig order array is what gets shuffled * manually by the user and it is also the definitive order to use when * outputting data (so it is completely under users control whether they * sort by name, size or scaffold). * * Returns 0 on success * -1 on failure */ int update_scaffold_order(GapIO *io) { int i, j, ret = -1; int nc; int ns; tg_rec *crecs; if (!io->scaffold) return 0; /* Not supported, but considered success */ nc = ArrayMax(io->contig_order); ns = ArrayMax(io->scaffold); scaf_ctg_t *a = (scaf_ctg_t *)malloc(nc * sizeof(*a)); if (!a) return -1; /* * Produce an array of scaffold and contig recs, so we can sort on * both fields. */ crecs = ArrayBase(tg_rec, io->contig_order); for (i = 0; i < nc; i++) { contig_t *c = cache_search(io, GT_Contig, crecs[i]); if (!c) goto err; a[i].ctg_idx = i; a[i].scaffold = c->scaffold; } qsort(a, nc, sizeof(*a), scaf_ctg_sort); /* * Now recreate scaffold orders from the sorted contig list. */ for (i = 0; i < nc; i++) { scaffold_t *f; int k; if (!a[i].scaffold) continue; j = i; while (i < nc && a[i].scaffold == a[j].scaffold) i++; /* j .. i-1 share the same scaffold */ f = cache_search(io, GT_Scaffold, a[j].scaffold); if (!f) goto err; if (!f->contig || ArrayMax(f->contig) != i-j) { verror(ERR_WARN, "update_scaffold_order", "Scaffold %"PRIrec "has different number of entries than contigs claim.", f->rec); goto err; } /* Only mark r/w and update if they differ */ for (k = 0; k < ArrayMax(f->contig); k++) { if ((arrp(scaffold_member_t, f->contig, k))->rec != crecs[a[j+k].ctg_idx]) break; } if (k != ArrayMax(f->contig)) { f = cache_rw(io, f); for (k = 0; k < ArrayMax(f->contig); k++) (arrp(scaffold_member_t, f->contig, k))->rec = crecs[a[j+k].ctg_idx]; } i--; } ret = 0; err: free(a); return ret; }
/* * Recursive part of remove_empty_bins. * Takes bin record. * Removes the bin if it is empty and has no children. * Modifies *first to contain the first bin record with data. * * Returns 1 if removed * 0 if not. */ static int remove_empty_bins_r(GapIO *io, tg_rec brec, tg_rec *first) { bin_index_t *bin = cache_search(io, GT_Bin, brec); int i, empty[2]; /* Emptied or non-existant */ int this_is_empty; tg_rec child[2], f[2]; /* Check if this bin is empty */ this_is_empty = 0; if (!bin->rng || ArrayMax(bin->rng) == 0) { this_is_empty = 1; } else { /* Check if ranges are all unused */ for (i = 0 ; i < ArrayMax(bin->rng); i++) { range_t *r = arrp(range_t, bin->rng, i); if (!(r->flags & GRANGE_FLAG_UNUSED)) break; } if (i == ArrayMax(bin->rng)) { this_is_empty = 1; } } /* Temporary copies to avoid needing cache_incr */ child[0] = bin->child[0]; child[1] = bin->child[1]; f[0] = f[1] = 0; empty[0] = child[0] ? remove_empty_bins_r(io, child[0], &f[0]) : 1; empty[1] = child[1] ? remove_empty_bins_r(io, child[1], &f[1]) : 1; /* Remove this bin if empty and children are too */ if (empty[0] && empty[1] && this_is_empty) { printf("Bin %"PRIrec": this & children are empty / non-existant\n", brec); cache_rec_deallocate(io, GT_Bin, brec); return 1; } /* If we removed a child bin but are keeping this, then fix links */ if ((empty[0] && child[0]) || (empty[1] && child[1])) { bin = cache_search(io, GT_Bin, brec); bin = cache_rw(io, bin); if (empty[0]) { bin->flags |= BIN_BIN_UPDATED; bin->child[0] = 0; } if (empty[1]) { bin->flags |= BIN_BIN_UPDATED; bin->child[1] = 0; } } /* Track first useful bin */ if (first && !*first) { if ((f[0] && f[1]) || !this_is_empty) { *first = brec; } else if (f[0]) { *first = f[0]; } else if (f[1]) { *first = f[1]; } } return 0; }
static int break_contig_move_bin(GapIO *io, bin_index_t *bin, contig_t *cfrom, tg_rec pfrom, contig_t *cto, tg_rec pto, int child_no) { /* Add to */ if (pto == cto->rec) { /* Parent is a contig */ if (bin->rec != cto->bin) { cache_rec_deallocate(io, GT_Bin, cto->rec); } cto->bin = bin->rec; cto->start = 1; cto->end = bin->size; bin->parent = cto->rec; bin->parent_type = GT_Contig; bin->flags |= BIN_BIN_UPDATED; } else { /* Parent is a bin */ bin_index_t *pb; if (!(pb = get_bin(io, pto))) return -1; if (!(pb = cache_rw(io, pb))) return -1; pb->child[child_no] = bin->rec; pb->flags |= BIN_BIN_UPDATED; bin->parent = pto; bin->parent_type = GT_Bin; bin->flags |= BIN_BIN_UPDATED; } /* Remove from: NB it may not exist? */ if (pfrom == cfrom->rec) { /* Parent is a contig */ if (cfrom->bin != bin->rec) { fprintf(stderr, "pfrom incorrect\n"); return -1; } cfrom->bin = 0; } else if (pfrom > 0) { /* Parent is a bin */ bin_index_t *pb; if (!(pb = get_bin(io, pfrom))) return -1; if (!(pb = cache_rw(io, pb))) return -1; if (pb->child[0] != bin->rec && pb->child[1] != bin->rec) { fprintf(stderr, "pfrom incorrect\n"); return -1; } if (!(pb = cache_rw(io, pb))) return -1; if (pb->child[0] == bin->rec) pb->child[0] = 0; else pb->child[1] = 0; pb->flags |= BIN_BIN_UPDATED; } return 0; }
int io_timestamp_incr(GapIO *io) { io = gio_base(io); io->db = cache_rw(io, io->db); return ++io->db->timestamp; }
/* * Extends the right hand end of a single contig. * * Min_depth is the minimum depth for extension. If lower then even if the * data matches we'll not extend further. * * Match_score (+ve) and mismatch_score (-ve) are accumulated during * extension to ensure that we don't extend into junk mismatching DNA. */ static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth, int match_score, int mismatch_score) { int end; rangec_t *r; int nr, i; contig_t *c; char cons[CSZ], new_cons[ESZ]; int freqs[ESZ][4], depth[ESZ]; double score, best_score; int best_pos, nseq; vmessage("Processing contig #%"PRIrec", %s end\n", crec, dir ? "left" : "right"); for (i = 0; i < ESZ; i++) { freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0; depth[i] = 0; } c = cache_search(io, GT_Contig, crec); if (NULL == c) return -1; cache_incr(io, c); if (consensus_valid_range(io, crec, NULL, &end) != 0) { cache_decr(io, c); return -1; } calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL); /* Start */ /* Not implemented for now: rev complement and go again! */ /* End */ r = contig_seqs_in_range(io, &c, end, end, 0, &nr); if (!r) { cache_decr(io, c); return -1; } for (i = 0; i < nr; i++) { seq_t *s = cache_search(io, GT_Seq, r[i].rec); seq_t *sorig = s; int cstart, cend; int j, k, slen; if ((s->len < 0) ^ r[i].comp) { s = dup_seq(s); complement_seq_t(s); } cstart = r[i].start + s->left-1; cend = r[i].start + s->right-1; /* Does cutoff extend to contig end, if so does it match cons? */ if (cend < end) { int mis = 0, len = 0; if (end - cend >= CSZ) { /* fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n", r[i].rec); */ if (sorig != s) free(s); r[i].rec = 0; /* Mark for removal */ continue; } for (k = s->right, j = cend+1; j <= end; j++, k++) { //printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]); if (s->seq[k] != cons[j-(end-(CSZ-1))]) mis++; } len = end - cend; if (100*mis/len > 5) { /* fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement " "with consensus.\n", r[i].rec); */ if (sorig != s) free(s); r[i].rec = 0; continue; } } /* So we got here, let's accumulate extension stats */ slen = ABS(s->len); for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) { //printf("%d: %c\n", j + r[i].start, s->seq[j]); if(s->seq[j] == 'N') continue; freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++; depth[k]++; } if (sorig != s) free(s); } score = best_score = 0; best_pos = 0; for (i = 0; i < ESZ; i++) { int call, best = 0, j; double dd; if (depth[i] < min_depth) break; for (j = 0; j < 4; j++) { if (best < freqs[i][j]) { best = freqs[i][j]; call = j; } } new_cons[i] = "ACGT"[call]; dd = (double)depth[i]; switch (call) { case 0: score += freqs[i][0] / dd; score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd; break; case 1: score += freqs[i][1] / dd; score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd; break; case 2: score += freqs[i][2] / dd; score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd; break; case 3: score += freqs[i][3] / dd; score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd; break; } if (best_score <= score) { best_score = score; best_pos = i+1; } /* printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n", i, depth[i], "ACGT"[call], freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3], score); */ } /* printf("Best score is %f at %d\n", best_score, best_pos); */ /* Extend */ nseq = 0; if (best_pos > 0) { int furthest_left = end; for (i = 0; i < nr; i++) { seq_t *s; int r_pos; int score; if (r[i].rec == 0) continue; s = cache_search(io, GT_Seq, r[i].rec); s = cache_rw(io, s); if (furthest_left > r[i].start) furthest_left = r[i].start; /* * end + best_pos is the furthest right we can go, but this * specific read may not be justified in reaching that far * if it has too many disagreements. */ if ((s->len > 0) ^ r[i].comp) { int best_r = 0, j, k; int len = ABS(s->len); //printf(">%s\t", s->name); r_pos = 0; score = 0; //for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) { for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) { if (new_cons[j] == toupper(s->seq[k])) { score += match_score; if (best_r <= score) { best_r = score; r_pos = k+1; } } else { score += mismatch_score; } //putchar(new_cons[j] == toupper(s->seq[k]) // ? toupper(s->seq[k]) // : tolower(s->seq[k])); } //putchar('\n'); if (s->right != r_pos) { s->right = r_pos; nseq++; } } else { int best_r = 0, j, k; //printf("<%s\t", s->name); r_pos = 0; score = 0; //for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) { for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) { char b = complement_base(s->seq[k]); if (new_cons[j] == b) { score += match_score; if (best_r <= score) { best_r = score; r_pos = k-1; } } else { score += mismatch_score; } //putchar(new_cons[j] == toupper(b) // ? toupper(b) // : tolower(b)); } //putchar('\n'); if (s->left != r_pos+2) { s->left = r_pos+2; nseq++; } } } vmessage(" Extended by %d, adjusting %d sequence clip%s\n", best_pos, nseq, nseq == 1 ? "" : "s"); bin_invalidate_consensus(io, crec, furthest_left, end + best_pos); } else { vmessage(" Unable to extend contig\n"); } free(r); cache_decr(io, c); cache_flush(io); return 0; }