GtUword gt_condenseq_seqlength(const GtCondenseq *condenseq, GtUword seqnum) { GtUword start = 0; if (seqnum != 0) start = gt_intset_get(condenseq->ssptab, seqnum - 1) + 1; return condenseq_seqlength_help(condenseq, seqnum, start); }
GtUword gt_condenseq_each_redundant_range( GtCondenseq *condenseq, GtUword uid, GtRange urange, GtUword left_extend, GtUword right_extend, GtCondenseqProcessExtractedRange callback, void *callback_data, GtError *err) { int had_err = 0; GtUword num_ranges = (GtUword) 1, linkidx, orig_seqnum, orig_seqstart, orig_seqend; const GtCondenseqUnique *unique; GtRange extract; gt_assert(condenseq != NULL); gt_assert(uid < condenseq->udb_nelems); unique = &condenseq->uniques[uid]; /* handle unique itself */ orig_seqnum = gt_condenseq_pos2seqnum(condenseq, unique->orig_startpos); orig_seqstart = gt_condenseq_seqstartpos(condenseq, orig_seqnum); orig_seqend = orig_seqstart + condenseq_seqlength_help(condenseq, orig_seqnum, orig_seqstart) - 1; extract.start = unique->orig_startpos + urange.start; extract.start = extract.start < left_extend ? 0 : extract.start - left_extend; extract.start = extract.start < orig_seqstart ? orig_seqstart : extract.start; extract.end = unique->orig_startpos + urange.end + right_extend; extract.end = extract.end > orig_seqend ? orig_seqend : extract.end; gt_assert(extract.start <= extract.end); had_err = callback(callback_data, orig_seqnum, extract, err); for (linkidx = 0; !had_err && linkidx < unique->links.nextfreeuint32_t; ++linkidx) { const GtCondenseqLink *link = &condenseq->links[unique->links.spaceuint32_t[linkidx]]; /* the second part is a little heuristic, the len of the link could be completely comprised of insertions, which would place it downstream of urange.start. But we assume ~ similar lengths for links and their unique counterpart */ if (!(urange.end < link->unique_offset || urange.start > link->unique_offset + link->len - 1)) { GtUword shift; orig_seqnum = gt_condenseq_pos2seqnum(condenseq, link->orig_startpos); orig_seqstart = gt_condenseq_seqstartpos(condenseq, orig_seqnum); orig_seqend = orig_seqstart + condenseq_seqlength_help(condenseq, orig_seqnum, orig_seqstart) - 1; extract.start = link->orig_startpos < left_extend ? 0 : link->orig_startpos - left_extend; if (urange.start < link->unique_offset) { shift = link->unique_offset - urange.start; extract.start = extract.start < shift ? 0 : extract.start - shift; } else { shift = urange.start - link->unique_offset; extract.start += shift; } extract.start = extract.start < orig_seqstart ? orig_seqstart : extract.start; /* see heuristic note above */ extract.end = link->orig_startpos + right_extend + link->len; if (urange.end < link->unique_offset + link->len - 1) { shift = (link->unique_offset + link->len - 1) - urange.end; extract.end = extract.end < shift ? 0 : extract.end - shift; } else { shift = urange.end - (link->unique_offset + link->len - 1); extract.end += shift; } extract.end = extract.end > orig_seqend ? orig_seqend : extract.end; gt_assert(extract.start <= extract.end); had_err = callback(callback_data, orig_seqnum, extract, err); num_ranges++; } } if (!had_err) return num_ranges; return 0; }
int gt_condenseq_output_to_gff3(const GtCondenseq *condenseq, GtError *err) { int had_err = 0; GtUword idx, name_len, seqnum = 0, seqstart = 0, seqend = 0, desclen; GtStr *filename = NULL, *id = gt_str_new_cstr("U"), *name = gt_str_new_cstr("unique"), *parent_unique = gt_str_new_cstr("U"), *seqid = gt_str_new(), *source = gt_str_new_cstr("Condenseq"); GtFile *outfile = NULL; GtGFF3Visitor *gffv = NULL; GtNodeVisitor *nodev = NULL; GtFeatureNode *fnode = NULL; GtGenomeNode *node = NULL; GtRange range; gt_assert(condenseq != NULL); filename = gt_str_new_cstr(gt_condenseq_basefilename(condenseq)); name_len = gt_str_length(name); gt_str_append_cstr(filename, ".gff3"); outfile = gt_file_new(gt_str_get(filename), "w", err); nodev = gt_gff3_visitor_new(outfile); gffv = (GtGFF3Visitor *) nodev; gt_gff3_visitor_retain_id_attributes(gffv); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); for (idx = 0; !had_err && idx < condenseq->udb_nelems; ++idx) { GtCondenseqUnique uq = condenseq->uniques[idx]; if (seqend <= uq.orig_startpos) { const char *desc; gt_genome_node_delete(node); seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_str_reset(seqid); gt_str_append_cstr_nt(seqid, desc, desclen); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); } gt_str_set_length(name, name_len); gt_str_append_uword(name, idx); gt_str_set_length(id, (GtUword) 1); gt_str_append_uword(id, idx); gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name)); gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id)); /* 1 Based coordinates! */ range.start = uq.orig_startpos + 1 - seqstart; range.end = uq.orig_startpos + uq.len - seqstart; gt_genome_node_set_range(node, &range); had_err = gt_genome_node_accept(node, nodev, err); } gt_str_reset(name); gt_str_append_cstr(name, "link"); gt_str_reset(id); gt_str_append_cstr(id, "L"); name_len = gt_str_length(name); seqend = 0; for (idx = 0; !had_err && idx < condenseq->ldb_nelems; ++idx) { GtCondenseqLink link = condenseq->links[idx]; if (seqend <= link.orig_startpos) { const char *desc; gt_genome_node_delete(node); seqnum = gt_condenseq_pos2seqnum(condenseq, link.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_str_reset(seqid); gt_str_append_cstr_nt(seqid, desc, desclen); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); } gt_str_set_length(name, name_len); gt_str_append_uword(name, idx); gt_str_set_length(id, (GtUword) 1); gt_str_append_uword(id, idx); gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name)); gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id)); gt_str_set_length(parent_unique, (GtUword) 1); gt_str_append_uword(parent_unique, link.unique_id); gt_feature_node_set_attribute(fnode, "Derives_from", gt_str_get(parent_unique)); /* 1 Based coordinates! */ range.start = link.orig_startpos + 1 - seqstart; range.end = link.orig_startpos + link.len - seqstart; gt_genome_node_set_range(node, &range); had_err = gt_genome_node_accept(node, nodev, err); } gt_file_delete(outfile); gt_genome_node_delete(node); gt_node_visitor_delete(nodev); gt_str_delete(filename); gt_str_delete(id); gt_str_delete(name); gt_str_delete(parent_unique); gt_str_delete(seqid); gt_str_delete(source); return had_err; }
GtUword gt_condenseq_seqlength(const GtCondenseq *condenseq, GtUword seqnum) { GtUword start = gt_condenseq_seqstartpos(condenseq, seqnum); return condenseq_seqlength_help(condenseq, seqnum, start); }