const char *gt_condenseq_extract_decoded(GtCondenseq *condenseq, GtUword *length, GtUword id) { GtRange range; range.start = gt_condenseq_seqstartpos(condenseq, id); if (id < condenseq->orig_num_seq - 1) /* -2 because of seperator */ range.end = gt_condenseq_seqstartpos(condenseq, id + 1) - 2; else range.end = condenseq->orig_length - 1; *length = range.end - range.start + 1; return gt_condenseq_extract_decoded_range(condenseq, range, '\0'); }
GtUword gt_condenseq_unique_range_to_seqrange(GtCondenseq *condenseq, GtUword uid, GtRange *urange) { GtUword seqnum = 0, seqstart = 0; GtCondenseqUnique uq; gt_assert(condenseq != NULL); gt_assert(uid < condenseq->udb_nelems); uq = condenseq->uniques[uid]; seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); urange->start += uq.orig_startpos - seqstart; urange->end += uq.orig_startpos - seqstart; return seqnum; }
GtUword gt_condenseq_each_redundant_range( GtCondenseq *condenseq, GtUword uid, GtRange urange, GtUword left_extend, GtUword right_extend, GtCondenseqProcessExtractedRange callback, void *callback_data, GtError *err) { int had_err = 0; GtUword num_ranges = (GtUword) 1, linkidx, orig_seqnum, orig_seqstart, orig_seqend; const GtCondenseqUnique *unique; GtRange extract; gt_assert(condenseq != NULL); gt_assert(uid < condenseq->udb_nelems); unique = &condenseq->uniques[uid]; /* handle unique itself */ orig_seqnum = gt_condenseq_pos2seqnum(condenseq, unique->orig_startpos); orig_seqstart = gt_condenseq_seqstartpos(condenseq, orig_seqnum); orig_seqend = orig_seqstart + condenseq_seqlength_help(condenseq, orig_seqnum, orig_seqstart) - 1; extract.start = unique->orig_startpos + urange.start; extract.start = extract.start < left_extend ? 0 : extract.start - left_extend; extract.start = extract.start < orig_seqstart ? orig_seqstart : extract.start; extract.end = unique->orig_startpos + urange.end + right_extend; extract.end = extract.end > orig_seqend ? orig_seqend : extract.end; gt_assert(extract.start <= extract.end); had_err = callback(callback_data, orig_seqnum, extract, err); for (linkidx = 0; !had_err && linkidx < unique->links.nextfreeuint32_t; ++linkidx) { const GtCondenseqLink *link = &condenseq->links[unique->links.spaceuint32_t[linkidx]]; /* the second part is a little heuristic, the len of the link could be completely comprised of insertions, which would place it downstream of urange.start. But we assume ~ similar lengths for links and their unique counterpart */ if (!(urange.end < link->unique_offset || urange.start > link->unique_offset + link->len - 1)) { GtUword shift; orig_seqnum = gt_condenseq_pos2seqnum(condenseq, link->orig_startpos); orig_seqstart = gt_condenseq_seqstartpos(condenseq, orig_seqnum); orig_seqend = orig_seqstart + condenseq_seqlength_help(condenseq, orig_seqnum, orig_seqstart) - 1; extract.start = link->orig_startpos < left_extend ? 0 : link->orig_startpos - left_extend; if (urange.start < link->unique_offset) { shift = link->unique_offset - urange.start; extract.start = extract.start < shift ? 0 : extract.start - shift; } else { shift = urange.start - link->unique_offset; extract.start += shift; } extract.start = extract.start < orig_seqstart ? orig_seqstart : extract.start; /* see heuristic note above */ extract.end = link->orig_startpos + right_extend + link->len; if (urange.end < link->unique_offset + link->len - 1) { shift = (link->unique_offset + link->len - 1) - urange.end; extract.end = extract.end < shift ? 0 : extract.end - shift; } else { shift = urange.end - (link->unique_offset + link->len - 1); extract.end += shift; } extract.end = extract.end > orig_seqend ? orig_seqend : extract.end; gt_assert(extract.start <= extract.end); had_err = callback(callback_data, orig_seqnum, extract, err); num_ranges++; } } if (!had_err) return num_ranges; return 0; }
int gt_condenseq_output_to_gff3(const GtCondenseq *condenseq, GtError *err) { int had_err = 0; GtUword idx, name_len, seqnum = 0, seqstart = 0, seqend = 0, desclen; GtStr *filename = NULL, *id = gt_str_new_cstr("U"), *name = gt_str_new_cstr("unique"), *parent_unique = gt_str_new_cstr("U"), *seqid = gt_str_new(), *source = gt_str_new_cstr("Condenseq"); GtFile *outfile = NULL; GtGFF3Visitor *gffv = NULL; GtNodeVisitor *nodev = NULL; GtFeatureNode *fnode = NULL; GtGenomeNode *node = NULL; GtRange range; gt_assert(condenseq != NULL); filename = gt_str_new_cstr(gt_condenseq_basefilename(condenseq)); name_len = gt_str_length(name); gt_str_append_cstr(filename, ".gff3"); outfile = gt_file_new(gt_str_get(filename), "w", err); nodev = gt_gff3_visitor_new(outfile); gffv = (GtGFF3Visitor *) nodev; gt_gff3_visitor_retain_id_attributes(gffv); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); for (idx = 0; !had_err && idx < condenseq->udb_nelems; ++idx) { GtCondenseqUnique uq = condenseq->uniques[idx]; if (seqend <= uq.orig_startpos) { const char *desc; gt_genome_node_delete(node); seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_str_reset(seqid); gt_str_append_cstr_nt(seqid, desc, desclen); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); } gt_str_set_length(name, name_len); gt_str_append_uword(name, idx); gt_str_set_length(id, (GtUword) 1); gt_str_append_uword(id, idx); gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name)); gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id)); /* 1 Based coordinates! */ range.start = uq.orig_startpos + 1 - seqstart; range.end = uq.orig_startpos + uq.len - seqstart; gt_genome_node_set_range(node, &range); had_err = gt_genome_node_accept(node, nodev, err); } gt_str_reset(name); gt_str_append_cstr(name, "link"); gt_str_reset(id); gt_str_append_cstr(id, "L"); name_len = gt_str_length(name); seqend = 0; for (idx = 0; !had_err && idx < condenseq->ldb_nelems; ++idx) { GtCondenseqLink link = condenseq->links[idx]; if (seqend <= link.orig_startpos) { const char *desc; gt_genome_node_delete(node); seqnum = gt_condenseq_pos2seqnum(condenseq, link.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_str_reset(seqid); gt_str_append_cstr_nt(seqid, desc, desclen); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); } gt_str_set_length(name, name_len); gt_str_append_uword(name, idx); gt_str_set_length(id, (GtUword) 1); gt_str_append_uword(id, idx); gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name)); gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id)); gt_str_set_length(parent_unique, (GtUword) 1); gt_str_append_uword(parent_unique, link.unique_id); gt_feature_node_set_attribute(fnode, "Derives_from", gt_str_get(parent_unique)); /* 1 Based coordinates! */ range.start = link.orig_startpos + 1 - seqstart; range.end = link.orig_startpos + link.len - seqstart; gt_genome_node_set_range(node, &range); had_err = gt_genome_node_accept(node, nodev, err); } gt_file_delete(outfile); gt_genome_node_delete(node); gt_node_visitor_delete(nodev); gt_str_delete(filename); gt_str_delete(id); gt_str_delete(name); gt_str_delete(parent_unique); gt_str_delete(seqid); gt_str_delete(source); return had_err; }
static int gt_condenseq_extract_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { int had_err = 0; GtCondenserExtractArguments *arguments = tool_arguments; GtCondenseq *condenseq = NULL; GtLogger *logger = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); if (!had_err) { condenseq = gt_condenseq_new_from_file(argv[parsed_args], logger, err); if (condenseq == NULL) { had_err = -1; } } if (!had_err) { const char *buffer = NULL; const char *desc = NULL; GtUword desclen, seqlen, rend = gt_condenseq_total_length(condenseq), send = gt_condenseq_num_of_sequences(condenseq); bool concat = strcmp(gt_str_get(arguments->mode), "concat") == 0; /* single sequence to extract = range of length 1 */ if (arguments->seq != GT_UNDEF_UWORD) { arguments->seqrange.start = arguments->seqrange.end = arguments->seq; } /* no range given at all: extract all seqs */ if (arguments->range.start == GT_UNDEF_UWORD && arguments->seqrange.start == GT_UNDEF_UWORD) { arguments->seqrange.start = 0; arguments->seqrange.end = send - 1; } /* if seqs are specified, and concat is given, switch to posrange */ if (concat && arguments->seqrange.start != GT_UNDEF_UWORD) { if (arguments->seqrange.end >= send) { had_err = -1; gt_error_set(err, "range end " GT_WU " excedes number of sequences " GT_WU " (ranges are zero based sequence ids)", arguments->seqrange.end, send); } else { arguments->range.start = gt_condenseq_seqstartpos(condenseq, arguments->seqrange.start); arguments->range.end = gt_condenseq_seqstartpos(condenseq, arguments->seqrange.end) + gt_condenseq_seqlength(condenseq, arguments->seqrange.end) - 1; } } /* extract sequence region */ if (!had_err && arguments->range.start != GT_UNDEF_UWORD) { const GtUword maxbuffsize = ((GtUword) 1) << 17; /* ~ 100000byte */ GtUword clen, rstart, current_length = 0, i; const char sepchar = gt_str_get(arguments->sepchar)[0]; if (arguments->range.end >= rend) { had_err = -1; gt_error_set(err, "range end " GT_WU " excedes length of sequence " GT_WU " (ranges are zero based positions)", arguments->range.end, rend); } if (!had_err) { rstart = arguments->range.start; rend = arguments->range.end; /* nextlength = gt_condenseq_seqlength(condenseq, seqnum); */ /* seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); */ /* gt_assert(rstart >= seqstart); */ /* nextlength -= rstart - seqstart; [> handle first seq <] */ while (rstart <= rend) { GtRange cur_range; if (rend - rstart > maxbuffsize) { GtUword seqnum = gt_condenseq_pos2seqnum(condenseq, rstart + maxbuffsize), closest_sep = gt_condenseq_seqstartpos(condenseq, seqnum) - 1; gt_assert(closest_sep > rstart); clen = closest_sep - rstart + 1; } else clen = rend - rstart + 1; cur_range.start = rstart; cur_range.end = rstart + clen - 1; buffer = gt_condenseq_extract_decoded_range(condenseq, cur_range, sepchar); gt_assert(buffer != NULL); for (i = 0; i < clen; i++, current_length++) { if (arguments->width && current_length == arguments->width) { gt_file_xfputc('\n', arguments->outfp); current_length = 0; } gt_file_xfputc(buffer[i], arguments->outfp); } rstart += clen; } gt_file_xfputc('\n', arguments->outfp); } } else if (!had_err) { /* extract seqwise and always fasta */ GtUword seqnum, sstart = arguments->seqrange.start; if (arguments->seqrange.end >= send) { had_err = -1; gt_error_set(err, "range end " GT_WU " excedes number of sequences " GT_WU " (ranges are zero based sequence ids)", arguments->seqrange.end, send); } send = arguments->seqrange.end; for (seqnum = sstart; !had_err && seqnum <= send; ++seqnum) { buffer = gt_condenseq_extract_decoded(condenseq, &seqlen, seqnum); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_fasta_show_entry_nt(desc, desclen, buffer, seqlen, arguments->width, arguments->outfp); } } } gt_condenseq_delete(condenseq); gt_logger_delete(logger); return had_err; }
GtUword gt_condenseq_seqlength(const GtCondenseq *condenseq, GtUword seqnum) { GtUword start = gt_condenseq_seqstartpos(condenseq, seqnum); return condenseq_seqlength_help(condenseq, seqnum, start); }