void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr) { GtGenomeNode *gn; gt_assert(fn && outstr); gn = (GtGenomeNode*) fn; gt_str_append_str(outstr, gt_genome_node_get_seqid(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_source(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_type(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_start(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_end(gn)); gt_str_append_char(outstr, '\t'); if (gt_feature_node_score_is_defined(fn)) { char buf[BUFSIZ]; (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn)); gt_str_append_cstr(outstr, buf); } else gt_str_append_char(outstr, '.'); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); gt_str_append_char(outstr, '\t'); }
static void set_gff3_target_attribute(GthSA *sa, bool md5ids) { gt_assert(sa && !sa->gff3_target_attribute); sa->gff3_target_attribute = gt_str_new(); if (md5ids) { gt_assert(sa->ref_md5); gt_str_append_cstr(sa->gff3_target_attribute, GT_MD5_SEQID_PREFIX); gt_str_append_str(sa->gff3_target_attribute, sa->ref_md5); gt_str_append_char(sa->gff3_target_attribute, ':'); } gt_gff3_escape(sa->gff3_target_attribute, gt_str_get(sa->ref_id), gt_str_length(sa->ref_id)); gt_str_append_char(sa->gff3_target_attribute, ' '); gt_str_append_uword(sa->gff3_target_attribute, gth_sa_referencecutoff_start(sa) + 1); /* XXX: use reference dpstartpos */ gt_str_append_char(sa->gff3_target_attribute, ' '); gt_str_append_uword(sa->gff3_target_attribute, gth_sa_ref_total_length(sa) - /* XXX */ gth_sa_referencecutoff_end(sa)); gt_str_append_char(sa->gff3_target_attribute, ' '); if (sa->ref_strand_forward) { gt_str_append_char(sa->gff3_target_attribute, GT_STRAND_CHARS[GT_STRAND_FORWARD]); } else { gt_str_append_char(sa->gff3_target_attribute, GT_STRAND_CHARS[GT_STRAND_REVERSE]); } }
static void show_parse_file_status(GthShowVerbose showverbose, GtUword filenum, GtUword numoffiles, const char *filename) { GtStr *buf = gt_str_new(); gt_str_append_cstr(buf, "process file "); gt_str_append_uword(buf, filenum + 1); gt_str_append_char(buf, GT_PATH_SEPARATOR); gt_str_append_uword(buf, numoffiles); gt_str_append_cstr(buf, ": "); gt_str_append_cstr(buf, filename); showverbose(gt_str_get(buf)); gt_str_delete(buf); }
static void write_model(GtStr *str, const char *model_cstr, const GthBSSMModel *model) { GtUword i, j, k, l; gt_assert(str && model_cstr && model); gt_assert(model->hypothesis_num == 2 || model->hypothesis_num == 7); gt_str_append_cstr(str, " "); gt_str_append_cstr(str, model_cstr); gt_str_append_cstr(str, " = {\n"); gt_str_append_cstr(str, " hypothesis_num = "); gt_str_append_uword(str, model->hypothesis_num); gt_str_append_cstr(str, ",\n"); gt_str_append_cstr(str, " window_size_left = "); gt_str_append_uword(str, model->window_size_left); gt_str_append_cstr(str, ",\n"); gt_str_append_cstr(str, " window_size_right = "); gt_str_append_uword(str, model->window_size_right); gt_str_append_cstr(str, ",\n"); for (i = 0; i < model->hypothesis_num; i++) { gt_str_append_cstr(str, " {\n"); for (j = 0; j < WINSIZE + 2; j++) { gt_str_append_cstr(str, " {\n"); for (k = 0; k < 4; k++) { gt_str_append_cstr(str, " { "); for (l = 0; l < 4; l++) { if (l) gt_str_append_cstr(str, ", "); if (model->hypothesis_num == 2) { gt_str_append_double(str, model->hypotables.hypo2table[i][j][k][l], BSSM_PRECISION); } else { gt_str_append_double(str, model->hypotables.hypo7table[i][j][k][l], BSSM_PRECISION); } } gt_str_append_cstr(str, " },\n"); } gt_str_append_cstr(str, " },\n"); } gt_str_append_cstr(str, " },\n"); } gt_str_append_cstr(str, " }"); }
static void remove_bioseq_files(GtBioseq *bs) { GtStr *base = gt_str_new_cstr("stdin."); gt_str_append_uword(base, (GtUword) bs); remove_indexfile(GT_ENCSEQFILESUFFIX, gt_str_get(base)); remove_indexfile(GT_DESTABFILESUFFIX, gt_str_get(base)); remove_indexfile(GT_SSPTABFILESUFFIX, gt_str_get(base)); remove_indexfile(GT_SDSTABFILESUFFIX, gt_str_get(base)); remove_indexfile(GT_MD5TABFILESUFFIX, gt_str_get(base)); remove_indexfile(GT_OISTABFILESUFFIX, gt_str_get(base)); gt_str_delete(base); }
int gt_condenseq_output_to_gff3(const GtCondenseq *condenseq, GtError *err) { int had_err = 0; GtUword idx, name_len, seqnum = 0, seqstart = 0, seqend = 0, desclen; GtStr *filename = NULL, *id = gt_str_new_cstr("U"), *name = gt_str_new_cstr("unique"), *parent_unique = gt_str_new_cstr("U"), *seqid = gt_str_new(), *source = gt_str_new_cstr("Condenseq"); GtFile *outfile = NULL; GtGFF3Visitor *gffv = NULL; GtNodeVisitor *nodev = NULL; GtFeatureNode *fnode = NULL; GtGenomeNode *node = NULL; GtRange range; gt_assert(condenseq != NULL); filename = gt_str_new_cstr(gt_condenseq_basefilename(condenseq)); name_len = gt_str_length(name); gt_str_append_cstr(filename, ".gff3"); outfile = gt_file_new(gt_str_get(filename), "w", err); nodev = gt_gff3_visitor_new(outfile); gffv = (GtGFF3Visitor *) nodev; gt_gff3_visitor_retain_id_attributes(gffv); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); for (idx = 0; !had_err && idx < condenseq->udb_nelems; ++idx) { GtCondenseqUnique uq = condenseq->uniques[idx]; if (seqend <= uq.orig_startpos) { const char *desc; gt_genome_node_delete(node); seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_str_reset(seqid); gt_str_append_cstr_nt(seqid, desc, desclen); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); } gt_str_set_length(name, name_len); gt_str_append_uword(name, idx); gt_str_set_length(id, (GtUword) 1); gt_str_append_uword(id, idx); gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name)); gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id)); /* 1 Based coordinates! */ range.start = uq.orig_startpos + 1 - seqstart; range.end = uq.orig_startpos + uq.len - seqstart; gt_genome_node_set_range(node, &range); had_err = gt_genome_node_accept(node, nodev, err); } gt_str_reset(name); gt_str_append_cstr(name, "link"); gt_str_reset(id); gt_str_append_cstr(id, "L"); name_len = gt_str_length(name); seqend = 0; for (idx = 0; !had_err && idx < condenseq->ldb_nelems; ++idx) { GtCondenseqLink link = condenseq->links[idx]; if (seqend <= link.orig_startpos) { const char *desc; gt_genome_node_delete(node); seqnum = gt_condenseq_pos2seqnum(condenseq, link.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_str_reset(seqid); gt_str_append_cstr_nt(seqid, desc, desclen); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); } gt_str_set_length(name, name_len); gt_str_append_uword(name, idx); gt_str_set_length(id, (GtUword) 1); gt_str_append_uword(id, idx); gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name)); gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id)); gt_str_set_length(parent_unique, (GtUword) 1); gt_str_append_uword(parent_unique, link.unique_id); gt_feature_node_set_attribute(fnode, "Derives_from", gt_str_get(parent_unique)); /* 1 Based coordinates! */ range.start = link.orig_startpos + 1 - seqstart; range.end = link.orig_startpos + link.len - seqstart; gt_genome_node_set_range(node, &range); had_err = gt_genome_node_accept(node, nodev, err); } gt_file_delete(outfile); gt_genome_node_delete(node); gt_node_visitor_delete(nodev); gt_str_delete(filename); gt_str_delete(id); gt_str_delete(name); gt_str_delete(parent_unique); gt_str_delete(seqid); gt_str_delete(source); return had_err; }
static void make_sequence_region(GtHashmap *sequence_regions, GtStr *sequenceid, GthRegionFactory *srf, GthInput *input, GtUword filenum, GtUword seqnum) { GtUword offset_is_defined = false; GtRange range, descrange; GtGenomeNode *sr = NULL; gt_assert(sequence_regions && sequenceid && srf && input); if (gth_input_use_substring_spec(input)) { range.start = gth_input_genomic_substring_from(input); range.end = gth_input_genomic_substring_to(input); } else { range = gth_input_get_relative_genomic_range(input, filenum, seqnum); } if (srf->use_desc_ranges) { GtStr *description = gt_str_new(); gth_input_get_genomic_description(input, description, filenum, seqnum); if (!gt_parse_description_range(gt_str_get(description), &descrange)) offset_is_defined = true; gt_str_delete(description); } if (offset_is_defined) range = gt_range_offset(&range, descrange.start); else range = gt_range_offset(&range, 1); /* 1-based */ if (!gt_str_length(sequenceid) || (gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)) && !offset_is_defined)) { /* sequenceid is empty or exists already (and no offset has been parsed) -> make one up */ GtStr *seqid; char *base; base = gt_basename(gth_input_get_genomic_filename(input, filenum)); seqid = gt_str_new_cstr(base); gt_free(base); gt_str_append_char(seqid, '|'); gt_str_append_uword(seqid, seqnum + 1); /* 1-based */ seqid_store_add(srf->seqid_store, filenum, seqnum, seqid, GT_UNDEF_UWORD); gt_assert(!gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid))); gt_cstr_table_add(srf->used_seqids, gt_str_get(seqid)); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid)), sr); gt_str_delete(seqid); } else { /* sequenceid does not exists already (or an offset has been parsed) -> use this one */ if (!gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid))) { /* no sequence region with this id exists -> create one */ gt_cstr_table_add(srf->used_seqids, gt_str_get(sequenceid)); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)), sr); } else { GtRange prev_range, new_range; /* sequence region with this id exists already -> modify range */ sr = gt_hashmap_get(sequence_regions, gt_str_get(sequenceid)); gt_assert(sr); prev_range = gt_genome_node_get_range(sr); new_range = gt_range_join(&prev_range, &range); gt_genome_node_set_range(sr, &new_range); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); } } gt_assert(sr); }
static int hmmsearch_process_coarse_hits( char *table_filename, GtCondenseq *ces, GtCondenseqHmmsearchArguments *arguments, GtLogger *logger, GtError *err) { int had_err = 0; GtStr *line = gt_str_new(); FILE *table = NULL; GtSplitter *splitter = gt_splitter_new(); GtStr *query = gt_str_new(), *fine_fasta_filename = gt_str_new_cstr("condenseq"); GtRBTree *sequences = NULL; GtUword filecount = (GtUword) 1; unsigned int querycount = 0; const GtUword fine_fasta_name_length = gt_str_length(fine_fasta_filename); const GtUword table_name_length = gt_str_length(arguments->outtable_filename); table = gt_xfopen(table_filename, "r"); sequences = gt_rbtree_new(hmmsearch_cmp_seqnum, hmmsearch_tree_free_node, NULL); while (!had_err && gt_str_read_next_line(line, table) == 0) { char *c_line = gt_str_get(line); GtUword uid; const GtUword target_column = 0, query_column = (GtUword) 3; if (c_line[0] != '#') { gt_splitter_split_non_empty(splitter, c_line, gt_str_length(line), ' '); gt_assert(gt_splitter_size(splitter) == (GtUword) 23); if (sscanf(gt_splitter_get_token(splitter, target_column), GT_WU, &uid) != 1) { gt_error_set(err, "couldn't parse target number: %s", gt_splitter_get_token(splitter, target_column)); had_err = -1; } if (gt_str_length(query) == 0 || strcmp(gt_str_get(query), gt_splitter_get_token(splitter, query_column)) != 0) { gt_str_set(query, gt_splitter_get_token(splitter, query_column)); gt_logger_log(logger, "new query: %s", gt_str_get(query)); querycount++; } if (!had_err && querycount == arguments->max_queries) { hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces); if (table_name_length != 0) gt_str_append_uword(arguments->outtable_filename, filecount++); had_err = hmmsearch_call_fine_search(table_name_length != 0 ? arguments->outtable_filename : NULL, gt_str_get(fine_fasta_filename), gt_str_get(arguments->hmmsearch_path), gt_str_get(arguments->hmm), logger, err); gt_rbtree_clear(sequences); gt_str_set_length(fine_fasta_filename, fine_fasta_name_length); if (table_name_length != 0) gt_str_set_length(arguments->outtable_filename, table_name_length); querycount = 0; } if (!had_err) { if (gt_condenseq_each_redundant_seq(ces, uid, hmmsearch_process_seq, sequences, err) == 0) { had_err = -1; } } gt_splitter_reset(splitter); } gt_str_reset(line); } gt_splitter_delete(splitter); gt_str_delete(line); gt_str_delete(query); gt_xfclose(table); if (!had_err) { hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces); if (table_name_length != 0) gt_str_append_uword(arguments->outtable_filename, filecount++); had_err = hmmsearch_call_fine_search(table_name_length != 0 ? arguments->outtable_filename : NULL, gt_str_get(fine_fasta_filename), gt_str_get(arguments->hmmsearch_path), gt_str_get(arguments->hmm), logger, err); } gt_log_log("created " GT_WU " files", filecount); gt_rbtree_delete(sequences); gt_str_delete(fine_fasta_filename); return had_err; }
/* Formats a given position number for short display in the ruler. */ void gt_format_ruler_label(char *txt, GtWord pos, const char *unitstr, size_t buflen) { double fpos; int logval; GtStr *formatstring; GtUword upos; gt_assert(txt); bool negative = false; if (pos < 0) { upos = (GtUword)-pos; negative = true; formatstring = gt_str_new_cstr("-%."); } else { upos = (GtUword)pos; formatstring = gt_str_new_cstr("%."); } logval = (int) floor(log10(upos)); if (upos >= 1000000000) { fpos = (double) upos / 1000000000; while (upos % 10 == 0) { upos /= 10; logval--; } /*@ignore@*/ gt_str_append_uword(formatstring, (GtUword) logval); gt_str_append_cstr(formatstring, "fG%s"); (void) snprintf(txt, buflen, gt_str_get(formatstring), fpos, unitstr); /*@end@*/ } else if (upos >= 1000000) { fpos = (double) upos / 1000000; while (upos % 10 == 0) { upos /= 10; logval--; } /*@ignore@*/ gt_str_append_uword(formatstring, (GtUword) logval); gt_str_append_cstr(formatstring, "fM%s"); (void) snprintf(txt, buflen, gt_str_get(formatstring), fpos, unitstr); /*@end@*/ } else if (upos >= 1000) { fpos = (double) upos / 1000; while (upos % 10 == 0) { upos /= 10; logval--; } /*@ignore@*/ gt_str_append_uword(formatstring, (GtUword) logval); gt_str_append_cstr(formatstring, "fk%s"); (void) snprintf(txt, buflen, gt_str_get(formatstring), fpos, unitstr); /*@end@*/ } else { /*@ignore@*/ (void) snprintf(txt, buflen, " %s"GT_WU"%s", negative ? "-" : "", upos, unitstr); /*@end@*/ } gt_str_delete(formatstring); }
static int bioseq_fill(GtBioseq *bs, bool recreate, GtError *err) { GtStr *bioseq_index_file = NULL, *bioseq_ois_file = NULL, *bioseq_sds_file = NULL, *bioseq_md5_file = NULL, *bioseq_des_file = NULL; int had_err = 0; GtStr *bioseq_basename; gt_assert(!bs->encseq); if (bs->use_stdin) { bioseq_basename = gt_str_new_cstr("stdin."); /* assign a unique name */ gt_str_append_uword(bioseq_basename, (GtUword) bs); } else bioseq_basename = bs->sequence_file; /* construct file names */ bioseq_index_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_index_file, GT_ENCSEQFILESUFFIX); bioseq_ois_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_ois_file, GT_OISTABFILESUFFIX); bioseq_sds_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_sds_file, GT_SDSTABFILESUFFIX); bioseq_md5_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_md5_file, GT_MD5TABFILESUFFIX); bioseq_des_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_des_file, GT_DESTABFILESUFFIX); /* construct the bioseq files if necessary */ if (recreate || bs->use_stdin || !gt_file_exists(gt_str_get(bioseq_index_file)) || !gt_file_exists(gt_str_get(bioseq_ois_file)) || !gt_file_exists(gt_str_get(bioseq_sds_file)) || !gt_file_exists(gt_str_get(bioseq_md5_file)) || !gt_file_exists(gt_str_get(bioseq_des_file)) || gt_file_is_newer(gt_str_get(bs->sequence_file), gt_str_get(bioseq_index_file))) { had_err = construct_bioseq_files(bs, bioseq_basename, err); } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new(); gt_encseq_loader_disable_autosupport(el); gt_encseq_loader_require_lossless_support(el); gt_encseq_loader_require_description_support(el); gt_encseq_loader_require_md5_support(el); gt_encseq_loader_require_multiseq_support(el); bs->encseq = gt_encseq_loader_load(el, gt_str_get(bioseq_basename), err); if (bs->encseq == NULL) { had_err = -1; gt_assert(gt_error_is_set(err)); } gt_encseq_loader_delete(el); } if (!had_err) { gt_assert(bs->encseq); } /* free */ if (bs->use_stdin) gt_str_delete(bioseq_basename); gt_str_delete(bioseq_index_file); gt_str_delete(bioseq_ois_file); gt_str_delete(bioseq_md5_file); gt_str_delete(bioseq_sds_file); gt_str_delete(bioseq_des_file); return had_err; }