static int gt_ltrdigest_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtLTRdigestOptions *arguments = tool_arguments; GtNodeStream *gff3_in_stream = NULL, *gff3_out_stream = NULL, *pdom_stream = NULL, *ppt_stream = NULL, *pbs_stream = NULL, *tab_out_stream = NULL, *sa_stream = NULL, *last_stream = NULL; int had_err = 0, tests_to_run = 0, arg = parsed_args; GtRegionMapping *rmap = NULL; GtPdomModelSet *ms = NULL; gt_error_check(err); gt_assert(arguments); /* determine and open sequence source */ if (gt_seqid2file_option_used(arguments->s2fi)) { /* create region mapping */ rmap = gt_seqid2file_region_mapping_new(arguments->s2fi, err); if (!rmap) had_err = -1; } else { GtEncseqLoader *el; GtEncseq *encseq; /* no new-style sequence source option given, fall back to legacy syntax */ if (argc < 3) { gt_error_set(err, "missing mandatory argument(s)"); had_err = -1; } if (!had_err) { el = gt_encseq_loader_new(); gt_encseq_loader_disable_autosupport(el); gt_encseq_loader_require_md5_support(el); gt_encseq_loader_require_description_support(el); encseq = gt_encseq_loader_load(el, argv[argc-1], err); /* XXX: clip off terminal argument */ gt_free((char*) argv[argc-1]); argv[argc-1] = NULL; argc--; gt_encseq_loader_delete(el); if (!encseq) had_err = -1; else { rmap = gt_region_mapping_new_encseq_seqno(encseq); gt_encseq_delete(encseq); } } } gt_assert(had_err || rmap); /* Always search for PPT. */ tests_to_run |= GT_LTRDIGEST_RUN_PPT; /* Open tRNA library if given. */ if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PBS; arguments->trna_lib_bs = gt_bioseq_new(gt_str_get(arguments->trna_lib), err); if (gt_error_is_set(err)) had_err = -1; } /* Set HMMER cutoffs. */ if (!had_err && gt_str_array_size(arguments->hmm_files) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PDOM; if (!strcmp(gt_str_get(arguments->cutoffs), "GA")) { arguments->cutoff = GT_PHMM_CUTOFF_GA; } else if (!strcmp(gt_str_get(arguments->cutoffs), "TC")) { arguments->cutoff = GT_PHMM_CUTOFF_TC; } else if (!strcmp(gt_str_get(arguments->cutoffs), "NONE")) { arguments->cutoff = GT_PHMM_CUTOFF_NONE; } else { gt_error_set(err, "invalid cutoff setting!"); had_err = -1; } } if (!had_err) { last_stream = gff3_in_stream = gt_gff3_in_stream_new_sorted(argv[arg]); } if (!had_err && gt_str_array_size(arguments->hmm_files) > 0) { GtNodeVisitor *pdom_v; ms = gt_pdom_model_set_new(arguments->hmm_files, err); if (ms != NULL) { pdom_v = gt_ltrdigest_pdom_visitor_new(ms, arguments->evalue_cutoff, arguments->chain_max_gap_length, arguments->cutoff, rmap, err); if (pdom_v == NULL) had_err = -1; if (!had_err) { if (arguments->output_all_chains) gt_ltrdigest_pdom_visitor_output_all_chains((GtLTRdigestPdomVisitor*) pdom_v); last_stream = pdom_stream = gt_visitor_stream_new(last_stream, pdom_v); } } else had_err = -1; } if (!had_err && arguments->trna_lib_bs) { GtNodeVisitor *pbs_v; pbs_v = gt_ltrdigest_pbs_visitor_new(rmap, arguments->pbs_radius, arguments->max_edist, arguments->alilen, arguments->offsetlen, arguments->trnaoffsetlen, arguments->ali_score_match, arguments->ali_score_mismatch, arguments->ali_score_insertion, arguments->ali_score_deletion, arguments->trna_lib_bs, err); if (pbs_v != NULL) last_stream = pbs_stream = gt_visitor_stream_new(last_stream, pbs_v); else had_err = -1; } if (!had_err) { GtNodeVisitor *ppt_v; ppt_v = gt_ltrdigest_ppt_visitor_new(rmap, arguments->ppt_len, arguments->ubox_len, arguments->ppt_pyrimidine_prob, arguments->ppt_purine_prob, arguments->bkg_a_prob, arguments->bkg_g_prob, arguments->bkg_t_prob, arguments->bkg_c_prob, arguments->ubox_u_prob, arguments->ppt_radius, arguments->max_ubox_dist, err); if (ppt_v != NULL) last_stream = ppt_stream = gt_visitor_stream_new(last_stream, ppt_v); else had_err = -1; } if (!had_err) { GtNodeVisitor *sa_v; sa_v = gt_ltrdigest_strand_assign_visitor_new(); gt_assert(sa_v); last_stream = sa_stream = gt_visitor_stream_new(last_stream, sa_v); } if (!had_err) { /* attach tabular output stream, if requested */ if (gt_str_length(arguments->prefix) > 0) { last_stream = tab_out_stream = gt_ltrdigest_file_out_stream_new( last_stream, tests_to_run, rmap, gt_str_get(arguments->prefix), arguments->seqnamelen, err); if (!tab_out_stream) had_err = -1; if (!had_err && arguments->print_metadata) { had_err = gt_ltrdigest_file_out_stream_write_metadata( (GtLTRdigestFileOutStream*) tab_out_stream, tests_to_run, gt_str_get(arguments->trna_lib), argv[arg], arguments->ppt_len, arguments->ubox_len, arguments->ppt_radius, arguments->alilen, arguments->max_edist, arguments->offsetlen, arguments->trnaoffsetlen, arguments->pbs_radius, arguments->hmm_files, arguments->chain_max_gap_length, arguments->evalue_cutoff, err); } if (!had_err) { if (arguments->write_alignments) gt_ltrdigest_file_out_stream_enable_pdom_alignment_output( tab_out_stream); if (arguments->write_aaseqs) gt_ltrdigest_file_out_stream_enable_aa_sequence_output( tab_out_stream); } } last_stream = gff3_out_stream = gt_gff3_out_stream_new(last_stream, arguments->outfp); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(last_stream, err); } gt_pdom_model_set_delete(ms); gt_node_stream_delete(gff3_out_stream); gt_node_stream_delete(ppt_stream); gt_node_stream_delete(pbs_stream); gt_node_stream_delete(sa_stream); gt_node_stream_delete(pdom_stream); gt_node_stream_delete(tab_out_stream); gt_node_stream_delete(gff3_in_stream); gt_bioseq_delete(arguments->trna_lib_bs); gt_region_mapping_delete(rmap); return had_err; }
static GtOPrval parse_options(int *parsed_args, Cmppairwiseopt *pw, int argc, const char **argv, GtError *err) { GtOptionParser *op; GtOption *optionstrings, *optionfiles, *optioncharlistlen, *optiontext, *optionshowedist, *optionprint; GtStrArray *charlistlen; GtOPrval oprval; gt_error_check(err); charlistlen = gt_str_array_new(); pw->strings = gt_str_array_new(); pw->files = gt_str_array_new(); pw->text = gt_str_new(); pw->charlistlen = NULL; pw->fastasequences0 = NULL; pw->fastasequences1 = NULL; pw->showedist = false; pw->print = false; pw->fasta = false; op = gt_option_parser_new("options", "Apply function to pairs of strings."); gt_option_parser_set_mail_address(op, "<*****@*****.**>"); optionstrings = gt_option_new_string_array("ss", "use two strings", pw->strings); gt_option_parser_add_option(op, optionstrings); optionfiles = gt_option_new_filename_array("ff", "use two files", pw->files); gt_option_parser_add_option(op, optionfiles); optioncharlistlen = gt_option_new_string_array("a", "use character list and length", charlistlen); gt_option_parser_add_option(op, optioncharlistlen); optiontext = gt_option_new_string("t", "use text", pw->text, NULL); gt_option_parser_add_option(op, optiontext); optionshowedist = gt_option_new_bool("e", "output unit edit distance", &pw->showedist, false); gt_option_parser_add_option(op, optionshowedist); optionprint = gt_option_new_bool("p", "print edist alignment", &pw->print, false); gt_option_parser_add_option(op, optionprint); gt_option_exclude(optionstrings, optionfiles); gt_option_exclude(optionstrings, optioncharlistlen); gt_option_exclude(optionstrings, optiontext); gt_option_exclude(optionfiles, optioncharlistlen); gt_option_exclude(optionfiles, optiontext); gt_option_exclude(optioncharlistlen, optiontext); gt_option_imply(optionshowedist, optionstrings); gt_option_imply(optionprint, optionstrings); oprval = gt_option_parser_parse(op, parsed_args, argc, argv, gt_versionfunc, err); if (oprval == GT_OPTION_PARSER_OK) { if (gt_option_is_set(optionstrings)) { if (gt_str_array_size(pw->strings) != 2UL) { gt_error_set(err, "option -ss requires two string arguments"); oprval = GT_OPTION_PARSER_ERROR; } } else { if (gt_option_is_set(optionfiles)) { if (gt_str_array_size(pw->files) != 2UL) { if (gt_str_array_size(pw->files) == 3UL && !strcmp(gt_str_array_get(pw->files,0),"fasta")) { pw->fasta = true; } if (!pw->fasta) { gt_error_set(err, "option -ff requires two filename arguments or " "keyword fasta and two filename arguments in " "FASTA format"); oprval = GT_OPTION_PARSER_ERROR; } } } else { if (gt_option_is_set(optioncharlistlen)) { GtWord readint; if (gt_str_array_size(charlistlen) != 2UL) { gt_error_set(err, "option -a requires charlist and length argument"); oprval = GT_OPTION_PARSER_ERROR; }else { pw->charlistlen = gt_malloc(sizeof *pw->charlistlen); pw->charlistlen->charlist = gt_str_ref(gt_str_array_get_str(charlistlen, 0)); if (sscanf(gt_str_array_get(charlistlen,1UL), GT_WD, &readint) != 1 || readint < 1L) { gt_error_set(err, "option -a requires charlist and length argument"); oprval = GT_OPTION_PARSER_ERROR; } pw->charlistlen->len = (GtUword) readint; } } else { if (!gt_option_is_set(optiontext)) { gt_error_set(err, "use exactly one of the options -ss, -ff, -a, -t"); oprval = GT_OPTION_PARSER_ERROR; } } } } } gt_option_parser_delete(op); if (oprval == GT_OPTION_PARSER_OK && *parsed_args != argc) { gt_error_set(err, "superfluous program parameters"); oprval = GT_OPTION_PARSER_ERROR; } gt_str_array_delete(charlistlen); return oprval; }
static int scanfmafileviafileptr(Fmindex *fmindex, GtSpecialcharinfo *specialcharinfo, bool *storeindexpos, const char *indexname, FILE *fpin, GtLogger *logger, GtError *err) { bool haserr = false; GtScannedprjkeytable *scannedprjkeytable; unsigned int intstoreindexpos; gt_error_check(err); scannedprjkeytable = gt_scannedprjkeytable_new(); GT_SCANNEDPRJKEY_ADD("bwtlength",&fmindex->bwtlength,NULL); GT_SCANNEDPRJKEY_ADD("longest",&fmindex->longestsuffixpos,NULL); GT_SCANNEDPRJKEY_ADD("storeindexpos",&intstoreindexpos,NULL); GT_SCANNEDPRJKEY_ADD("log2blocksize",&fmindex->log2bsize,NULL); GT_SCANNEDPRJKEY_ADD("log2markdist",&fmindex->log2markdist,NULL); GT_SCANNEDPRJKEY_ADD("specialcharacters", &specialcharinfo->specialcharacters,NULL); GT_SCANNEDPRJKEY_ADD("specialranges",&specialcharinfo->specialranges,NULL); GT_SCANNEDPRJKEY_ADD("realspecialranges",&specialcharinfo->realspecialranges, NULL); GT_SCANNEDPRJKEY_ADD("lengthofspecialprefix", &specialcharinfo->lengthofspecialprefix,NULL); GT_SCANNEDPRJKEY_ADD("lengthofspecialsuffix", &specialcharinfo->lengthofspecialsuffix,NULL); GT_SCANNEDPRJKEY_ADD("wildcards",&specialcharinfo->wildcards,NULL); GT_SCANNEDPRJKEY_ADD("wildcardranges",&specialcharinfo->wildcardranges,NULL); GT_SCANNEDPRJKEY_ADD("realwildcardranges", &specialcharinfo->realwildcardranges,NULL); GT_SCANNEDPRJKEY_ADD("lengthofwildcardprefix", &specialcharinfo->lengthofwildcardprefix,NULL); GT_SCANNEDPRJKEY_ADD("lengthofwildcardsuffix", &specialcharinfo->lengthofwildcardsuffix,NULL); GT_SCANNEDPRJKEY_ADD("suffixlength",&fmindex->suffixlength,NULL); if (!haserr) { GtStr *currentline; unsigned int linenum; currentline = gt_str_new(); for (linenum = 0; gt_str_read_next_line(currentline, fpin) != EOF; linenum++) { if (gt_scannedprjkey_analyze(indexname, FMASCIIFILESUFFIX, linenum, gt_str_get(currentline), gt_str_length(currentline), scannedprjkeytable, err) != 0) { haserr = true; break; } gt_str_reset(currentline); } gt_str_delete(currentline); } if (!haserr && gt_scannedprjkey_allkeysdefined(indexname,FMASCIIFILESUFFIX, scannedprjkeytable, logger,err) != 0) { haserr = true; } if (!haserr) { if (intstoreindexpos == 1U) { *storeindexpos = true; } else { if (intstoreindexpos == 0) { *storeindexpos = false; } else { gt_error_set(err,"illegal value in line matching \"storeindexpos=\""); haserr = true; } } } gt_scannedprjkeytable_delete(scannedprjkeytable); return haserr ? -1 : 0; }
GtFile* gt_file_new(const char *path, const char *mode, GtError *err) { gt_error_check(err); gt_assert(mode); return gt_file_open(gt_file_mode_determine(path), path, mode, err); }
int gt_codon_iterator_encseq_unit_test(GtError *err) { int had_err = 0, i, j; const char *testseq = "gctgatcgactgaacatagctagcacggccgcgcgatcgtacgatg", *testseq_rc = "catcgtacgatcgcgcggccgtgctagctatgttcagtcgatcagc", *testseq_rv = "gtagcatgctagcgcgccggcacgatcgatacaagtcagctagtcg", *testseq_cm = "cgactagctgacttgtatcgatcgtgccggcgcgctagcatgctac"; GtEncseq *encseq; GtEncseqBuilder *eb; GtCodonIterator *ci; GtAlphabet *alpha; char n1, n2, n3; unsigned int frame; gt_error_check(err); alpha = gt_alphabet_new_dna(); gt_ensure(had_err, alpha != NULL); eb = gt_encseq_builder_new(alpha); gt_ensure(had_err, eb != NULL); gt_encseq_builder_add_cstr(eb, testseq, strlen(testseq), "foo"); encseq = gt_encseq_builder_build(eb, NULL); gt_ensure(had_err, encseq != NULL); if (!had_err) { /* forward tests */ had_err = gt_codon_iterator_encseq_single_test(encseq, testseq, testseq, GT_READMODE_FORWARD, err); } if (!had_err) { /* complement tests */ had_err = gt_codon_iterator_encseq_single_test(encseq, testseq, testseq_cm, GT_READMODE_COMPL, err); } if (!had_err) { /* revcompl tests */ had_err = gt_codon_iterator_encseq_single_test(encseq, testseq, testseq_rc, GT_READMODE_REVCOMPL, err); } if (!had_err) { /* reverse tests */ had_err = gt_codon_iterator_encseq_single_test(encseq, testseq, testseq_rv, GT_READMODE_REVERSE, err); } /* lengths < 3 */ for (j = 0; !had_err && j < 3; j++) { ci = gt_codon_iterator_encseq_new_with_readmode(encseq, 10, j, GT_READMODE_REVCOMPL, NULL); i = 10; while (!(gt_codon_iterator_next(ci, &n1, &n2, &n3, &frame, NULL))) { gt_ensure(had_err, false); } gt_ensure(had_err, i == 10); gt_codon_iterator_delete(ci); } gt_encseq_delete(encseq); gt_encseq_builder_delete(eb); gt_alphabet_delete(alpha); return had_err; }
static int gt_extract_feature_sequence_generic(GtStr *sequence, GtGenomeNode *gn, const char *type, bool join, GtStr *seqid, GtStrArray *target_ids, unsigned int *out_phase_offset, GtRegionMapping *region_mapping, GtError *err) { GtFeatureNode *fn; GtRange range; unsigned int phase_offset = 0; char *outsequence; const char *target; int had_err = 0; gt_error_check(err); fn = gt_genome_node_cast(gt_feature_node_class(), gn); gt_assert(fn); if (seqid) gt_str_append_str(seqid, gt_genome_node_get_seqid(gn)); if (target_ids && (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) { had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } if (!had_err) { if (join) { GtFeatureNodeIterator *fni; GtFeatureNode *child; bool reverse_strand = false, first_child = true, first_child_of_type_seen = false; GtPhase phase = GT_PHASE_UNDEFINED; /* in this case we have to traverse the children */ fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn)); while (!had_err && (child = gt_feature_node_iterator_next(fni))) { if (first_child) { if (target_ids && (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) { gt_str_array_reset(target_ids); had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } first_child = false; } if (!had_err) { if (extract_join_feature((GtGenomeNode*) child, type, region_mapping, sequence, &reverse_strand, &first_child_of_type_seen, &phase, err)) { had_err = -1; } if (phase != GT_PHASE_UNDEFINED) { phase_offset = (int) phase; } } } gt_feature_node_iterator_delete(fni); gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED); if (!had_err && gt_str_length(sequence)) { if (reverse_strand) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } else if (gt_feature_node_get_type(fn) == type) { GtPhase phase = gt_feature_node_get_phase(fn); gt_assert(!had_err); if (phase != GT_PHASE_UNDEFINED) phase_offset = (unsigned int) phase; /* otherwise we only have to look at this feature */ range = gt_genome_node_get_range(gn); gt_assert(range.start); /* 1-based coordinates */ had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } } if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) { *out_phase_offset = phase_offset; } return had_err; }
static int construct_genes(GT_UNUSED void *key, void *value, void *data, GtError *err) { GtHashmap *transcript_id_hash = (GtHashmap*) value; ConstructionInfo *cinfo = (ConstructionInfo*) data; GtQueue *genome_nodes = cinfo->genome_nodes; const char *gname; GtArray *mRNAs = gt_array_new(sizeof (GtGenomeNode*)); GtGenomeNode *gene_node, *gn; GtStrand gene_strand; GtRange gene_range; GtStr *gene_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); cinfo->mRNAs = mRNAs; had_err = gt_hashmap_foreach(transcript_id_hash, construct_mRNAs, cinfo, err); if (!had_err) { gt_assert(gt_array_size(mRNAs)); /* at least one mRNA constructed */ /* determine the range and the strand of the gene */ gn = *(GtGenomeNode**) gt_array_get(mRNAs, 0); gene_range = gt_genome_node_get_range(gn); gene_strand = gt_feature_node_get_strand((GtFeatureNode*) gn); gene_seqid = gt_genome_node_get_seqid(gn); for (i = 1; i < gt_array_size(mRNAs); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); range = gt_genome_node_get_range(gn); gene_range = gt_range_join(&gene_range, &range); gene_strand = gt_strand_join(gene_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); gt_assert(gt_str_cmp(gene_seqid, gt_genome_node_get_seqid(gn)) == 0); } gene_node = gt_feature_node_new(gene_seqid, gt_ft_gene, gene_range.start, gene_range.end, gene_strand); if ((gname = gt_hashmap_get(cinfo->gene_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) gene_node, GT_GFF_NAME, gname); } /* register children */ for (i = 0; i < gt_array_size(mRNAs); i++) { gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); gt_feature_node_add_child((GtFeatureNode*) gene_node, (GtFeatureNode*) gn); } /* store the gene */ gt_queue_add(genome_nodes, gene_node); /* free */ gt_array_delete(mRNAs); } return had_err; }
static int gt_sketch_page_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SketchPageArguments *arguments = tool_arguments; int had_err = 0; GtFeatureIndex *features = NULL; GtRange qry_range, sequence_region_range; GtStyle *sty = NULL; GtStr *prog, *gt_style_file; GtDiagram *d = NULL; GtLayout *l = NULL; GtBioseq *bioseq = NULL; GtCanvas *canvas = NULL; char *seqid = NULL; const char *outfile = NULL; GtUword start, height, num_pages = 0; double offsetpos, usable_height; cairo_surface_t *surf = NULL; cairo_t *cr = NULL; bool has_seqid; GtTextWidthCalculator *twc; gt_error_check(err); features = gt_feature_index_memory_new(); if (cairo_version() < CAIRO_VERSION_ENCODE(1, 8, 6)) gt_warning("Your cairo library (version %s) is older than version 1.8.6! " "These versions contain a bug which may result in " "corrupted PDF output!", cairo_version_string()); /* get style */ sty = gt_style_new(err); if (gt_str_length(arguments->stylefile) == 0) { prog = gt_str_new(); gt_str_append_cstr_nt(prog, argv[0], gt_cstr_length_up_to_char(argv[0], ' ')); gt_style_file = gt_get_gtdata_path(gt_str_get(prog), err); gt_str_delete(prog); gt_str_append_cstr(gt_style_file, "/sketch/default.style"); } else { gt_style_file = gt_str_ref(arguments->stylefile); } had_err = gt_style_load_file(sty, gt_str_get(gt_style_file), err); if (!had_err) { had_err = gt_feature_index_has_seqid(features, &has_seqid, gt_str_get(arguments->seqid), err); } outfile = argv[parsed_args]; if (!had_err) { /* get features */ had_err = gt_feature_index_add_gff3file(features, argv[parsed_args+1], err); if (!had_err && gt_str_length(arguments->seqid) == 0) { seqid = gt_feature_index_get_first_seqid(features, err); if (seqid == NULL) { gt_error_set(err, "GFF input file must contain a sequence region!"); had_err = -1; } } else if (!had_err && !has_seqid) { gt_error_set(err, "sequence region '%s' does not exist in GFF input file", gt_str_get(arguments->seqid)); had_err = -1; } else if (!had_err) seqid = gt_str_get(arguments->seqid); } /* set text */ if (gt_str_length(arguments->text) == 0) { gt_str_delete(arguments->text); arguments->text = gt_str_new_cstr(argv[parsed_args+1]); } if (!had_err) { /* set display range */ had_err = gt_feature_index_get_range_for_seqid(features, &sequence_region_range, seqid, err); } if (!had_err) { qry_range.start = (arguments->range.start == GT_UNDEF_UWORD ? sequence_region_range.start : arguments->range.start); qry_range.end = (arguments->range.end == GT_UNDEF_UWORD ? sequence_region_range.end : arguments->range.end); /* set output format */ if (strcmp(gt_str_get(arguments->format), "pdf") == 0) { surf = cairo_pdf_surface_create(outfile, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); } else if (strcmp(gt_str_get(arguments->format), "ps") == 0) { surf = cairo_ps_surface_create(outfile, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); } gt_log_log("created page with %.2f:%.2f dimensions\n", mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER; usable_height = mm_to_pt(arguments->pheight) - arguments->theight - arguments->theight - 4*TEXT_SPACER; if (gt_str_length(arguments->seqfile) > 0) { bioseq = gt_bioseq_new(gt_str_get(arguments->seqfile), err); } cr = cairo_create(surf); cairo_set_font_size(cr, 8); twc = gt_text_width_calculator_cairo_new(cr, sty, err); for (start = qry_range.start; start <= qry_range.end; start += arguments->width) { GtRange single_range; GtCustomTrack *ct = NULL; const char *seq; single_range.start = start; single_range.end = start + arguments->width; if (had_err) break; d = gt_diagram_new(features, seqid, &single_range, sty, err); if (!d) { had_err = -1; break; } if (bioseq) { seq = gt_bioseq_get_sequence(bioseq, 0); ct = gt_custom_track_gc_content_new(seq, gt_bioseq_get_sequence_length(bioseq, 0), 800, 70, 0.4, true); gt_diagram_add_custom_track(d, ct); } l = gt_layout_new_with_twc(d, mm_to_pt(arguments->width), sty, twc, err); had_err = gt_layout_get_height(l, &height, err); if (!had_err) { if (gt_double_smaller_double(usable_height - 10 - 2*TEXT_SPACER - arguments->theight, offsetpos + height)) { draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid, num_pages, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight), arguments->theight); cairo_show_page(cr); offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER; num_pages++; } canvas = gt_canvas_cairo_context_new(sty, cr, offsetpos, mm_to_pt(arguments->pwidth), height, NULL, err); if (!canvas) had_err = -1; offsetpos += height; if (!had_err) had_err = gt_layout_sketch(l, canvas, err); } gt_canvas_delete(canvas); gt_layout_delete(l); gt_diagram_delete(d); if (ct) gt_custom_track_delete(ct); } draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid, num_pages, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight), arguments->theight); cairo_show_page(cr); num_pages++; gt_log_log("finished, should be "GT_WU" pages\n", num_pages); gt_text_width_calculator_delete(twc); cairo_destroy(cr); cairo_surface_flush(surf); cairo_surface_finish(surf); cairo_surface_destroy(surf); cairo_debug_reset_static_data(); if (bioseq) gt_bioseq_delete(bioseq); gt_style_delete(sty); gt_free(seqid); gt_str_delete(gt_style_file); gt_feature_index_delete(features); } return had_err; }
static int gt_condenser_search_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtCondenserSearchArguments *arguments = tool_arguments; int i, had_err = 0; char *querypath = gt_str_get(arguments->querypath); GtStr* coarse_fname = gt_str_new_cstr("coarse_"); char *db_basename = NULL; char *suffix_ptr = NULL; GtTimer *timer = NULL; GtLogger *logger = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); db_basename = gt_basename(gt_str_get(arguments->dbpath)); /* if first char is '.' this might be a hidden file */ if (strlen(db_basename) > (size_t) 1 && (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) { /* remove suffix */ *suffix_ptr = '\0'; } gt_str_append_cstr(coarse_fname, db_basename); gt_str_append_cstr(coarse_fname, ".fas"); gt_free(db_basename); db_basename = NULL; suffix_ptr = NULL; if (arguments->blastn || arguments->blastp) { GtMatch *match; GtMatchIterator *mp = NULL; GtNREncseq *nrencseq = NULL; GtStr *fastaname = gt_str_clone(arguments->dbpath); HitPosition *hits; double eval, raw_eval = 0.0; GtUword coarse_db_len = 0; GtMatchIteratorStatus status; int curr_hits = 0, max_hits = 100; hits = gt_malloc(sizeof (*hits) * (size_t) max_hits); gt_str_append_cstr(fastaname, ".fas"); for (i=0; i < max_hits; i++) { hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("initialization"); gt_timer_start(timer); } /*extract sequences from compressed database*/ if (!had_err) { nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath), logger, err); if (nrencseq == NULL) had_err = -1; } if (!had_err) { if (arguments->ceval == GT_UNDEF_DOUBLE || arguments->feval == GT_UNDEF_DOUBLE) { /* from NCBI BLAST tutorial: E = Kmne^{-lambdaS} calculates E-value for score S with natural scale parameters K for search space size and lambda for the scoring system E = mn2^-S' m being the subject (total) length, n the length of ONE query calculates E-value for bit-score S' */ GtFastaReader *reader; GtCondenserSearchAvg avg = {0,0}; reader = gt_fasta_reader_rec_new(arguments->querypath); had_err = gt_fasta_reader_run(reader, NULL, NULL, gt_condenser_search_cum_moving_avg, &avg, err); if (!had_err) { GtUword S = arguments->bitscore; gt_log_log(GT_WU " queries, avg query size: " GT_WU, avg.count, avg.avg); raw_eval = 1/pow(2.0, (double) S) * avg.avg; gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval); gt_assert(avg.avg != 0); } gt_fasta_reader_delete(reader); } } /*create BLAST database from compressed database fasta file*/ if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "create coarse BLAST db", stderr); if (arguments->blastn) had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname), err); else had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname), err); } if (!had_err) { GtBlastProcessCall *call; if (timer != NULL) gt_timer_show_progress(timer, "coarse BLAST run", stderr); if (arguments->blastp) call = gt_blast_process_call_new_prot(); else call = gt_blast_process_call_new_nucl(); gt_blast_process_call_set_db(call, gt_str_get(fastaname)); gt_blast_process_call_set_query(call, querypath); gt_blast_process_call_set_evalue(call, arguments->ceval); gt_blast_process_call_set_num_threads(call, arguments->blthreads); mp = gt_match_iterator_blast_process_new(call, err); if (!mp) had_err = -1; gt_blast_process_call_delete(call); while (!had_err && (status = gt_match_iterator_next(mp, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { GtUword hit_seq_id; char string[7]; const char *dbseqid = gt_match_get_seqid2(match); if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) { gt_match_get_range_seq2(match, hits[curr_hits].range); hits[curr_hits].idx = hit_seq_id; gt_match_delete(match); curr_hits++; if (curr_hits == max_hits) { HitPosition *hit_extention; max_hits += 100; hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits); for (i=max_hits - 100; i < max_hits; i++) { hits[i].range = gt_malloc(sizeof (*hits[i].range)); } } } else { gt_error_set(err, "could not parse unique db header %s", dbseqid); had_err = -1; } } else if (status == GT_MATCHER_STATUS_ERROR) { had_err = -1; } } gt_match_iterator_delete(mp); } /*extract sequences*/ if (!had_err) { GtNREncseqDecompressor *decomp; GtFile *coarse_hits; if (timer != NULL) gt_timer_show_progress(timer, "extract coarse search hits", stderr); decomp = gt_n_r_encseq_decompressor_new(nrencseq); coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err); /* TODO DW do NOT extract complete uniques! these could be complete chromosomes!! just extract something around it? maybe +- max query length*/ for (i = 0; i < curr_hits; i++) { gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp, hits[i].idx); } had_err = gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits, decomp, &coarse_db_len, err); gt_assert(coarse_db_len != 0); gt_file_delete(coarse_hits); gt_n_r_encseq_decompressor_delete(decomp); } gt_n_r_encseq_delete(nrencseq); /* create BLAST database from decompressed database file */ if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "create fine BLAST db", stderr); if (arguments->blastn) had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname), err); else had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname), err); } /* perform fine BLAST search */ if (!had_err) { GtBlastProcessCall *call; if (timer != NULL) gt_timer_show_progress(timer, "fine BLAST run", stderr); if (arguments->feval == GT_UNDEF_DOUBLE) { eval = raw_eval * coarse_db_len; } else { eval = arguments->feval; } if (arguments->blastp) call = gt_blast_process_call_new_prot(); else call = gt_blast_process_call_new_nucl(); gt_blast_process_call_set_db(call, gt_str_get(coarse_fname)); gt_blast_process_call_set_query(call, querypath); gt_blast_process_call_set_evalue(call, eval); gt_blast_process_call_set_num_threads(call, arguments->blthreads); gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval, coarse_db_len); mp = gt_match_iterator_blast_process_new(call, err); if (!mp) had_err = -1; gt_blast_process_call_delete(call); if (!had_err) { GtUword numofhits = 0; while (!had_err && (status = gt_match_iterator_next(mp, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { GtMatchBlast *matchb = (GtMatchBlast*) match; char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50); GtRange range_seq1; GtRange range_seq2; numofhits++; gt_match_get_range_seq1(match, &range_seq1); gt_match_get_range_seq2(match, &range_seq2); gt_file_xprintf( arguments->outfp, "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t%g\t%.3f\n", gt_match_get_seqid1(match), gt_match_get_seqid2(match), gt_match_blast_get_similarity(matchb), gt_match_blast_get_align_length(matchb), range_seq1.start, range_seq1.end, range_seq2.start, range_seq2.end, gt_match_blast_get_evalue(matchb), (double) gt_match_blast_get_bitscore(matchb)); gt_match_delete(match); gt_free(dbseqid); } else if (status == GT_MATCHER_STATUS_ERROR) { had_err = -1; } } gt_log_log(GT_WU " hits found\n", numofhits); } gt_match_iterator_delete(mp); } if (!had_err) if (timer != NULL) gt_timer_show_progress_final(timer, stderr); gt_timer_delete(timer); /*cleanup*/ for (i=0; i < max_hits; i++) { gt_free(hits[i].range); } gt_free(hits); gt_str_delete(fastaname); } gt_str_delete(coarse_fname); gt_logger_delete(logger); return had_err; }
int mg_computepath(CombinedScoreMatrixEntry **combinedscore_matrix, HitInformation *hit_information, unsigned long rows, unsigned long contig_len, ParseStruct *parsestruct_ptr, GtError * err) { int had_err = 0; /* Initialisieren der Matrix fuer die Pfadberechnung */ PathMatrixEntry **path_matrix; /* i: Zaehlvariable fuer die Matrix-Zeilen; k: Zaehlvariable Precursors (von 0 bis max 2) maxpath_frame: Speichern des vorherigen Frames von dem der max-Wert berechnet wird */ unsigned short row_index = 0, precursor_index = 0, precursors_row = 0, maxpath_frame = 0; /* Position in der Query-DNA */ unsigned long column_index = 0; /* Variablen fuer den aktuellen Frame, den vorherigen Frame(speichert einen Wert aus precursors[], die Zeile des vorherigen Frames, GtArray mit den Precursors-Frames */ short current_frame = 0, precursors_frame = 0, precursors[NUM_PRECURSORS]; /* q ist der Wert, der bei Aus- oder Eintreten in ein Gen auf dem Forward- bzw. Reverse-Strang berechnet wird */ double q = ARGUMENTSSTRUCT(leavegene_value), max_new = 1, max_old = 1; /* Speicherreservierung fuer die Path-Matrix - Groesse entsprechend der CombinedScore-Matrix */ gt_array2dim_calloc(path_matrix, 7, contig_len); gt_error_check(err); /* fuer die erste Spalte der Path-Matrix wird die erste Spalte der CombinedScore-Matrix uebernommen */ for (row_index = 0; row_index < rows; row_index++) { path_matrix[row_index][0].score = combinedscore_matrix[row_index][0].matrix_score; path_matrix[row_index][0].path_frame = row_index; } /* Spaltenweise Berechnung des opt. Pfades */ for (column_index = 1; column_index < contig_len; column_index++) { for (row_index = 0; row_index < rows; row_index++) { /* Zaehlvariable fuer die Zeile wird umgerechnet in den entsprechenden Leserahmen */ current_frame = get_current_frame(row_index); /* Aufruf der Methode zum Berechnen der moeglichen Leserahmen anhand von aktuellem Leserahmen und der Query-DNA-Sequenz */ compute_precursors(current_frame, column_index, precursors); /* der max-Wert der moeglichen Vorgaenger wird berechnet */ for (precursor_index = 0; precursor_index < NUM_PRECURSORS && (precursors[precursor_index] != UNDEFINED); ++precursor_index) { /* aktueller Vorgaengerleserahmen - es gibt max. 3 moegliche Vorgaenger */ precursors_frame = precursors[precursor_index]; /* Vorgaengerleserahmen wird umgerechnet in die entsprechende Matrix-Zeile */ precursors_row = get_matrix_row(precursors_frame); /* der DP-Algo umfasst 3 moegliche Faelle 1. Fall: Wechsel vom Reversen- auf den Forward-Strang bzw. umgekehrt */ if ((current_frame < 0 && precursors_frame > 0) || (current_frame > 0 && precursors_frame < 0)) { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index].matrix_score + 2*q; } /* 2. Fall: Einfacher Wechsel des Leserahmens, also von + zu + bzw.- zu - */ else if (current_frame != 0 && precursors_frame != current_frame) { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index].matrix_score + q; } /* 3. Fall: Leserahmen wird beibehalten bzw. Wechsel von kodierend zu nicht-kodierend oder umgekehrt */ else { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index] .matrix_score; } /* Bestimmen des Max-Wertes der max. 3 Moeglichkeiten und Speichern der Zeile, von der der Max-Wert stammt */ if (gt_double_compare(max_new, max_old) > 0) { max_old = max_new; maxpath_frame = precursors_row; } } /* Speichern des Max-Wertes und der "Vorgaenger"-Zeile; zuruecksetzen der Variablen */ path_matrix[row_index][column_index].score = max_old; path_matrix[row_index][column_index].path_frame = maxpath_frame; max_new = DBL_MIN; max_old = DBL_MIN; maxpath_frame = 0; } } /* Aufruf der Methode zur Genvorhersage */ had_err = mg_compute_gene_prediction(combinedscore_matrix, path_matrix, contig_len, hit_information, parsestruct_ptr, err); gt_array2dim_delete(path_matrix); return had_err; }
static int gt_show_seedext_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { int had_err = 0; GtUword alignmentwidth; GtShowSeedextArguments *arguments = tool_arguments; GtSeedextendMatchIterator *semi; gt_error_check(err); gt_assert(arguments != NULL); /* Parse option string in first line of file specified by filename. */ alignmentwidth = arguments->show_alignment ? 70 : 0; semi = gt_seedextend_match_iterator_new(arguments->matchfilename,err); if (semi == NULL) { had_err = -1; } /* Parse seed extensions. */ if (!had_err) { const GtEncseq *aencseq = gt_seedextend_match_iterator_aencseq(semi), *bencseq = gt_seedextend_match_iterator_bencseq(semi); GtAlignment *alignment = gt_alignment_new(); Polishing_info *pol_info = NULL; GtSequencepairbuffer seqpairbuf = {NULL,NULL,0,0}; /* the following are used if seed_extend is set */ GtGreedyextendmatchinfo *greedyextendmatchinfo = NULL; GtProcessinfo_and_querymatchspaceptr processinfo_and_querymatchspaceptr; const GtUchar *characters = gt_encseq_alphabetcharacters(aencseq); const GtUchar wildcardshow = gt_encseq_alphabetwildcardshow(aencseq); GtUchar *alignment_show_buffer = arguments->show_alignment ? gt_alignment_buffer_new(alignmentwidth) : NULL; GtLinspaceManagement *linspace_spacemanager = gt_linspaceManagement_new(); GtScoreHandler *linspace_scorehandler = gt_scorehandler_new(0,1,0,1);; if (!arguments->relax_polish) { double matchscore_bias = GT_DEFAULT_MATCHSCORE_BIAS; if (gt_seedextend_match_iterator_bias_parameters(semi)) { matchscore_bias = gt_greedy_dna_sequence_bias_get(aencseq); } pol_info = polishing_info_new_with_bias( gt_seedextend_match_iterator_errorpercentage(semi), matchscore_bias, gt_seedextend_match_iterator_history_size(semi)); } if (arguments->seed_display) { gt_seedextend_match_iterator_seed_display_set(semi); } if (arguments->show_alignment || arguments->showeoplist) { gt_seedextend_match_iterator_querymatchoutoptions_set(semi, true, arguments->showeoplist, alignmentwidth, !arguments->relax_polish, arguments->seed_display); } if (arguments->seed_extend) { greedyextendmatchinfo = gt_greedy_extend_matchinfo_new(70, GT_MAX_ALI_LEN_DIFF, gt_seedextend_match_iterator_history_size(semi), GT_MIN_PERC_MAT_HISTORY, 0, /* userdefinedleastlength */ GT_EXTEND_CHAR_ACCESS_ANY, 100, pol_info); } if (pol_info != NULL) { gt_alignment_polished_ends(alignment,pol_info,false); } processinfo_and_querymatchspaceptr.processinfo = greedyextendmatchinfo; if (arguments->sortmatches) { (void) gt_seedextend_match_iterator_all_sorted(semi,true); } while (true) { GtQuerymatch *querymatchptr = gt_seedextend_match_iterator_next(semi); if (querymatchptr == NULL) { break; } if (gt_seedextend_match_iterator_has_seedline(semi)) { if (arguments->seed_extend) { if (aencseq == bencseq) { const GtUword seedlen = gt_seedextend_match_iterator_seedlen(semi), seedpos1 = gt_seedextend_match_iterator_seedpos1(semi), seedpos2 = gt_seedextend_match_iterator_seedpos2(semi); processinfo_and_querymatchspaceptr.querymatchspaceptr = querymatchptr; had_err = gt_greedy_extend_selfmatch_with_output( &processinfo_and_querymatchspaceptr, aencseq, seedlen, seedpos1, seedpos2, err); if (had_err) { break; } } else { gt_assert(false); } } else { const GtUword query_totallength = gt_encseq_seqlength(bencseq, gt_querymatch_queryseqnum(querymatchptr)); gt_show_seed_extend_encseq(querymatchptr, aencseq, bencseq, query_totallength); } } else { gt_show_seed_extend_plain(&seqpairbuf, linspace_spacemanager, linspace_scorehandler, alignment, alignment_show_buffer, alignmentwidth, arguments->showeoplist, characters, wildcardshow, aencseq, bencseq, querymatchptr); } } polishing_info_delete(pol_info); gt_greedy_extend_matchinfo_delete(greedyextendmatchinfo); gt_free(alignment_show_buffer); gt_scorehandler_delete(linspace_scorehandler); gt_linspaceManagement_delete(linspace_spacemanager); gt_free(seqpairbuf.a_sequence); gt_free(seqpairbuf.b_sequence); gt_alignment_delete(alignment); } gt_seedextend_match_iterator_delete(semi); return had_err; }
static int gt_select_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SelectArguments *arguments = tool_arguments; GtNodeStream *gff3_in_stream, *select_stream, *targetbest_select_stream = NULL, *gff3_out_stream; int had_err; GtFile *drop_file = NULL; GtNodeVisitor *gff3outvis = NULL; gt_error_check(err); gt_assert(arguments); /* create a gff3 input stream */ gff3_in_stream = gt_gff3_in_stream_new_unsorted(argc - parsed_args, argv + parsed_args); if (arguments->verbose && arguments->outfp) gt_gff3_in_stream_show_progress_bar((GtGFF3InStream*) gff3_in_stream); /* create a filter stream */ select_stream = gt_select_stream_new(gff3_in_stream, arguments->seqid, arguments->source, &arguments->contain_range, &arguments->overlap_range, arguments->strand, arguments->targetstrand, arguments->has_CDS, arguments->max_gene_length, arguments->max_gene_num, arguments->min_gene_score, arguments->max_gene_score, arguments->min_average_splice_site_prob, arguments->feature_num, arguments->filter_files, arguments->filter_logic, err); if (select_stream) { GtSelectStream *fs = (GtSelectStream*) select_stream; if (gt_str_length(arguments->dropped_file) > 0) { drop_file = gt_file_new(gt_str_get(arguments->dropped_file), "w", err); gff3outvis = gt_gff3_visitor_new(drop_file); gt_select_stream_set_drophandler(fs, print_to_file_drophandler, (void*) gff3outvis); } else { gt_select_stream_set_drophandler(fs, default_drophandler, NULL); } gt_select_stream_set_single_intron_factor(select_stream, arguments->single_intron_factor); if (arguments->targetbest) targetbest_select_stream = gt_targetbest_select_stream_new(select_stream); /* create a gff3 output stream */ gff3_out_stream = gt_gff3_out_stream_new(arguments->targetbest ? targetbest_select_stream : select_stream, arguments->outfp); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(gff3_out_stream, err); /* free */ gt_node_stream_delete(gff3_out_stream); gt_node_stream_delete(select_stream); gt_node_stream_delete(targetbest_select_stream); } else { had_err = -1; } gt_file_delete(drop_file); gt_node_visitor_delete(gff3outvis); gt_node_stream_delete(gff3_in_stream); return had_err; }
static GtRDBStmt* gt_rdb_mysql_prepare(GtRDB *rdb, const char *query, unsigned long num_params, GtError *err) { GtRDBStmt *st = NULL; GtRDBStmtMySQL *stm = NULL; GtRDBMySQL *rdbm; int had_err = 0, retval = 0; /* we need these to keep track of result/parameter and string buffers */ HashElemInfo str_buffer_hash = { gt_ht_ptr_elem_hash, { free_str }, sizeof (GtStr*), gt_ht_ptr_elem_cmp, NULL, NULL }, buffer_hash = { gt_ht_ptr_elem_hash, { free_buf }, sizeof (void*), gt_ht_ptr_elem_cmp, NULL, NULL }; MYSQL_STMT *tmp = NULL; gt_assert(rdb && query); gt_error_check(err); rdbm = gt_rdb_mysql_cast(rdb); tmp = mysql_stmt_init(&rdbm->conn); if ((retval = mysql_stmt_prepare(tmp, query, strlen(query)))) { gt_error_set(err, GT_MYSQL_ERRMSG, retval, mysql_stmt_error(tmp)); had_err = -1; } if (!had_err) { int param_count; param_count = mysql_stmt_param_count(tmp); if (param_count != num_params) { gt_error_set(err, "invalid parameter count: %lu expected, %d given", num_params, param_count); mysql_stmt_close(tmp); had_err = -1; } } if (!had_err) { st = gt_rdb_stmt_create(gt_rdb_stmt_mysql_class()); stm = gt_rdb_stmt_mysql_cast(st); stm->num_params = num_params; stm->query = gt_str_new_cstr(query); stm->buffers = gt_hashtable_new(buffer_hash); stm->returned_strings = gt_hashtable_new(str_buffer_hash); stm->stmt = tmp; stm->update_maxlengths = true; stm->params = gt_calloc(num_params, sizeof (MYSQL_BIND)); mysql_stmt_attr_set(tmp, STMT_ATTR_UPDATE_MAX_LENGTH, &stm->update_maxlengths); memset(stm->params, 0, num_params*sizeof (MYSQL_BIND)); stm->conn = &rdbm->conn; } return st; }
static int gt_rdb_stmt_mysql_exec(GtRDBStmt *st, GtError *err) { GtRDBStmtMySQL *stm; int rval, had_err = 0, num_fields; MYSQL_RES *meta_res = NULL; gt_assert(st); gt_error_check(err); stm = gt_rdb_stmt_mysql_cast(st); if (!stm->executed) { if (stm->num_params > 0) { gt_assert(stm->stmt && stm->params); if ((rval = mysql_stmt_bind_param(stm->stmt, stm->params))) { gt_error_set(err, GT_MYSQL_ERRMSG, rval, mysql_stmt_error(stm->stmt)); had_err = -1; } } if (!had_err && (rval = mysql_stmt_execute(stm->stmt))) { gt_error_set(err, GT_MYSQL_ERRMSG, rval, mysql_stmt_error(stm->stmt)); had_err = -1; } if (!had_err) { stm->executed = true; if (mysql_stmt_store_result(stm->stmt)) { gt_error_set(err, GT_MYSQL_ERRMSG, had_err, mysql_stmt_error(stm->stmt)); had_err = -1; } meta_res = mysql_stmt_result_metadata(stm->stmt); if (!had_err && meta_res) { int i = 0; /* statement returned a result */ num_fields = mysql_num_fields(meta_res); stm->results = gt_calloc(num_fields, sizeof (MYSQL_BIND)); /* prepare result buffers for each field */ for (i=0;i<num_fields;i++) { MYSQL_FIELD *field; field = mysql_fetch_field(meta_res); stm->results[i].buffer_type = field->type; switch (field->type) { case MYSQL_TYPE_DOUBLE: {double *dbl = gt_calloc(1, sizeof (double)); gt_hashtable_add(stm->buffers, &dbl); stm->results[i].buffer_length = sizeof (double); stm->results[i].buffer = dbl;} break; case MYSQL_TYPE_LONG: case MYSQL_TYPE_INT24: {int *l = gt_calloc(1, sizeof (int)); gt_hashtable_add(stm->buffers, &l); stm->results[i].is_unsigned = false; stm->results[i].buffer_length = sizeof (int); stm->results[i].buffer = l;} case MYSQL_TYPE_SHORT: {short int *l = gt_calloc(1, sizeof (short int)); gt_hashtable_add(stm->buffers, &l); stm->results[i].is_unsigned = false; stm->results[i].buffer_length = sizeof (short int); stm->results[i].buffer = l;} case MYSQL_TYPE_TINY: {signed char *l = gt_calloc(1, sizeof (signed char)); gt_hashtable_add(stm->buffers, &l); stm->results[i].is_unsigned = false; stm->results[i].buffer_length = sizeof (signed char); stm->results[i].buffer = l;} break; case MYSQL_TYPE_STRING: case MYSQL_TYPE_VAR_STRING: case MYSQL_TYPE_BLOB: case MYSQL_TYPE_TINY_BLOB: case MYSQL_TYPE_MEDIUM_BLOB: case MYSQL_TYPE_LONG_BLOB: case MYSQL_TYPE_BIT: {char *str = gt_calloc(field->max_length+1, sizeof (char)); gt_hashtable_add(stm->buffers, &str); unsigned long *length = gt_calloc(1, sizeof (unsigned long)); gt_hashtable_add(stm->buffers, &length); stm->results[i].buffer = str; stm->results[i].buffer_length = field->max_length; stm->results[i].length = length;} break; default: /* unsupported data type */ break; } } if (!had_err) mysql_stmt_bind_result(stm->stmt, stm->results); mysql_free_result(meta_res); } else { return 1; } } } if (!had_err) { switch ((rval = mysql_stmt_fetch(stm->stmt))) { case 0: default: break; case MYSQL_NO_DATA: had_err = 1; /* last row read */ break; case 1: gt_error_set(err, GT_MYSQL_ERRMSG, mysql_stmt_errno(stm->stmt), mysql_stmt_error(stm->stmt)); had_err = -1; break; } } return had_err; }
static int gt_seqorder_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSeqorderArguments *arguments = tool_arguments; int had_err = 0; GtEncseq *encseq; GtEncseqLoader *loader; GtUword i, nofseqs; gt_error_check(err); gt_assert(arguments != NULL); /* load encseq */ loader = gt_encseq_loader_new(); encseq = gt_encseq_loader_load(loader, argv[parsed_args], err); if (encseq == NULL) had_err = -1; if (had_err == 0 && !gt_encseq_has_description_support(encseq)) gt_warning("%s has no description support", argv[parsed_args]); if (!had_err) { nofseqs = gt_encseq_num_of_sequences(encseq); if (arguments->invert) { for (i = nofseqs; i > 0; i--) gt_seqorder_output(i - 1, encseq); } else if (arguments->shuffle) { GtUword *seqnums; seqnums = gt_malloc(sizeof (GtUword) * nofseqs); gt_seqorder_get_shuffled_seqnums(nofseqs, seqnums); for (i = 0; i < nofseqs; i++) gt_seqorder_output(seqnums[i], encseq); gt_free(seqnums); } else { GtSuffixsortspace *suffixsortspace; gt_assert(arguments->sort || arguments->revsort); suffixsortspace = gt_suffixsortspace_new(nofseqs, /* Use iterator over sequence separators: saves a lot of binary searches */ gt_encseq_seqstartpos(encseq, nofseqs-1), false,NULL); gt_seqorder_sort(suffixsortspace, encseq); if (arguments->sort) for (i = 0; i < nofseqs; i++) gt_seqorder_output(gt_encseq_seqnum(encseq, gt_suffixsortspace_getdirect(suffixsortspace, i)), encseq); else for (i = nofseqs; i > 0; i--) gt_seqorder_output(gt_encseq_seqnum(encseq, gt_suffixsortspace_getdirect(suffixsortspace, i - 1)), encseq); gt_suffixsortspace_delete(suffixsortspace, false); } } gt_encseq_loader_delete(loader); gt_encseq_delete(encseq); return had_err; }
int gtr_run(GtR *gtr, int argc, const char **argv, GtError *err) { GtToolfunc toolfunc; GtTool *tool = NULL; char **nargv = NULL; void *mem, *map; int had_err = 0; gt_error_check(err); gt_assert(gtr); if (gtr->debug) enable_logging(gt_str_get(gtr->debugfp), >r->logfp); if (gtr->quiet) gt_warning_disable(); gtr->seed = gt_ya_rand_init(gtr->seed); gt_log_log("seed=%u", gtr->seed); if (gtr->list) return list_tools(gtr); if (gt_str_length(gtr->manoutdir) > 0) return create_manpages(gtr, gt_str_get(gtr->manoutdir), err); if (gtr->check64bit) return check64bit(); if (gtr->test) return run_tests(gtr, err); if (gt_str_length(gtr->testspacepeak)) { mem = gt_malloc(1 << 26); /* alloc 64 MB */; map = gt_fa_xmmap_read(gt_str_get(gtr->testspacepeak), NULL); gt_fa_xmunmap(map); gt_free(mem); } if (argc == 0 && !gtr->interactive) { gt_error_set(err, "neither tool nor script specified; option -help lists " "possible tools"); had_err = -1; } if (!had_err && argc) { if (!gtr->tools || !gt_toolbox_has_tool(gtr->tools, argv[0])) { /* no tool found -> try to open script */ if (gt_file_exists(argv[0])) { /* export script */ gt_lua_set_script_dir(gtr->L, argv[0]); /* run script */ nargv = gt_cstr_array_prefix_first(argv, gt_error_get_progname(err)); gt_lua_set_arg(gtr->L, nargv[0], (const char**) nargv+1); if (luaL_dofile(gtr->L, argv[0])) { /* error */ gt_assert(lua_isstring(gtr->L, -1)); /* error message on top */ gt_error_set(err, "could not execute script %s", lua_tostring(gtr->L, -1)); had_err = -1; lua_pop(gtr->L, 1); /* pop error message */ } } else { /* neither tool nor script found */ gt_error_set(err, "neither tool nor script '%s' found; option -help " "lists possible tools", argv[0]); had_err = -1; } } else { /* run tool */ if (!(toolfunc = gt_toolbox_get(gtr->tools, argv[0]))) { tool = gt_toolbox_get_tool(gtr->tools, argv[0]); gt_assert(tool); } nargv = gt_cstr_array_prefix_first(argv, gt_error_get_progname(err)); gt_error_set_progname(err, nargv[0]); if (toolfunc) had_err = toolfunc(argc, (const char**) nargv, err); else had_err = gt_tool_run(tool, argc, (const char**) nargv, err); } } gt_cstr_array_delete(nargv); if (!had_err && gtr->interactive) { gt_showshortversion(gt_error_get_progname(err)); gt_lua_set_arg(gtr->L, gt_error_get_progname(err), argv); run_interactive_lua_interpreter(gtr->L); } if (had_err) return EXIT_FAILURE; return EXIT_SUCCESS; }
int gt_findsubquerygmatchforward(const GtEncseq *encseq, const void *genericindex, unsigned long totallength, Greedygmatchforwardfunction gmatchforward, const GtAlphabet *alphabet, const GtStrArray *queryfilenames, Definedunsignedlong minlength, Definedunsignedlong maxlength, bool showsequence, bool showquerypos, bool showsubjectpos, GtError *err) { Substringinfo substringinfo; Rangespecinfo rangespecinfo; bool haserr = false; GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; uint64_t unitnum; gt_error_check(err); substringinfo.genericindex = genericindex; substringinfo.totallength = totallength; rangespecinfo.minlength = minlength; rangespecinfo.maxlength = maxlength; rangespecinfo.showsequence = showsequence; rangespecinfo.showquerypos = showquerypos; rangespecinfo.showsubjectpos = showsubjectpos; substringinfo.preprocessgmatchlength = showunitnum; substringinfo.processgmatchlength = showifinlengthrange; substringinfo.postprocessgmatchlength = NULL; substringinfo.alphabet = alphabet; substringinfo.processinfo = &rangespecinfo; substringinfo.gmatchforward = gmatchforward; substringinfo.encseq = encseq; seqit = gt_seqiterator_sequence_buffer_new(queryfilenames, err); if (!seqit) haserr = true; if (!haserr) { gt_seqiterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alphabet)); for (unitnum = 0; /* Nothing */; unitnum++) { retval = gt_seqiterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } gmatchposinsinglesequence(&substringinfo, unitnum, query, querylen, desc); } gt_seqiterator_delete(seqit); } return haserr ? -1 : 0; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); for (i = 1; i < gt_array_size(gt_genome_node_array); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); /* XXX: an error check is necessary here, otherwise gt_strand_join() can cause a failed assertion */ mRNA_strand = gt_strand_join(mRNA_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); if (gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gn); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
GtPdomModelSet* gt_pdom_model_set_new(GtStrArray *hmmfiles, GtError *err) { GtStr *concat_dbnames, *cmdline, *indexfilename = NULL; GtUword i; char *md5_hash, ch; const char *tmpdir; int had_err = 0, rval; FILE *dest; GtPdomModelSet *pdom_model_set; gt_assert(hmmfiles); gt_error_check(err); rval = system("hmmpress -h > /dev/null"); if (rval == -1) { gt_error_set(err, "error executing system(hmmpress)"); return NULL; } #ifndef _WIN32 if (WEXITSTATUS(rval) != 0) { gt_error_set(err, "cannot find the hmmpress executable in PATH"); return NULL; } #else /* XXX */ gt_error_set(err, "hmmpress for Windows not implemented"); return NULL; #endif pdom_model_set = gt_calloc((size_t) 1, sizeof (GtPdomModelSet)); concat_dbnames = gt_str_new(); for (i = 0; !had_err && i < gt_str_array_size(hmmfiles); i++) { const char *filename = gt_str_array_get(hmmfiles, i); if (!gt_file_exists(filename)) { gt_error_set(err, "invalid HMM file: %s", filename); gt_str_delete(concat_dbnames); gt_free(pdom_model_set); return NULL; } else { gt_str_append_cstr(concat_dbnames, filename); } } if (!had_err) { pdom_model_set->filename = gt_str_new(); if (!(tmpdir = getenv("TMPDIR"))) tmpdir = "/tmp"; gt_str_append_cstr(pdom_model_set->filename, tmpdir); gt_str_append_char(pdom_model_set->filename, GT_PATH_SEPARATOR); md5_hash = gt_md5_fingerprint(gt_str_get(concat_dbnames), gt_str_length(concat_dbnames)); gt_str_append_cstr(pdom_model_set->filename, md5_hash); gt_free(md5_hash); gt_str_delete(concat_dbnames); indexfilename = gt_str_new_cstr(gt_str_get(pdom_model_set->filename)); gt_str_append_cstr(indexfilename, GT_HMM_INDEX_SUFFIX); } if (!gt_file_exists(gt_str_get(indexfilename))) { dest = fopen(gt_str_get(pdom_model_set->filename), "w+"); if (!dest) { gt_error_set(err, "could not create file %s", gt_str_get(pdom_model_set->filename)); had_err = -1; } if (!had_err) { for (i = 0; !had_err && i < gt_str_array_size(hmmfiles); i++) { FILE *source; const char *filename = gt_str_array_get(hmmfiles, i); source = fopen(filename, "r"); if (!source) { gt_error_set(err, "could not open HMM file %s", filename); had_err = -1; } if (!had_err) { while (( ch = fgetc(source)) != EOF) (void) fputc(ch, dest); (void) fclose(source); } } (void) fclose(dest); } /* XXX: read hmmer path from env */ cmdline = gt_str_new_cstr("hmmpress -f "); gt_str_append_str(cmdline, pdom_model_set->filename); gt_str_append_cstr(cmdline, "> /dev/null"); /* XXX: portability? */ rval = system(gt_str_get(cmdline)); gt_str_delete(cmdline); if (rval == -1) { gt_error_set(err, "error executing system(hmmpress)"); return NULL; } #ifndef _WIN32 if (WEXITSTATUS(rval) != 0) { gt_error_set(err, "an error occurred during HMM preprocessing"); had_err = -1; } #else gt_error_set(err, "WEXITSTATUS not implemented on Windows"); had_err = -1; #endif } if (had_err) { gt_pdom_model_set_delete(pdom_model_set); pdom_model_set = NULL; } gt_str_delete(indexfilename); return pdom_model_set; }
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes, GtStr *filenamestr, GtFile *fpin, bool be_tolerant, GtError *err) { GtStr *seqid_str, *source_str, *line_buffer; char *line; size_t line_length; GtUword i, line_number = 0; GtGenomeNode *gn; GtRange range; GtPhase phase_value; GtStrand gt_strand_value; GtSplitter *splitter, *attribute_splitter; float score_value; char *seqname, *source, *feature, *start, *end, *score, *strand, *frame, *attributes, *token, *gene_id, *gene_name = NULL, *transcript_id, *transcript_name = NULL, **tokens; GtHashmap *transcript_id_hash; /* map from transcript id to array of genome nodes */ GtArray *gt_genome_node_array; ConstructionInfo cinfo; GTF_feature_type gtf_feature_type; GT_UNUSED bool gff_type_is_valid = false; const char *type = NULL; const char *filename; bool score_is_defined; int had_err = 0; gt_assert(parser && genome_nodes); gt_error_check(err); filename = gt_str_get(filenamestr); /* alloc */ line_buffer = gt_str_new(); splitter = gt_splitter_new(), attribute_splitter = gt_splitter_new(); #define HANDLE_ERROR \ if (had_err) { \ if (be_tolerant) { \ fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \ gt_error_unset(err); \ gt_str_reset(line_buffer); \ had_err = 0; \ continue; \ } \ else { \ had_err = -1; \ break; \ } \ } while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) { line = gt_str_get(line_buffer); line_length = gt_str_length(line_buffer); line_number++; had_err = 0; if (line_length == 0) { gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number, filename); } else if (line[0] == '#') { /* storing comment */ if (line_length >= 2 && line[1] == '#') gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */ else gn = gt_comment_node_new(line+1); gt_genome_node_set_origin(gn, filenamestr, line_number); gt_queue_add(genome_nodes, gn); } else { /* process tab delimited GTF line */ gt_splitter_reset(splitter); gt_splitter_split(splitter, line, line_length, '\t'); if (gt_splitter_size(splitter) != 9UL) { gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU " tab (\\t) " "separated fields instead of 9", line_number, filename, gt_splitter_size(splitter)); had_err = -1; break; } tokens = gt_splitter_get_tokens(splitter); seqname = tokens[0]; source = tokens[1]; feature = tokens[2]; start = tokens[3]; end = tokens[4]; score = tokens[5]; strand = tokens[6]; frame = tokens[7]; attributes = tokens[8]; /* parse feature */ if (GTF_feature_type_get(>f_feature_type, feature) == -1) { /* we skip unknown features */ fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown " "feature: \"%s\"\n", line_number, filename, feature); gt_str_reset(line_buffer); continue; } /* translate into GFF3 feature type */ switch (gtf_feature_type) { case GTF_CDS: case GTF_stop_codon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_CDS); type = gt_ft_CDS; break; case GTF_exon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_exon); type = gt_ft_exon; } gt_assert(gff_type_is_valid); /* parse the range */ had_err = gt_parse_range(&range, start, end, line_number, filename, err); HANDLE_ERROR; /* process seqname (we have to do it here because we need the range) */ gt_region_node_builder_add_region(parser->region_node_builder, seqname, range); /* parse the score */ had_err = gt_parse_score(&score_is_defined, &score_value, score, line_number, filename, err); HANDLE_ERROR; /* parse the strand */ had_err = gt_parse_strand(>_strand_value, strand, line_number, filename, err); HANDLE_ERROR; /* parse the frame */ had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err); HANDLE_ERROR; /* parse the attributes */ gt_splitter_reset(attribute_splitter); gene_id = NULL; transcript_id = NULL; gt_splitter_split(attribute_splitter, attributes, strlen(attributes), ';'); for (i = 0; i < gt_splitter_size(attribute_splitter); i++) { token = gt_splitter_get_token(attribute_splitter, i); /* skip leading blanks */ while (*token == ' ') token++; /* look for the two mandatory attributes */ if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1; } else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE, strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1; } else if (strncmp(token, GENE_NAME_ATTRIBUTE, strlen(GENE_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*gene_name == '"') gene_name++; if (gene_name[strlen(gene_name)-1] == '"') gene_name[strlen(gene_name)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE, strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*transcript_name == '"') transcript_name++; if (transcript_name[strlen(transcript_name)-1] == '"') transcript_name[strlen(transcript_name)-1] = '\0'; } } /* check for the mandatory attributes */ if (!gene_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; if (!transcript_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; /* process the mandatory attributes */ if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash, gene_id))) { transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id), transcript_id_hash); } gt_assert(transcript_id_hash); if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash, transcript_id))) { gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*)); gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id), gt_genome_node_array); } gt_assert(gt_genome_node_array); /* save optional gene_name and transcript_name attributes */ if (transcript_name && !gt_hashmap_get(parser->transcript_id_to_name_mapping, transcript_id)) { gt_hashmap_add(parser->transcript_id_to_name_mapping, gt_cstr_dup(transcript_id), gt_cstr_dup(transcript_name)); } if (gene_name && !gt_hashmap_get(parser->gene_id_to_name_mapping, gene_id)) { gt_hashmap_add(parser->gene_id_to_name_mapping, gt_cstr_dup(gene_id), gt_cstr_dup(gene_name)); } /* get seqid */ seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname); if (!seqid_str) { seqid_str = gt_str_new_cstr(seqname); gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str), seqid_str); } gt_assert(seqid_str); /* construct the new feature */ gn = gt_feature_node_new(seqid_str, type, range.start, range.end, gt_strand_value); gt_genome_node_set_origin(gn, filenamestr, line_number); /* set source */ source_str = gt_hashmap_get(parser->source_to_str_mapping, source); if (!source_str) { source_str = gt_str_new_cstr(source); gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str), source_str); } gt_assert(source_str); gt_feature_node_set_source((GtFeatureNode*) gn, source_str); if (score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); if (phase_value != GT_PHASE_UNDEFINED) gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value); gt_array_add(gt_genome_node_array, gn); } gt_str_reset(line_buffer); } /* process all region nodes */ if (!had_err) gt_region_node_builder_build(parser->region_node_builder, genome_nodes); /* process all feature nodes */ cinfo.genome_nodes = genome_nodes; cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping; cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping; if (!had_err) { had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes, &cinfo, err); } /* free */ gt_splitter_delete(splitter); gt_splitter_delete(attribute_splitter); gt_str_delete(line_buffer); return had_err; }
static int split_fasta_file(const char *filename, GtUword max_filesize, bool force, GtError *err) { GtFile *srcfp = NULL, *destfp = NULL; GtStr *destfilename = NULL; GtUword filenum = 0, bytecount = 0, separator_pos; int read_bytes, had_err = 0; char buf[BUFSIZ]; gt_error_check(err); gt_assert(filename && max_filesize); /* open source file */ srcfp = gt_file_xopen(filename, "r"); gt_assert(srcfp); /* read start characters */ if ((read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) == 0) { gt_error_set(err, "file \"%s\" is empty", filename); had_err = -1; } bytecount += read_bytes; /* make sure the file is in fasta format */ if (!had_err && buf[0] != '>') { gt_error_set(err, "file is not in FASTA format"); had_err = -1; } if (!had_err) { /* open destination file */ destfilename = gt_str_new(); gt_str_append_cstr_nt(destfilename, filename, gt_file_basename_length(filename)); gt_str_append_char(destfilename, '.'); gt_str_append_ulong(destfilename, ++filenum); gt_str_append_cstr(destfilename, gt_file_mode_suffix(gt_file_mode(srcfp))); if (!(destfp = gt_output_file_xopen_forcecheck(gt_str_get(destfilename), "w", force, err))) { had_err = -1; } if (!had_err) gt_file_xwrite(destfp, buf, read_bytes); while (!had_err && (read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) != 0) { if (bytecount + read_bytes > max_filesize) { int offset = bytecount < max_filesize ? max_filesize - bytecount : 0; if ((separator_pos = buf_contains_separator(buf, offset, read_bytes))) { separator_pos--; gt_assert(separator_pos < read_bytes); if (separator_pos) gt_file_xwrite(destfp, buf, separator_pos); /* close current file */ gt_file_delete(destfp); /* open new file */ gt_str_reset(destfilename); gt_str_append_cstr_nt(destfilename, filename, gt_file_basename_length(filename)); gt_str_append_char(destfilename, '.'); gt_str_append_ulong(destfilename, ++filenum); gt_str_append_cstr(destfilename, gt_file_mode_suffix(gt_file_mode(srcfp))); if (!(destfp = gt_output_file_xopen_forcecheck(gt_str_get(destfilename), "w", force, err))) { had_err = -1; break; } bytecount = read_bytes - separator_pos; /* reset */ gt_assert(buf[separator_pos] == '>'); gt_file_xwrite(destfp, buf + separator_pos, read_bytes - separator_pos); continue; } } bytecount += read_bytes; gt_file_xwrite(destfp, buf, read_bytes); } } /* free */ gt_str_delete(destfilename); /* close current file */ gt_file_delete(destfp); /* close source file */ gt_file_delete(srcfp); return had_err; }
static int gt_fasta_reader_fsm_run(GtFastaReader *fasta_reader, GtFastaReaderProcDescription proc_description, GtFastaReaderProcSequencePart proc_sequence_part, GtFastaReaderProcSequenceLength proc_sequence_length, void *data, GtError *err) { GtFastaReaderFSM *fr = gt_fasta_reader_fsm_cast(fasta_reader); unsigned char cc; GtFastaReaderState state = EXPECTING_SEPARATOR; GtUword sequence_length = 0, line_counter = 1; GtStr *description, *sequence; int had_err = 0; gt_error_check(err); gt_assert(fr); /* init */ description = gt_str_new(); sequence = gt_str_new(); /* at least one function has to be defined */ gt_assert(proc_description || proc_sequence_part || proc_sequence_length); /* rewind sequence file (to allow multiple calls) */ if (fr->sequence_file) gt_file_xrewind(fr->sequence_file); /* reading */ while (!had_err && gt_file_xread(fr->sequence_file, &cc, 1) != 0) { switch (state) { case EXPECTING_SEPARATOR: if (cc != GT_FASTA_SEPARATOR) { gt_error_set(err, "the first character of fasta file \"%s\" has to be '%c'", gt_str_get(fr->sequence_filename), GT_FASTA_SEPARATOR); had_err = -1; } else state = READING_DESCRIPTION; break; case READING_DESCRIPTION: if (cc == '\n') { if (proc_description) { had_err = proc_description(gt_str_get(description), gt_str_length(description), data, err); if (!had_err) gt_str_reset(description); } if (!had_err) { sequence_length = 0; line_counter++; state = READING_SEQUENCE_AFTER_NEWLINE; } } else if (proc_description && cc != '\r') gt_str_append_char(description, cc); break; case READING_SEQUENCE_AFTER_NEWLINE: if (cc == GT_FASTA_SEPARATOR) { if (!sequence_length) { gt_assert(line_counter); gt_error_set(err, "empty sequence after description given in line " ""GT_WU"", line_counter - 1); had_err = -1; break; } else { if (proc_sequence_part) { gt_assert(gt_str_length(sequence)); had_err = proc_sequence_part(gt_str_get(sequence), gt_str_length(sequence), data, err); } if (had_err) break; gt_str_reset(sequence); if (proc_sequence_length) had_err = proc_sequence_length(sequence_length, data, err); if (had_err) break; state = READING_DESCRIPTION; continue; } } /*@fallthrough@*/ case READING_SEQUENCE: if (cc == '\n') { line_counter++; state = READING_SEQUENCE_AFTER_NEWLINE; } else { sequence_length++; if (proc_sequence_part) { if (gt_str_length(sequence) == BUFSIZ) { had_err = proc_sequence_part(gt_str_get(sequence), gt_str_length(sequence), data, err); if (had_err) break; gt_str_reset(sequence); } if (cc != ' ' && cc != '\r') gt_str_append_char(sequence, cc); } } break; } } if (!had_err) { /* checks after reading */ switch (state) { case EXPECTING_SEPARATOR: gt_error_set(err, "sequence file \"%s\" is empty", gt_str_get(fr->sequence_filename)); had_err = -1; break; case READING_DESCRIPTION: gt_error_set(err, "unfinished fasta entry in line " GT_WU " of sequence file \"%s\"", line_counter, gt_str_get(fr->sequence_filename)); had_err = -1; break; case READING_SEQUENCE_AFTER_NEWLINE: case READING_SEQUENCE: if (!sequence_length) { gt_assert(line_counter); gt_error_set(err, "empty sequence after description given in line " ""GT_WU"", line_counter - 1); had_err = -1; } else { if (proc_sequence_part) { gt_assert(gt_str_length(sequence)); had_err = proc_sequence_part(gt_str_get(sequence), gt_str_length(sequence), data, err); } if (!had_err && proc_sequence_length) had_err = proc_sequence_length(sequence_length, data, err); } } } /* free */ gt_str_delete(sequence); gt_str_delete(description); return had_err; }
static int gt_xrf_abbr_parse_tree_validate_entries(const GtXRFAbbrParseTree *xrf_abbr_parse_tree, GtError *err) { GtUword i; GtHashmap *abbrvs; const char *value; int had_err = 0; gt_error_check(err); gt_assert(xrf_abbr_parse_tree); abbrvs = gt_hashmap_new(GT_HASH_STRING, NULL, NULL); for (i = 0; !had_err && i < gt_xrf_abbr_parse_tree_num_of_entries(xrf_abbr_parse_tree); i++) { GtXRFAbbrEntry *entry = *(GtXRFAbbrEntry**) gt_array_get(xrf_abbr_parse_tree->entries, i); if (!(value = gt_xrf_abbr_entry_get_value(entry, XRF_LABEL_ABBREVIATION))) { gt_error_set(err, "file \"%s\": line "GT_WU": required " "label \"" XRF_LABEL_ABBREVIATION "\" missing", gt_xrf_abbr_entry_filename(entry), gt_xrf_abbr_entry_line(entry)); had_err = -1; } if (!had_err) { gt_assert(value); if (gt_hashmap_get(abbrvs, value)) { gt_error_set(err, "file \"%s\": line "GT_WU": duplicate abbreviation " "\"%s\", must be unique", gt_xrf_abbr_entry_filename(entry), gt_xrf_abbr_entry_line(entry), value); had_err = -1; } else { gt_hashmap_add(abbrvs, (void*) value, (void*) value); } } if (!had_err && (value = gt_xrf_abbr_entry_get_value(entry, XRF_LABEL_SHORTHAND_NAME))) { if (strlen(value) >= 10) { gt_error_set(err, "file \"%s\": line "GT_WU": length of " "shorthand name \"%s\" " "is not less than 10 characters", gt_xrf_abbr_entry_filename(entry), gt_xrf_abbr_entry_line(entry), value); had_err = -1; } } if (!had_err && (value = gt_xrf_abbr_entry_get_value(entry, XRF_LABEL_LOCAL_ID_SYNTAX))) { GtError *regex_error = gt_error_new(); bool match; if (gt_grep(&match, value, "", regex_error)) { gt_error_set(err, "file \"%s\": line "GT_WU": invalid " "regular expression \"%s\" (%s)", gt_xrf_abbr_entry_filename(entry), gt_xrf_abbr_entry_line(entry), value, gt_error_get(regex_error)); had_err = -1; } gt_error_delete(regex_error); } } gt_hashmap_delete(abbrvs); return had_err; }
int gt_paircmp(int argc, const char **argv, GtError *err) { int parsed_args; Cmppairwiseopt cmppairwise; GtOPrval oprval; GtFastaReader *reader0 = NULL, *reader1 = NULL; gt_error_check(err); oprval = parse_options(&parsed_args, &cmppairwise, argc, argv, err); if (oprval == GT_OPTION_PARSER_OK) { gt_assert(parsed_args == argc); showsimpleoptions(&cmppairwise); if (cmppairwise.showedist) { GtUword edist, len1, len2; GtStr *s1, *s2; gt_assert(gt_str_array_size(cmppairwise.strings) >= 2); s1 = gt_str_array_get_str(cmppairwise.strings,0); s2 = gt_str_array_get_str(cmppairwise.strings,1UL); len1 = gt_str_length(s1); len2 = gt_str_length(s2); edist = gt_computegreedyunitedist((const GtUchar *) gt_str_get(s1), len1, (const GtUchar *) gt_str_get(s2), len2); printf(GT_WU " " GT_WU " " GT_WU " " GT_WU "%% errors\n", edist, len1,len2,(200 * edist)/(len1+len2)); } else if (cmppairwise.print) { const GtStr *str0 = gt_str_array_get_str(cmppairwise.strings,0), *str1 = gt_str_array_get_str(cmppairwise.strings,1); gt_print_edist_alignment((const GtUchar *) gt_str_get(str0),0, gt_str_length(str0), (const GtUchar *) gt_str_get(str1),0, gt_str_length(str1)); } else { size_t idx; Checkfunctiontabentry checkfunction_tab[] = { MAKECheckfunctiontabentry(gt_checkgreedyunitedist), MAKECheckfunctiontabentry(gt_checklinearspace), MAKECheckfunctiontabentry(gt_checklinearspace_local), MAKECheckfunctiontabentry(gt_checkaffinelinearspace), MAKECheckfunctiontabentry(gt_checkaffinelinearspace_local), MAKECheckfunctiontabentry(gt_checkdiagonalbandalign), MAKECheckfunctiontabentry(gt_checkdiagonalbandaffinealign) }; if (cmppairwise.fasta) { gt_assert(gt_str_array_size(cmppairwise.files) == 3); cmppairwise.fastasequences0 = gt_str_array_new(); cmppairwise.fastasequences1 = gt_str_array_new(); reader0 = gt_fasta_reader_rec_new(gt_str_array_get_str( cmppairwise.files,1UL)); gt_fasta_reader_run(reader0, NULL, save_fastaentry, NULL, cmppairwise.fastasequences0, err); reader1 = gt_fasta_reader_rec_new (gt_str_array_get_str( cmppairwise.files,2UL)); gt_fasta_reader_run(reader1, NULL, save_fastaentry, NULL, cmppairwise.fastasequences1, err); gt_error_check(err); } for (idx = 0; idx < sizeof checkfunction_tab/sizeof checkfunction_tab[0]; idx++) { GtUword testcases; printf("run %s\n",checkfunction_tab[idx].name); testcases = applycheckfunctiontosimpleoptions(checkfunction_tab[idx].function, &cmppairwise); printf("# number of testcases for %s: " GT_WU "\n", checkfunction_tab[idx].name,testcases); } gt_fasta_reader_delete(reader0); gt_fasta_reader_delete(reader1); } } freesimpleoption(&cmppairwise); if (oprval == GT_OPTION_PARSER_REQUESTS_EXIT) { return 0; } if (oprval == GT_OPTION_PARSER_ERROR) { return -1; } return 0; }
static int gt_genomediff_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { bool mirrored = false; int had_err = 0, i; GtEncseq *encseq = NULL; GtGenomediffArguments *arguments = tool_arguments; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("start"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (timer != NULL) gt_timer_show_progress(timer, "start shu search", stdout); if (gt_str_array_size(arguments->filenames) > 1UL) { GtEncseqEncoder *ee = gt_encseq_encoder_new(); gt_encseq_encoder_set_timer(ee, timer); gt_encseq_encoder_set_logger(ee, logger); /* kr only makes sense for dna, so we can check this already with ee */ gt_encseq_encoder_set_input_dna(ee); had_err = gt_encseq_encoder_encode(ee, arguments->filenames, gt_str_get(arguments->indexname), err); gt_encseq_encoder_delete(ee); } else { gt_str_append_str(arguments->indexname, gt_str_array_get_str(arguments->filenames, 0)); if (arguments->with_esa || arguments->with_pck) { GtStr *current_line = gt_str_new(); FILE *prj_fp; const char *buffer; char **elements = NULL; prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname), GT_PROJECTFILESUFFIX,"rb",err); if (prj_fp == NULL) had_err = -1; while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) { buffer = gt_str_get(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); elements = gt_cstr_split(buffer, '='); gt_log_log("%s", elements[0]); if (strcmp("mirrored", elements[0]) == 0) { gt_log_log("%s", elements[1]); if (strcmp("1", elements[1]) == 0) { mirrored = true; gt_log_log("sequences are treated as mirrored"); } } gt_str_reset(current_line); } gt_str_delete(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); gt_fa_xfclose(prj_fp); } } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); if (mirrored) gt_encseq_loader_mirror(el); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (!had_err) { uint64_t **shusums = NULL; if (arguments->with_esa || arguments->with_pck) { shusums = gt_genomediff_shulen_sum(arguments, unit_info, logger, timer, err); if (shusums == NULL) had_err = -1; } else { const bool doesa = true; GenomediffInfo gd_info; Suffixeratoroptions sopts; sopts.beverbose = arguments->verbose; sopts.indexname = arguments->indexname; sopts.db = NULL; sopts.encopts = NULL; sopts.genomediff = true; sopts.inputindex = arguments->indexname; sopts.loadopts = arguments->loadopts; sopts.showprogress = false; sopts.idxopts = arguments->idxopts; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); gd_info.shulensums = shusums; gd_info.unit_info = unit_info; had_err = runsuffixerator(doesa, &sopts, &gd_info, logger, err); } if (!had_err && shusums != NULL) { had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info, arguments->with_pck, logger, timer, err); gt_array2dim_delete(shusums); } } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
int gt_mapfmindex (Fmindex *fmindex,const char *indexname, GtLogger *logger,GtError *err) { FILE *fpin = NULL; bool haserr = false, storeindexpos = true; GtSpecialcharinfo specialcharinfo; gt_error_check(err); fmindex->mappedptr = NULL; fmindex->bwtformatching = NULL; fmindex->alphabet = NULL; fpin = gt_fa_fopen_with_suffix(indexname,FMASCIIFILESUFFIX,"rb",err); if (fpin == NULL) { haserr = true; } if (!haserr) { if (scanfmafileviafileptr(fmindex, &specialcharinfo, &storeindexpos, indexname, fpin, logger, err) != 0) { haserr = true; } } gt_fa_xfclose(fpin); if (!haserr) { fmindex->bwtformatching = mapbwtencoding(indexname,logger,err); if (fmindex->bwtformatching == NULL) { haserr = true; } } if (!haserr) { fmindex->specpos.nextfreeGtPairBwtidx = (GtUword) gt_determinenumberofspecialstostore(&specialcharinfo); fmindex->specpos.spaceGtPairBwtidx = NULL; fmindex->specpos.allocatedGtPairBwtidx = 0; fmindex->alphabet = gt_alphabet_ref( gt_encseq_alphabet(fmindex->bwtformatching)); if (fmindex->alphabet == NULL) { haserr = true; } } if (!haserr) { GtStr *tmpfilename; gt_computefmkeyvalues (fmindex, &specialcharinfo, fmindex->bwtlength, fmindex->log2bsize, fmindex->log2markdist, gt_alphabet_num_of_chars(fmindex->alphabet), fmindex->suffixlength, storeindexpos); tmpfilename = gt_str_new_cstr(indexname); gt_str_append_cstr(tmpfilename,FMDATAFILESUFFIX); if (gt_fillfmmapspecstartptr(fmindex,storeindexpos,tmpfilename,err) != 0) { haserr = true; } gt_str_delete(tmpfilename); } if (haserr) { gt_freefmindex(fmindex); } return haserr ? -1 : 0; }
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }
int gt_mapspec_read(GtMapspecSetupFunc setup, void *data, const GtStr *filename, unsigned long expectedsize, void **mapped, GtError *err) { void *mapptr; uint64_t expectedaccordingtomapspec; unsigned long byteoffset = 0; size_t numofbytes; GtMapspec *ms = gt_malloc(sizeof (GtMapspec)); GtMapspecification *mapspecptr; int had_err = 0; unsigned long totalpadunits = 0; gt_error_check(err); GT_INITARRAY(&ms->mapspectable, GtMapspecification); setup(ms, data, false); mapptr = gt_fa_mmap_read(gt_str_get(filename), &numofbytes, err); if (mapptr == NULL) { had_err = -1; } *mapped = mapptr; if (!had_err) { if (assigncorrecttype(ms->mapspectable.spaceGtMapspecification, mapptr,0,err) != 0) { had_err = -1; } } if (!had_err) { expectedaccordingtomapspec = detexpectedaccordingtomapspec(&ms->mapspectable); if (expectedaccordingtomapspec != (uint64_t) numofbytes) { gt_error_set(err,"%lu bytes read from %s, but " Formatuint64_t " expected", (unsigned long) numofbytes, gt_str_get(filename), PRINTuint64_tcast(expectedaccordingtomapspec)); had_err = -1; } } if (!had_err) { mapspecptr = ms->mapspectable.spaceGtMapspecification; gt_assert(mapspecptr != NULL); byteoffset = CALLCASTFUNC(uint64_t,unsigned_long, (uint64_t) (mapspecptr->sizeofunit * mapspecptr->numofunits)); if (byteoffset % (unsigned long) GT_WORDSIZE_INBYTES > 0) { size_t padunits = GT_WORDSIZE_INBYTES - (byteoffset % GT_WORDSIZE_INBYTES); byteoffset += (unsigned long) padunits; totalpadunits += (unsigned long) padunits; } for (mapspecptr++; mapspecptr < ms->mapspectable.spaceGtMapspecification + ms->mapspectable.nextfreeGtMapspecification; mapspecptr++) { if (assigncorrecttype(mapspecptr,mapptr,byteoffset,err) != 0) { had_err = -1; break; } byteoffset = CALLCASTFUNC(uint64_t,unsigned_long, (uint64_t) (byteoffset + mapspecptr->sizeofunit * mapspecptr->numofunits)); if (byteoffset % (unsigned long) GT_WORDSIZE_INBYTES > 0) { size_t padunits = GT_WORDSIZE_INBYTES - (byteoffset % GT_WORDSIZE_INBYTES); byteoffset += (unsigned long) padunits; totalpadunits += (unsigned long) padunits; } } } if (!had_err) { if (expectedsize + totalpadunits != byteoffset) { gt_error_set(err,"mapping: expected file size is %lu bytes, " "but file has %lu bytes", expectedsize,byteoffset); had_err = -1; } } GT_FREEARRAY(&ms->mapspectable,GtMapspecification); gt_free(ms); return had_err; }
static int cluster_annotate_nodes(GtClusteredSet *cs, GtEncseq *encseq, const char *feature, GtArray *nodes, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *curnode = NULL, *tmp; GtClusteredSetIterator *csi = NULL; GtGenomeNode *gn; GtHashmap *desc2node; GtStr *seqid = NULL; int had_err = 0; unsigned long num_of_clusters, i, elm; const char *fnt = NULL; char buffer[BUFSIZ], *real_feature; gt_error_check(err); if ((strcmp(feature, "lLTR") == 0) || (strcmp(feature, "rLTR") == 0)) real_feature = gt_cstr_dup(gt_ft_long_terminal_repeat); else real_feature = gt_cstr_dup(feature); desc2node = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL); for (i = 0; i < gt_array_size(nodes); i++) { gn = *(GtGenomeNode**) gt_array_get(nodes, i); if (gt_feature_node_try_cast(gn) == NULL) continue; fni = gt_feature_node_iterator_new((GtFeatureNode*) gn); while ((curnode = gt_feature_node_iterator_next(fni)) != NULL) { char header[BUFSIZ]; fnt = gt_feature_node_get_type(curnode); if (strcmp(fnt, gt_ft_repeat_region) == 0) { const char *rid; unsigned long id; seqid = gt_genome_node_get_seqid((GtGenomeNode*) curnode); rid = gt_feature_node_get_attribute(curnode, "ID"); (void) sscanf(rid, "repeat_region%lu", &id); (void) snprintf(buffer, BUFSIZ, "%s_%lu", gt_str_get(seqid), id); } else if (strcmp(fnt, gt_ft_protein_match) == 0) { GtRange range; const char *attr; attr = gt_feature_node_get_attribute(curnode, "name"); if (!attr) continue; if (strcmp(feature, attr) != 0) continue; range = gt_genome_node_get_range((GtGenomeNode*) curnode); if ((range.end - range.start + 1) < 10UL) continue; (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start, range.end); gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode); } else if (strcmp(fnt, real_feature) == 0) { GtRange range; range = gt_genome_node_get_range((GtGenomeNode*) curnode); if ((range.end - range.start + 1) < 10UL) continue; (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start, range.end); gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode); } } gt_feature_node_iterator_delete(fni); } gt_free(real_feature); num_of_clusters = gt_clustered_set_num_of_clusters(cs, err); for (i = 0; i < num_of_clusters; i++) { csi = gt_clustered_set_get_iterator(cs, i ,err); if (csi != NULL) { while (!had_err && (gt_clustered_set_iterator_next(csi, &elm, err) != GT_CLUSTERED_SET_ITERATOR_STATUS_END)) { char clid[BUFSIZ]; const char *encseqdesc; char *encseqid; unsigned long desclen; encseqdesc = gt_encseq_description(encseq, &desclen, elm); encseqid = gt_calloc((size_t) (desclen + 1), sizeof (char)); (void) strncpy(encseqid, encseqdesc, (size_t) desclen); encseqid[desclen] = '\0'; tmp = (GtFeatureNode*) gt_hashmap_get(desc2node, (void*) encseqid); (void) snprintf(clid, BUFSIZ, "%lu", i); gt_feature_node_set_attribute(tmp, "clid", clid); gt_free(encseqid); } } gt_clustered_set_iterator_delete(csi, err); csi = NULL; } gt_hashmap_delete(desc2node); return had_err; }