static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtGenomediffArguments *arguments = tool_arguments; int had_err = 0, i; GtUword lcounter = 0, zcounter = 0; double **shusums = NULL; GtEncseq *encseq = NULL; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("load encseq"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (timer != NULL) gt_timer_show_progress(timer, "load units", stdout); if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (timer != NULL) gt_timer_show_progress(timer, "read table", stdout); if (!had_err) { GtIO *table_file = NULL; GtTokenizer *tokenizer = NULL; GtStr *line = NULL; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r"); tokenizer = gt_tokenizer_new(table_file); line = gt_tokenizer_get_token(tokenizer); while (line != NULL && !had_err) { char *cline = gt_str_get(line); char *elem = strtok(cline, ";"); zcounter = 0; while (elem != NULL && !had_err) { if (*elem != '#') { if (1 != sscanf(elem, "%lf", &shusums[lcounter][zcounter])) { had_err = 1; gt_error_set(err, "couldn't scan"); break; } gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]); zcounter++; } else { gt_logger_log(logger, "name: %s", elem++); } elem = strtok(NULL, ";"); } gt_tokenizer_next_token(tokenizer); gt_str_delete(line); line = gt_tokenizer_get_token(tokenizer); lcounter++; gt_logger_log(logger, "line "GT_WD"", lcounter); } } if (!had_err) { GtUword num_of_seq, file_idx, seq_idx, startpos; GT_UNUSED GtUword oldpos = 0; gt_assert(unit_info != NULL); gt_assert(lcounter == zcounter); gt_assert(lcounter == unit_info->num_of_genomes); num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq); for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) { startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx); file_idx = gt_encseq_filenum(unit_info->encseq, startpos); gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n" "belonges to file: "GT_WU" which is part of genome: %s", seq_idx, startpos, file_idx, gt_str_array_get(unit_info->genome_names, unit_info->map_files[file_idx])); gt_assert(oldpos <= startpos); oldpos = startpos; } } if (!had_err && shusums != NULL) { had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments, unit_info, logger, timer, err); gt_array2dim_delete(shusums); } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
static GtOPrval parse_options(int *parsed_args, Cmppairwiseopt *pw, int argc, const char **argv, GtError *err) { GtOptionParser *op; GtOption *optionstrings, *optionfiles, *optioncharlistlen, *optiontext, *optionshowedist, *optionprint; GtStrArray *charlistlen; GtOPrval oprval; gt_error_check(err); charlistlen = gt_str_array_new(); pw->strings = gt_str_array_new(); pw->files = gt_str_array_new(); pw->text = gt_str_new(); pw->charlistlen = NULL; pw->fastasequences0 = NULL; pw->fastasequences1 = NULL; pw->showedist = false; pw->print = false; pw->fasta = false; op = gt_option_parser_new("options", "Apply function to pairs of strings."); gt_option_parser_set_mail_address(op, "<*****@*****.**>"); optionstrings = gt_option_new_string_array("ss", "use two strings", pw->strings); gt_option_parser_add_option(op, optionstrings); optionfiles = gt_option_new_filename_array("ff", "use two files", pw->files); gt_option_parser_add_option(op, optionfiles); optioncharlistlen = gt_option_new_string_array("a", "use character list and length", charlistlen); gt_option_parser_add_option(op, optioncharlistlen); optiontext = gt_option_new_string("t", "use text", pw->text, NULL); gt_option_parser_add_option(op, optiontext); optionshowedist = gt_option_new_bool("e", "output unit edit distance", &pw->showedist, false); gt_option_parser_add_option(op, optionshowedist); optionprint = gt_option_new_bool("p", "print edist alignment", &pw->print, false); gt_option_parser_add_option(op, optionprint); gt_option_exclude(optionstrings, optionfiles); gt_option_exclude(optionstrings, optioncharlistlen); gt_option_exclude(optionstrings, optiontext); gt_option_exclude(optionfiles, optioncharlistlen); gt_option_exclude(optionfiles, optiontext); gt_option_exclude(optioncharlistlen, optiontext); gt_option_imply(optionshowedist, optionstrings); gt_option_imply(optionprint, optionstrings); oprval = gt_option_parser_parse(op, parsed_args, argc, argv, gt_versionfunc, err); if (oprval == GT_OPTION_PARSER_OK) { if (gt_option_is_set(optionstrings)) { if (gt_str_array_size(pw->strings) != 2UL) { gt_error_set(err, "option -ss requires two string arguments"); oprval = GT_OPTION_PARSER_ERROR; } } else { if (gt_option_is_set(optionfiles)) { if (gt_str_array_size(pw->files) != 2UL) { if (gt_str_array_size(pw->files) == 3UL && !strcmp(gt_str_array_get(pw->files,0),"fasta")) { pw->fasta = true; } if (!pw->fasta) { gt_error_set(err, "option -ff requires two filename arguments or " "keyword fasta and two filename arguments in " "FASTA format"); oprval = GT_OPTION_PARSER_ERROR; } } } else { if (gt_option_is_set(optioncharlistlen)) { GtWord readint; if (gt_str_array_size(charlistlen) != 2UL) { gt_error_set(err, "option -a requires charlist and length argument"); oprval = GT_OPTION_PARSER_ERROR; }else { pw->charlistlen = gt_malloc(sizeof *pw->charlistlen); pw->charlistlen->charlist = gt_str_ref(gt_str_array_get_str(charlistlen, 0)); if (sscanf(gt_str_array_get(charlistlen,1UL), GT_WD, &readint) != 1 || readint < 1L) { gt_error_set(err, "option -a requires charlist and length argument"); oprval = GT_OPTION_PARSER_ERROR; } pw->charlistlen->len = (GtUword) readint; } } else { if (!gt_option_is_set(optiontext)) { gt_error_set(err, "use exactly one of the options -ss, -ff, -a, -t"); oprval = GT_OPTION_PARSER_ERROR; } } } } } gt_option_parser_delete(op); if (oprval == GT_OPTION_PARSER_OK && *parsed_args != argc) { gt_error_set(err, "superfluous program parameters"); oprval = GT_OPTION_PARSER_ERROR; } gt_str_array_delete(charlistlen); return oprval; }
/* Create lists of all GtBlocks in the diagram. */ static int collect_blocks(GT_UNUSED void *key, void *value, void *data, GT_UNUSED GtError *err) { NodeInfoElement *ni = (NodeInfoElement*) value; GtDiagram *diagram = (GtDiagram*) data; GtBlock *block = NULL; GtStr *trackid_str; GtUword i = 0; trackid_str = gt_str_new(); for (i = 0; i < gt_str_array_size(ni->types); i++) { const char *type; GtUword j; GtArray *list; PerTypeInfo *type_struc = NULL; GtBlock* mainblock = NULL; type = gt_str_array_get(ni->types, i); type_struc = gt_hashmap_get(ni->type_index, type); gt_assert(type_struc); for (j=0; j<gt_array_size(type_struc->blocktuples); j++) { GtBlockTuple *bt; bt = *(GtBlockTuple**) gt_array_get(type_struc->blocktuples, j); if (bt->rep == GT_UNDEF_REPR && type_struc->must_merge) { block = mainblock = gt_block_ref(bt->block); gt_block_delete(mainblock); gt_free(bt); continue; } else { if (mainblock) { block = gt_block_clone(mainblock); gt_block_merge(block, bt->block); gt_block_delete(bt->block); } else block = bt->block; } gt_assert(block); gt_str_reset(trackid_str); /* execute hook for track selector function */ diagram->select_func(block, trackid_str, diagram->ptr); if (!(list = (GtArray*) gt_hashmap_get(diagram->blocks, gt_str_get(trackid_str)))) { list = gt_array_new(sizeof (GtBlock*)); gt_hashmap_add(diagram->blocks, gt_cstr_dup(gt_str_get(trackid_str)), list); }; gt_assert(list); gt_array_add(list, block); gt_free(bt); } gt_array_delete(type_struc->blocktuples); gt_hashmap_delete(type_struc->rep_index); gt_block_delete(mainblock); } gt_hashmap_delete(ni->type_index); gt_str_array_delete(ni->types); gt_free(ni); gt_str_delete(trackid_str); return 0; }
void gt_stat_visitor_show_stats(GtNodeVisitor *nv, GtFile *outfp) { GtStatVisitor *sv = stat_visitor_cast(nv); if (sv->number_of_sequence_regions) { gt_file_xprintf(outfp, "sequence regions: %lu (total length: %llu)\n", sv->number_of_sequence_regions, sv->total_length_of_sequence_regions); } if (sv->number_of_multi_features) { gt_file_xprintf(outfp, "multi-features: %lu\n", sv->number_of_multi_features); } if (sv->number_of_genes) gt_file_xprintf(outfp, "genes: %lu\n", sv->number_of_genes); if (sv->number_of_protein_coding_genes) { gt_file_xprintf(outfp, "protein-coding genes: %lu\n", sv->number_of_protein_coding_genes); } if (sv->number_of_mRNAs) gt_file_xprintf(outfp, "mRNAs: %lu\n", sv->number_of_mRNAs); if (sv->number_of_protein_coding_mRNAs) { gt_file_xprintf(outfp, "protein-coding mRNAs: %lu\n", sv->number_of_protein_coding_mRNAs); } if (sv->number_of_exons) gt_file_xprintf(outfp, "exons: %lu\n", sv->number_of_exons); if (sv->number_of_CDSs) gt_file_xprintf(outfp, "CDSs: %lu\n", sv->number_of_CDSs); if (sv->number_of_LTR_retrotransposons) { gt_file_xprintf(outfp, "LTR_retrotransposons: %lu\n", sv->number_of_LTR_retrotransposons); } if (sv->gene_length_distribution) { gt_file_xprintf(outfp, "gene length distribution:\n"); gt_disc_distri_show(sv->gene_length_distribution, outfp); } if (sv->gene_score_distribution) { gt_file_xprintf(outfp, "gene score distribution:\n"); gt_disc_distri_show(sv->gene_score_distribution, outfp); } if (sv->exon_length_distribution) { gt_file_xprintf(outfp, "exon length distribution:\n"); gt_disc_distri_show(sv->exon_length_distribution, outfp); } if (sv->exon_number_distribution) { gt_file_xprintf(outfp, "exon number distribution:\n"); gt_disc_distri_show(sv->exon_number_distribution, outfp); } if (sv->intron_length_distribution) { gt_file_xprintf(outfp, "intron length distribution:\n"); gt_disc_distri_show(sv->intron_length_distribution, outfp); } if (sv->cds_length_distribution) { gt_file_xprintf(outfp, "CDS length distribution:\n"); gt_disc_distri_show(sv->cds_length_distribution, outfp); } if (sv->used_sources) { GtStrArray *sources; unsigned long i; gt_file_xprintf(outfp, "used source tags:\n"); sources = gt_cstr_table_get_all(sv->used_sources); for (i = 0; i < gt_str_array_size(sources); i++) gt_file_xprintf(outfp, "%s\n", gt_str_array_get(sources, i)); gt_str_array_delete(sources); } }
static int gt_tyr_occratio_arguments_check(int rest_argc, void *tool_arguments, GtError *err) { Tyr_occratio_options *arguments = tool_arguments; bool haserr = false; Optionargmodedesc outputmodedesctable[] = { {"unique","number of unique mers",TYROCC_OUTPUTUNIQUE}, {"nonunique","number of nonunique mers (single count)", TYROCC_OUTPUTNONUNIQUE}, {"nonuniquemulti","number of nonunique mers (multi count)", TYROCC_OUTPUTNONUNIQUEMULTI}, {"relative","fraction of unique/non-unique mers relative to all mers", TYROCC_OUTPUTRELATIVE}, {"total","number of all mers",TYROCC_OUTPUTTOTAL} }; if (rest_argc != 0) { gt_error_set(err,"superfluous arguments"); return -1; } if (gt_option_is_set(arguments->refoptionmersizes)) { unsigned long *mersizes = NULL; unsigned long idx, numofmersizes = gt_str_array_size(arguments->mersizesstrings); if (numofmersizes == 0) { gt_error_set(err,"missing argument to option -mersizes:"); haserr = true; } else { mersizes = gt_malloc(sizeof(*mersizes) * numofmersizes); for (idx=0; idx<numofmersizes; idx++) { long readnum; if (sscanf(gt_str_array_get(arguments->mersizesstrings,idx), "%ld",&readnum) != 1 || readnum <= 0) { gt_error_set(err,"invalid argument \"%s\" of option -mersizes: " "must be a positive integer", gt_str_array_get(arguments->mersizesstrings,idx)); haserr = true; break; } mersizes[idx] = (unsigned long) readnum; if (idx > 0 && mersizes[idx-1] >= mersizes[idx]) { gt_error_set(err,"invalid argumnt %s to option -mersizes: " "positive numbers must be strictly increasing", gt_str_array_get(arguments->mersizesstrings,idx)); haserr = true; break; } } } if (!haserr) { gt_assert(mersizes != NULL); arguments->minmersize = mersizes[0]; arguments->maxmersize = mersizes[numofmersizes-1]; INITBITTAB(arguments->outputvector,arguments->maxmersize+1); for (idx=0; idx<numofmersizes; idx++) { SETIBIT(arguments->outputvector,mersizes[idx]); } } gt_free(mersizes); } else { if (arguments->minmersize == 0) { gt_error_set(err,"if option -mersizes is not used, then option " "-minmersize is mandatory"); haserr = true; } if (!haserr) { if (arguments->maxmersize == 0) { gt_error_set(err,"if option -mersizes is not used, then option " "-maxmersize is mandatory"); haserr = true; } } if (!haserr) { if (arguments->minmersize > arguments->maxmersize) { gt_error_set(err,"minimum mer size must not be larger than " "maximum mer size"); haserr = true; } } if (!haserr) { if (arguments->minmersize+arguments->stepmersize > arguments->maxmersize) { gt_error_set(err,"minimum mer size + step value must be smaller or " "equal to maximum mersize"); haserr = true; } } if (!haserr) { unsigned long outputval; INITBITTAB(arguments->outputvector,arguments->maxmersize+1); for (outputval = arguments->minmersize; outputval <= arguments->maxmersize; outputval += arguments->stepmersize) { SETIBIT(arguments->outputvector,outputval); } } } if (!haserr) { unsigned long idx; for (idx=0; idx<gt_str_array_size(arguments->outputspec); idx++) { if (optionargaddbitmask(outputmodedesctable, sizeof (outputmodedesctable)/ sizeof (outputmodedesctable[0]), &arguments->outputmode, "-output", gt_str_array_get(arguments->outputspec,idx), err) != 0) { haserr = true; break; } } } if (!haserr) { if ((arguments->outputmode & TYROCC_OUTPUTRELATIVE) && !(arguments->outputmode & (TYROCC_OUTPUTUNIQUE | TYROCC_OUTPUTNONUNIQUE | TYROCC_OUTPUTNONUNIQUEMULTI))) { gt_error_set(err,"argument relative to option -output requires that one " "of the arguments unique, nonunique, or nonuniquemulti " "is used"); haserr = true; } } return haserr ? - 1: 0; }
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }
GtPdomModelSet* gt_pdom_model_set_new(GtStrArray *hmmfiles, GtError *err) { GtStr *concat_dbnames, *cmdline, *indexfilename = NULL; GtUword i; char *md5_hash, ch; const char *tmpdir; int had_err = 0, rval; FILE *dest; GtPdomModelSet *pdom_model_set; gt_assert(hmmfiles); gt_error_check(err); rval = system("hmmpress -h > /dev/null"); if (rval == -1) { gt_error_set(err, "error executing system(hmmpress)"); return NULL; } #ifndef _WIN32 if (WEXITSTATUS(rval) != 0) { gt_error_set(err, "cannot find the hmmpress executable in PATH"); return NULL; } #else /* XXX */ gt_error_set(err, "hmmpress for Windows not implemented"); return NULL; #endif pdom_model_set = gt_calloc((size_t) 1, sizeof (GtPdomModelSet)); concat_dbnames = gt_str_new(); for (i = 0; !had_err && i < gt_str_array_size(hmmfiles); i++) { const char *filename = gt_str_array_get(hmmfiles, i); if (!gt_file_exists(filename)) { gt_error_set(err, "invalid HMM file: %s", filename); gt_str_delete(concat_dbnames); gt_free(pdom_model_set); return NULL; } else { gt_str_append_cstr(concat_dbnames, filename); } } if (!had_err) { pdom_model_set->filename = gt_str_new(); if (!(tmpdir = getenv("TMPDIR"))) tmpdir = "/tmp"; gt_str_append_cstr(pdom_model_set->filename, tmpdir); gt_str_append_char(pdom_model_set->filename, GT_PATH_SEPARATOR); md5_hash = gt_md5_fingerprint(gt_str_get(concat_dbnames), gt_str_length(concat_dbnames)); gt_str_append_cstr(pdom_model_set->filename, md5_hash); gt_free(md5_hash); gt_str_delete(concat_dbnames); indexfilename = gt_str_new_cstr(gt_str_get(pdom_model_set->filename)); gt_str_append_cstr(indexfilename, GT_HMM_INDEX_SUFFIX); } if (!gt_file_exists(gt_str_get(indexfilename))) { dest = fopen(gt_str_get(pdom_model_set->filename), "w+"); if (!dest) { gt_error_set(err, "could not create file %s", gt_str_get(pdom_model_set->filename)); had_err = -1; } if (!had_err) { for (i = 0; !had_err && i < gt_str_array_size(hmmfiles); i++) { FILE *source; const char *filename = gt_str_array_get(hmmfiles, i); source = fopen(filename, "r"); if (!source) { gt_error_set(err, "could not open HMM file %s", filename); had_err = -1; } if (!had_err) { while (( ch = fgetc(source)) != EOF) (void) fputc(ch, dest); (void) fclose(source); } } (void) fclose(dest); } /* XXX: read hmmer path from env */ cmdline = gt_str_new_cstr("hmmpress -f "); gt_str_append_str(cmdline, pdom_model_set->filename); gt_str_append_cstr(cmdline, "> /dev/null"); /* XXX: portability? */ rval = system(gt_str_get(cmdline)); gt_str_delete(cmdline); if (rval == -1) { gt_error_set(err, "error executing system(hmmpress)"); return NULL; } #ifndef _WIN32 if (WEXITSTATUS(rval) != 0) { gt_error_set(err, "an error occurred during HMM preprocessing"); had_err = -1; } #else gt_error_set(err, "WEXITSTATUS not implemented on Windows"); had_err = -1; #endif } if (had_err) { gt_pdom_model_set_delete(pdom_model_set); pdom_model_set = NULL; } gt_str_delete(indexfilename); return pdom_model_set; }
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes, GtStr *filenamestr, GtFile *fpin, bool be_tolerant, GtError *err) { GtStr *seqid_str, *source_str, *line_buffer; char *line; size_t line_length; GtUword i, line_number = 0; GtGenomeNode *gn; GtRange range; GtPhase phase_value; GtStrand gt_strand_value; GtSplitter *splitter, *attribute_splitter; float score_value; char *seqname, *source, *feature, *start, *end, *score, *strand, *frame, *attributes, *token, *gene_id, *gene_name = NULL, *transcript_id, *transcript_name = NULL, **tokens; GtHashmap *transcript_id_hash; /* map from transcript id to array of genome nodes */ GtArray *gt_genome_node_array; ConstructionInfo cinfo; GTF_feature_type gtf_feature_type; GT_UNUSED bool gff_type_is_valid = false; const char *type = NULL; const char *filename; bool score_is_defined; int had_err = 0; gt_assert(parser && genome_nodes); gt_error_check(err); filename = gt_str_get(filenamestr); /* alloc */ line_buffer = gt_str_new(); splitter = gt_splitter_new(), attribute_splitter = gt_splitter_new(); #define HANDLE_ERROR \ if (had_err) { \ if (be_tolerant) { \ fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \ gt_error_unset(err); \ gt_str_reset(line_buffer); \ had_err = 0; \ continue; \ } \ else { \ had_err = -1; \ break; \ } \ } while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) { line = gt_str_get(line_buffer); line_length = gt_str_length(line_buffer); line_number++; gene_name = gene_id = transcript_id = transcript_name = NULL; had_err = 0; if (line_length == 0) { gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number, filename); } else if (line[0] == '#') { /* storing comment */ if (line_length >= 2 && line[1] == '#') gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */ else gn = gt_comment_node_new(line+1); gt_genome_node_set_origin(gn, filenamestr, line_number); gt_queue_add(genome_nodes, gn); } else { bool stop_codon = false; char *tokendup, *attrkey; GtStrArray *attrkeys, *attrvals; /* process tab delimited GTF line */ gt_splitter_reset(splitter); gt_splitter_split(splitter, line, line_length, '\t'); if (gt_splitter_size(splitter) != 9UL) { gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU " tab (\\t) " "separated fields instead of 9", line_number, filename, gt_splitter_size(splitter)); had_err = -1; break; } tokens = gt_splitter_get_tokens(splitter); seqname = tokens[0]; source = tokens[1]; feature = tokens[2]; start = tokens[3]; end = tokens[4]; score = tokens[5]; strand = tokens[6]; frame = tokens[7]; attributes = tokens[8]; /* parse feature */ if (GTF_feature_type_get(>f_feature_type, feature) == -1) { /* we skip unknown features */ fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown " "feature: \"%s\"\n", line_number, filename, feature); gt_str_reset(line_buffer); continue; } /* translate into GFF3 feature type */ switch (gtf_feature_type) { case GTF_stop_codon: stop_codon = true; case GTF_CDS: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_CDS); type = gt_ft_CDS; break; case GTF_exon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_exon); type = gt_ft_exon; break; case GTF_start_codon: /* we can skip the start codons, they are part of the CDS anyway */ gt_str_reset(line_buffer); continue; } gt_assert(gff_type_is_valid); /* parse the range */ had_err = gt_parse_range(&range, start, end, line_number, filename, err); HANDLE_ERROR; /* process seqname (we have to do it here because we need the range) */ gt_region_node_builder_add_region(parser->region_node_builder, seqname, range); /* parse the score */ had_err = gt_parse_score(&score_is_defined, &score_value, score, line_number, filename, err); HANDLE_ERROR; /* parse the strand */ had_err = gt_parse_strand(>_strand_value, strand, line_number, filename, err); HANDLE_ERROR; /* parse the frame */ had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err); HANDLE_ERROR; /* parse the attributes */ attrkeys = gt_str_array_new(); attrvals = gt_str_array_new(); gt_splitter_reset(attribute_splitter); gene_id = NULL; transcript_id = NULL; gt_splitter_split(attribute_splitter, attributes, strlen(attributes), ';'); for (i = 0; i < gt_splitter_size(attribute_splitter); i++) { token = gt_splitter_get_token(attribute_splitter, i); /* skip leading blanks */ while (*token == ' ') token++; tokendup = gt_cstr_dup(token); attrkey = strtok(tokendup, " "); if (attrkey) { char *attrval = strtok(NULL, " "); if (attrval == NULL || strcmp(attrval, "") == 0 || strcmp(attrval, "\"\"") == 0) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU " in file \"%s\"", attrkey,line_number,filename); had_err = -1; } HANDLE_ERROR; if (*attrval == '"') attrval++; if (attrval[strlen(attrval)-1] == '"') attrval[strlen(attrval)-1] = '\0'; gt_assert(attrkey && strlen(attrkey) > 0); gt_assert(attrval && strlen(attrval) > 0); gt_str_array_add_cstr(attrkeys, attrkey); gt_str_array_add_cstr(attrvals, attrval); } gt_free(tokendup); /* look for the two mandatory attributes */ if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1; if (*gene_id == '"') gene_id++; if (gene_id[strlen(gene_id)-1] == '"') gene_id[strlen(gene_id)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE, strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1; if (*transcript_id == '"') transcript_id++; if (transcript_id[strlen(transcript_id)-1] == '"') transcript_id[strlen(transcript_id)-1] = '\0'; } else if (strncmp(token, GENE_NAME_ATTRIBUTE, strlen(GENE_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*gene_name == '"') gene_name++; if (gene_name[strlen(gene_name)-1] == '"') gene_name[strlen(gene_name)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE, strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*transcript_name == '"') transcript_name++; if (transcript_name[strlen(transcript_name)-1] == '"') transcript_name[strlen(transcript_name)-1] = '\0'; } } /* check for the mandatory attributes */ if (!gene_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; if (!transcript_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; /* process the mandatory attributes */ if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash, gene_id))) { transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id), transcript_id_hash); } gt_assert(transcript_id_hash); if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash, transcript_id))) { gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*)); gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id), gt_genome_node_array); } gt_assert(gt_genome_node_array); /* save optional gene_name and transcript_name attributes */ if (transcript_name && strlen(transcript_name) > 0 && !gt_hashmap_get(parser->transcript_id_to_name_mapping, transcript_id)) { gt_hashmap_add(parser->transcript_id_to_name_mapping, gt_cstr_dup(transcript_id), gt_cstr_dup(transcript_name)); } if (gene_name && strlen(gene_name) > 0 && !gt_hashmap_get(parser->gene_id_to_name_mapping, gene_id)) { gt_hashmap_add(parser->gene_id_to_name_mapping, gt_cstr_dup(gene_id), gt_cstr_dup(gene_name)); } /* get seqid */ seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname); if (!seqid_str) { seqid_str = gt_str_new_cstr(seqname); gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str), seqid_str); } gt_assert(seqid_str); /* construct the new feature */ gn = gt_feature_node_new(seqid_str, type, range.start, range.end, gt_strand_value); gt_genome_node_set_origin(gn, filenamestr, line_number); if (stop_codon) { gt_feature_node_add_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG, "true"); } for (i = 0; i < gt_str_array_size(attrkeys); i++) { GtFeatureNode *fn = (GtFeatureNode *)gn; const char *key = gt_str_array_get(attrkeys, i); const char *val = gt_str_array_get(attrvals, i); /* Not a comprehensive solution to ensure correct encoding, just bare minimum required to get Cufflinks output parsed */ if (strcmp(val, "=") == 0) val = "%26"; if (gt_feature_node_get_attribute(fn, key) != NULL) { const char *oldval = gt_feature_node_get_attribute(fn, key); GtStr *newval = gt_str_new_cstr(oldval); gt_str_append_char(newval, ','); gt_str_append_cstr(newval, val); gt_feature_node_set_attribute(fn, key, gt_str_get(newval)); gt_str_delete(newval); } else gt_feature_node_add_attribute(fn, key, val); } gt_str_array_delete(attrkeys); gt_str_array_delete(attrvals); /* set source */ source_str = gt_hashmap_get(parser->source_to_str_mapping, source); if (!source_str) { source_str = gt_str_new_cstr(source); gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str), source_str); } gt_assert(source_str); gt_feature_node_set_source((GtFeatureNode*) gn, source_str); if (score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); if (phase_value != GT_PHASE_UNDEFINED) gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value); gt_array_add(gt_genome_node_array, gn); } gt_str_reset(line_buffer); } /* process all region nodes */ if (!had_err) gt_region_node_builder_build(parser->region_node_builder, genome_nodes); /* process all feature nodes */ cinfo.genome_nodes = genome_nodes; cinfo.tidy = be_tolerant; cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping; cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping; if (!had_err) { had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes, &cinfo, err); } gt_hashmap_foreach(parser->gene_id_hash, delete_genes, NULL, err); /* free */ gt_splitter_delete(splitter); gt_splitter_delete(attribute_splitter); gt_str_delete(line_buffer); return had_err; }
static int gt_sequence_buffer_fasta_advance(GtSequenceBuffer *sb, GtError *err) { int currentchar, ret = 0; GtUword currentoutpos = 0, currentfileadd = 0, currentfileread = 0; GtSequenceBufferMembers *pvt; GtSequenceBufferFasta *sbf; gt_error_check(err); sbf = (GtSequenceBufferFasta*) sb; pvt = sb->pvt; while (true) { if (currentoutpos >= (GtUword) OUTBUFSIZE) { if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length += (uint64_t) currentfileread; pvt->filelengthtab[pvt->filenum].effectivelength += (uint64_t) currentfileadd; } break; } if (sbf->nextfile) { if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length = 0; pvt->filelengthtab[pvt->filenum].effectivelength = 0; } sbf->nextfile = false; sbf->indesc = false; sbf->firstseqinfile = true; currentfileadd = 0; currentfileread = 0; pvt->linenum = (uint64_t) 1; pvt->inputstream = gt_file_xopen(gt_str_array_get(pvt->filenametab, (GtUword) pvt->filenum), "rb"); pvt->currentinpos = 0; pvt->currentfillpos = 0; } else { currentchar = inlinebuf_getchar(sb, pvt->inputstream); if (currentchar == EOF) { gt_file_delete(pvt->inputstream); pvt->inputstream = NULL; if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length += currentfileread; pvt->filelengthtab[pvt->filenum].effectivelength += currentfileadd; } if ((GtUword) pvt->filenum == gt_str_array_size(pvt->filenametab)-1) { pvt->complete = true; break; } pvt->filenum++; sbf->nextfile = true; } else { currentfileread++; if (sbf->indesc) { if (currentchar == NEWLINESYMBOL) { pvt->linenum++; sbf->indesc = false; } if (pvt->descptr != NULL) { if (currentchar == NEWLINESYMBOL) { gt_desc_buffer_finish(pvt->descptr); } else { if (currentchar != CRSYMBOL) gt_desc_buffer_append_char(pvt->descptr, currentchar); } } } else { if (!isspace((int) currentchar)) { if (currentchar == FASTASEPARATOR) { if (sbf->firstoverallseq) { sbf->firstoverallseq = false; sbf->firstseqinfile = false; } else { if (sbf->firstseqinfile) { sbf->firstseqinfile = false; } else { currentfileadd++; } pvt->outbuf[currentoutpos++] = (unsigned char) SEPARATOR; pvt->lastspeciallength++; } sbf->indesc = true; } else { if ((ret = process_char(sb, currentoutpos, (unsigned char) currentchar, err))) return ret; currentoutpos++; currentfileadd++; } } } } } } if (sbf->firstoverallseq) { gt_error_set(err,"no sequences in multiple fasta file(s) %s ...", gt_str_array_get(pvt->filenametab,0)); return -2; } pvt->nextfree = currentoutpos; return 0; }
static int gff3_in_stream_plain_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtGFF3InStreamPlain *is = gff3_in_stream_plain_cast(ns); GtStr *filenamestr; int had_err = 0, status_code; gt_error_check(err); if (gt_queue_size(is->genome_node_buffer) > 1) { /* we still have at least two nodes in the buffer -> serve from there */ *gn = gt_queue_get(is->genome_node_buffer); return 0; } /* the buffer is empty or has one element */ gt_assert(gt_queue_size(is->genome_node_buffer) <= 1); for (;;) { /* open file if necessary */ if (!is->file_is_open) { if (gt_str_array_size(is->files) && is->next_file == gt_str_array_size(is->files)) { break; } if (gt_str_array_size(is->files)) { if (strcmp(gt_str_array_get(is->files, is->next_file), "-") == 0) { if (is->stdin_argument) { gt_error_set(err, "multiple specification of argument file \"-\""); had_err = -1; break; } is->fpin = gt_file_xopen(NULL, "r"); is->file_is_open = true; is->stdin_argument = true; } else { is->fpin = gt_file_xopen(gt_str_array_get(is->files, is->next_file), "r"); is->file_is_open = true; } is->next_file++; } else { if (is->stdin_processed) break; is->fpin = NULL; is->file_is_open = true; } is->line_number = 0; if (!had_err && is->progress_bar) { printf("processing file \"%s\"\n", gt_str_array_size(is->files) ? gt_str_array_get(is->files, is->next_file-1) : "stdin"); } if (!had_err && is->fpin && is->progress_bar) { gt_progressbar_start(&is->line_number, gt_file_number_of_lines(gt_str_array_get(is->files, is->next_file-1))); } } gt_assert(is->file_is_open); filenamestr = gt_str_array_size(is->files) ? gt_str_array_get_str(is->files, is->next_file-1) : is->stdinstr; /* read two nodes */ had_err = gt_gff3_parser_parse_genome_nodes(is->gff3_parser, &status_code, is->genome_node_buffer, is->used_types, filenamestr, &is->line_number, is->fpin, err); if (had_err) break; if (status_code != EOF) { had_err = gt_gff3_parser_parse_genome_nodes(is->gff3_parser, &status_code, is->genome_node_buffer, is->used_types, filenamestr, &is->line_number, is->fpin, err); if (had_err) break; } if (status_code == EOF) { /* end of current file */ if (is->progress_bar) gt_progressbar_stop(); gt_file_delete(is->fpin); is->fpin = NULL; is->file_is_open = false; gt_gff3_parser_reset(is->gff3_parser); if (!gt_str_array_size(is->files)) { is->stdin_processed = true; break; } continue; } gt_assert(gt_queue_size(is->genome_node_buffer)); /* make sure the parsed nodes are sorted */ if (is->ensure_sorting && gt_queue_size(is->genome_node_buffer) > 1) { GtGenomeNode *last_node = NULL; /* a sorted stream can have at most one input file */ gt_assert(gt_str_array_size(is->files) == 0 || gt_str_array_size(is->files) == 1); had_err = gt_queue_iterate(is->genome_node_buffer, buffer_is_sorted, &last_node, err); } if (!had_err) { *gn = gt_queue_get(is->genome_node_buffer); } return had_err; } gt_assert(!gt_queue_size(is->genome_node_buffer)); *gn = NULL; return had_err; }