static void store_attribute(const char *attr_name, GT_UNUSED const char *attr_value, void *data) { GtStrArray *list = data; gt_assert(attr_name && attr_value && data); gt_str_array_add_cstr(list, attr_name); }
GtNodeStream* gt_gff3_in_stream_plain_new_sorted(const char *filename) { GtStrArray *files = gt_str_array_new(); if (filename) gt_str_array_add_cstr(files, filename); return gff3_in_stream_plain_new(files, true); }
static int save_fastaentry(const char *seqpart, GT_UNUSED GtUword length, void *data, GT_UNUSED GtError* err) { gt_error_check(err); GtStrArray *fasta_sequences = (GtStrArray*) data; gt_str_array_add_cstr(fasta_sequences, seqpart); return 0; }
GtNodeStream* gt_gff3_in_stream_plain_new_unsorted(int num_of_files, const char **filenames) { int i; GtStrArray *files = gt_str_array_new(); for (i = 0; i < num_of_files; i++) gt_str_array_add_cstr(files, filenames[i]); return gff3_in_stream_plain_new(files, false); }
static int gt_encseq_encode_runner(GT_UNUSED int argc, const char **argv, int parsed_args, GT_UNUSED void *tool_arguments, GtError *err) { int had_err = 0, i; GtEncseqEncodeArguments *arguments = (GtEncseqEncodeArguments*) tool_arguments; GtStrArray *infiles; gt_error_check(err); infiles = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(infiles, argv[i]); } if (gt_str_length(arguments->indexname) == 0UL) { if (gt_str_array_size(infiles) > 1UL) { gt_error_set(err,"if more than one input file is given, then " "option -indexname is mandatory"); had_err = -1; } else { char *basenameptr; basenameptr = gt_basename(gt_str_array_get(infiles, 0UL)); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } } if (!had_err) { gt_assert(gt_str_length(arguments->indexname) > 0UL); had_err = encode_sequence_files(infiles, arguments->eopts, gt_str_get(arguments->indexname), arguments->verbose, arguments->no_esq_header, err); } if (!had_err && arguments->showstats) show_encoded_statistics(infiles, gt_str_get(arguments->indexname)); gt_str_array_delete(infiles); return had_err; }
int gt_lua_get_table_as_strarray(lua_State *L, int index, GtStrArray *outarray, GtError *err) { int had_err = 0; gt_assert(lua_istable(L, index)); lua_pushnil(L); while (!had_err && (lua_next(L, index) != 0)) { if (!lua_isstring(L, -1)) { had_err = -1; gt_error_set(err, "table contains non-string value!"); break; } gt_str_array_add_cstr(outarray, lua_tostring(L, -1)); lua_pop(L, 1); } return 0; }
static int region_mapping_lua_new_seqfile(lua_State *L) { const char *seqfilename; GtStrArray *seqfile; GtRegionMapping **region_mapping; gt_assert(L); seqfilename = luaL_checkstring(L, 1); region_mapping = lua_newuserdata(L, sizeof (GtRegionMapping*)); gt_assert(region_mapping); seqfile = gt_str_array_new(); gt_str_array_add_cstr(seqfile, seqfilename); /* XXX: make second and third parameter available */ *region_mapping = gt_region_mapping_new_seqfiles(seqfile, false, false); gt_str_array_delete(seqfile); luaL_getmetatable(L, REGION_MAPPING_METATABLE); lua_setmetatable(L, -2); return 1; }
static int process_fastakeyfile(GtStr *fastakeyfile, int argc, const char **argv, unsigned long width, GtFile *outfp, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(gt_str_length(fastakeyfile)); if (argc == 0) { gt_error_set(err,"option -keys requires at least one file argument"); had_err = -1; } if (!had_err) { GtStr *indexname = gt_str_new_cstr(argv[0]); if (argc == 1 && gt_deskeysfileexists(indexname)) { if (gt_extractkeysfromfastaindex(indexname,fastakeyfile,width,err) != 0) { had_err = -1; } } else { GtStrArray *referencefiletab; int i; referencefiletab = gt_str_array_new(); for (i = 0; i < argc; i++) { gt_str_array_add_cstr(referencefiletab, argv[i]); } if (gt_extractkeysfromfastafile(true, outfp, width, fastakeyfile, referencefiletab, err) != 1) { had_err = -1; } gt_str_array_delete(referencefiletab); } gt_str_delete(indexname); } return had_err; }
static GtOPrval gthfilestat_parse_options(int *parsed_args, GthFileStatInfo *file_stat_info, int argc, const char **argv, const GthPlugins *plugins, GtError *err) { GtOptionParser *op; GtOption *o; GtOPrval oprval; bool verbose; gt_error_check(err); op = gt_option_parser_new("[option ...] [file ...]", "Show statistics about " "spliced alignments in GenomeThreader output files\n" "containing intermediate results."); /* add sa_filter options */ gth_sa_filter_register_options(op, file_stat_info->sa_filter, false); /* -v */ o = gt_option_new_verbose(&verbose); gt_option_parser_add_option(op, o); gt_option_parser_set_mail_address(op, "<*****@*****.**>"); oprval = gt_option_parser_parse(op, parsed_args, argc, argv, plugins->gth_version_func, err); if (verbose) file_stat_info->showverbose = gth_show_on_stdout; /* save consensus files */ if (oprval == GT_OPTION_PARSER_OK) { while (*parsed_args < argc) { gt_str_array_add_cstr(file_stat_info->consensusfiles, argv[*parsed_args]); (*parsed_args)++; } } gt_option_parser_delete(op); return oprval; }
/* 'static' function */ GtStrArray* gt_trans_table_get_scheme_descriptions() { GtUword i; GtTranslationScheme *scheme; GtStr *str; GtStrArray *sa = gt_str_array_new(); str = gt_str_new(); for (i = 1UL; i < (GtUword) GT_SIZEOFTRANSRANGE; i++) { if (transnum2index[i] == GT_UNDEFTRANSNUM) continue; scheme = schemetable + transnum2index[i]; gt_str_reset(str); gt_str_append_uint(str, scheme->identity); gt_str_append_cstr(str, ": "); gt_str_append_cstr(str, scheme->name); gt_str_array_add_cstr(sa, gt_str_get(str)); } gt_str_delete(str); return sa; }
static void nodeinfo_add_block(NodeInfoElement *ni, const char *gft, GtFeatureNode *rep, GtBlock *block) { GtBlockTuple *bt; PerTypeInfo *type_struc = NULL; gt_assert(ni); bt = blocktuple_new(gft, rep, block); if (!(ni->type_index)) { ni->type_index = gt_hashmap_new(GT_HASH_STRING, NULL, gt_free_func); } if (!(type_struc = gt_hashmap_get(ni->type_index, gft))) { type_struc = gt_calloc(1, sizeof (PerTypeInfo)); type_struc->rep_index = gt_hashmap_new(GT_HASH_DIRECT, NULL, NULL); type_struc->blocktuples = gt_array_new(sizeof (GtBlockTuple*)); gt_hashmap_add(ni->type_index, (char*) gft, type_struc); gt_str_array_add_cstr(ni->types, gft); } gt_hashmap_add(type_struc->rep_index, rep, bt); if (rep != GT_UNDEF_REPR) type_struc->must_merge = true; gt_array_add(type_struc->blocktuples, bt); }
int gt_translator_unit_test(GtError *err) { int had_err = 0; GtTranslatorStatus test_errnum; GtTranslator *tr; GtCodonIterator *ci; GtError *test_err; GtStrArray *codons, *invalidcodons; const char *seq = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGT" "GGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGT" "TACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG"; const char *no_startcodon = "AAAAAAAAAATCATCTCCCCATTTTTTT"; const char *invalidseq = "ZAGCTTTTCATTCTGACTGCAAATATGTCTCTGTGT"; const char *invalidseq2 = "AGCTTTTCATTCTGACZTGCAAATATGTCTCTGTGT"; char translated; unsigned int frame; GtUword pos = 0; GtStr *protein[3]; gt_error_check(err); test_err = gt_error_new(); ci = gt_codon_iterator_simple_new(seq, (GtUword) strlen(seq), test_err); tr = gt_translator_new(ci); protein[0] = gt_str_new(); protein[1] = gt_str_new(); protein[2] = gt_str_new(); codons = gt_str_array_new(); gt_str_array_add_cstr(codons, "ACG"); gt_str_array_add_cstr(codons, "ACT"); invalidcodons = gt_str_array_new(); gt_str_array_add_cstr(invalidcodons, "ACG"); gt_str_array_add_cstr(invalidcodons, "AC"); /* do 3-frame translation */ gt_error_unset(test_err); test_errnum = gt_translator_next(tr, &translated, &frame, test_err); while (!test_errnum && translated) { gt_str_append_char(protein[frame], translated); test_errnum = gt_translator_next(tr, &translated, &frame, test_err); gt_ensure( test_errnum != GT_TRANSLATOR_ERROR && !gt_error_is_set(test_err)); } gt_ensure( test_errnum == GT_TRANSLATOR_END && !gt_error_is_set(test_err)); /* check 3-frame translation */ gt_ensure(strcmp(gt_str_get(protein[0]), "SFSF*LQRAICLCVD*KKSV**QLLNWLPAVSKLKFY*LR") == 0); gt_ensure(strcmp(gt_str_get(protein[1]), "AFHSDCNGQYVSVWIKKRVSDSSF*TGYLP*VN*NFIDL") == 0); gt_ensure(strcmp(gt_str_get(protein[2]), "LFILTATGNMSLCGLKKECLIAASELVTCRE*IKILLT*") == 0); /* find start codon -- positive */ gt_error_unset(test_err); gt_codon_iterator_rewind(ci); test_errnum = gt_translator_find_startcodon(tr, &pos, test_err); gt_ensure(!test_errnum && !gt_error_is_set(test_err)); gt_ensure(pos == 11UL); /* find stop codon -- positive */ gt_error_unset(test_err); gt_codon_iterator_rewind(ci); test_errnum = gt_translator_find_stopcodon(tr, &pos, test_err); gt_ensure(!test_errnum && !gt_error_is_set(test_err)); gt_ensure(pos == 12UL); /* find arbitrary codons -- positive */ gt_error_unset(test_err); gt_codon_iterator_rewind(ci); test_errnum = gt_translator_find_codon(tr, codons, &pos, test_err); gt_ensure(!test_errnum && !gt_error_is_set(test_err)); gt_ensure(pos == 14UL); /* find arbitrary codons -- negative (invalid codons) */ gt_error_unset(test_err); gt_codon_iterator_rewind(ci); test_errnum = gt_translator_find_codon(tr, invalidcodons, &pos, test_err); gt_ensure( test_errnum == GT_TRANSLATOR_ERROR && gt_error_is_set(test_err)); gt_error_unset(test_err); gt_codon_iterator_delete(ci); ci = gt_codon_iterator_simple_new(invalidseq, (GtUword) strlen(invalidseq), test_err); gt_ensure(ci && !gt_error_is_set(test_err)); gt_translator_reset(tr, ci); /* check translation of sequence with invalid beginning */ test_errnum = gt_translator_next(tr, &translated, &frame, test_err); gt_ensure(test_errnum && gt_error_is_set(test_err)); /* check translation of sequence with invalid character within */ gt_error_unset(test_err); gt_codon_iterator_delete(ci); ci = gt_codon_iterator_simple_new(invalidseq2, (GtUword) strlen(invalidseq2), test_err); gt_ensure(ci && !gt_error_is_set(test_err)); gt_translator_reset(tr, ci); test_errnum = gt_translator_next(tr, &translated, &frame, test_err); while (!test_errnum && translated) { gt_str_append_char(protein[frame], translated); test_errnum = gt_translator_next(tr, &translated, &frame, test_err); } gt_ensure( test_errnum == GT_TRANSLATOR_ERROR && gt_error_is_set(test_err)); /* find start codon -- fail */ gt_error_unset(test_err); gt_codon_iterator_delete(ci); ci = gt_codon_iterator_simple_new(no_startcodon, (GtUword) strlen(no_startcodon), test_err); gt_ensure(ci && !gt_error_is_set(test_err)); gt_translator_reset(tr, ci); test_errnum = gt_translator_find_startcodon(tr, &pos, test_err); gt_ensure( test_errnum == GT_TRANSLATOR_END && !gt_error_is_set(test_err)); /* find stop codon -- fail */ gt_error_unset(test_err); gt_codon_iterator_rewind(ci); test_errnum = gt_translator_find_stopcodon(tr, &pos, test_err); gt_ensure( test_errnum == GT_TRANSLATOR_END && !gt_error_is_set(test_err)); /* find arbitrary codons -- negative (none there) */ gt_error_unset(test_err); gt_codon_iterator_rewind(ci); test_errnum = gt_translator_find_codon(tr, codons, &pos, test_err); gt_ensure( test_errnum == GT_TRANSLATOR_END && !gt_error_is_set(test_err)); gt_codon_iterator_delete(ci); gt_translator_delete(tr); gt_str_delete(protein[0]); gt_str_delete(protein[1]); gt_str_delete(protein[2]); gt_str_array_delete(codons); gt_str_array_delete(invalidcodons); gt_error_delete(test_err); return had_err; }
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes, GtStr *filenamestr, GtFile *fpin, bool be_tolerant, GtError *err) { GtStr *seqid_str, *source_str, *line_buffer; char *line; size_t line_length; GtUword i, line_number = 0; GtGenomeNode *gn; GtRange range; GtPhase phase_value; GtStrand gt_strand_value; GtSplitter *splitter, *attribute_splitter; float score_value; char *seqname, *source, *feature, *start, *end, *score, *strand, *frame, *attributes, *token, *gene_id, *gene_name = NULL, *transcript_id, *transcript_name = NULL, **tokens; GtHashmap *transcript_id_hash; /* map from transcript id to array of genome nodes */ GtArray *gt_genome_node_array; ConstructionInfo cinfo; GTF_feature_type gtf_feature_type; GT_UNUSED bool gff_type_is_valid = false; const char *type = NULL; const char *filename; bool score_is_defined; int had_err = 0; gt_assert(parser && genome_nodes); gt_error_check(err); filename = gt_str_get(filenamestr); /* alloc */ line_buffer = gt_str_new(); splitter = gt_splitter_new(), attribute_splitter = gt_splitter_new(); #define HANDLE_ERROR \ if (had_err) { \ if (be_tolerant) { \ fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \ gt_error_unset(err); \ gt_str_reset(line_buffer); \ had_err = 0; \ continue; \ } \ else { \ had_err = -1; \ break; \ } \ } while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) { line = gt_str_get(line_buffer); line_length = gt_str_length(line_buffer); line_number++; gene_name = gene_id = transcript_id = transcript_name = NULL; had_err = 0; if (line_length == 0) { gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number, filename); } else if (line[0] == '#') { /* storing comment */ if (line_length >= 2 && line[1] == '#') gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */ else gn = gt_comment_node_new(line+1); gt_genome_node_set_origin(gn, filenamestr, line_number); gt_queue_add(genome_nodes, gn); } else { bool stop_codon = false; char *tokendup, *attrkey; GtStrArray *attrkeys, *attrvals; /* process tab delimited GTF line */ gt_splitter_reset(splitter); gt_splitter_split(splitter, line, line_length, '\t'); if (gt_splitter_size(splitter) != 9UL) { gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU " tab (\\t) " "separated fields instead of 9", line_number, filename, gt_splitter_size(splitter)); had_err = -1; break; } tokens = gt_splitter_get_tokens(splitter); seqname = tokens[0]; source = tokens[1]; feature = tokens[2]; start = tokens[3]; end = tokens[4]; score = tokens[5]; strand = tokens[6]; frame = tokens[7]; attributes = tokens[8]; /* parse feature */ if (GTF_feature_type_get(>f_feature_type, feature) == -1) { /* we skip unknown features */ fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown " "feature: \"%s\"\n", line_number, filename, feature); gt_str_reset(line_buffer); continue; } /* translate into GFF3 feature type */ switch (gtf_feature_type) { case GTF_stop_codon: stop_codon = true; case GTF_CDS: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_CDS); type = gt_ft_CDS; break; case GTF_exon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_exon); type = gt_ft_exon; break; case GTF_start_codon: /* we can skip the start codons, they are part of the CDS anyway */ gt_str_reset(line_buffer); continue; } gt_assert(gff_type_is_valid); /* parse the range */ had_err = gt_parse_range(&range, start, end, line_number, filename, err); HANDLE_ERROR; /* process seqname (we have to do it here because we need the range) */ gt_region_node_builder_add_region(parser->region_node_builder, seqname, range); /* parse the score */ had_err = gt_parse_score(&score_is_defined, &score_value, score, line_number, filename, err); HANDLE_ERROR; /* parse the strand */ had_err = gt_parse_strand(>_strand_value, strand, line_number, filename, err); HANDLE_ERROR; /* parse the frame */ had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err); HANDLE_ERROR; /* parse the attributes */ attrkeys = gt_str_array_new(); attrvals = gt_str_array_new(); gt_splitter_reset(attribute_splitter); gene_id = NULL; transcript_id = NULL; gt_splitter_split(attribute_splitter, attributes, strlen(attributes), ';'); for (i = 0; i < gt_splitter_size(attribute_splitter); i++) { token = gt_splitter_get_token(attribute_splitter, i); /* skip leading blanks */ while (*token == ' ') token++; tokendup = gt_cstr_dup(token); attrkey = strtok(tokendup, " "); if (attrkey) { char *attrval = strtok(NULL, " "); if (attrval == NULL || strcmp(attrval, "") == 0 || strcmp(attrval, "\"\"") == 0) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU " in file \"%s\"", attrkey,line_number,filename); had_err = -1; } HANDLE_ERROR; if (*attrval == '"') attrval++; if (attrval[strlen(attrval)-1] == '"') attrval[strlen(attrval)-1] = '\0'; gt_assert(attrkey && strlen(attrkey) > 0); gt_assert(attrval && strlen(attrval) > 0); gt_str_array_add_cstr(attrkeys, attrkey); gt_str_array_add_cstr(attrvals, attrval); } gt_free(tokendup); /* look for the two mandatory attributes */ if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1; if (*gene_id == '"') gene_id++; if (gene_id[strlen(gene_id)-1] == '"') gene_id[strlen(gene_id)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE, strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1; if (*transcript_id == '"') transcript_id++; if (transcript_id[strlen(transcript_id)-1] == '"') transcript_id[strlen(transcript_id)-1] = '\0'; } else if (strncmp(token, GENE_NAME_ATTRIBUTE, strlen(GENE_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*gene_name == '"') gene_name++; if (gene_name[strlen(gene_name)-1] == '"') gene_name[strlen(gene_name)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE, strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*transcript_name == '"') transcript_name++; if (transcript_name[strlen(transcript_name)-1] == '"') transcript_name[strlen(transcript_name)-1] = '\0'; } } /* check for the mandatory attributes */ if (!gene_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; if (!transcript_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; /* process the mandatory attributes */ if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash, gene_id))) { transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id), transcript_id_hash); } gt_assert(transcript_id_hash); if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash, transcript_id))) { gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*)); gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id), gt_genome_node_array); } gt_assert(gt_genome_node_array); /* save optional gene_name and transcript_name attributes */ if (transcript_name && strlen(transcript_name) > 0 && !gt_hashmap_get(parser->transcript_id_to_name_mapping, transcript_id)) { gt_hashmap_add(parser->transcript_id_to_name_mapping, gt_cstr_dup(transcript_id), gt_cstr_dup(transcript_name)); } if (gene_name && strlen(gene_name) > 0 && !gt_hashmap_get(parser->gene_id_to_name_mapping, gene_id)) { gt_hashmap_add(parser->gene_id_to_name_mapping, gt_cstr_dup(gene_id), gt_cstr_dup(gene_name)); } /* get seqid */ seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname); if (!seqid_str) { seqid_str = gt_str_new_cstr(seqname); gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str), seqid_str); } gt_assert(seqid_str); /* construct the new feature */ gn = gt_feature_node_new(seqid_str, type, range.start, range.end, gt_strand_value); gt_genome_node_set_origin(gn, filenamestr, line_number); if (stop_codon) { gt_feature_node_add_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG, "true"); } for (i = 0; i < gt_str_array_size(attrkeys); i++) { GtFeatureNode *fn = (GtFeatureNode *)gn; const char *key = gt_str_array_get(attrkeys, i); const char *val = gt_str_array_get(attrvals, i); /* Not a comprehensive solution to ensure correct encoding, just bare minimum required to get Cufflinks output parsed */ if (strcmp(val, "=") == 0) val = "%26"; if (gt_feature_node_get_attribute(fn, key) != NULL) { const char *oldval = gt_feature_node_get_attribute(fn, key); GtStr *newval = gt_str_new_cstr(oldval); gt_str_append_char(newval, ','); gt_str_append_cstr(newval, val); gt_feature_node_set_attribute(fn, key, gt_str_get(newval)); gt_str_delete(newval); } else gt_feature_node_add_attribute(fn, key, val); } gt_str_array_delete(attrkeys); gt_str_array_delete(attrvals); /* set source */ source_str = gt_hashmap_get(parser->source_to_str_mapping, source); if (!source_str) { source_str = gt_str_new_cstr(source); gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str), source_str); } gt_assert(source_str); gt_feature_node_set_source((GtFeatureNode*) gn, source_str); if (score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); if (phase_value != GT_PHASE_UNDEFINED) gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value); gt_array_add(gt_genome_node_array, gn); } gt_str_reset(line_buffer); } /* process all region nodes */ if (!had_err) gt_region_node_builder_build(parser->region_node_builder, genome_nodes); /* process all feature nodes */ cinfo.genome_nodes = genome_nodes; cinfo.tidy = be_tolerant; cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping; cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping; if (!had_err) { had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes, &cinfo, err); } gt_hashmap_foreach(parser->gene_id_hash, delete_genes, NULL, err); /* free */ gt_splitter_delete(splitter); gt_splitter_delete(attribute_splitter); gt_str_delete(line_buffer); return had_err; }
static int gt_genomediff_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { bool mirrored = false; int had_err = 0, i; GtEncseq *encseq = NULL; GtGenomediffArguments *arguments = tool_arguments; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("start"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (timer != NULL) gt_timer_show_progress(timer, "start shu search", stdout); if (gt_str_array_size(arguments->filenames) > 1UL) { GtEncseqEncoder *ee = gt_encseq_encoder_new(); gt_encseq_encoder_set_timer(ee, timer); gt_encseq_encoder_set_logger(ee, logger); /* kr only makes sense for dna, so we can check this already with ee */ gt_encseq_encoder_set_input_dna(ee); had_err = gt_encseq_encoder_encode(ee, arguments->filenames, gt_str_get(arguments->indexname), err); gt_encseq_encoder_delete(ee); } else { gt_str_append_str(arguments->indexname, gt_str_array_get_str(arguments->filenames, 0)); if (arguments->with_esa || arguments->with_pck) { GtStr *current_line = gt_str_new(); FILE *prj_fp; const char *buffer; char **elements = NULL; prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname), GT_PROJECTFILESUFFIX,"rb",err); if (prj_fp == NULL) had_err = -1; while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) { buffer = gt_str_get(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); elements = gt_cstr_split(buffer, '='); gt_log_log("%s", elements[0]); if (strcmp("mirrored", elements[0]) == 0) { gt_log_log("%s", elements[1]); if (strcmp("1", elements[1]) == 0) { mirrored = true; gt_log_log("sequences are treated as mirrored"); } } gt_str_reset(current_line); } gt_str_delete(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); gt_fa_xfclose(prj_fp); } } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); if (mirrored) gt_encseq_loader_mirror(el); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (!had_err) { uint64_t **shusums = NULL; if (arguments->with_esa || arguments->with_pck) { shusums = gt_genomediff_shulen_sum(arguments, unit_info, logger, timer, err); if (shusums == NULL) had_err = -1; } else { const bool doesa = true; GenomediffInfo gd_info; Suffixeratoroptions sopts; sopts.beverbose = arguments->verbose; sopts.indexname = arguments->indexname; sopts.db = NULL; sopts.encopts = NULL; sopts.genomediff = true; sopts.inputindex = arguments->indexname; sopts.loadopts = arguments->loadopts; sopts.showprogress = false; sopts.idxopts = arguments->idxopts; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); gd_info.shulensums = shusums; gd_info.unit_info = unit_info; had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err); } if (!had_err && shusums != NULL) { had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info, arguments->with_pck, logger, timer, err); gt_array2dim_delete(shusums); } } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtGenomediffArguments *arguments = tool_arguments; int had_err = 0, i; GtUword lcounter = 0, zcounter = 0; double **shusums = NULL; GtEncseq *encseq = NULL; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("load encseq"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (timer != NULL) gt_timer_show_progress(timer, "load units", stdout); if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (timer != NULL) gt_timer_show_progress(timer, "read table", stdout); if (!had_err) { GtIO *table_file = NULL; GtTokenizer *tokenizer = NULL; GtStr *line = NULL; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r"); tokenizer = gt_tokenizer_new(table_file); line = gt_tokenizer_get_token(tokenizer); while (line != NULL && !had_err) { char *cline = gt_str_get(line); char *elem = strtok(cline, ";"); zcounter = 0; while (elem != NULL && !had_err) { if (*elem != '#') { if (1 != sscanf(elem, "%lf", &shusums[lcounter][zcounter])) { had_err = 1; gt_error_set(err, "couldn't scan"); break; } gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]); zcounter++; } else { gt_logger_log(logger, "name: %s", elem++); } elem = strtok(NULL, ";"); } gt_tokenizer_next_token(tokenizer); gt_str_delete(line); line = gt_tokenizer_get_token(tokenizer); lcounter++; gt_logger_log(logger, "line "GT_WD"", lcounter); } } if (!had_err) { GtUword num_of_seq, file_idx, seq_idx, startpos; GT_UNUSED GtUword oldpos = 0; gt_assert(unit_info != NULL); gt_assert(lcounter == zcounter); gt_assert(lcounter == unit_info->num_of_genomes); num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq); for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) { startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx); file_idx = gt_encseq_filenum(unit_info->encseq, startpos); gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n" "belonges to file: "GT_WU" which is part of genome: %s", seq_idx, startpos, file_idx, gt_str_array_get(unit_info->genome_names, unit_info->map_files[file_idx])); gt_assert(oldpos <= startpos); oldpos = startpos; } } if (!had_err && shusums != NULL) { had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments, unit_info, logger, timer, err); gt_array2dim_delete(shusums); } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
static GtOPrval gthsplit_parse_options(int *parsed_args, Gthsplitinfo *gthsplitinfo, int argc, const char **argv, const GthPlugins *plugins, GtError *err) { GtOptionParser *op; GtOption *optalignmentscore, *optcoverage, *optrange, *optverbose, *optgzip, *optbzip2, *optforce; bool alignmentscore, coverage, verbose, gzip, bzip2; GtOPrval oprval; gt_error_check(err); op = gt_option_parser_new("-alignmentscore | -coverage [option ...] " "[file ...]", "Split GenomeThreader output files " "containing intermediate results."); /* specify all options with a corresponding help-text */ optalignmentscore = gt_option_new_bool("alignmentscore", "split according to " "the overall alignment score (scr)", &alignmentscore, false); gt_option_parser_add_option(op, optalignmentscore); optcoverage = gt_option_new_bool("coverage", "split according to coverage " "(cov)", &coverage, false); gt_option_parser_add_option(op, optcoverage); optrange = gt_option_new_uint_max(RANGE_OPT_CSTR, "set the percentage range " "used to create the sets", >hsplitinfo->range, DEFAULT_RANGE, 100); gt_option_parser_add_option(op, optrange); /* add sa_filter options */ gth_sa_filter_register_options(op, gthsplitinfo->sa_filter, false); /* -v */ optverbose = gt_option_new_verbose(&verbose); gt_option_parser_add_option(op, optverbose); optgzip = gt_option_new_bool("gzip", "write gzip compressed output file(s)", &gzip, false); gt_option_parser_add_option(op, optgzip); optbzip2 = gt_option_new_bool("bzip2", "write bzip2 compressed output " "file(s)", &bzip2, false); gt_option_parser_add_option(op, optbzip2); optforce = gt_option_new_bool(GT_FORCE_OPT_CSTR,"force writing to split " "files", >hsplitinfo->force, false); gt_option_parser_add_option(op, optforce); gt_option_exclude(optalignmentscore, optcoverage); gt_option_exclude(optgzip, optbzip2); gt_option_is_mandatory_either(optalignmentscore, optcoverage); gt_option_parser_set_mail_address(op, "<*****@*****.**>"); oprval = gt_option_parser_parse(op, parsed_args, argc, argv, plugins->gth_version_func, err); if (oprval == GT_OPTION_PARSER_OK && alignmentscore) gthsplitinfo->splitmode = ALIGNMENTSCORE_SPLIT; if (oprval == GT_OPTION_PARSER_OK && coverage) gthsplitinfo->splitmode = COVERAGE_SPLIT; if (oprval == GT_OPTION_PARSER_OK && 100 % gthsplitinfo->range) { gt_error_set(err, "argument to option %s must divide 100 without rest", RANGE_OPT_CSTR); oprval = GT_OPTION_PARSER_ERROR; } if (oprval == GT_OPTION_PARSER_OK && verbose) gthsplitinfo->showverbose = gth_show_on_stdout; if (oprval == GT_OPTION_PARSER_OK && gzip) gthsplitinfo->file_mode = GT_FILE_MODE_GZIP; if (oprval == GT_OPTION_PARSER_OK && bzip2) gthsplitinfo->file_mode = GT_FILE_MODE_BZIP2; /* save consensus files */ if (oprval == GT_OPTION_PARSER_OK) { while (*parsed_args < argc) { gt_str_array_add_cstr(gthsplitinfo->consensusfiles, argv[*parsed_args]); (*parsed_args)++; } } if (oprval == GT_OPTION_PARSER_OK && !gt_str_array_size(gthsplitinfo->consensusfiles) && (gt_option_is_set(optgzip) || gt_option_is_set(optbzip2))) { gt_error_set(err, "to use compression, at least on input file has to be " "supplied"); oprval = GT_OPTION_PARSER_ERROR; } gt_option_parser_delete(op); return oprval; }
static int gt_convertseq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtConvertseqArguments *arguments = tool_arguments; int had_err = 0, i; GtFilelengthvalues *flv; GtSeqIterator *seqit; GtSequenceBuffer *sb = NULL; GtStrArray *files; const GtUchar *sequence; char *desc; GtUword len, j; off_t totalsize; gt_error_check(err); gt_assert(arguments != NULL); files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(files, argv[i]); } totalsize = gt_files_estimate_total_size(files); flv = gt_calloc((size_t) gt_str_array_size(files), sizeof (GtFilelengthvalues)); sb = gt_sequence_buffer_new_guess_type(files, err); if (!sb) { had_err = -1; } if (!had_err) { gt_sequence_buffer_set_filelengthtab(sb, flv); /* read input using seqiterator */ seqit = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (true) { GtUchar *seq = NULL; desc = NULL; j = 0UL; had_err = gt_seq_iterator_next(seqit, &sequence, &len, &desc, err); if (had_err != 1) break; if (arguments->revcomp) { GtUchar *newseq = gt_calloc((size_t) len+1, sizeof (GtUchar)); memcpy(newseq, sequence, (size_t) len*sizeof (GtUchar)); had_err = gt_reverse_complement((char*) newseq, len, err); if (had_err) break; seq = newseq; } else seq = (GtUchar*) sequence; if (!arguments->showseq) { bool in_wildcard = false; gt_file_xprintf(arguments->outfp, ">%s\n", desc); for (i = 0; (GtUword) i < len; i++) { if (arguments->reduce_wc_dna) { switch (seq[i]) { case 'a': case 'A': case 'c': case 'C': case 'g': case 'G': case 't': case 'u': case 'T': case 'U': in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; break; default: if (!in_wildcard) { in_wildcard = true; if (isupper((int) seq[i])) gt_file_xfputc((int) 'N', arguments->outfp); else gt_file_xfputc((int) 'n', arguments->outfp); j++; } } } else if (arguments->reduce_wc_prot) { switch (seq[i]) { case 'X': case 'B': case 'Z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'N', arguments->outfp); j++; } break; case 'x': case 'b': case 'z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'n', arguments->outfp); j++; } break; default: in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; } } else { gt_file_xfputc((int) seq[i], arguments->outfp); j++; } if (arguments->fastawidth > 0 && j % arguments->fastawidth == 0) { j = 0; gt_file_xprintf(arguments->outfp, "\n"); } } if (arguments->fastawidth == 0 || len % arguments->fastawidth != 0) gt_file_xprintf(arguments->outfp, "\n"); } if (arguments->revcomp) { gt_free(seq); } } if (arguments->showflv) { for (j=0;j<gt_str_array_size(files);j++) { fprintf(stderr, "file "GT_WU" (%s): "GT_WU"/"GT_WU"\n", j, gt_str_array_get(files, j), (GtUword) flv[j].length, (GtUword) flv[j].effectivelength); } } if (arguments->verbose) { gt_progressbar_stop(); } gt_sequence_buffer_delete(sb); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); gt_free(flv); return had_err; }
static int gt_seqtranslate_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtTranslateArguments *arguments = tool_arguments; GtSeqIterator *si = NULL; GtSequenceBuffer *sb = NULL; GtStrArray *infiles; int had_err = 0, rval, i; GtStr *translations[3]; translations[0] = gt_str_new(); translations[1] = gt_str_new(); translations[2] = gt_str_new(); gt_error_check(err); gt_assert(arguments); infiles = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(infiles, argv[i]); } sb = gt_sequence_buffer_new_guess_type(infiles, err); if (!sb) had_err = -1; if (!had_err) { si = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); if (!si) had_err = -1; } if (!had_err) { char *desc; const GtUchar *sequence; GtUword len; while (!had_err && (rval = gt_seq_iterator_next(si, &sequence, &len, &desc, err))) { if (rval < 0) { had_err = -1; break; } if (len < GT_CODON_LENGTH) { gt_warning("sequence '%s' is shorter than codon length of %d, skipping", desc, GT_CODON_LENGTH); } else { had_err = gt_seqtranslate_do_translation(arguments, (char*) sequence, len, desc, translations, false, err); if (!had_err && arguments->reverse) { char *revseq = gt_cstr_dup_nt((char*) sequence, len); had_err = gt_reverse_complement(revseq, len, err); if (!had_err) { had_err = gt_seqtranslate_do_translation(arguments, revseq, len, desc, translations, true, err); } gt_free(revseq); } } } } gt_str_delete(translations[0]); gt_str_delete(translations[1]); gt_str_delete(translations[2]); gt_str_array_delete(infiles); gt_seq_iterator_delete(si); gt_sequence_buffer_delete(sb); return had_err; }
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSequniqArguments *arguments = tool_arguments; GtUint64 duplicates = 0, num_of_sequences = 0; int i, had_err = 0; GtMD5Set *md5set; gt_error_check(err); gt_assert(arguments); md5set = gt_md5set_new(arguments->nofseqs); if (!arguments->seqit) { GtUword j; GtBioseq *bs; for (i = parsed_args; !had_err && i < argc; i++) { if (!(bs = gt_bioseq_new(argv[i], err))) had_err = -1; if (!had_err) { GtMD5SetStatus retval; for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) { char *seq = gt_bioseq_get_sequence(bs, j); retval = gt_md5set_add_sequence(md5set, seq, gt_bioseq_get_sequence_length(bs, j), arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq, gt_bioseq_get_sequence_length(bs, j), arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; gt_free(seq); } gt_bioseq_delete(bs); } } } else { GtSeqIterator *seqit; GtStrArray *files; off_t totalsize; const GtUchar *sequence; char *desc; GtUword len; files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) gt_str_array_add_cstr(files, argv[i]); totalsize = gt_files_estimate_total_size(files); seqit = gt_seq_iterator_sequence_buffer_new(files, err); if (!seqit) had_err = -1; if (!had_err) { if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (!had_err) { GtMD5SetStatus retval; if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1) break; retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len, arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(desc, (const char*) sequence, len, arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; } if (arguments->verbose) gt_progressbar_stop(); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); } /* show statistics */ if (!had_err) { fprintf(stderr, "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n", (GtUword)duplicates, (GtUword)num_of_sequences, ((double) duplicates / (double)num_of_sequences) * 100.0); } gt_md5set_delete(md5set); return had_err; }