int gt_splitter_unit_test(GtError *err) { static char string_1[] = "a bb ccc dddd eeeee", string_2[] = "a\tbb\tccc\tdddd\teeeee", string_3[] = "", string_4[] = "a b", string_5[] = "ac bc ", string_6[] = "test"; GtSplitter *s; int had_err = 0; gt_error_check(err); s = gt_splitter_new(); /* string_1 */ gt_ensure(!gt_splitter_size(s)); gt_splitter_split(s, string_1, strlen(string_1), ' '); gt_ensure(gt_splitter_size(s) == 5); gt_ensure(strcmp(gt_splitter_get_token(s, 0), "a") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 1), "bb") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 2), "ccc") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 3), "dddd") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 4), "eeeee") == 0); gt_splitter_reset(s); /* string_2 */ gt_ensure(!gt_splitter_size(s)); gt_splitter_split(s, string_2, strlen(string_2), '\t'); gt_ensure(gt_splitter_size(s) == 5); gt_ensure(strcmp(gt_splitter_get_token(s, 0), "a") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 1), "bb") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 2), "ccc") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 3), "dddd") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 4), "eeeee") == 0); gt_splitter_reset(s); /* string_3 */ gt_ensure(!gt_splitter_size(s)); gt_splitter_split(s, string_3, strlen(string_3), '\t'); gt_ensure(gt_splitter_size(s) == 1); gt_ensure(strcmp(gt_splitter_get_token(s, 0), "") == 0); gt_splitter_reset(s); /* string_4 */ gt_ensure(!gt_splitter_size(s)); gt_splitter_split(s, string_4, strlen(string_4), ' '); gt_ensure(gt_splitter_size(s) == 3); gt_ensure(strcmp(gt_splitter_get_token(s, 0), "a") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 1), "") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 2), "b") == 0); gt_splitter_reset(s); /* string_5 */ gt_ensure(!gt_splitter_size(s)); gt_splitter_split(s, string_5, strlen(string_5), ' '); gt_ensure(gt_splitter_size(s) == 3); gt_ensure(strcmp(gt_splitter_get_token(s, 0), "ac") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 1), "bc") == 0); gt_ensure(strcmp(gt_splitter_get_token(s, 2), "") == 0); gt_splitter_reset(s); /* string_6 */ gt_ensure(!gt_splitter_size(s)); gt_splitter_split(s, string_6, strlen(string_6), ';'); gt_ensure(gt_splitter_size(s) == 1); gt_ensure(strcmp(gt_splitter_get_token(s, 0), "test") == 0); /* free */ gt_splitter_delete(s); return had_err; }
static int hmmsearch_process_coarse_hits( char *table_filename, GtCondenseq *ces, GtCondenseqHmmsearchArguments *arguments, GtLogger *logger, GtError *err) { int had_err = 0; GtStr *line = gt_str_new(); FILE *table = NULL; GtSplitter *splitter = gt_splitter_new(); GtStr *query = gt_str_new(), *fine_fasta_filename = gt_str_new_cstr("condenseq"); GtRBTree *sequences = NULL; GtUword filecount = (GtUword) 1; unsigned int querycount = 0; const GtUword fine_fasta_name_length = gt_str_length(fine_fasta_filename); const GtUword table_name_length = gt_str_length(arguments->outtable_filename); table = gt_xfopen(table_filename, "r"); sequences = gt_rbtree_new(hmmsearch_cmp_seqnum, hmmsearch_tree_free_node, NULL); while (!had_err && gt_str_read_next_line(line, table) == 0) { char *c_line = gt_str_get(line); GtUword uid; const GtUword target_column = 0, query_column = (GtUword) 3; if (c_line[0] != '#') { gt_splitter_split_non_empty(splitter, c_line, gt_str_length(line), ' '); gt_assert(gt_splitter_size(splitter) == (GtUword) 23); if (sscanf(gt_splitter_get_token(splitter, target_column), GT_WU, &uid) != 1) { gt_error_set(err, "couldn't parse target number: %s", gt_splitter_get_token(splitter, target_column)); had_err = -1; } if (gt_str_length(query) == 0 || strcmp(gt_str_get(query), gt_splitter_get_token(splitter, query_column)) != 0) { gt_str_set(query, gt_splitter_get_token(splitter, query_column)); gt_logger_log(logger, "new query: %s", gt_str_get(query)); querycount++; } if (!had_err && querycount == arguments->max_queries) { hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces); if (table_name_length != 0) gt_str_append_uword(arguments->outtable_filename, filecount++); had_err = hmmsearch_call_fine_search(table_name_length != 0 ? arguments->outtable_filename : NULL, gt_str_get(fine_fasta_filename), gt_str_get(arguments->hmmsearch_path), gt_str_get(arguments->hmm), logger, err); gt_rbtree_clear(sequences); gt_str_set_length(fine_fasta_filename, fine_fasta_name_length); if (table_name_length != 0) gt_str_set_length(arguments->outtable_filename, table_name_length); querycount = 0; } if (!had_err) { if (gt_condenseq_each_redundant_seq(ces, uid, hmmsearch_process_seq, sequences, err) == 0) { had_err = -1; } } gt_splitter_reset(splitter); } gt_str_reset(line); } gt_splitter_delete(splitter); gt_str_delete(line); gt_str_delete(query); gt_xfclose(table); if (!had_err) { hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces); if (table_name_length != 0) gt_str_append_uword(arguments->outtable_filename, filecount++); had_err = hmmsearch_call_fine_search(table_name_length != 0 ? arguments->outtable_filename : NULL, gt_str_get(fine_fasta_filename), gt_str_get(arguments->hmmsearch_path), gt_str_get(arguments->hmm), logger, err); } gt_log_log("created " GT_WU " files", filecount); gt_rbtree_delete(sequences); gt_str_delete(fine_fasta_filename); return had_err; }
bool gt_xrf_checker_is_valid(GtXRFChecker *xrc, const char *value, GtError *err) { bool valid = true; char *myvalue = gt_cstr_dup(value), *dbid = NULL, *localid = NULL; GtXRFAbbrEntry *e; GtUword nof_tokens, i; gt_assert(xrc && value); gt_error_check(err); /* XXX: Thread safety! */ gt_splitter_reset(xrc->splitter); gt_splitter_split(xrc->splitter, myvalue, strlen(myvalue), ','); nof_tokens = gt_splitter_size(xrc->splitter); for (i = 0; valid && i < nof_tokens; i++) { dbid = gt_splitter_get_token(xrc->splitter, i); if (!(localid = strchr(dbid, ':'))) { gt_error_set(err, "xref \"%s\": separator colon missing", value); valid = false; } if (valid) { *localid = '\0'; if (*(++localid) == '\0') { gt_error_set(err, "xref \"%s\": local ID (part after colon) missing", value); valid = false; } } if (valid) { gt_assert(dbid && localid); if (!(e = gt_hashmap_get(xrc->abbrvs, dbid))) { gt_error_set(err, "xref \"%s\": unknown database abbreviation \"%s\"", value, dbid); valid = false; } } if (valid) { /* TODO: use #defines here. */ const char *regex = NULL; gt_assert(e); if ((regex = gt_xrf_abbr_entry_get_value(e, "local_id_syntax"))) { bool match = false; GT_UNUSED int rval; rval = gt_grep(&match, regex, localid, NULL); gt_assert(rval == 0); /* regex format has been checked before */ if (!match) { gt_error_set(err, "xref \"%s\": local ID \"%s\" does not " "conform to syntax \"%s\" for the " "%s database", value, localid, regex, dbid); valid = false; } } } } gt_free(myvalue); return valid; }
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes, GtStr *filenamestr, GtFile *fpin, bool be_tolerant, GtError *err) { GtStr *seqid_str, *source_str, *line_buffer; char *line; size_t line_length; GtUword i, line_number = 0; GtGenomeNode *gn; GtRange range; GtPhase phase_value; GtStrand gt_strand_value; GtSplitter *splitter, *attribute_splitter; float score_value; char *seqname, *source, *feature, *start, *end, *score, *strand, *frame, *attributes, *token, *gene_id, *gene_name = NULL, *transcript_id, *transcript_name = NULL, **tokens; GtHashmap *transcript_id_hash; /* map from transcript id to array of genome nodes */ GtArray *gt_genome_node_array; ConstructionInfo cinfo; GTF_feature_type gtf_feature_type; GT_UNUSED bool gff_type_is_valid = false; const char *type = NULL; const char *filename; bool score_is_defined; int had_err = 0; gt_assert(parser && genome_nodes); gt_error_check(err); filename = gt_str_get(filenamestr); /* alloc */ line_buffer = gt_str_new(); splitter = gt_splitter_new(), attribute_splitter = gt_splitter_new(); #define HANDLE_ERROR \ if (had_err) { \ if (be_tolerant) { \ fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \ gt_error_unset(err); \ gt_str_reset(line_buffer); \ had_err = 0; \ continue; \ } \ else { \ had_err = -1; \ break; \ } \ } while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) { line = gt_str_get(line_buffer); line_length = gt_str_length(line_buffer); line_number++; had_err = 0; if (line_length == 0) { gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number, filename); } else if (line[0] == '#') { /* storing comment */ if (line_length >= 2 && line[1] == '#') gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */ else gn = gt_comment_node_new(line+1); gt_genome_node_set_origin(gn, filenamestr, line_number); gt_queue_add(genome_nodes, gn); } else { /* process tab delimited GTF line */ gt_splitter_reset(splitter); gt_splitter_split(splitter, line, line_length, '\t'); if (gt_splitter_size(splitter) != 9UL) { gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU " tab (\\t) " "separated fields instead of 9", line_number, filename, gt_splitter_size(splitter)); had_err = -1; break; } tokens = gt_splitter_get_tokens(splitter); seqname = tokens[0]; source = tokens[1]; feature = tokens[2]; start = tokens[3]; end = tokens[4]; score = tokens[5]; strand = tokens[6]; frame = tokens[7]; attributes = tokens[8]; /* parse feature */ if (GTF_feature_type_get(>f_feature_type, feature) == -1) { /* we skip unknown features */ fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown " "feature: \"%s\"\n", line_number, filename, feature); gt_str_reset(line_buffer); continue; } /* translate into GFF3 feature type */ switch (gtf_feature_type) { case GTF_CDS: case GTF_stop_codon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_CDS); type = gt_ft_CDS; break; case GTF_exon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_exon); type = gt_ft_exon; } gt_assert(gff_type_is_valid); /* parse the range */ had_err = gt_parse_range(&range, start, end, line_number, filename, err); HANDLE_ERROR; /* process seqname (we have to do it here because we need the range) */ gt_region_node_builder_add_region(parser->region_node_builder, seqname, range); /* parse the score */ had_err = gt_parse_score(&score_is_defined, &score_value, score, line_number, filename, err); HANDLE_ERROR; /* parse the strand */ had_err = gt_parse_strand(>_strand_value, strand, line_number, filename, err); HANDLE_ERROR; /* parse the frame */ had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err); HANDLE_ERROR; /* parse the attributes */ gt_splitter_reset(attribute_splitter); gene_id = NULL; transcript_id = NULL; gt_splitter_split(attribute_splitter, attributes, strlen(attributes), ';'); for (i = 0; i < gt_splitter_size(attribute_splitter); i++) { token = gt_splitter_get_token(attribute_splitter, i); /* skip leading blanks */ while (*token == ' ') token++; /* look for the two mandatory attributes */ if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1; } else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE, strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1; } else if (strncmp(token, GENE_NAME_ATTRIBUTE, strlen(GENE_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*gene_name == '"') gene_name++; if (gene_name[strlen(gene_name)-1] == '"') gene_name[strlen(gene_name)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE, strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*transcript_name == '"') transcript_name++; if (transcript_name[strlen(transcript_name)-1] == '"') transcript_name[strlen(transcript_name)-1] = '\0'; } } /* check for the mandatory attributes */ if (!gene_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; if (!transcript_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; /* process the mandatory attributes */ if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash, gene_id))) { transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id), transcript_id_hash); } gt_assert(transcript_id_hash); if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash, transcript_id))) { gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*)); gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id), gt_genome_node_array); } gt_assert(gt_genome_node_array); /* save optional gene_name and transcript_name attributes */ if (transcript_name && !gt_hashmap_get(parser->transcript_id_to_name_mapping, transcript_id)) { gt_hashmap_add(parser->transcript_id_to_name_mapping, gt_cstr_dup(transcript_id), gt_cstr_dup(transcript_name)); } if (gene_name && !gt_hashmap_get(parser->gene_id_to_name_mapping, gene_id)) { gt_hashmap_add(parser->gene_id_to_name_mapping, gt_cstr_dup(gene_id), gt_cstr_dup(gene_name)); } /* get seqid */ seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname); if (!seqid_str) { seqid_str = gt_str_new_cstr(seqname); gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str), seqid_str); } gt_assert(seqid_str); /* construct the new feature */ gn = gt_feature_node_new(seqid_str, type, range.start, range.end, gt_strand_value); gt_genome_node_set_origin(gn, filenamestr, line_number); /* set source */ source_str = gt_hashmap_get(parser->source_to_str_mapping, source); if (!source_str) { source_str = gt_str_new_cstr(source); gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str), source_str); } gt_assert(source_str); gt_feature_node_set_source((GtFeatureNode*) gn, source_str); if (score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); if (phase_value != GT_PHASE_UNDEFINED) gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value); gt_array_add(gt_genome_node_array, gn); } gt_str_reset(line_buffer); } /* process all region nodes */ if (!had_err) gt_region_node_builder_build(parser->region_node_builder, genome_nodes); /* process all feature nodes */ cinfo.genome_nodes = genome_nodes; cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping; cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping; if (!had_err) { had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes, &cinfo, err); } /* free */ gt_splitter_delete(splitter); gt_splitter_delete(attribute_splitter); gt_str_delete(line_buffer); return had_err; }