static int parse_score_line(GtScoreMatrix *sm, GtTokenizer *tz, GtArray *index_to_alpha_char_mapping, char *parsed_characters, GtError *err) { unsigned int num_of_chars, i = 0; char amino_acid; int score, had_err = 0; GtStr *token; gt_assert(sm && tz && index_to_alpha_char_mapping); gt_error_check(err); token = gt_tokenizer_get_token(tz); gt_assert(token); if (gt_str_length(token) != 1) { gt_error_set(err, "illegal character token '%s' on line %lu in file '%s'", gt_str_get(token), gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; } amino_acid = gt_str_get(token)[0]; /* check for character duplications */ if (parsed_characters[(int) amino_acid]) { gt_error_set(err, "multiple character '%c' entry on line %lu in file '%s'", amino_acid, gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; } parsed_characters[(int) amino_acid] = GT_UNDEF_CHAR; gt_str_delete(token); if (!had_err) { num_of_chars = gt_alphabet_num_of_chars(sm->alphabet); gt_tokenizer_next_token(tz); while ((token = gt_tokenizer_get_token(tz))) { unsigned int idx1, idx2; /* the tokenizer can return tokens which are empty except for a newline -> skip these */ if (!strcmp(gt_str_get(token), "\n")) { gt_str_delete(token); gt_tokenizer_next_token(tz); if (gt_tokenizer_line_start(tz)) break; continue; } /* token is not empty -> parse score */ had_err = gt_parse_int_line(&score, gt_str_get(token), gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz), err); if (had_err) break; idx1 = gt_alphabet_encode(sm->alphabet, amino_acid); idx2 = gt_alphabet_encode(sm->alphabet, *(char*) gt_array_get(index_to_alpha_char_mapping, i)); gt_score_matrix_set_score(sm, idx1 == WILDCARD ? num_of_chars : idx1, idx2 == WILDCARD ? num_of_chars : idx2, score); i++; gt_str_delete(token); gt_tokenizer_next_token(tz); if (gt_tokenizer_line_start(tz)) break; } } return had_err; }
static int parse_alphabet_line(GtArray *index_to_alpha_char_mapping, GtTokenizer *tz, GtError *err) { GtStr *token; char *tokenstr, amino_acid, parsed_characters[UCHAR_MAX] = { 0 }; int had_err = 0; gt_error_check(err); gt_assert(index_to_alpha_char_mapping && tz); gt_assert(!gt_array_size(index_to_alpha_char_mapping)); while ((token = gt_tokenizer_get_token(tz))) { if (gt_str_length(token) > 2) { gt_error_set(err, "illegal character token '%s' on line %lu in file '%s'", gt_str_get(token), gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; break; } tokenstr = gt_str_get(token); amino_acid = tokenstr[0]; /* check for character duplications */ if (parsed_characters[(int) amino_acid]) { gt_error_set(err, "the character '%c' appears more then once on line %lu " "in file '%s'", amino_acid, gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; break; } parsed_characters[(int) amino_acid] = GT_UNDEF_CHAR; if (amino_acid == '\n') { gt_str_delete(token); gt_tokenizer_next_token(tz); gt_assert(!had_err); return 0; } gt_array_add(index_to_alpha_char_mapping, amino_acid); if (gt_str_length(token) == 2) { if (tokenstr[1] != '\n') { gt_error_set(err, "illegal character token '%s' on line %lu in file " "'%s'", gt_str_get(token), gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; break; } gt_str_delete(token); gt_tokenizer_next_token(tz); gt_assert(!had_err); return 0; } gt_str_delete(token); gt_tokenizer_next_token(tz); } if (!had_err) { if (!gt_array_size(index_to_alpha_char_mapping)) { gt_error_set(err, "could not parse a single alphabet character in file " "'%s' (file empty or directory?)", gt_tokenizer_get_filename(tz)); had_err = -1; } } gt_str_delete(token); return had_err; }
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtGenomediffArguments *arguments = tool_arguments; int had_err = 0, i; GtUword lcounter = 0, zcounter = 0; double **shusums = NULL; GtEncseq *encseq = NULL; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("load encseq"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (timer != NULL) gt_timer_show_progress(timer, "load units", stdout); if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (timer != NULL) gt_timer_show_progress(timer, "read table", stdout); if (!had_err) { GtIO *table_file = NULL; GtTokenizer *tokenizer = NULL; GtStr *line = NULL; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r"); tokenizer = gt_tokenizer_new(table_file); line = gt_tokenizer_get_token(tokenizer); while (line != NULL && !had_err) { char *cline = gt_str_get(line); char *elem = strtok(cline, ";"); zcounter = 0; while (elem != NULL && !had_err) { if (*elem != '#') { if (1 != sscanf(elem, "%lf", &shusums[lcounter][zcounter])) { had_err = 1; gt_error_set(err, "couldn't scan"); break; } gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]); zcounter++; } else { gt_logger_log(logger, "name: %s", elem++); } elem = strtok(NULL, ";"); } gt_tokenizer_next_token(tokenizer); gt_str_delete(line); line = gt_tokenizer_get_token(tokenizer); lcounter++; gt_logger_log(logger, "line "GT_WD"", lcounter); } } if (!had_err) { GtUword num_of_seq, file_idx, seq_idx, startpos; GT_UNUSED GtUword oldpos = 0; gt_assert(unit_info != NULL); gt_assert(lcounter == zcounter); gt_assert(lcounter == unit_info->num_of_genomes); num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq); for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) { startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx); file_idx = gt_encseq_filenum(unit_info->encseq, startpos); gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n" "belonges to file: "GT_WU" which is part of genome: %s", seq_idx, startpos, file_idx, gt_str_array_get(unit_info->genome_names, unit_info->map_files[file_idx])); gt_assert(oldpos <= startpos); oldpos = startpos; } } if (!had_err && shusums != NULL) { had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments, unit_info, logger, timer, err); gt_array2dim_delete(shusums); } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }