Example #1
0
static int parse_score_line(GtScoreMatrix *sm, GtTokenizer *tz,
                            GtArray *index_to_alpha_char_mapping,
                            char *parsed_characters, GtError *err)
{
  unsigned int num_of_chars, i = 0;
  char amino_acid;
  int score, had_err = 0;
  GtStr *token;
  gt_assert(sm && tz && index_to_alpha_char_mapping);
  gt_error_check(err);
  token = gt_tokenizer_get_token(tz);
  gt_assert(token);
  if (gt_str_length(token) != 1) {
    gt_error_set(err, "illegal character token '%s' on line %lu in file '%s'",
                 gt_str_get(token), gt_tokenizer_get_line_number(tz),
                 gt_tokenizer_get_filename(tz));
    had_err = -1;
  }
  amino_acid = gt_str_get(token)[0];
  /* check for character duplications */
  if (parsed_characters[(int) amino_acid]) {
    gt_error_set(err, "multiple character '%c' entry on line %lu in file '%s'",
                 amino_acid, gt_tokenizer_get_line_number(tz),
                 gt_tokenizer_get_filename(tz));
    had_err = -1;
  }
  parsed_characters[(int) amino_acid] = GT_UNDEF_CHAR;
  gt_str_delete(token);
  if (!had_err) {
    num_of_chars = gt_alphabet_num_of_chars(sm->alphabet);
    gt_tokenizer_next_token(tz);
    while ((token = gt_tokenizer_get_token(tz))) {
      unsigned int idx1, idx2;
      /* the tokenizer can return tokens which are empty except for a newline
         -> skip these */
      if (!strcmp(gt_str_get(token), "\n")) {
        gt_str_delete(token);
        gt_tokenizer_next_token(tz);
        if (gt_tokenizer_line_start(tz))
          break;
        continue;
      }
      /* token is not empty -> parse score */
      had_err = gt_parse_int_line(&score, gt_str_get(token),
                                  gt_tokenizer_get_line_number(tz),
                                  gt_tokenizer_get_filename(tz), err);
      if (had_err)
        break;
      idx1 = gt_alphabet_encode(sm->alphabet, amino_acid);
      idx2 = gt_alphabet_encode(sm->alphabet, *(char*)
                                gt_array_get(index_to_alpha_char_mapping, i));
      gt_score_matrix_set_score(sm,
                                idx1 == WILDCARD ? num_of_chars : idx1,
                                idx2 == WILDCARD ? num_of_chars : idx2,
                                score);
      i++;
      gt_str_delete(token);
      gt_tokenizer_next_token(tz);
      if (gt_tokenizer_line_start(tz))
        break;
    }
  }
  return had_err;
}
Example #2
0
static int parse_alphabet_line(GtArray *index_to_alpha_char_mapping,
                               GtTokenizer *tz, GtError *err)
{
  GtStr *token;
  char *tokenstr, amino_acid, parsed_characters[UCHAR_MAX] = { 0 };
  int had_err = 0;
  gt_error_check(err);
  gt_assert(index_to_alpha_char_mapping && tz);
  gt_assert(!gt_array_size(index_to_alpha_char_mapping));
  while ((token = gt_tokenizer_get_token(tz))) {
    if (gt_str_length(token) > 2) {
      gt_error_set(err, "illegal character token '%s' on line %lu in file '%s'",
                gt_str_get(token), gt_tokenizer_get_line_number(tz),
                gt_tokenizer_get_filename(tz));
      had_err = -1;
      break;
    }
    tokenstr = gt_str_get(token);
    amino_acid = tokenstr[0];
    /* check for character duplications */
    if (parsed_characters[(int) amino_acid]) {
      gt_error_set(err, "the character '%c' appears more then once on line %lu "
                   "in file  '%s'", amino_acid,
                   gt_tokenizer_get_line_number(tz),
                   gt_tokenizer_get_filename(tz));
      had_err = -1;
      break;
    }
    parsed_characters[(int) amino_acid] = GT_UNDEF_CHAR;
    if (amino_acid == '\n') {
      gt_str_delete(token);
      gt_tokenizer_next_token(tz);
      gt_assert(!had_err);
      return 0;
    }
    gt_array_add(index_to_alpha_char_mapping, amino_acid);
    if (gt_str_length(token) == 2) {
      if (tokenstr[1] != '\n') {
        gt_error_set(err, "illegal character token '%s' on line %lu in file "
                     "'%s'", gt_str_get(token),
                     gt_tokenizer_get_line_number(tz),
                     gt_tokenizer_get_filename(tz));
        had_err = -1;
        break;
      }
      gt_str_delete(token);
      gt_tokenizer_next_token(tz);
      gt_assert(!had_err);
      return 0;
    }
    gt_str_delete(token);
    gt_tokenizer_next_token(tz);
  }
  if (!had_err) {
    if (!gt_array_size(index_to_alpha_char_mapping)) {
      gt_error_set(err, "could not parse a single alphabet character in file "
                   "'%s' (file empty or directory?)",
                   gt_tokenizer_get_filename(tz));
    had_err = -1;
    }
  }
  gt_str_delete(token);
  return had_err;
}
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtGenomediffArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtUword lcounter = 0, zcounter = 0;
  double **shusums = NULL;
  GtEncseq              *encseq = NULL;
  GtLogger              *logger;
  GtShuUnitFileInfo     *unit_info = NULL;
  GtTimer               *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose,
                         GT_LOGGER_DEFLT_PREFIX,
                         stdout);
  gt_assert(logger);

  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(arguments->filenames, argv[i]);
  }

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("load encseq");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (arguments->with_units) {
    gt_logger_log(logger, "unitfile option set, filename is %s\n",
                  gt_str_get(arguments->unitfile));
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                                                           err);
    encseq =
      gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
    gt_encseq_loader_delete(el);
  }
  if (encseq == NULL)
    had_err = -1;

  if (timer != NULL)
    gt_timer_show_progress(timer, "load units", stdout);

  if (!had_err) {
    unit_info = gt_shu_unit_info_new(encseq);
    if (arguments->with_units)
      had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                           logger, err);
  }

  if (timer != NULL)
    gt_timer_show_progress(timer, "read table", stdout);

  if (!had_err) {
    GtIO *table_file = NULL;
    GtTokenizer *tokenizer = NULL;
    GtStr *line = NULL;

    gt_assert(unit_info != NULL);
    gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                        unit_info->num_of_genomes);

    table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r");
    tokenizer = gt_tokenizer_new(table_file);
    line = gt_tokenizer_get_token(tokenizer);
    while (line != NULL && !had_err) {
      char *cline = gt_str_get(line);
      char *elem = strtok(cline, ";");
      zcounter = 0;
      while (elem != NULL && !had_err) {
        if (*elem != '#') {
          if (1 != sscanf(elem, "%lf",
                          &shusums[lcounter][zcounter])) {
            had_err = 1;
            gt_error_set(err, "couldn't scan");
            break;
          }
          gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]);
          zcounter++;
        }
        else {
          gt_logger_log(logger, "name: %s", elem++);
        }
        elem = strtok(NULL, ";");
      }
      gt_tokenizer_next_token(tokenizer);
      gt_str_delete(line);
      line = gt_tokenizer_get_token(tokenizer);
      lcounter++;
      gt_logger_log(logger, "line "GT_WD"", lcounter);
    }
  }
  if (!had_err) {
    GtUword num_of_seq, file_idx, seq_idx, startpos;
    GT_UNUSED GtUword oldpos = 0;

    gt_assert(unit_info != NULL);
    gt_assert(lcounter == zcounter);
    gt_assert(lcounter == unit_info->num_of_genomes);

    num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq);

    for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) {
      startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx);
      file_idx = gt_encseq_filenum(unit_info->encseq, startpos);
      gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n"
                 "belonges to file: "GT_WU" which is part of genome: %s",
                 seq_idx, startpos, file_idx,
                 gt_str_array_get(unit_info->genome_names,
                                  unit_info->map_files[file_idx]));
      gt_assert(oldpos <= startpos);
      oldpos = startpos;
    }
  }
  if (!had_err && shusums != NULL) {
    had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments,
                                                   unit_info,
                                                   logger, timer, err);
    gt_array2dim_delete(shusums);
  }

  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(logger);
  gt_encseq_delete(encseq);
  gt_shu_unit_info_delete(unit_info);
  return had_err;
}