static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtGenomediffArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtUword lcounter = 0, zcounter = 0;
  double **shusums = NULL;
  GtEncseq              *encseq = NULL;
  GtLogger              *logger;
  GtShuUnitFileInfo     *unit_info = NULL;
  GtTimer               *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose,
                         GT_LOGGER_DEFLT_PREFIX,
                         stdout);
  gt_assert(logger);

  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(arguments->filenames, argv[i]);
  }

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("load encseq");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (arguments->with_units) {
    gt_logger_log(logger, "unitfile option set, filename is %s\n",
                  gt_str_get(arguments->unitfile));
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                                                           err);
    encseq =
      gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
    gt_encseq_loader_delete(el);
  }
  if (encseq == NULL)
    had_err = -1;

  if (timer != NULL)
    gt_timer_show_progress(timer, "load units", stdout);

  if (!had_err) {
    unit_info = gt_shu_unit_info_new(encseq);
    if (arguments->with_units)
      had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                           logger, err);
  }

  if (timer != NULL)
    gt_timer_show_progress(timer, "read table", stdout);

  if (!had_err) {
    GtIO *table_file = NULL;
    GtTokenizer *tokenizer = NULL;
    GtStr *line = NULL;

    gt_assert(unit_info != NULL);
    gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                        unit_info->num_of_genomes);

    table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r");
    tokenizer = gt_tokenizer_new(table_file);
    line = gt_tokenizer_get_token(tokenizer);
    while (line != NULL && !had_err) {
      char *cline = gt_str_get(line);
      char *elem = strtok(cline, ";");
      zcounter = 0;
      while (elem != NULL && !had_err) {
        if (*elem != '#') {
          if (1 != sscanf(elem, "%lf",
                          &shusums[lcounter][zcounter])) {
            had_err = 1;
            gt_error_set(err, "couldn't scan");
            break;
          }
          gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]);
          zcounter++;
        }
        else {
          gt_logger_log(logger, "name: %s", elem++);
        }
        elem = strtok(NULL, ";");
      }
      gt_tokenizer_next_token(tokenizer);
      gt_str_delete(line);
      line = gt_tokenizer_get_token(tokenizer);
      lcounter++;
      gt_logger_log(logger, "line "GT_WD"", lcounter);
    }
  }
  if (!had_err) {
    GtUword num_of_seq, file_idx, seq_idx, startpos;
    GT_UNUSED GtUword oldpos = 0;

    gt_assert(unit_info != NULL);
    gt_assert(lcounter == zcounter);
    gt_assert(lcounter == unit_info->num_of_genomes);

    num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq);

    for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) {
      startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx);
      file_idx = gt_encseq_filenum(unit_info->encseq, startpos);
      gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n"
                 "belonges to file: "GT_WU" which is part of genome: %s",
                 seq_idx, startpos, file_idx,
                 gt_str_array_get(unit_info->genome_names,
                                  unit_info->map_files[file_idx]));
      gt_assert(oldpos <= startpos);
      oldpos = startpos;
    }
  }
  if (!had_err && shusums != NULL) {
    had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments,
                                                   unit_info,
                                                   logger, timer, err);
    gt_array2dim_delete(shusums);
  }

  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(logger);
  gt_encseq_delete(encseq);
  gt_shu_unit_info_delete(unit_info);
  return had_err;
}
Example #2
0
static GtOPrval parse_options(int *parsed_args,
                              Cmppairwiseopt *pw,
                              int argc, const char **argv, GtError *err)
{
  GtOptionParser *op;
  GtOption *optionstrings,
         *optionfiles,
         *optioncharlistlen,
         *optiontext,
         *optionshowedist,
         *optionprint;
  GtStrArray *charlistlen;
  GtOPrval oprval;

  gt_error_check(err);
  charlistlen = gt_str_array_new();
  pw->strings = gt_str_array_new();
  pw->files = gt_str_array_new();
  pw->text = gt_str_new();
  pw->charlistlen = NULL;
  pw->fastasequences0 = NULL;
  pw->fastasequences1 = NULL;
  pw->showedist = false;
  pw->print = false;
  pw->fasta = false;
  op = gt_option_parser_new("options", "Apply function to pairs of strings.");
  gt_option_parser_set_mail_address(op, "<*****@*****.**>");

  optionstrings = gt_option_new_string_array("ss", "use two strings",
                                             pw->strings);
  gt_option_parser_add_option(op, optionstrings);

  optionfiles = gt_option_new_filename_array("ff", "use two files",
                                             pw->files);
  gt_option_parser_add_option(op, optionfiles);

  optioncharlistlen = gt_option_new_string_array("a",
                                             "use character list and length",
                                             charlistlen);
  gt_option_parser_add_option(op, optioncharlistlen);

  optiontext = gt_option_new_string("t", "use text", pw->text, NULL);
  gt_option_parser_add_option(op, optiontext);

  optionshowedist = gt_option_new_bool("e", "output unit edit distance",
                      &pw->showedist, false);
  gt_option_parser_add_option(op, optionshowedist);

  optionprint = gt_option_new_bool("p", "print edist alignment",
                      &pw->print, false);
  gt_option_parser_add_option(op, optionprint);

  gt_option_exclude(optionstrings, optionfiles);
  gt_option_exclude(optionstrings, optioncharlistlen);
  gt_option_exclude(optionstrings, optiontext);
  gt_option_exclude(optionfiles, optioncharlistlen);
  gt_option_exclude(optionfiles, optiontext);
  gt_option_exclude(optioncharlistlen, optiontext);
  gt_option_imply(optionshowedist, optionstrings);
  gt_option_imply(optionprint, optionstrings);

  oprval = gt_option_parser_parse(op, parsed_args, argc, argv, gt_versionfunc,
                                  err);
  if (oprval == GT_OPTION_PARSER_OK)
  {
    if (gt_option_is_set(optionstrings))
    {
      if (gt_str_array_size(pw->strings) != 2UL)
      {
        gt_error_set(err, "option -ss requires two string arguments");
        oprval = GT_OPTION_PARSER_ERROR;
      }
    } else
    {
      if (gt_option_is_set(optionfiles))
      {
        if (gt_str_array_size(pw->files) != 2UL)
        {
          if (gt_str_array_size(pw->files) == 3UL &&
              !strcmp(gt_str_array_get(pw->files,0),"fasta"))
          {
            pw->fasta = true;
          }
          if (!pw->fasta)
          {
            gt_error_set(err, "option -ff requires two filename arguments or "
                              "keyword fasta and two filename arguments in "
                              "FASTA format");
            oprval = GT_OPTION_PARSER_ERROR;
          }
        }
      } else
      {
        if (gt_option_is_set(optioncharlistlen))
        {
          GtWord readint;
          if (gt_str_array_size(charlistlen) != 2UL)
          {
            gt_error_set(err,
                         "option -a requires charlist and length argument");
            oprval = GT_OPTION_PARSER_ERROR;
          }else
          {
            pw->charlistlen = gt_malloc(sizeof *pw->charlistlen);
            pw->charlistlen->charlist =
              gt_str_ref(gt_str_array_get_str(charlistlen,
                                                                    0));
            if (sscanf(gt_str_array_get(charlistlen,1UL), GT_WD, &readint) != 1
                || readint < 1L)
            {
              gt_error_set(err,
                           "option -a requires charlist and length argument");
              oprval = GT_OPTION_PARSER_ERROR;
            }
            pw->charlistlen->len = (GtUword) readint;
          }
        } else
        {
          if (!gt_option_is_set(optiontext))
          {
            gt_error_set(err,
                         "use exactly one of the options -ss, -ff, -a, -t");
            oprval = GT_OPTION_PARSER_ERROR;
          }
        }
      }
    }
  }
  gt_option_parser_delete(op);
  if (oprval == GT_OPTION_PARSER_OK && *parsed_args != argc)
  {
    gt_error_set(err, "superfluous program parameters");
    oprval = GT_OPTION_PARSER_ERROR;
  }
  gt_str_array_delete(charlistlen);
  return oprval;
}
Example #3
0
/* Create lists of all GtBlocks in the diagram. */
static int collect_blocks(GT_UNUSED void *key, void *value, void *data,
                          GT_UNUSED GtError *err)
{
  NodeInfoElement *ni = (NodeInfoElement*) value;
  GtDiagram *diagram = (GtDiagram*) data;
  GtBlock *block = NULL;
  GtStr *trackid_str;
  GtUword i = 0;
  trackid_str = gt_str_new();
  for (i = 0; i < gt_str_array_size(ni->types); i++) {
    const char *type;
    GtUword j;
    GtArray *list;
    PerTypeInfo *type_struc = NULL;
    GtBlock* mainblock = NULL;
    type = gt_str_array_get(ni->types, i);
    type_struc = gt_hashmap_get(ni->type_index, type);
    gt_assert(type_struc);
    for (j=0; j<gt_array_size(type_struc->blocktuples); j++)
    {
      GtBlockTuple *bt;
      bt  = *(GtBlockTuple**) gt_array_get(type_struc->blocktuples, j);
      if (bt->rep == GT_UNDEF_REPR && type_struc->must_merge)
      {
        block = mainblock = gt_block_ref(bt->block);
        gt_block_delete(mainblock);
        gt_free(bt);
        continue;
      }
      else
      {
        if (mainblock)
        {
          block = gt_block_clone(mainblock);
          gt_block_merge(block, bt->block);
          gt_block_delete(bt->block);
        } else block = bt->block;
      }
      gt_assert(block);
      gt_str_reset(trackid_str);
      /* execute hook for track selector function */
      diagram->select_func(block, trackid_str, diagram->ptr);

      if (!(list = (GtArray*) gt_hashmap_get(diagram->blocks,
                                             gt_str_get(trackid_str))))
      {
        list = gt_array_new(sizeof (GtBlock*));
        gt_hashmap_add(diagram->blocks, gt_cstr_dup(gt_str_get(trackid_str)),
                       list);
      };
      gt_assert(list);
      gt_array_add(list, block);
      gt_free(bt);
    }
    gt_array_delete(type_struc->blocktuples);
    gt_hashmap_delete(type_struc->rep_index);
    gt_block_delete(mainblock);
  }
  gt_hashmap_delete(ni->type_index);
  gt_str_array_delete(ni->types);
  gt_free(ni);
  gt_str_delete(trackid_str);
  return 0;
}
Example #4
0
void gt_stat_visitor_show_stats(GtNodeVisitor *nv, GtFile *outfp)
{
  GtStatVisitor *sv = stat_visitor_cast(nv);
  if (sv->number_of_sequence_regions) {
    gt_file_xprintf(outfp, "sequence regions: %lu (total length: %llu)\n",
                    sv->number_of_sequence_regions,
                    sv->total_length_of_sequence_regions);
  }
  if (sv->number_of_multi_features) {
    gt_file_xprintf(outfp, "multi-features: %lu\n",
                    sv->number_of_multi_features);
  }
  if (sv->number_of_genes)
    gt_file_xprintf(outfp, "genes: %lu\n", sv->number_of_genes);
  if (sv->number_of_protein_coding_genes) {
    gt_file_xprintf(outfp, "protein-coding genes: %lu\n",
                    sv->number_of_protein_coding_genes);
  }
  if (sv->number_of_mRNAs)
    gt_file_xprintf(outfp, "mRNAs: %lu\n", sv->number_of_mRNAs);
  if (sv->number_of_protein_coding_mRNAs) {
    gt_file_xprintf(outfp, "protein-coding mRNAs: %lu\n",
                    sv->number_of_protein_coding_mRNAs);
  }
  if (sv->number_of_exons)
    gt_file_xprintf(outfp, "exons: %lu\n", sv->number_of_exons);
  if (sv->number_of_CDSs)
    gt_file_xprintf(outfp, "CDSs: %lu\n", sv->number_of_CDSs);
  if (sv->number_of_LTR_retrotransposons) {
    gt_file_xprintf(outfp, "LTR_retrotransposons: %lu\n",
                    sv->number_of_LTR_retrotransposons);
  }
  if (sv->gene_length_distribution) {
    gt_file_xprintf(outfp, "gene length distribution:\n");
    gt_disc_distri_show(sv->gene_length_distribution, outfp);
  }
  if (sv->gene_score_distribution) {
    gt_file_xprintf(outfp, "gene score distribution:\n");
    gt_disc_distri_show(sv->gene_score_distribution, outfp);
  }
  if (sv->exon_length_distribution) {
    gt_file_xprintf(outfp, "exon length distribution:\n");
    gt_disc_distri_show(sv->exon_length_distribution, outfp);
  }
  if (sv->exon_number_distribution) {
    gt_file_xprintf(outfp, "exon number distribution:\n");
    gt_disc_distri_show(sv->exon_number_distribution, outfp);
  }
  if (sv->intron_length_distribution) {
    gt_file_xprintf(outfp, "intron length distribution:\n");
    gt_disc_distri_show(sv->intron_length_distribution, outfp);
  }
  if (sv->cds_length_distribution) {
    gt_file_xprintf(outfp, "CDS length distribution:\n");
    gt_disc_distri_show(sv->cds_length_distribution, outfp);
  }
  if (sv->used_sources) {
    GtStrArray *sources;
    unsigned long i;
    gt_file_xprintf(outfp, "used source tags:\n");
    sources = gt_cstr_table_get_all(sv->used_sources);
    for (i = 0; i < gt_str_array_size(sources); i++)
      gt_file_xprintf(outfp, "%s\n", gt_str_array_get(sources, i));
    gt_str_array_delete(sources);
  }
}
Example #5
0
static int gt_tyr_occratio_arguments_check(int rest_argc,
                                           void *tool_arguments,
                                           GtError *err)
{
  Tyr_occratio_options *arguments = tool_arguments;
  bool haserr = false;

  Optionargmodedesc outputmodedesctable[] =
  {
    {"unique","number of unique mers",TYROCC_OUTPUTUNIQUE},
    {"nonunique","number of nonunique mers (single count)",
                 TYROCC_OUTPUTNONUNIQUE},
    {"nonuniquemulti","number of nonunique mers (multi count)",
                 TYROCC_OUTPUTNONUNIQUEMULTI},
    {"relative","fraction of unique/non-unique mers relative to all mers",
                 TYROCC_OUTPUTRELATIVE},
    {"total","number of all mers",TYROCC_OUTPUTTOTAL}
  };
  if (rest_argc != 0)
  {
    gt_error_set(err,"superfluous arguments");
    return -1;
  }
  if (gt_option_is_set(arguments->refoptionmersizes))
  {
    unsigned long *mersizes = NULL;
    unsigned long idx,
                  numofmersizes = gt_str_array_size(arguments->mersizesstrings);
    if (numofmersizes == 0)
    {
      gt_error_set(err,"missing argument to option -mersizes:");
      haserr = true;
    } else
    {
      mersizes = gt_malloc(sizeof(*mersizes) * numofmersizes);
      for (idx=0; idx<numofmersizes; idx++)
      {
        long readnum;

        if (sscanf(gt_str_array_get(arguments->mersizesstrings,idx),
                   "%ld",&readnum) != 1 || readnum <= 0)
        {
          gt_error_set(err,"invalid argument \"%s\" of option -mersizes: "
                       "must be a positive integer",
                       gt_str_array_get(arguments->mersizesstrings,idx));
          haserr = true;
          break;
        }
        mersizes[idx] = (unsigned long) readnum;
        if (idx > 0 && mersizes[idx-1] >= mersizes[idx])
        {
          gt_error_set(err,"invalid argumnt %s to option -mersizes: "
                       "positive numbers must be strictly increasing",
                       gt_str_array_get(arguments->mersizesstrings,idx));
          haserr = true;
          break;
        }
      }
    }
    if (!haserr)
    {
      gt_assert(mersizes != NULL);
      arguments->minmersize = mersizes[0];
      arguments->maxmersize = mersizes[numofmersizes-1];
      INITBITTAB(arguments->outputvector,arguments->maxmersize+1);
      for (idx=0; idx<numofmersizes; idx++)
      {
        SETIBIT(arguments->outputvector,mersizes[idx]);
      }
    }
    gt_free(mersizes);
  } else
  {
    if (arguments->minmersize == 0)
    {
      gt_error_set(err,"if option -mersizes is not used, then option "
                       "-minmersize is mandatory");
      haserr = true;
    }
    if (!haserr)
    {
      if (arguments->maxmersize == 0)
      {
        gt_error_set(err,"if option -mersizes is not used, then option "
                         "-maxmersize is mandatory");
        haserr = true;
      }
    }
    if (!haserr)
    {
      if (arguments->minmersize > arguments->maxmersize)
      {
        gt_error_set(err,"minimum mer size must not be larger than "
                         "maximum mer size");
        haserr = true;
      }
    }
    if (!haserr)
    {
      if (arguments->minmersize+arguments->stepmersize > arguments->maxmersize)
      {
        gt_error_set(err,"minimum mer size + step value must be smaller or "
                         "equal to maximum mersize");
        haserr = true;
      }
    }
    if (!haserr)
    {
      unsigned long outputval;

      INITBITTAB(arguments->outputvector,arguments->maxmersize+1);
      for (outputval = arguments->minmersize;
           outputval <= arguments->maxmersize;
           outputval += arguments->stepmersize)
      {
        SETIBIT(arguments->outputvector,outputval);
      }
    }
  }
  if (!haserr)
  {
    unsigned long idx;
    for (idx=0; idx<gt_str_array_size(arguments->outputspec); idx++)
    {
      if (optionargaddbitmask(outputmodedesctable,
                           sizeof (outputmodedesctable)/
                           sizeof (outputmodedesctable[0]),
                           &arguments->outputmode,
                           "-output",
                           gt_str_array_get(arguments->outputspec,idx),
                           err) != 0)
      {
        haserr = true;
        break;
      }
    }
  }
  if (!haserr)
  {
    if ((arguments->outputmode & TYROCC_OUTPUTRELATIVE) &&
        !(arguments->outputmode &
            (TYROCC_OUTPUTUNIQUE | TYROCC_OUTPUTNONUNIQUE |
                                   TYROCC_OUTPUTNONUNIQUEMULTI)))
    {
      gt_error_set(err,"argument relative to option -output requires that one "
                   "of the arguments unique, nonunique, or nonuniquemulti "
                   "is used");
      haserr = true;
    }
  }
  return haserr ? - 1: 0;
}
Example #6
0
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv,
                           int parsed_args, void *tool_arguments,
                           GtError *err)
{
  GtEncseqInfoArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlphabet *alpha;
  const GtUchar *chars;
  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->nomap) {
    GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err);
    if (!emd)
      had_err = -1;

    if (!had_err) {
      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_metadata_version(emd));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd)
                                                  ? "yes"
                                                  : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_total_length(emd));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                      gt_encseq_metadata_num_of_sequences(emd));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_num_of_files(emd));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                        gt_encseq_metadata_min_seq_length(emd),
                                        gt_encseq_metadata_max_seq_length(emd));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                 gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd)));

      alpha = gt_encseq_metadata_alphabet(emd);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

    }
    gt_encseq_metadata_delete(emd);
  } else {
    GtEncseqLoader *encseq_loader;
    GtEncseq *encseq;

    encseq_loader = gt_encseq_loader_new();
    if (arguments->mirror)
      gt_encseq_loader_mirror(encseq_loader);
    if (!(encseq = gt_encseq_loader_load(encseq_loader,
                                         argv[parsed_args], err)))
      had_err = -1;

    if (!had_err) {
      const GtStrArray *filenames;
      GtUword i;

      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq)
                                                   ? "yes"
                                                   : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "compressed size: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n",
                                        gt_encseq_sizeofrep(encseq));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_sequences(encseq));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_files(encseq));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                      gt_encseq_min_seq_length(encseq),
                                      gt_encseq_max_seq_length(encseq));

      filenames = gt_encseq_filenames(encseq);
      gt_file_xprintf(arguments->outfp, "original filenames:\n");
      for (i = 0; i < gt_str_array_size(filenames); i++) {
        gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n",
                                          gt_str_array_get(filenames, i),
                                          (GtUword)
                                     gt_encseq_effective_filelength(encseq, i));
      }

      alpha = gt_encseq_alphabet(encseq);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

      gt_file_xprintf(arguments->outfp, "character distribution:\n");
      for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) {
        GtUword cc;
        cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i]));
        gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n",
                                          (char) chars[i],
                                          cc,
                             (cc /(double) (gt_encseq_total_length(encseq)
                                  - gt_encseq_num_of_sequences(encseq)+1))*100);
      }

      gt_file_xprintf(arguments->outfp, "number of wildcards: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_wildcards(encseq),
                                        gt_encseq_realwildcardranges(encseq));

      gt_file_xprintf(arguments->outfp, "number of special characters: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_specialcharacters(encseq),
                                        gt_encseq_realspecialranges(encseq));

      gt_file_xprintf(arguments->outfp, "length of longest non-special "
                                        "character stretch: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                   gt_encseq_lengthoflongestnonspecial(encseq));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                   gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq)));

      gt_file_xprintf(arguments->outfp, "bits used per character: ");
      gt_file_xprintf(arguments->outfp, "%f\n",
        (double) ((uint64_t) CHAR_BIT *
                  (uint64_t) gt_encseq_sizeofrep(encseq)) /
        (double) gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "has special ranges: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_specialranges(encseq)
                                          ? "yes"
                                          : "no");

      gt_file_xprintf(arguments->outfp, "has description support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                       gt_encseq_has_description_support(encseq)
                                          ? "yes"
                                          : "no");

      if (gt_encseq_has_description_support(encseq)) {
        gt_file_xprintf(arguments->outfp, "length of longest description: ");
        gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_max_desc_length(encseq));
      }

      gt_file_xprintf(arguments->outfp, "has multiple sequence support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_multiseq_support(encseq)
                                          ? "yes"
                                          : "no");
    }
    gt_encseq_delete(encseq);
    gt_encseq_loader_delete(encseq_loader);
  }

  return had_err;
}
GtPdomModelSet* gt_pdom_model_set_new(GtStrArray *hmmfiles, GtError *err)
{
  GtStr *concat_dbnames, *cmdline, *indexfilename = NULL;
  GtUword i;
  char *md5_hash, ch;
  const char *tmpdir;
  int had_err = 0, rval;
  FILE *dest;
  GtPdomModelSet *pdom_model_set;
  gt_assert(hmmfiles);
  gt_error_check(err);

  rval = system("hmmpress -h > /dev/null");
  if (rval == -1) {
    gt_error_set(err, "error executing system(hmmpress)");
    return NULL;
  }
#ifndef _WIN32
  if (WEXITSTATUS(rval) != 0) {
    gt_error_set(err, "cannot find the hmmpress executable in PATH");
    return NULL;
  }
#else
  /* XXX */
  gt_error_set(err, "hmmpress for Windows not implemented");
  return NULL;
#endif

  pdom_model_set = gt_calloc((size_t) 1, sizeof (GtPdomModelSet));
  concat_dbnames = gt_str_new();
  for (i = 0; !had_err && i < gt_str_array_size(hmmfiles); i++) {
    const char *filename = gt_str_array_get(hmmfiles, i);
    if (!gt_file_exists(filename)) {
      gt_error_set(err, "invalid HMM file: %s", filename);
      gt_str_delete(concat_dbnames);
      gt_free(pdom_model_set);
      return NULL;
    } else {
      gt_str_append_cstr(concat_dbnames, filename);
    }
  }
  if (!had_err) {
    pdom_model_set->filename = gt_str_new();
    if (!(tmpdir = getenv("TMPDIR")))
      tmpdir = "/tmp";
    gt_str_append_cstr(pdom_model_set->filename, tmpdir);
    gt_str_append_char(pdom_model_set->filename, GT_PATH_SEPARATOR);
    md5_hash = gt_md5_fingerprint(gt_str_get(concat_dbnames),
                                  gt_str_length(concat_dbnames));
    gt_str_append_cstr(pdom_model_set->filename, md5_hash);
    gt_free(md5_hash);
    gt_str_delete(concat_dbnames);
    indexfilename = gt_str_new_cstr(gt_str_get(pdom_model_set->filename));
    gt_str_append_cstr(indexfilename, GT_HMM_INDEX_SUFFIX);
  }

  if (!gt_file_exists(gt_str_get(indexfilename))) {
    dest = fopen(gt_str_get(pdom_model_set->filename), "w+");
    if (!dest) {
      gt_error_set(err, "could not create file %s",
                 gt_str_get(pdom_model_set->filename));
      had_err = -1;
    }
    if (!had_err) {
      for (i = 0; !had_err && i < gt_str_array_size(hmmfiles); i++) {
        FILE *source;
        const char *filename = gt_str_array_get(hmmfiles, i);
        source = fopen(filename, "r");
        if (!source) {
          gt_error_set(err, "could not open HMM file %s", filename);
          had_err = -1;
        }
        if (!had_err) {
          while (( ch = fgetc(source)) != EOF)
            (void) fputc(ch, dest);
          (void) fclose(source);
        }
      }
      (void) fclose(dest);
    }
    /* XXX: read hmmer path from env */
    cmdline = gt_str_new_cstr("hmmpress -f ");
    gt_str_append_str(cmdline, pdom_model_set->filename);
    gt_str_append_cstr(cmdline, "> /dev/null");   /* XXX: portability? */

    rval = system(gt_str_get(cmdline));
    gt_str_delete(cmdline);
    if (rval == -1) {
      gt_error_set(err, "error executing system(hmmpress)");
      return NULL;
    }
#ifndef _WIN32
    if (WEXITSTATUS(rval) != 0) {
      gt_error_set(err, "an error occurred during HMM preprocessing");
      had_err = -1;
    }
#else
    gt_error_set(err, "WEXITSTATUS not implemented on Windows");
    had_err = -1;
#endif
  }

  if (had_err) {
    gt_pdom_model_set_delete(pdom_model_set);
    pdom_model_set = NULL;
  }
  gt_str_delete(indexfilename);
  return pdom_model_set;
}
Example #8
0
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes,
                        GtStr *filenamestr, GtFile *fpin, bool be_tolerant,
                        GtError *err)
{
  GtStr *seqid_str, *source_str, *line_buffer;
  char *line;
  size_t line_length;
  GtUword i, line_number = 0;
  GtGenomeNode *gn;
  GtRange range;
  GtPhase phase_value;
  GtStrand gt_strand_value;
  GtSplitter *splitter, *attribute_splitter;
  float score_value;
  char *seqname,
       *source,
       *feature,
       *start,
       *end,
       *score,
       *strand,
       *frame,
       *attributes,
       *token,
       *gene_id,
       *gene_name = NULL,
       *transcript_id,
       *transcript_name = NULL,
       **tokens;
  GtHashmap *transcript_id_hash; /* map from transcript id to array of genome
                                    nodes */
  GtArray *gt_genome_node_array;
  ConstructionInfo cinfo;
  GTF_feature_type gtf_feature_type;
  GT_UNUSED bool gff_type_is_valid = false;
  const char *type = NULL;
  const char *filename;
  bool score_is_defined;
  int had_err = 0;

  gt_assert(parser && genome_nodes);
  gt_error_check(err);

  filename = gt_str_get(filenamestr);

  /* alloc */
  line_buffer = gt_str_new();
  splitter = gt_splitter_new(),
  attribute_splitter = gt_splitter_new();

#define HANDLE_ERROR                                                   \
        if (had_err) {                                                 \
          if (be_tolerant) {                                           \
            fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \
            gt_error_unset(err);                                       \
            gt_str_reset(line_buffer);                                 \
            had_err = 0;                                               \
            continue;                                                  \
          }                                                            \
          else {                                                       \
            had_err = -1;                                              \
            break;                                                     \
          }                                                            \
        }

  while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) {
    line = gt_str_get(line_buffer);
    line_length = gt_str_length(line_buffer);
    line_number++;
    gene_name = gene_id = transcript_id = transcript_name = NULL;
    had_err = 0;

    if (line_length == 0) {
      gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number,
                 filename);
    }
    else if (line[0] == '#') {
      /* storing comment */
      if (line_length >= 2 && line[1] == '#')
        gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */
      else
        gn = gt_comment_node_new(line+1);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      gt_queue_add(genome_nodes, gn);
    }
    else {
      bool stop_codon = false;
      char *tokendup, *attrkey;
      GtStrArray *attrkeys, *attrvals;

      /* process tab delimited GTF line */
      gt_splitter_reset(splitter);
      gt_splitter_split(splitter, line, line_length, '\t');
      if (gt_splitter_size(splitter) != 9UL) {
        gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU
                     " tab (\\t) " "separated fields instead of 9", line_number,
                     filename,
                  gt_splitter_size(splitter));
        had_err = -1;
        break;
      }
      tokens = gt_splitter_get_tokens(splitter);
      seqname    = tokens[0];
      source     = tokens[1];
      feature    = tokens[2];
      start      = tokens[3];
      end        = tokens[4];
      score      = tokens[5];
      strand     = tokens[6];
      frame      = tokens[7];
      attributes = tokens[8];

      /* parse feature */
      if (GTF_feature_type_get(&gtf_feature_type, feature) == -1) {
        /* we skip unknown features */
        fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown "
                "feature: \"%s\"\n", line_number, filename, feature);
        gt_str_reset(line_buffer);
        continue;
      }

      /* translate into GFF3 feature type */
      switch (gtf_feature_type) {
        case GTF_stop_codon:
          stop_codon = true;
        case GTF_CDS:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_CDS);
          type = gt_ft_CDS;
          break;
        case GTF_exon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_exon);
          type = gt_ft_exon;
          break;
        case GTF_start_codon:
          /* we can skip the start codons, they are part of the CDS anyway */
          gt_str_reset(line_buffer);
          continue;
      }
      gt_assert(gff_type_is_valid);

      /* parse the range */
      had_err = gt_parse_range(&range, start, end, line_number, filename, err);
      HANDLE_ERROR;

      /* process seqname (we have to do it here because we need the range) */
      gt_region_node_builder_add_region(parser->region_node_builder, seqname,
                                        range);

      /* parse the score */
      had_err = gt_parse_score(&score_is_defined, &score_value, score,
                               line_number, filename, err);
      HANDLE_ERROR;

      /* parse the strand */
      had_err = gt_parse_strand(&gt_strand_value, strand, line_number, filename,
                               err);
      HANDLE_ERROR;

      /* parse the frame */
      had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err);
      HANDLE_ERROR;

      /* parse the attributes */
      attrkeys = gt_str_array_new();
      attrvals = gt_str_array_new();
      gt_splitter_reset(attribute_splitter);
      gene_id = NULL;
      transcript_id = NULL;
      gt_splitter_split(attribute_splitter, attributes, strlen(attributes),
                        ';');
      for (i = 0; i < gt_splitter_size(attribute_splitter); i++) {
        token = gt_splitter_get_token(attribute_splitter, i);
        /* skip leading blanks */
        while (*token == ' ')
          token++;

        tokendup = gt_cstr_dup(token);
        attrkey = strtok(tokendup, " ");
        if (attrkey) {
          char *attrval = strtok(NULL, " ");
          if (attrval == NULL || strcmp(attrval, "") == 0 ||
              strcmp(attrval, "\"\"") == 0)
          {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU " in file \"%s\"", attrkey,line_number,filename);
            had_err = -1;
          }
          HANDLE_ERROR;

          if (*attrval == '"')
            attrval++;
          if (attrval[strlen(attrval)-1] == '"')
            attrval[strlen(attrval)-1] = '\0';
          gt_assert(attrkey && strlen(attrkey) > 0);
          gt_assert(attrval && strlen(attrval) > 0);
          gt_str_array_add_cstr(attrkeys, attrkey);
          gt_str_array_add_cstr(attrvals, attrval);
        }
        gt_free(tokendup);

        /* look for the two mandatory attributes */
        if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                         filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1;
          if (*gene_id == '"')
            gene_id++;
          if (gene_id[strlen(gene_id)-1] == '"')
            gene_id[strlen(gene_id)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE,
                         strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1;
          if (*transcript_id == '"')
            transcript_id++;
          if (transcript_id[strlen(transcript_id)-1] == '"')
            transcript_id[strlen(transcript_id)-1] = '\0';
        }
        else if (strncmp(token, GENE_NAME_ATTRIBUTE,
                         strlen(GENE_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*gene_name == '"')
            gene_name++;
          if (gene_name[strlen(gene_name)-1] == '"')
            gene_name[strlen(gene_name)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE,
                         strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*transcript_name == '"')
            transcript_name++;
          if (transcript_name[strlen(transcript_name)-1] == '"')
            transcript_name[strlen(transcript_name)-1] = '\0';
        }
      }

      /* check for the mandatory attributes */
      if (!gene_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;
      if (!transcript_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;

      /* process the mandatory attributes */
      if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash,
                                             gene_id))) {
        transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                            (GtFree) gt_array_delete);
        gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id),
                    transcript_id_hash);
      }
      gt_assert(transcript_id_hash);

      if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash,
                                            transcript_id))) {
        gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*));
        gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id),
                    gt_genome_node_array);
      }
      gt_assert(gt_genome_node_array);

      /* save optional gene_name and transcript_name attributes */
      if (transcript_name && strlen(transcript_name) > 0
            && !gt_hashmap_get(parser->transcript_id_to_name_mapping,
                             transcript_id)) {
        gt_hashmap_add(parser->transcript_id_to_name_mapping,
                    gt_cstr_dup(transcript_id),
                    gt_cstr_dup(transcript_name));
      }
      if (gene_name && strlen(gene_name) > 0
            && !gt_hashmap_get(parser->gene_id_to_name_mapping,
                                    gene_id)) {
        gt_hashmap_add(parser->gene_id_to_name_mapping,
                    gt_cstr_dup(gene_id),
                    gt_cstr_dup(gene_name));
      }

      /* get seqid */
      seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname);
      if (!seqid_str) {
        seqid_str = gt_str_new_cstr(seqname);
        gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str),
                       seqid_str);
      }
      gt_assert(seqid_str);

      /* construct the new feature */
      gn = gt_feature_node_new(seqid_str, type, range.start, range.end,
                                 gt_strand_value);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      if (stop_codon) {
        gt_feature_node_add_attribute((GtFeatureNode*) gn,
                                      GTF_PARSER_STOP_CODON_FLAG, "true");
      }
      for (i = 0; i < gt_str_array_size(attrkeys); i++) {
        GtFeatureNode *fn = (GtFeatureNode *)gn;
        const char *key = gt_str_array_get(attrkeys, i);
        const char *val = gt_str_array_get(attrvals, i);

        /* Not a comprehensive solution to ensure correct encoding, just bare
           minimum required to get Cufflinks output parsed */
        if (strcmp(val, "=") == 0)
          val = "%26";

        if (gt_feature_node_get_attribute(fn, key) != NULL) {
          const char *oldval = gt_feature_node_get_attribute(fn, key);
          GtStr *newval = gt_str_new_cstr(oldval);
          gt_str_append_char(newval, ',');
          gt_str_append_cstr(newval, val);
          gt_feature_node_set_attribute(fn, key, gt_str_get(newval));
          gt_str_delete(newval);
        }
        else
          gt_feature_node_add_attribute(fn, key, val);
      }
      gt_str_array_delete(attrkeys);
      gt_str_array_delete(attrvals);

      /* set source */
      source_str = gt_hashmap_get(parser->source_to_str_mapping, source);
      if (!source_str) {
        source_str = gt_str_new_cstr(source);
        gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str),
                    source_str);
      }
      gt_assert(source_str);
      gt_feature_node_set_source((GtFeatureNode*) gn, source_str);

      if (score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
      if (phase_value != GT_PHASE_UNDEFINED)
        gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value);
      gt_array_add(gt_genome_node_array, gn);
    }

    gt_str_reset(line_buffer);
  }

  /* process all region nodes */
  if (!had_err)
    gt_region_node_builder_build(parser->region_node_builder, genome_nodes);

  /* process all feature nodes */
  cinfo.genome_nodes = genome_nodes;
  cinfo.tidy = be_tolerant;
  cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping;
  cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping;
  if (!had_err) {
    had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes,
                                 &cinfo, err);
  }
  gt_hashmap_foreach(parser->gene_id_hash, delete_genes, NULL, err);

  /* free */
  gt_splitter_delete(splitter);
  gt_splitter_delete(attribute_splitter);
  gt_str_delete(line_buffer);

  return had_err;
}
static int gt_sequence_buffer_fasta_advance(GtSequenceBuffer *sb, GtError *err)
{
  int currentchar, ret = 0;
  GtUword currentoutpos = 0, currentfileadd = 0, currentfileread = 0;
  GtSequenceBufferMembers *pvt;
  GtSequenceBufferFasta *sbf;

  gt_error_check(err);

  sbf = (GtSequenceBufferFasta*) sb;
  pvt = sb->pvt;
  while (true)
  {
    if (currentoutpos >= (GtUword) OUTBUFSIZE)
    {
      if (pvt->filelengthtab != NULL)
      {
        pvt->filelengthtab[pvt->filenum].length
          += (uint64_t) currentfileread;
        pvt->filelengthtab[pvt->filenum].effectivelength
          += (uint64_t) currentfileadd;
      }
      break;
    }
    if (sbf->nextfile)
    {
      if (pvt->filelengthtab != NULL)
      {
        pvt->filelengthtab[pvt->filenum].length = 0;
        pvt->filelengthtab[pvt->filenum].effectivelength = 0;
      }
      sbf->nextfile = false;
      sbf->indesc = false;
      sbf->firstseqinfile = true;
      currentfileadd = 0;
      currentfileread = 0;
      pvt->linenum = (uint64_t) 1;
      pvt->inputstream = gt_file_xopen(gt_str_array_get(pvt->filenametab,
                                                  (GtUword) pvt->filenum),
                                       "rb");
      pvt->currentinpos = 0;
      pvt->currentfillpos = 0;
    } else
    {
      currentchar = inlinebuf_getchar(sb, pvt->inputstream);
      if (currentchar == EOF)
      {
        gt_file_delete(pvt->inputstream);
        pvt->inputstream = NULL;
        if (pvt->filelengthtab != NULL)
        {
          pvt->filelengthtab[pvt->filenum].length += currentfileread;
          pvt->filelengthtab[pvt->filenum].effectivelength += currentfileadd;
        }
        if ((GtUword) pvt->filenum
                                     == gt_str_array_size(pvt->filenametab)-1)
        {
          pvt->complete = true;
          break;
        }
        pvt->filenum++;
        sbf->nextfile = true;
      } else
      {
        currentfileread++;
        if (sbf->indesc)
        {
          if (currentchar == NEWLINESYMBOL)
          {
            pvt->linenum++;
            sbf->indesc = false;
          }
          if (pvt->descptr != NULL)
          {
            if (currentchar == NEWLINESYMBOL)
            {
              gt_desc_buffer_finish(pvt->descptr);
            } else
            {
              if (currentchar != CRSYMBOL)
                gt_desc_buffer_append_char(pvt->descptr, currentchar);
            }
          }
        } else
        {
          if (!isspace((int) currentchar))
          {
            if (currentchar == FASTASEPARATOR)
            {
              if (sbf->firstoverallseq)
              {
                sbf->firstoverallseq = false;
                sbf->firstseqinfile = false;
              } else
              {
                if (sbf->firstseqinfile)
                {
                  sbf->firstseqinfile = false;
                } else
                {
                  currentfileadd++;
                }
                pvt->outbuf[currentoutpos++] = (unsigned char) SEPARATOR;
                pvt->lastspeciallength++;
              }
              sbf->indesc = true;
            } else
            {
              if ((ret = process_char(sb, currentoutpos,
                                      (unsigned char) currentchar, err)))
                return ret;
              currentoutpos++;
              currentfileadd++;
            }
          }
        }
      }
    }
  }
  if (sbf->firstoverallseq)
  {
    gt_error_set(err,"no sequences in multiple fasta file(s) %s ...",
              gt_str_array_get(pvt->filenametab,0));
    return -2;
  }
  pvt->nextfree = currentoutpos;
  return 0;
}
Example #10
0
static int gff3_in_stream_plain_next(GtNodeStream *ns, GtGenomeNode **gn,
                                     GtError *err)
{
  GtGFF3InStreamPlain *is = gff3_in_stream_plain_cast(ns);
  GtStr *filenamestr;
  int had_err = 0, status_code;

  gt_error_check(err);

  if (gt_queue_size(is->genome_node_buffer) > 1) {
    /* we still have at least two nodes in the buffer -> serve from there */
    *gn = gt_queue_get(is->genome_node_buffer);
    return 0;
  }

  /* the buffer is empty or has one element */
  gt_assert(gt_queue_size(is->genome_node_buffer) <= 1);

  for (;;) {
    /* open file if necessary */
    if (!is->file_is_open) {
      if (gt_str_array_size(is->files) &&
          is->next_file == gt_str_array_size(is->files)) {
        break;
      }
      if (gt_str_array_size(is->files)) {
        if (strcmp(gt_str_array_get(is->files, is->next_file), "-") == 0) {
          if (is->stdin_argument) {
            gt_error_set(err, "multiple specification of argument file \"-\"");
            had_err = -1;
            break;
          }
          is->fpin = gt_file_xopen(NULL, "r");
          is->file_is_open = true;
          is->stdin_argument = true;
        }
        else {
          is->fpin = gt_file_xopen(gt_str_array_get(is->files,
                                                       is->next_file), "r");
          is->file_is_open = true;
        }
        is->next_file++;
      }
      else {
        if (is->stdin_processed)
          break;
        is->fpin = NULL;
        is->file_is_open = true;
      }
      is->line_number = 0;

      if (!had_err && is->progress_bar) {
        printf("processing file \"%s\"\n", gt_str_array_size(is->files)
               ? gt_str_array_get(is->files, is->next_file-1) : "stdin");
      }
      if (!had_err && is->fpin && is->progress_bar) {
        gt_progressbar_start(&is->line_number,
                            gt_file_number_of_lines(gt_str_array_get(is->files,
                                                             is->next_file-1)));
      }
    }

    gt_assert(is->file_is_open);

    filenamestr = gt_str_array_size(is->files)
                  ? gt_str_array_get_str(is->files, is->next_file-1)
                  : is->stdinstr;
    /* read two nodes */
    had_err = gt_gff3_parser_parse_genome_nodes(is->gff3_parser, &status_code,
                                                is->genome_node_buffer,
                                                is->used_types, filenamestr,
                                                &is->line_number, is->fpin,
                                                err);
    if (had_err)
      break;
    if (status_code != EOF) {
      had_err = gt_gff3_parser_parse_genome_nodes(is->gff3_parser, &status_code,
                                                  is->genome_node_buffer,
                                                  is->used_types, filenamestr,
                                                  &is->line_number, is->fpin,
                                                  err);
      if (had_err)
        break;
    }

    if (status_code == EOF) {
      /* end of current file */
      if (is->progress_bar) gt_progressbar_stop();
      gt_file_delete(is->fpin);
      is->fpin = NULL;
      is->file_is_open = false;
      gt_gff3_parser_reset(is->gff3_parser);
      if (!gt_str_array_size(is->files)) {
        is->stdin_processed = true;
        break;
      }
      continue;
    }

    gt_assert(gt_queue_size(is->genome_node_buffer));

    /* make sure the parsed nodes are sorted */
    if (is->ensure_sorting && gt_queue_size(is->genome_node_buffer) > 1) {
      GtGenomeNode *last_node = NULL;
      /* a sorted stream can have at most one input file */
      gt_assert(gt_str_array_size(is->files) == 0 ||
                gt_str_array_size(is->files) == 1);
      had_err = gt_queue_iterate(is->genome_node_buffer, buffer_is_sorted,
                                 &last_node, err);
    }
    if (!had_err) {
      *gn = gt_queue_get(is->genome_node_buffer);
    }
    return had_err;
  }
  gt_assert(!gt_queue_size(is->genome_node_buffer));
  *gn = NULL;
  return had_err;
}