Пример #1
0
static void store_attribute(const char *attr_name,
                            GT_UNUSED const char *attr_value, void *data)
{
  GtStrArray *list = data;
  gt_assert(attr_name && attr_value && data);
  gt_str_array_add_cstr(list, attr_name);
}
Пример #2
0
GtNodeStream* gt_gff3_in_stream_plain_new_sorted(const char *filename)
{
  GtStrArray *files = gt_str_array_new();
  if (filename)
    gt_str_array_add_cstr(files, filename);
  return gff3_in_stream_plain_new(files, true);
}
Пример #3
0
static int save_fastaentry(const char *seqpart, GT_UNUSED GtUword length,
                           void *data, GT_UNUSED GtError* err)
{
  gt_error_check(err);
  GtStrArray *fasta_sequences = (GtStrArray*) data;
  gt_str_array_add_cstr(fasta_sequences, seqpart);
  return 0;
}
Пример #4
0
GtNodeStream* gt_gff3_in_stream_plain_new_unsorted(int num_of_files,
                                                   const char **filenames)
{
  int i;
  GtStrArray *files = gt_str_array_new();
  for (i = 0; i < num_of_files; i++)
    gt_str_array_add_cstr(files, filenames[i]);
  return gff3_in_stream_plain_new(files, false);
}
Пример #5
0
static int gt_encseq_encode_runner(GT_UNUSED int argc, const char **argv,
                               int parsed_args, GT_UNUSED void *tool_arguments,
                               GtError *err)
{
  int had_err = 0,
      i;
  GtEncseqEncodeArguments *arguments =
                                      (GtEncseqEncodeArguments*) tool_arguments;
  GtStrArray *infiles;
  gt_error_check(err);

  infiles = gt_str_array_new();
  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(infiles, argv[i]);
  }

  if (gt_str_length(arguments->indexname) == 0UL) {
    if (gt_str_array_size(infiles) > 1UL) {
      gt_error_set(err,"if more than one input file is given, then "
                       "option -indexname is mandatory");
      had_err = -1;
    } else {
      char *basenameptr;
      basenameptr = gt_basename(gt_str_array_get(infiles, 0UL));
      gt_str_set(arguments->indexname, basenameptr);
      gt_free(basenameptr);
    }
  }

  if (!had_err) {
    gt_assert(gt_str_length(arguments->indexname) > 0UL);
    had_err = encode_sequence_files(infiles,
                                    arguments->eopts,
                                    gt_str_get(arguments->indexname),
                                    arguments->verbose,
                                    arguments->no_esq_header,
                                    err);
  }

  if (!had_err && arguments->showstats)
    show_encoded_statistics(infiles, gt_str_get(arguments->indexname));

  gt_str_array_delete(infiles);
  return had_err;
}
Пример #6
0
int gt_lua_get_table_as_strarray(lua_State *L, int index, GtStrArray *outarray,
                                 GtError *err)
{
  int had_err = 0;
  gt_assert(lua_istable(L, index));
  lua_pushnil(L);
  while (!had_err && (lua_next(L, index) != 0))
  {
    if (!lua_isstring(L, -1)) {
      had_err = -1;
      gt_error_set(err, "table contains non-string value!");
      break;
    }
    gt_str_array_add_cstr(outarray, lua_tostring(L, -1));
    lua_pop(L, 1);
  }
  return 0;
}
Пример #7
0
static int region_mapping_lua_new_seqfile(lua_State *L)
{
    const char *seqfilename;
    GtStrArray *seqfile;
    GtRegionMapping **region_mapping;
    gt_assert(L);
    seqfilename = luaL_checkstring(L, 1);
    region_mapping = lua_newuserdata(L, sizeof (GtRegionMapping*));
    gt_assert(region_mapping);
    seqfile = gt_str_array_new();
    gt_str_array_add_cstr(seqfile, seqfilename);
    /* XXX: make second and third parameter available */
    *region_mapping = gt_region_mapping_new_seqfiles(seqfile, false, false);
    gt_str_array_delete(seqfile);
    luaL_getmetatable(L, REGION_MAPPING_METATABLE);
    lua_setmetatable(L, -2);
    return 1;
}
Пример #8
0
static int process_fastakeyfile(GtStr *fastakeyfile, int argc,
                                const char **argv, unsigned long width,
                                GtFile *outfp, GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_assert(gt_str_length(fastakeyfile));

  if (argc == 0) {
    gt_error_set(err,"option -keys requires at least one file argument");
    had_err = -1;
  }

  if (!had_err)
  {
    GtStr *indexname = gt_str_new_cstr(argv[0]);

    if (argc == 1 && gt_deskeysfileexists(indexname))
    {
      if (gt_extractkeysfromfastaindex(indexname,fastakeyfile,width,err) != 0)
      {
        had_err = -1;
      }
    } else
    {
      GtStrArray *referencefiletab;
      int i;

      referencefiletab = gt_str_array_new();
      for (i = 0; i < argc; i++)
      {
        gt_str_array_add_cstr(referencefiletab, argv[i]);
      }
      if (gt_extractkeysfromfastafile(true, outfp, width, fastakeyfile,
                                      referencefiletab, err) != 1)
      {
        had_err = -1;
      }
      gt_str_array_delete(referencefiletab);
    }
    gt_str_delete(indexname);
  }
  return had_err;
}
Пример #9
0
static GtOPrval gthfilestat_parse_options(int *parsed_args,
                                          GthFileStatInfo *file_stat_info,
                                          int argc, const char **argv,
                                          const GthPlugins *plugins,
                                          GtError *err)
{
  GtOptionParser *op;
  GtOption *o;
  GtOPrval oprval;
  bool verbose;
  gt_error_check(err);

  op = gt_option_parser_new("[option ...] [file ...]", "Show statistics about "
                         "spliced alignments in GenomeThreader output files\n"
                         "containing intermediate results.");

  /* add sa_filter options */
  gth_sa_filter_register_options(op, file_stat_info->sa_filter, false);

  /* -v */
  o = gt_option_new_verbose(&verbose);
  gt_option_parser_add_option(op, o);

  gt_option_parser_set_mail_address(op, "<*****@*****.**>");
  oprval = gt_option_parser_parse(op, parsed_args, argc, argv,
                                  plugins->gth_version_func, err);

  if (verbose)
    file_stat_info->showverbose = gth_show_on_stdout;

  /* save consensus files */
  if (oprval == GT_OPTION_PARSER_OK) {
    while (*parsed_args < argc) {
      gt_str_array_add_cstr(file_stat_info->consensusfiles,
                            argv[*parsed_args]);
      (*parsed_args)++;
    }
  }

  gt_option_parser_delete(op);

  return oprval;
}
Пример #10
0
/* 'static' function */
GtStrArray* gt_trans_table_get_scheme_descriptions()
{
    GtUword i;
    GtTranslationScheme *scheme;
    GtStr *str;
    GtStrArray *sa = gt_str_array_new();
    str = gt_str_new();
    for (i = 1UL; i < (GtUword) GT_SIZEOFTRANSRANGE; i++) {
        if (transnum2index[i] == GT_UNDEFTRANSNUM)
            continue;
        scheme = schemetable + transnum2index[i];
        gt_str_reset(str);
        gt_str_append_uint(str, scheme->identity);
        gt_str_append_cstr(str, ": ");
        gt_str_append_cstr(str, scheme->name);
        gt_str_array_add_cstr(sa, gt_str_get(str));
    }
    gt_str_delete(str);
    return sa;
}
Пример #11
0
static void nodeinfo_add_block(NodeInfoElement *ni, const char *gft,
                               GtFeatureNode *rep, GtBlock *block)
{
  GtBlockTuple *bt;
  PerTypeInfo *type_struc = NULL;
  gt_assert(ni);
  bt = blocktuple_new(gft, rep, block);
  if (!(ni->type_index))
  {
    ni->type_index = gt_hashmap_new(GT_HASH_STRING, NULL, gt_free_func);
  }
  if (!(type_struc = gt_hashmap_get(ni->type_index, gft)))
  {
    type_struc = gt_calloc(1, sizeof (PerTypeInfo));
    type_struc->rep_index = gt_hashmap_new(GT_HASH_DIRECT, NULL, NULL);
    type_struc->blocktuples = gt_array_new(sizeof (GtBlockTuple*));
    gt_hashmap_add(ni->type_index, (char*) gft, type_struc);
    gt_str_array_add_cstr(ni->types, gft);
  }
  gt_hashmap_add(type_struc->rep_index, rep, bt);
  if (rep != GT_UNDEF_REPR)
    type_struc->must_merge = true;
  gt_array_add(type_struc->blocktuples, bt);
}
Пример #12
0
int gt_translator_unit_test(GtError *err)
{
  int had_err = 0;
  GtTranslatorStatus test_errnum;
  GtTranslator *tr;
  GtCodonIterator *ci;
  GtError *test_err;
  GtStrArray *codons, *invalidcodons;
  const char *seq = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGT"
                    "GGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGT"
                    "TACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG";
  const char *no_startcodon = "AAAAAAAAAATCATCTCCCCATTTTTTT";
  const char *invalidseq  = "ZAGCTTTTCATTCTGACTGCAAATATGTCTCTGTGT";
  const char *invalidseq2 = "AGCTTTTCATTCTGACZTGCAAATATGTCTCTGTGT";

  char translated;
  unsigned int frame;
  GtUword pos = 0;
  GtStr *protein[3];
  gt_error_check(err);

  test_err = gt_error_new();
  ci = gt_codon_iterator_simple_new(seq, (GtUword) strlen(seq), test_err);
  tr = gt_translator_new(ci);
  protein[0] = gt_str_new();
  protein[1] = gt_str_new();
  protein[2] = gt_str_new();
  codons = gt_str_array_new();
  gt_str_array_add_cstr(codons, "ACG");
  gt_str_array_add_cstr(codons, "ACT");
  invalidcodons = gt_str_array_new();
  gt_str_array_add_cstr(invalidcodons, "ACG");
  gt_str_array_add_cstr(invalidcodons, "AC");

  /* do 3-frame translation */
  gt_error_unset(test_err);
  test_errnum = gt_translator_next(tr, &translated, &frame, test_err);
  while (!test_errnum && translated) {
    gt_str_append_char(protein[frame], translated);
    test_errnum = gt_translator_next(tr, &translated, &frame, test_err);
    gt_ensure(
           test_errnum != GT_TRANSLATOR_ERROR && !gt_error_is_set(test_err));
  }
  gt_ensure(
         test_errnum == GT_TRANSLATOR_END && !gt_error_is_set(test_err));

  /* check 3-frame translation */
  gt_ensure(strcmp(gt_str_get(protein[0]),
                         "SFSF*LQRAICLCVD*KKSV**QLLNWLPAVSKLKFY*LR") == 0);
  gt_ensure(strcmp(gt_str_get(protein[1]),
                         "AFHSDCNGQYVSVWIKKRVSDSSF*TGYLP*VN*NFIDL") == 0);
  gt_ensure(strcmp(gt_str_get(protein[2]),
                         "LFILTATGNMSLCGLKKECLIAASELVTCRE*IKILLT*") == 0);

  /* find start codon -- positive */
  gt_error_unset(test_err);
  gt_codon_iterator_rewind(ci);
  test_errnum = gt_translator_find_startcodon(tr, &pos, test_err);
  gt_ensure(!test_errnum && !gt_error_is_set(test_err));
  gt_ensure(pos == 11UL);

  /* find stop codon -- positive */
  gt_error_unset(test_err);
  gt_codon_iterator_rewind(ci);
  test_errnum = gt_translator_find_stopcodon(tr, &pos, test_err);
  gt_ensure(!test_errnum && !gt_error_is_set(test_err));
  gt_ensure(pos == 12UL);

  /* find arbitrary codons -- positive */
  gt_error_unset(test_err);
  gt_codon_iterator_rewind(ci);
  test_errnum = gt_translator_find_codon(tr, codons, &pos, test_err);
  gt_ensure(!test_errnum && !gt_error_is_set(test_err));
  gt_ensure(pos == 14UL);

  /* find arbitrary codons -- negative (invalid codons) */
  gt_error_unset(test_err);
  gt_codon_iterator_rewind(ci);
  test_errnum = gt_translator_find_codon(tr, invalidcodons, &pos, test_err);
  gt_ensure(
         test_errnum == GT_TRANSLATOR_ERROR && gt_error_is_set(test_err));

  gt_error_unset(test_err);
  gt_codon_iterator_delete(ci);
  ci = gt_codon_iterator_simple_new(invalidseq,
                                    (GtUword) strlen(invalidseq),
                                    test_err);
  gt_ensure(ci && !gt_error_is_set(test_err));
  gt_translator_reset(tr, ci);
  /* check translation of sequence with invalid beginning */
  test_errnum = gt_translator_next(tr, &translated, &frame, test_err);
  gt_ensure(test_errnum && gt_error_is_set(test_err));

  /* check translation of sequence with invalid character within */
  gt_error_unset(test_err);
  gt_codon_iterator_delete(ci);
  ci = gt_codon_iterator_simple_new(invalidseq2,
                                    (GtUword) strlen(invalidseq2),
                                    test_err);
  gt_ensure(ci && !gt_error_is_set(test_err));
  gt_translator_reset(tr, ci);
  test_errnum = gt_translator_next(tr, &translated, &frame, test_err);
  while (!test_errnum && translated) {
    gt_str_append_char(protein[frame], translated);
    test_errnum = gt_translator_next(tr, &translated, &frame, test_err);
  }
  gt_ensure(
         test_errnum == GT_TRANSLATOR_ERROR && gt_error_is_set(test_err));

  /* find start codon -- fail */
  gt_error_unset(test_err);
  gt_codon_iterator_delete(ci);
  ci = gt_codon_iterator_simple_new(no_startcodon,
                                    (GtUword) strlen(no_startcodon),
                                    test_err);
  gt_ensure(ci && !gt_error_is_set(test_err));
  gt_translator_reset(tr, ci);
  test_errnum = gt_translator_find_startcodon(tr, &pos, test_err);
  gt_ensure(
         test_errnum == GT_TRANSLATOR_END && !gt_error_is_set(test_err));

  /* find stop codon -- fail */
  gt_error_unset(test_err);
  gt_codon_iterator_rewind(ci);
  test_errnum = gt_translator_find_stopcodon(tr, &pos, test_err);
  gt_ensure(
         test_errnum == GT_TRANSLATOR_END && !gt_error_is_set(test_err));

  /* find arbitrary codons -- negative (none there) */
  gt_error_unset(test_err);
  gt_codon_iterator_rewind(ci);
  test_errnum = gt_translator_find_codon(tr, codons, &pos, test_err);
  gt_ensure(
         test_errnum == GT_TRANSLATOR_END && !gt_error_is_set(test_err));

  gt_codon_iterator_delete(ci);
  gt_translator_delete(tr);
  gt_str_delete(protein[0]);
  gt_str_delete(protein[1]);
  gt_str_delete(protein[2]);
  gt_str_array_delete(codons);
  gt_str_array_delete(invalidcodons);
  gt_error_delete(test_err);

  return had_err;
}
Пример #13
0
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes,
                        GtStr *filenamestr, GtFile *fpin, bool be_tolerant,
                        GtError *err)
{
  GtStr *seqid_str, *source_str, *line_buffer;
  char *line;
  size_t line_length;
  GtUword i, line_number = 0;
  GtGenomeNode *gn;
  GtRange range;
  GtPhase phase_value;
  GtStrand gt_strand_value;
  GtSplitter *splitter, *attribute_splitter;
  float score_value;
  char *seqname,
       *source,
       *feature,
       *start,
       *end,
       *score,
       *strand,
       *frame,
       *attributes,
       *token,
       *gene_id,
       *gene_name = NULL,
       *transcript_id,
       *transcript_name = NULL,
       **tokens;
  GtHashmap *transcript_id_hash; /* map from transcript id to array of genome
                                    nodes */
  GtArray *gt_genome_node_array;
  ConstructionInfo cinfo;
  GTF_feature_type gtf_feature_type;
  GT_UNUSED bool gff_type_is_valid = false;
  const char *type = NULL;
  const char *filename;
  bool score_is_defined;
  int had_err = 0;

  gt_assert(parser && genome_nodes);
  gt_error_check(err);

  filename = gt_str_get(filenamestr);

  /* alloc */
  line_buffer = gt_str_new();
  splitter = gt_splitter_new(),
  attribute_splitter = gt_splitter_new();

#define HANDLE_ERROR                                                   \
        if (had_err) {                                                 \
          if (be_tolerant) {                                           \
            fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \
            gt_error_unset(err);                                       \
            gt_str_reset(line_buffer);                                 \
            had_err = 0;                                               \
            continue;                                                  \
          }                                                            \
          else {                                                       \
            had_err = -1;                                              \
            break;                                                     \
          }                                                            \
        }

  while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) {
    line = gt_str_get(line_buffer);
    line_length = gt_str_length(line_buffer);
    line_number++;
    gene_name = gene_id = transcript_id = transcript_name = NULL;
    had_err = 0;

    if (line_length == 0) {
      gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number,
                 filename);
    }
    else if (line[0] == '#') {
      /* storing comment */
      if (line_length >= 2 && line[1] == '#')
        gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */
      else
        gn = gt_comment_node_new(line+1);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      gt_queue_add(genome_nodes, gn);
    }
    else {
      bool stop_codon = false;
      char *tokendup, *attrkey;
      GtStrArray *attrkeys, *attrvals;

      /* process tab delimited GTF line */
      gt_splitter_reset(splitter);
      gt_splitter_split(splitter, line, line_length, '\t');
      if (gt_splitter_size(splitter) != 9UL) {
        gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU
                     " tab (\\t) " "separated fields instead of 9", line_number,
                     filename,
                  gt_splitter_size(splitter));
        had_err = -1;
        break;
      }
      tokens = gt_splitter_get_tokens(splitter);
      seqname    = tokens[0];
      source     = tokens[1];
      feature    = tokens[2];
      start      = tokens[3];
      end        = tokens[4];
      score      = tokens[5];
      strand     = tokens[6];
      frame      = tokens[7];
      attributes = tokens[8];

      /* parse feature */
      if (GTF_feature_type_get(&gtf_feature_type, feature) == -1) {
        /* we skip unknown features */
        fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown "
                "feature: \"%s\"\n", line_number, filename, feature);
        gt_str_reset(line_buffer);
        continue;
      }

      /* translate into GFF3 feature type */
      switch (gtf_feature_type) {
        case GTF_stop_codon:
          stop_codon = true;
        case GTF_CDS:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_CDS);
          type = gt_ft_CDS;
          break;
        case GTF_exon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_exon);
          type = gt_ft_exon;
          break;
        case GTF_start_codon:
          /* we can skip the start codons, they are part of the CDS anyway */
          gt_str_reset(line_buffer);
          continue;
      }
      gt_assert(gff_type_is_valid);

      /* parse the range */
      had_err = gt_parse_range(&range, start, end, line_number, filename, err);
      HANDLE_ERROR;

      /* process seqname (we have to do it here because we need the range) */
      gt_region_node_builder_add_region(parser->region_node_builder, seqname,
                                        range);

      /* parse the score */
      had_err = gt_parse_score(&score_is_defined, &score_value, score,
                               line_number, filename, err);
      HANDLE_ERROR;

      /* parse the strand */
      had_err = gt_parse_strand(&gt_strand_value, strand, line_number, filename,
                               err);
      HANDLE_ERROR;

      /* parse the frame */
      had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err);
      HANDLE_ERROR;

      /* parse the attributes */
      attrkeys = gt_str_array_new();
      attrvals = gt_str_array_new();
      gt_splitter_reset(attribute_splitter);
      gene_id = NULL;
      transcript_id = NULL;
      gt_splitter_split(attribute_splitter, attributes, strlen(attributes),
                        ';');
      for (i = 0; i < gt_splitter_size(attribute_splitter); i++) {
        token = gt_splitter_get_token(attribute_splitter, i);
        /* skip leading blanks */
        while (*token == ' ')
          token++;

        tokendup = gt_cstr_dup(token);
        attrkey = strtok(tokendup, " ");
        if (attrkey) {
          char *attrval = strtok(NULL, " ");
          if (attrval == NULL || strcmp(attrval, "") == 0 ||
              strcmp(attrval, "\"\"") == 0)
          {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU " in file \"%s\"", attrkey,line_number,filename);
            had_err = -1;
          }
          HANDLE_ERROR;

          if (*attrval == '"')
            attrval++;
          if (attrval[strlen(attrval)-1] == '"')
            attrval[strlen(attrval)-1] = '\0';
          gt_assert(attrkey && strlen(attrkey) > 0);
          gt_assert(attrval && strlen(attrval) > 0);
          gt_str_array_add_cstr(attrkeys, attrkey);
          gt_str_array_add_cstr(attrvals, attrval);
        }
        gt_free(tokendup);

        /* look for the two mandatory attributes */
        if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                         filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1;
          if (*gene_id == '"')
            gene_id++;
          if (gene_id[strlen(gene_id)-1] == '"')
            gene_id[strlen(gene_id)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE,
                         strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1;
          if (*transcript_id == '"')
            transcript_id++;
          if (transcript_id[strlen(transcript_id)-1] == '"')
            transcript_id[strlen(transcript_id)-1] = '\0';
        }
        else if (strncmp(token, GENE_NAME_ATTRIBUTE,
                         strlen(GENE_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*gene_name == '"')
            gene_name++;
          if (gene_name[strlen(gene_name)-1] == '"')
            gene_name[strlen(gene_name)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE,
                         strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*transcript_name == '"')
            transcript_name++;
          if (transcript_name[strlen(transcript_name)-1] == '"')
            transcript_name[strlen(transcript_name)-1] = '\0';
        }
      }

      /* check for the mandatory attributes */
      if (!gene_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;
      if (!transcript_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;

      /* process the mandatory attributes */
      if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash,
                                             gene_id))) {
        transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                            (GtFree) gt_array_delete);
        gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id),
                    transcript_id_hash);
      }
      gt_assert(transcript_id_hash);

      if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash,
                                            transcript_id))) {
        gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*));
        gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id),
                    gt_genome_node_array);
      }
      gt_assert(gt_genome_node_array);

      /* save optional gene_name and transcript_name attributes */
      if (transcript_name && strlen(transcript_name) > 0
            && !gt_hashmap_get(parser->transcript_id_to_name_mapping,
                             transcript_id)) {
        gt_hashmap_add(parser->transcript_id_to_name_mapping,
                    gt_cstr_dup(transcript_id),
                    gt_cstr_dup(transcript_name));
      }
      if (gene_name && strlen(gene_name) > 0
            && !gt_hashmap_get(parser->gene_id_to_name_mapping,
                                    gene_id)) {
        gt_hashmap_add(parser->gene_id_to_name_mapping,
                    gt_cstr_dup(gene_id),
                    gt_cstr_dup(gene_name));
      }

      /* get seqid */
      seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname);
      if (!seqid_str) {
        seqid_str = gt_str_new_cstr(seqname);
        gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str),
                       seqid_str);
      }
      gt_assert(seqid_str);

      /* construct the new feature */
      gn = gt_feature_node_new(seqid_str, type, range.start, range.end,
                                 gt_strand_value);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      if (stop_codon) {
        gt_feature_node_add_attribute((GtFeatureNode*) gn,
                                      GTF_PARSER_STOP_CODON_FLAG, "true");
      }
      for (i = 0; i < gt_str_array_size(attrkeys); i++) {
        GtFeatureNode *fn = (GtFeatureNode *)gn;
        const char *key = gt_str_array_get(attrkeys, i);
        const char *val = gt_str_array_get(attrvals, i);

        /* Not a comprehensive solution to ensure correct encoding, just bare
           minimum required to get Cufflinks output parsed */
        if (strcmp(val, "=") == 0)
          val = "%26";

        if (gt_feature_node_get_attribute(fn, key) != NULL) {
          const char *oldval = gt_feature_node_get_attribute(fn, key);
          GtStr *newval = gt_str_new_cstr(oldval);
          gt_str_append_char(newval, ',');
          gt_str_append_cstr(newval, val);
          gt_feature_node_set_attribute(fn, key, gt_str_get(newval));
          gt_str_delete(newval);
        }
        else
          gt_feature_node_add_attribute(fn, key, val);
      }
      gt_str_array_delete(attrkeys);
      gt_str_array_delete(attrvals);

      /* set source */
      source_str = gt_hashmap_get(parser->source_to_str_mapping, source);
      if (!source_str) {
        source_str = gt_str_new_cstr(source);
        gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str),
                    source_str);
      }
      gt_assert(source_str);
      gt_feature_node_set_source((GtFeatureNode*) gn, source_str);

      if (score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
      if (phase_value != GT_PHASE_UNDEFINED)
        gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value);
      gt_array_add(gt_genome_node_array, gn);
    }

    gt_str_reset(line_buffer);
  }

  /* process all region nodes */
  if (!had_err)
    gt_region_node_builder_build(parser->region_node_builder, genome_nodes);

  /* process all feature nodes */
  cinfo.genome_nodes = genome_nodes;
  cinfo.tidy = be_tolerant;
  cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping;
  cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping;
  if (!had_err) {
    had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes,
                                 &cinfo, err);
  }
  gt_hashmap_foreach(parser->gene_id_hash, delete_genes, NULL, err);

  /* free */
  gt_splitter_delete(splitter);
  gt_splitter_delete(attribute_splitter);
  gt_str_delete(line_buffer);

  return had_err;
}
Пример #14
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
    bool mirrored = false;
    int had_err = 0,
        i;
    GtEncseq              *encseq = NULL;
    GtGenomediffArguments *arguments = tool_arguments;
    GtLogger              *logger;
    GtShuUnitFileInfo     *unit_info = NULL;
    GtTimer               *timer = NULL;

    gt_error_check(err);
    gt_assert(arguments);

    logger = gt_logger_new(arguments->verbose,
                           GT_LOGGER_DEFLT_PREFIX,
                           stdout);
    gt_assert(logger);

    for (i = parsed_args; i < argc; i++) {
        gt_str_array_add_cstr(arguments->filenames, argv[i]);
    }

    if (gt_showtime_enabled()) {
        timer = gt_timer_new_with_progress_description("start");
        gt_timer_start(timer);
        gt_assert(timer);
    }

    if (arguments->with_units) {
        gt_logger_log(logger, "unitfile option set, filename is %s\n",
                      gt_str_get(arguments->unitfile));
    }

    if (timer != NULL)
        gt_timer_show_progress(timer, "start shu search", stdout);

    if (gt_str_array_size(arguments->filenames) > 1UL) {
        GtEncseqEncoder *ee = gt_encseq_encoder_new();
        gt_encseq_encoder_set_timer(ee, timer);
        gt_encseq_encoder_set_logger(ee, logger);
        /* kr only makes sense for dna, so we can check this already with ee */
        gt_encseq_encoder_set_input_dna(ee);
        had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                           gt_str_get(arguments->indexname), err);
        gt_encseq_encoder_delete(ee);
    }
    else {
        gt_str_append_str(arguments->indexname,
                          gt_str_array_get_str(arguments->filenames, 0));
        if (arguments->with_esa || arguments->with_pck) {
            GtStr *current_line = gt_str_new();
            FILE *prj_fp;
            const char *buffer;
            char **elements = NULL;

            prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                             GT_PROJECTFILESUFFIX,"rb",err);
            if (prj_fp == NULL)
                had_err = -1;
            while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
                buffer = gt_str_get(current_line);
                if (elements != NULL) {
                    gt_free(elements[0]);
                    gt_free(elements[1]);
                }
                gt_free(elements);
                elements = gt_cstr_split(buffer, '=');
                gt_log_log("%s", elements[0]);
                if (strcmp("mirrored", elements[0]) == 0) {
                    gt_log_log("%s", elements[1]);
                    if (strcmp("1", elements[1]) == 0) {
                        mirrored = true;
                        gt_log_log("sequences are treated as mirrored");
                    }
                }
                gt_str_reset(current_line);
            }
            gt_str_delete(current_line);
            if (elements != NULL) {
                gt_free(elements[0]);
                gt_free(elements[1]);
            }
            gt_free(elements);
            gt_fa_xfclose(prj_fp);
        }
    }

    if (!had_err) {
        GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                             err);
        if (mirrored)
            gt_encseq_loader_mirror(el);
        encseq =
            gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
        gt_encseq_loader_delete(el);
    }
    if (encseq == NULL)
        had_err = -1;
    if (!had_err) {
        unit_info = gt_shu_unit_info_new(encseq);
        if (arguments->with_units)
            had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                                 logger, err);
    }

    if (!had_err) {
        uint64_t **shusums = NULL;
        if (arguments->with_esa || arguments->with_pck) {
            shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                               logger, timer, err);
            if (shusums == NULL)
                had_err = -1;
        }
        else {
            const bool doesa = true;
            GenomediffInfo gd_info;
            Suffixeratoroptions sopts;
            sopts.beverbose = arguments->verbose;
            sopts.indexname = arguments->indexname;
            sopts.db = NULL;
            sopts.encopts = NULL;
            sopts.genomediff = true;
            sopts.inputindex = arguments->indexname;
            sopts.loadopts = arguments->loadopts;
            sopts.showprogress = false;
            sopts.idxopts = arguments->idxopts;

            gt_assert(unit_info != NULL);
            gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                                unit_info->num_of_genomes);
            gd_info.shulensums = shusums;
            gd_info.unit_info = unit_info;
            had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err);
        }
        if (!had_err && shusums != NULL) {
            had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                            arguments->with_pck, logger, timer, err);
            gt_array2dim_delete(shusums);
        }
    }

    if (timer != NULL) {
        gt_timer_show_progress_final(timer, stdout);
        gt_timer_delete(timer);
    }
    gt_logger_delete(logger);
    gt_encseq_delete(encseq);
    gt_shu_unit_info_delete(unit_info);

    return had_err;
}
Пример #15
0
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtGenomediffArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtUword lcounter = 0, zcounter = 0;
  double **shusums = NULL;
  GtEncseq              *encseq = NULL;
  GtLogger              *logger;
  GtShuUnitFileInfo     *unit_info = NULL;
  GtTimer               *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose,
                         GT_LOGGER_DEFLT_PREFIX,
                         stdout);
  gt_assert(logger);

  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(arguments->filenames, argv[i]);
  }

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("load encseq");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (arguments->with_units) {
    gt_logger_log(logger, "unitfile option set, filename is %s\n",
                  gt_str_get(arguments->unitfile));
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                                                           err);
    encseq =
      gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
    gt_encseq_loader_delete(el);
  }
  if (encseq == NULL)
    had_err = -1;

  if (timer != NULL)
    gt_timer_show_progress(timer, "load units", stdout);

  if (!had_err) {
    unit_info = gt_shu_unit_info_new(encseq);
    if (arguments->with_units)
      had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                           logger, err);
  }

  if (timer != NULL)
    gt_timer_show_progress(timer, "read table", stdout);

  if (!had_err) {
    GtIO *table_file = NULL;
    GtTokenizer *tokenizer = NULL;
    GtStr *line = NULL;

    gt_assert(unit_info != NULL);
    gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                        unit_info->num_of_genomes);

    table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r");
    tokenizer = gt_tokenizer_new(table_file);
    line = gt_tokenizer_get_token(tokenizer);
    while (line != NULL && !had_err) {
      char *cline = gt_str_get(line);
      char *elem = strtok(cline, ";");
      zcounter = 0;
      while (elem != NULL && !had_err) {
        if (*elem != '#') {
          if (1 != sscanf(elem, "%lf",
                          &shusums[lcounter][zcounter])) {
            had_err = 1;
            gt_error_set(err, "couldn't scan");
            break;
          }
          gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]);
          zcounter++;
        }
        else {
          gt_logger_log(logger, "name: %s", elem++);
        }
        elem = strtok(NULL, ";");
      }
      gt_tokenizer_next_token(tokenizer);
      gt_str_delete(line);
      line = gt_tokenizer_get_token(tokenizer);
      lcounter++;
      gt_logger_log(logger, "line "GT_WD"", lcounter);
    }
  }
  if (!had_err) {
    GtUword num_of_seq, file_idx, seq_idx, startpos;
    GT_UNUSED GtUword oldpos = 0;

    gt_assert(unit_info != NULL);
    gt_assert(lcounter == zcounter);
    gt_assert(lcounter == unit_info->num_of_genomes);

    num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq);

    for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) {
      startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx);
      file_idx = gt_encseq_filenum(unit_info->encseq, startpos);
      gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n"
                 "belonges to file: "GT_WU" which is part of genome: %s",
                 seq_idx, startpos, file_idx,
                 gt_str_array_get(unit_info->genome_names,
                                  unit_info->map_files[file_idx]));
      gt_assert(oldpos <= startpos);
      oldpos = startpos;
    }
  }
  if (!had_err && shusums != NULL) {
    had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments,
                                                   unit_info,
                                                   logger, timer, err);
    gt_array2dim_delete(shusums);
  }

  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(logger);
  gt_encseq_delete(encseq);
  gt_shu_unit_info_delete(unit_info);
  return had_err;
}
Пример #16
0
static GtOPrval gthsplit_parse_options(int *parsed_args,
                                       Gthsplitinfo *gthsplitinfo,
                                       int argc, const char **argv,
                                       const GthPlugins *plugins, GtError *err)
{
  GtOptionParser *op;
  GtOption *optalignmentscore, *optcoverage, *optrange, *optverbose, *optgzip,
           *optbzip2, *optforce;
  bool alignmentscore, coverage, verbose, gzip, bzip2;
  GtOPrval oprval;

  gt_error_check(err);

  op = gt_option_parser_new("-alignmentscore | -coverage [option ...] "
                            "[file ...]", "Split GenomeThreader output files "
                            "containing intermediate results.");

  /* specify all options with a corresponding help-text */
  optalignmentscore = gt_option_new_bool("alignmentscore", "split according to "
                                      "the overall alignment score (scr)",
                                      &alignmentscore, false);
  gt_option_parser_add_option(op, optalignmentscore);

  optcoverage = gt_option_new_bool("coverage", "split according to coverage "
                                   "(cov)", &coverage, false);
  gt_option_parser_add_option(op, optcoverage);

  optrange = gt_option_new_uint_max(RANGE_OPT_CSTR, "set the percentage range "
                                 "used to create the sets",
                                 &gthsplitinfo->range, DEFAULT_RANGE, 100);
  gt_option_parser_add_option(op, optrange);

  /* add sa_filter options */
  gth_sa_filter_register_options(op, gthsplitinfo->sa_filter, false);

  /* -v */
  optverbose = gt_option_new_verbose(&verbose);
  gt_option_parser_add_option(op, optverbose);

  optgzip = gt_option_new_bool("gzip", "write gzip compressed output file(s)",
                               &gzip, false);
  gt_option_parser_add_option(op, optgzip);

  optbzip2 = gt_option_new_bool("bzip2", "write bzip2 compressed output "
                                "file(s)", &bzip2, false);
  gt_option_parser_add_option(op, optbzip2);

  optforce = gt_option_new_bool(GT_FORCE_OPT_CSTR,"force writing to split "
                                "files", &gthsplitinfo->force, false);
  gt_option_parser_add_option(op, optforce);

  gt_option_exclude(optalignmentscore, optcoverage);
  gt_option_exclude(optgzip, optbzip2);
  gt_option_is_mandatory_either(optalignmentscore, optcoverage);

  gt_option_parser_set_mail_address(op, "<*****@*****.**>");
  oprval = gt_option_parser_parse(op, parsed_args, argc, argv,
                                  plugins->gth_version_func, err);

  if (oprval == GT_OPTION_PARSER_OK && alignmentscore)
    gthsplitinfo->splitmode = ALIGNMENTSCORE_SPLIT;
  if (oprval == GT_OPTION_PARSER_OK && coverage)
    gthsplitinfo->splitmode = COVERAGE_SPLIT;
  if (oprval == GT_OPTION_PARSER_OK && 100 % gthsplitinfo->range) {
    gt_error_set(err, "argument to option %s must divide 100 without rest",
              RANGE_OPT_CSTR);
    oprval = GT_OPTION_PARSER_ERROR;
  }
  if (oprval == GT_OPTION_PARSER_OK && verbose)
    gthsplitinfo->showverbose = gth_show_on_stdout;
  if (oprval == GT_OPTION_PARSER_OK && gzip)
    gthsplitinfo->file_mode = GT_FILE_MODE_GZIP;
  if (oprval == GT_OPTION_PARSER_OK && bzip2)
    gthsplitinfo->file_mode = GT_FILE_MODE_BZIP2;

  /* save consensus files */
  if (oprval == GT_OPTION_PARSER_OK) {
    while (*parsed_args < argc) {
      gt_str_array_add_cstr(gthsplitinfo->consensusfiles, argv[*parsed_args]);
      (*parsed_args)++;
    }
  }

  if (oprval == GT_OPTION_PARSER_OK &&
      !gt_str_array_size(gthsplitinfo->consensusfiles) &&
      (gt_option_is_set(optgzip) || gt_option_is_set(optbzip2))) {
    gt_error_set(err, "to use compression, at least on input file has to be "
                      "supplied");
    oprval = GT_OPTION_PARSER_ERROR;
  }

  gt_option_parser_delete(op);

  return oprval;
}
Пример #17
0
static int gt_convertseq_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GtError *err)
{
  GtConvertseqArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtFilelengthvalues *flv;
  GtSeqIterator *seqit;
  GtSequenceBuffer *sb = NULL;
  GtStrArray *files;
  const GtUchar *sequence;
  char *desc;
  GtUword len, j;
  off_t totalsize;
  gt_error_check(err);
  gt_assert(arguments != NULL);

  files = gt_str_array_new();
  for (i = parsed_args; i < argc; i++)
  {
    gt_str_array_add_cstr(files, argv[i]);
  }
  totalsize = gt_files_estimate_total_size(files);

  flv = gt_calloc((size_t) gt_str_array_size(files),
                  sizeof (GtFilelengthvalues));

  sb = gt_sequence_buffer_new_guess_type(files, err);
  if (!sb) {
    had_err = -1;
  }
  if (!had_err) {
    gt_sequence_buffer_set_filelengthtab(sb, flv);
    /* read input using seqiterator */
    seqit = gt_seq_iterator_sequence_buffer_new_with_buffer(sb);
    if (arguments->verbose)
    {
      gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit,
                                                           (GtUint64)
                                                           totalsize),
                           (GtUint64) totalsize);
    }
    while (true)
    {
      GtUchar *seq = NULL;
      desc = NULL;
      j = 0UL;
      had_err = gt_seq_iterator_next(seqit, &sequence, &len, &desc, err);
      if (had_err != 1)
        break;
      if (arguments->revcomp) {
        GtUchar *newseq = gt_calloc((size_t) len+1, sizeof (GtUchar));
        memcpy(newseq, sequence, (size_t) len*sizeof (GtUchar));
        had_err = gt_reverse_complement((char*) newseq, len, err);
        if (had_err)
          break;
        seq = newseq;
      } else seq = (GtUchar*) sequence;

      if (!arguments->showseq) {
        bool in_wildcard = false;
        gt_file_xprintf(arguments->outfp, ">%s\n", desc);
        for (i = 0; (GtUword) i < len; i++) {
          if (arguments->reduce_wc_dna) {
            switch (seq[i]) {
              case 'a':
              case 'A':
              case 'c':
              case 'C':
              case 'g':
              case 'G':
              case 't':
              case 'u':
              case 'T':
              case 'U':
                in_wildcard = false;
                gt_file_xfputc((int) seq[i], arguments->outfp);
                j++;
                break;
              default:
                if (!in_wildcard) {
                  in_wildcard = true;
                  if (isupper((int) seq[i]))
                    gt_file_xfputc((int) 'N', arguments->outfp);
                  else
                    gt_file_xfputc((int) 'n', arguments->outfp);
                  j++;
                }
            }
          }
          else if (arguments->reduce_wc_prot) {
            switch (seq[i]) {
              case 'X':
              case 'B':
              case 'Z':
                if (!in_wildcard) {
                  in_wildcard = true;
                  gt_file_xfputc((int) 'N', arguments->outfp);
                  j++;
                }
                break;
              case 'x':
              case 'b':
              case 'z':
                if (!in_wildcard) {
                  in_wildcard = true;
                  gt_file_xfputc((int) 'n', arguments->outfp);
                  j++;
                }
                break;
              default:
                in_wildcard = false;
                gt_file_xfputc((int) seq[i], arguments->outfp);
                j++;
            }
          }
          else {
            gt_file_xfputc((int) seq[i], arguments->outfp);
            j++;
          }
          if (arguments->fastawidth > 0 && j % arguments->fastawidth == 0) {
            j = 0;
            gt_file_xprintf(arguments->outfp, "\n");
          }
        }
        if (arguments->fastawidth == 0 || len % arguments->fastawidth != 0)
            gt_file_xprintf(arguments->outfp, "\n");
      }
      if (arguments->revcomp) {
        gt_free(seq);
      }
    }
    if (arguments->showflv) {
      for (j=0;j<gt_str_array_size(files);j++) {
        fprintf(stderr, "file "GT_WU" (%s): "GT_WU"/"GT_WU"\n",
               j,
               gt_str_array_get(files, j),
               (GtUword) flv[j].length,
               (GtUword) flv[j].effectivelength);
      }
    }
    if (arguments->verbose)
    {
      gt_progressbar_stop();
    }
    gt_sequence_buffer_delete(sb);
    gt_seq_iterator_delete(seqit);
  }
  gt_str_array_delete(files);
  gt_free(flv);

  return had_err;
}
Пример #18
0
static int gt_seqtranslate_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtTranslateArguments *arguments = tool_arguments;
  GtSeqIterator *si = NULL;
  GtSequenceBuffer *sb = NULL;
  GtStrArray *infiles;
  int had_err = 0,
      rval,
      i;
  GtStr *translations[3];
  translations[0] = gt_str_new();
  translations[1] = gt_str_new();
  translations[2] = gt_str_new();

  gt_error_check(err);
  gt_assert(arguments);

  infiles = gt_str_array_new();
  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(infiles, argv[i]);
  }
  sb = gt_sequence_buffer_new_guess_type(infiles, err);
  if (!sb)
    had_err = -1;
  if (!had_err) {
    si = gt_seq_iterator_sequence_buffer_new_with_buffer(sb);
    if (!si)
      had_err = -1;
  }
  if (!had_err) {
    char *desc;
    const GtUchar *sequence;
    GtUword len;
    while (!had_err && (rval = gt_seq_iterator_next(si,
                                                   &sequence,
                                                   &len, &desc, err))) {
      if (rval < 0) {
        had_err = -1;
        break;
      }
      if (len < GT_CODON_LENGTH) {
        gt_warning("sequence '%s' is shorter than codon length of %d, skipping",
                   desc, GT_CODON_LENGTH);
      } else {
        had_err = gt_seqtranslate_do_translation(arguments, (char*) sequence,
                                                 len, desc,
                                                 translations, false, err);
        if (!had_err && arguments->reverse) {
          char *revseq = gt_cstr_dup_nt((char*) sequence, len);
          had_err = gt_reverse_complement(revseq, len, err);
          if (!had_err) {
            had_err = gt_seqtranslate_do_translation(arguments, revseq, len,
                                                  desc, translations, true,
                                                  err);
          }
          gt_free(revseq);
        }
      }
    }
  }
  gt_str_delete(translations[0]);
  gt_str_delete(translations[1]);
  gt_str_delete(translations[2]);
  gt_str_array_delete(infiles);
  gt_seq_iterator_delete(si);
  gt_sequence_buffer_delete(sb);
  return had_err;
}
Пример #19
0
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args,
                             void *tool_arguments, GtError *err)
{
  GtSequniqArguments *arguments = tool_arguments;
  GtUint64 duplicates = 0, num_of_sequences = 0;
  int i, had_err = 0;
  GtMD5Set *md5set;

  gt_error_check(err);
  gt_assert(arguments);
  md5set = gt_md5set_new(arguments->nofseqs);
  if (!arguments->seqit) {
    GtUword j;
    GtBioseq *bs;

    for (i = parsed_args; !had_err && i < argc; i++) {
      if (!(bs = gt_bioseq_new(argv[i], err)))
        had_err = -1;
      if (!had_err) {
        GtMD5SetStatus retval;
        for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) {
          char *seq = gt_bioseq_get_sequence(bs, j);
          retval = gt_md5set_add_sequence(md5set, seq,
                                          gt_bioseq_get_sequence_length(bs, j),
                                          arguments->rev, err);
          if (retval == GT_MD5SET_NOT_FOUND)
            gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq,
                                gt_bioseq_get_sequence_length(bs, j),
                                arguments->width, arguments->outfp);
          else if (retval != GT_MD5SET_ERROR)
            duplicates++;
          else
            had_err = -1;
          num_of_sequences++;
          gt_free(seq);
        }
        gt_bioseq_delete(bs);
      }
    }
  }
  else {
    GtSeqIterator *seqit;
    GtStrArray *files;
    off_t totalsize;
    const GtUchar *sequence;
    char *desc;
    GtUword len;

    files = gt_str_array_new();
    for (i = parsed_args; i < argc; i++)
      gt_str_array_add_cstr(files, argv[i]);
    totalsize = gt_files_estimate_total_size(files);
    seqit = gt_seq_iterator_sequence_buffer_new(files, err);
    if (!seqit)
      had_err = -1;
    if (!had_err) {
      if (arguments->verbose) {
        gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit,
                                                          (GtUint64) totalsize),
                             (GtUint64) totalsize);
      }
      while (!had_err) {
        GtMD5SetStatus retval;
        if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1)
          break;

        retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len,
                                        arguments->rev, err);
        if (retval == GT_MD5SET_NOT_FOUND)
          gt_fasta_show_entry(desc, (const char*) sequence, len,
                              arguments->width, arguments->outfp);
        else if (retval != GT_MD5SET_ERROR)
          duplicates++;
        else
          had_err = -1;
        num_of_sequences++;
      }
      if (arguments->verbose)
        gt_progressbar_stop();
      gt_seq_iterator_delete(seqit);
    }
    gt_str_array_delete(files);
  }

  /* show statistics */
  if (!had_err) {
    fprintf(stderr,
            "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n",
            (GtUword)duplicates, (GtUword)num_of_sequences,
            ((double) duplicates / (double)num_of_sequences) * 100.0);
  }

  gt_md5set_delete(md5set);
  return had_err;
}