Exemplo n.º 1
0
static int gt_ltrdigest_runner(GT_UNUSED int argc, const char **argv,
                               int parsed_args, void *tool_arguments,
                               GtError *err)
{
  GtLTRdigestOptions *arguments = tool_arguments;
  GtNodeStream *gff3_in_stream  = NULL,
               *gff3_out_stream = NULL,
               *pdom_stream     = NULL,
               *ppt_stream      = NULL,
               *pbs_stream      = NULL,
               *tab_out_stream  = NULL,
               *sa_stream       = NULL,
               *last_stream     = NULL;
  int had_err      = 0,
      tests_to_run = 0,
      arg = parsed_args;
  GtRegionMapping *rmap = NULL;
  GtPdomModelSet *ms = NULL;
  gt_error_check(err);
  gt_assert(arguments);

  /* determine and open sequence source */
  if (gt_seqid2file_option_used(arguments->s2fi)) {
    /* create region mapping */
    rmap = gt_seqid2file_region_mapping_new(arguments->s2fi, err);
    if (!rmap)
      had_err = -1;
  } else {
    GtEncseqLoader *el;
    GtEncseq *encseq;
    /* no new-style sequence source option given, fall back to legacy syntax */
    if (argc < 3) {
      gt_error_set(err, "missing mandatory argument(s)");
      had_err = -1;
    }
    if (!had_err) {
      el = gt_encseq_loader_new();
      gt_encseq_loader_disable_autosupport(el);
      gt_encseq_loader_require_md5_support(el);
      gt_encseq_loader_require_description_support(el);
      encseq = gt_encseq_loader_load(el, argv[argc-1], err);
      /* XXX: clip off terminal argument */
      gt_free((char*) argv[argc-1]);
      argv[argc-1] = NULL;
      argc--;
      gt_encseq_loader_delete(el);
      if (!encseq)
        had_err = -1;
      else {
        rmap = gt_region_mapping_new_encseq_seqno(encseq);
        gt_encseq_delete(encseq);
      }
    }
  }
  gt_assert(had_err || rmap);

  /* Always search for PPT. */
  tests_to_run |= GT_LTRDIGEST_RUN_PPT;

  /* Open tRNA library if given. */
  if (!had_err && arguments->trna_lib
        && gt_str_length(arguments->trna_lib) > 0)
  {
    tests_to_run |= GT_LTRDIGEST_RUN_PBS;
    arguments->trna_lib_bs = gt_bioseq_new(gt_str_get(arguments->trna_lib),
                                           err);
    if (gt_error_is_set(err))
      had_err = -1;
  }

  /* Set HMMER cutoffs. */
  if (!had_err && gt_str_array_size(arguments->hmm_files) > 0)
  {
    tests_to_run |= GT_LTRDIGEST_RUN_PDOM;
    if (!strcmp(gt_str_get(arguments->cutoffs), "GA")) {
      arguments->cutoff = GT_PHMM_CUTOFF_GA;
    } else if (!strcmp(gt_str_get(arguments->cutoffs), "TC")) {
      arguments->cutoff = GT_PHMM_CUTOFF_TC;
    } else if (!strcmp(gt_str_get(arguments->cutoffs), "NONE")) {
      arguments->cutoff = GT_PHMM_CUTOFF_NONE;
    } else {
      gt_error_set(err, "invalid cutoff setting!");
      had_err = -1;
    }
  }

  if (!had_err) {
    last_stream = gff3_in_stream  = gt_gff3_in_stream_new_sorted(argv[arg]);
  }

  if (!had_err && gt_str_array_size(arguments->hmm_files) > 0) {
    GtNodeVisitor *pdom_v;
    ms = gt_pdom_model_set_new(arguments->hmm_files, err);
    if (ms != NULL) {
      pdom_v = gt_ltrdigest_pdom_visitor_new(ms, arguments->evalue_cutoff,
                                             arguments->chain_max_gap_length,
                                             arguments->cutoff, rmap, err);
      if (pdom_v == NULL)
        had_err = -1;
      if (!had_err) {
        if (arguments->output_all_chains)
          gt_ltrdigest_pdom_visitor_output_all_chains((GtLTRdigestPdomVisitor*)
                                                                        pdom_v);
        last_stream = pdom_stream = gt_visitor_stream_new(last_stream, pdom_v);
      }
    } else had_err = -1;
  }

  if (!had_err && arguments->trna_lib_bs) {
    GtNodeVisitor *pbs_v;
    pbs_v = gt_ltrdigest_pbs_visitor_new(rmap, arguments->pbs_radius,
                                         arguments->max_edist,
                                         arguments->alilen,
                                         arguments->offsetlen,
                                         arguments->trnaoffsetlen,
                                         arguments->ali_score_match,
                                         arguments->ali_score_mismatch,
                                         arguments->ali_score_insertion,
                                         arguments->ali_score_deletion,
                                         arguments->trna_lib_bs, err);
    if (pbs_v != NULL)
      last_stream = pbs_stream = gt_visitor_stream_new(last_stream, pbs_v);
    else
      had_err = -1;
  }

  if (!had_err) {
    GtNodeVisitor *ppt_v;
    ppt_v = gt_ltrdigest_ppt_visitor_new(rmap, arguments->ppt_len,
                                         arguments->ubox_len,
                                         arguments->ppt_pyrimidine_prob,
                                         arguments->ppt_purine_prob,
                                         arguments->bkg_a_prob,
                                         arguments->bkg_g_prob,
                                         arguments->bkg_t_prob,
                                         arguments->bkg_c_prob,
                                         arguments->ubox_u_prob,
                                         arguments->ppt_radius,
                                         arguments->max_ubox_dist, err);
    if (ppt_v != NULL)
      last_stream = ppt_stream = gt_visitor_stream_new(last_stream, ppt_v);
    else
      had_err = -1;
  }

  if (!had_err) {
    GtNodeVisitor *sa_v;
    sa_v = gt_ltrdigest_strand_assign_visitor_new();
    gt_assert(sa_v);
    last_stream = sa_stream = gt_visitor_stream_new(last_stream, sa_v);
  }

  if (!had_err)
  {
    /* attach tabular output stream, if requested */
    if (gt_str_length(arguments->prefix) > 0)
    {
      last_stream = tab_out_stream = gt_ltrdigest_file_out_stream_new(
                                                  last_stream,
                                                  tests_to_run,
                                                  rmap,
                                                  gt_str_get(arguments->prefix),
                                                  arguments->seqnamelen,
                                                  err);
      if (!tab_out_stream)
        had_err = -1;
      if (!had_err && arguments->print_metadata)
      {
        had_err = gt_ltrdigest_file_out_stream_write_metadata(
                                           (GtLTRdigestFileOutStream*)
                                                                 tab_out_stream,
                                           tests_to_run,
                                           gt_str_get(arguments->trna_lib),
                                           argv[arg],
                                           arguments->ppt_len,
                                           arguments->ubox_len,
                                           arguments->ppt_radius,
                                           arguments->alilen,
                                           arguments->max_edist,
                                           arguments->offsetlen,
                                           arguments->trnaoffsetlen,
                                           arguments->pbs_radius,
                                           arguments->hmm_files,
                                           arguments->chain_max_gap_length,
                                           arguments->evalue_cutoff,
                                           err);
      }
      if (!had_err)
      {
        if (arguments->write_alignments)
          gt_ltrdigest_file_out_stream_enable_pdom_alignment_output(
                                                                tab_out_stream);
        if (arguments->write_aaseqs)
          gt_ltrdigest_file_out_stream_enable_aa_sequence_output(
                                                                tab_out_stream);
      }
    }

    last_stream = gff3_out_stream = gt_gff3_out_stream_new(last_stream,
                                                           arguments->outfp);

    /* pull the features through the stream and free them afterwards */
    had_err = gt_node_stream_pull(last_stream, err);
  }

  gt_pdom_model_set_delete(ms);
  gt_node_stream_delete(gff3_out_stream);
  gt_node_stream_delete(ppt_stream);
  gt_node_stream_delete(pbs_stream);
  gt_node_stream_delete(sa_stream);
  gt_node_stream_delete(pdom_stream);
  gt_node_stream_delete(tab_out_stream);
  gt_node_stream_delete(gff3_in_stream);
  gt_bioseq_delete(arguments->trna_lib_bs);
  gt_region_mapping_delete(rmap);

  return had_err;
}
Exemplo n.º 2
0
static GtOPrval parse_options(int *parsed_args,
                              Cmppairwiseopt *pw,
                              int argc, const char **argv, GtError *err)
{
  GtOptionParser *op;
  GtOption *optionstrings,
         *optionfiles,
         *optioncharlistlen,
         *optiontext,
         *optionshowedist,
         *optionprint;
  GtStrArray *charlistlen;
  GtOPrval oprval;

  gt_error_check(err);
  charlistlen = gt_str_array_new();
  pw->strings = gt_str_array_new();
  pw->files = gt_str_array_new();
  pw->text = gt_str_new();
  pw->charlistlen = NULL;
  pw->fastasequences0 = NULL;
  pw->fastasequences1 = NULL;
  pw->showedist = false;
  pw->print = false;
  pw->fasta = false;
  op = gt_option_parser_new("options", "Apply function to pairs of strings.");
  gt_option_parser_set_mail_address(op, "<*****@*****.**>");

  optionstrings = gt_option_new_string_array("ss", "use two strings",
                                             pw->strings);
  gt_option_parser_add_option(op, optionstrings);

  optionfiles = gt_option_new_filename_array("ff", "use two files",
                                             pw->files);
  gt_option_parser_add_option(op, optionfiles);

  optioncharlistlen = gt_option_new_string_array("a",
                                             "use character list and length",
                                             charlistlen);
  gt_option_parser_add_option(op, optioncharlistlen);

  optiontext = gt_option_new_string("t", "use text", pw->text, NULL);
  gt_option_parser_add_option(op, optiontext);

  optionshowedist = gt_option_new_bool("e", "output unit edit distance",
                      &pw->showedist, false);
  gt_option_parser_add_option(op, optionshowedist);

  optionprint = gt_option_new_bool("p", "print edist alignment",
                      &pw->print, false);
  gt_option_parser_add_option(op, optionprint);

  gt_option_exclude(optionstrings, optionfiles);
  gt_option_exclude(optionstrings, optioncharlistlen);
  gt_option_exclude(optionstrings, optiontext);
  gt_option_exclude(optionfiles, optioncharlistlen);
  gt_option_exclude(optionfiles, optiontext);
  gt_option_exclude(optioncharlistlen, optiontext);
  gt_option_imply(optionshowedist, optionstrings);
  gt_option_imply(optionprint, optionstrings);

  oprval = gt_option_parser_parse(op, parsed_args, argc, argv, gt_versionfunc,
                                  err);
  if (oprval == GT_OPTION_PARSER_OK)
  {
    if (gt_option_is_set(optionstrings))
    {
      if (gt_str_array_size(pw->strings) != 2UL)
      {
        gt_error_set(err, "option -ss requires two string arguments");
        oprval = GT_OPTION_PARSER_ERROR;
      }
    } else
    {
      if (gt_option_is_set(optionfiles))
      {
        if (gt_str_array_size(pw->files) != 2UL)
        {
          if (gt_str_array_size(pw->files) == 3UL &&
              !strcmp(gt_str_array_get(pw->files,0),"fasta"))
          {
            pw->fasta = true;
          }
          if (!pw->fasta)
          {
            gt_error_set(err, "option -ff requires two filename arguments or "
                              "keyword fasta and two filename arguments in "
                              "FASTA format");
            oprval = GT_OPTION_PARSER_ERROR;
          }
        }
      } else
      {
        if (gt_option_is_set(optioncharlistlen))
        {
          GtWord readint;
          if (gt_str_array_size(charlistlen) != 2UL)
          {
            gt_error_set(err,
                         "option -a requires charlist and length argument");
            oprval = GT_OPTION_PARSER_ERROR;
          }else
          {
            pw->charlistlen = gt_malloc(sizeof *pw->charlistlen);
            pw->charlistlen->charlist =
              gt_str_ref(gt_str_array_get_str(charlistlen,
                                                                    0));
            if (sscanf(gt_str_array_get(charlistlen,1UL), GT_WD, &readint) != 1
                || readint < 1L)
            {
              gt_error_set(err,
                           "option -a requires charlist and length argument");
              oprval = GT_OPTION_PARSER_ERROR;
            }
            pw->charlistlen->len = (GtUword) readint;
          }
        } else
        {
          if (!gt_option_is_set(optiontext))
          {
            gt_error_set(err,
                         "use exactly one of the options -ss, -ff, -a, -t");
            oprval = GT_OPTION_PARSER_ERROR;
          }
        }
      }
    }
  }
  gt_option_parser_delete(op);
  if (oprval == GT_OPTION_PARSER_OK && *parsed_args != argc)
  {
    gt_error_set(err, "superfluous program parameters");
    oprval = GT_OPTION_PARSER_ERROR;
  }
  gt_str_array_delete(charlistlen);
  return oprval;
}
Exemplo n.º 3
0
static int scanfmafileviafileptr(Fmindex *fmindex,
                                 GtSpecialcharinfo *specialcharinfo,
                                 bool *storeindexpos,
                                 const char *indexname,
                                 FILE *fpin,
                                 GtLogger *logger,
                                 GtError *err)
{
  bool haserr = false;
  GtScannedprjkeytable *scannedprjkeytable;
  unsigned int intstoreindexpos;

  gt_error_check(err);
  scannedprjkeytable = gt_scannedprjkeytable_new();
  GT_SCANNEDPRJKEY_ADD("bwtlength",&fmindex->bwtlength,NULL);
  GT_SCANNEDPRJKEY_ADD("longest",&fmindex->longestsuffixpos,NULL);
  GT_SCANNEDPRJKEY_ADD("storeindexpos",&intstoreindexpos,NULL);
  GT_SCANNEDPRJKEY_ADD("log2blocksize",&fmindex->log2bsize,NULL);
  GT_SCANNEDPRJKEY_ADD("log2markdist",&fmindex->log2markdist,NULL);
  GT_SCANNEDPRJKEY_ADD("specialcharacters",
                       &specialcharinfo->specialcharacters,NULL);
  GT_SCANNEDPRJKEY_ADD("specialranges",&specialcharinfo->specialranges,NULL);
  GT_SCANNEDPRJKEY_ADD("realspecialranges",&specialcharinfo->realspecialranges,
                       NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofspecialprefix",
                       &specialcharinfo->lengthofspecialprefix,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofspecialsuffix",
                       &specialcharinfo->lengthofspecialsuffix,NULL);
  GT_SCANNEDPRJKEY_ADD("wildcards",&specialcharinfo->wildcards,NULL);
  GT_SCANNEDPRJKEY_ADD("wildcardranges",&specialcharinfo->wildcardranges,NULL);
  GT_SCANNEDPRJKEY_ADD("realwildcardranges",
                       &specialcharinfo->realwildcardranges,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofwildcardprefix",
                       &specialcharinfo->lengthofwildcardprefix,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofwildcardsuffix",
                       &specialcharinfo->lengthofwildcardsuffix,NULL);
  GT_SCANNEDPRJKEY_ADD("suffixlength",&fmindex->suffixlength,NULL);
  if (!haserr)
  {
    GtStr *currentline;
    unsigned int linenum;

    currentline = gt_str_new();
    for (linenum = 0; gt_str_read_next_line(currentline, fpin) != EOF;
         linenum++)
    {
      if (gt_scannedprjkey_analyze(indexname,
                                   FMASCIIFILESUFFIX,
                                   linenum,
                                   gt_str_get(currentline),
                                   gt_str_length(currentline),
                                   scannedprjkeytable,
                                   err) != 0)
      {
        haserr = true;
        break;
      }
      gt_str_reset(currentline);
    }
    gt_str_delete(currentline);
  }
  if (!haserr && gt_scannedprjkey_allkeysdefined(indexname,FMASCIIFILESUFFIX,
                                                 scannedprjkeytable,
                                                 logger,err) != 0)
  {
    haserr = true;
  }
  if (!haserr)
  {
    if (intstoreindexpos == 1U)
    {
      *storeindexpos = true;
    } else
    {
      if (intstoreindexpos == 0)
      {
        *storeindexpos = false;
      } else
      {
        gt_error_set(err,"illegal value in line matching \"storeindexpos=\"");
        haserr = true;
      }
    }
  }
  gt_scannedprjkeytable_delete(scannedprjkeytable);
  return haserr ? -1 : 0;
}
Exemplo n.º 4
0
GtFile* gt_file_new(const char *path, const char *mode, GtError *err)
{
  gt_error_check(err);
  gt_assert(mode);
  return gt_file_open(gt_file_mode_determine(path), path, mode, err);
}
int gt_codon_iterator_encseq_unit_test(GtError *err)
{
  int had_err = 0,
      i, j;
  const char *testseq    = "gctgatcgactgaacatagctagcacggccgcgcgatcgtacgatg",
             *testseq_rc = "catcgtacgatcgcgcggccgtgctagctatgttcagtcgatcagc",
             *testseq_rv = "gtagcatgctagcgcgccggcacgatcgatacaagtcagctagtcg",
             *testseq_cm = "cgactagctgacttgtatcgatcgtgccggcgcgctagcatgctac";
  GtEncseq *encseq;
  GtEncseqBuilder *eb;
  GtCodonIterator *ci;
  GtAlphabet *alpha;
  char n1, n2, n3;
  unsigned int frame;
  gt_error_check(err);

  alpha = gt_alphabet_new_dna();
  gt_ensure(had_err, alpha != NULL);
  eb = gt_encseq_builder_new(alpha);
  gt_ensure(had_err, eb != NULL);
  gt_encseq_builder_add_cstr(eb, testseq, strlen(testseq), "foo");
  encseq = gt_encseq_builder_build(eb, NULL);
  gt_ensure(had_err, encseq != NULL);

  if (!had_err) {
    /* forward tests */
    had_err = gt_codon_iterator_encseq_single_test(encseq, testseq, testseq,
                                                   GT_READMODE_FORWARD, err);
  }

  if (!had_err) {
    /* complement tests */
    had_err = gt_codon_iterator_encseq_single_test(encseq, testseq, testseq_cm,
                                                   GT_READMODE_COMPL, err);
  }

  if (!had_err) {
    /* revcompl tests */
    had_err = gt_codon_iterator_encseq_single_test(encseq, testseq, testseq_rc,
                                                   GT_READMODE_REVCOMPL, err);
  }

  if (!had_err) {
    /* reverse tests */
    had_err = gt_codon_iterator_encseq_single_test(encseq, testseq, testseq_rv,
                                                   GT_READMODE_REVERSE, err);
  }

  /* lengths < 3 */
  for (j = 0; !had_err && j < 3; j++) {
    ci = gt_codon_iterator_encseq_new_with_readmode(encseq, 10, j,
                                                    GT_READMODE_REVCOMPL, NULL);
    i = 10;
    while (!(gt_codon_iterator_next(ci, &n1, &n2, &n3, &frame, NULL))) {
      gt_ensure(had_err, false);
    }
    gt_ensure(had_err, i == 10);
    gt_codon_iterator_delete(ci);
  }

  gt_encseq_delete(encseq);
  gt_encseq_builder_delete(eb);
  gt_alphabet_delete(alpha);
  return had_err;
}
static int gt_extract_feature_sequence_generic(GtStr *sequence,
                                GtGenomeNode *gn,
                                const char *type, bool join, GtStr *seqid,
                                GtStrArray *target_ids,
                                unsigned int *out_phase_offset,
                                GtRegionMapping *region_mapping, GtError *err)
{
  GtFeatureNode *fn;
  GtRange range;
  unsigned int phase_offset = 0;
  char *outsequence;
  const char *target;
  int had_err = 0;

  gt_error_check(err);
  fn = gt_genome_node_cast(gt_feature_node_class(), gn);
  gt_assert(fn);

  if (seqid)
    gt_str_append_str(seqid, gt_genome_node_get_seqid(gn));
  if (target_ids &&
      (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) {
    had_err = gt_gff3_parser_parse_all_target_attributes(target, false,
                                                         target_ids, NULL,
                                                         NULL, "", 0, err);
  }
  if (!had_err) {
    if (join) {
      GtFeatureNodeIterator *fni;
      GtFeatureNode *child;
      bool reverse_strand = false,
           first_child = true,
           first_child_of_type_seen = false;
      GtPhase phase = GT_PHASE_UNDEFINED;
      /* in this case we have to traverse the children */
      fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn));
      while (!had_err && (child = gt_feature_node_iterator_next(fni))) {
        if (first_child) {
          if (target_ids &&
               (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) {
            gt_str_array_reset(target_ids);
            had_err = gt_gff3_parser_parse_all_target_attributes(target, false,
                                                                 target_ids,
                                                                 NULL,
                                                                 NULL, "", 0,
                                                                 err);
          }
          first_child = false;
        }
        if (!had_err) {
          if (extract_join_feature((GtGenomeNode*) child, type, region_mapping,
                                   sequence, &reverse_strand,
                                   &first_child_of_type_seen,
                                   &phase, err)) {
            had_err = -1;
          }
          if (phase != GT_PHASE_UNDEFINED) {
            phase_offset = (int) phase;
          }
        }
      }
      gt_feature_node_iterator_delete(fni);
      gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED);
      if (!had_err && gt_str_length(sequence)) {
        if (reverse_strand) {
          had_err = gt_reverse_complement(gt_str_get(sequence),
                                          gt_str_length(sequence), err);
        }
      }
    }
    else if (gt_feature_node_get_type(fn) == type) {
      GtPhase phase = gt_feature_node_get_phase(fn);
      gt_assert(!had_err);
      if (phase != GT_PHASE_UNDEFINED)
        phase_offset = (unsigned int) phase;
      /* otherwise we only have to look at this feature */
      range = gt_genome_node_get_range(gn);
      gt_assert(range.start); /* 1-based coordinates */
      had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence,
                                               gt_genome_node_get_seqid(gn),
                                               range.start, range.end, err);
      if (!had_err) {
        gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range));
        gt_free(outsequence);
        if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) {
          had_err = gt_reverse_complement(gt_str_get(sequence),
                                          gt_str_length(sequence), err);
        }
      }
    }
  }
  if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) {
    *out_phase_offset = phase_offset;
  }
  return had_err;
}
Exemplo n.º 7
0
static int construct_genes(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  GtHashmap *transcript_id_hash = (GtHashmap*) value;
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtQueue *genome_nodes = cinfo->genome_nodes;
  const char *gname;
  GtArray *mRNAs = gt_array_new(sizeof (GtGenomeNode*));
  GtGenomeNode *gene_node, *gn;
  GtStrand gene_strand;
  GtRange gene_range;
  GtStr *gene_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
  cinfo->mRNAs = mRNAs;
  had_err = gt_hashmap_foreach(transcript_id_hash, construct_mRNAs, cinfo, err);
  if (!had_err) {
    gt_assert(gt_array_size(mRNAs)); /* at least one mRNA constructed */

    /* determine the range and the strand of the gene */
    gn = *(GtGenomeNode**) gt_array_get(mRNAs, 0);
    gene_range = gt_genome_node_get_range(gn);
    gene_strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    gene_seqid = gt_genome_node_get_seqid(gn);
    for (i = 1; i < gt_array_size(mRNAs); i++) {
      GtRange range;
      gn = *(GtGenomeNode**) gt_array_get(mRNAs, i);
      range = gt_genome_node_get_range(gn);
      gene_range = gt_range_join(&gene_range, &range);
      gene_strand = gt_strand_join(gene_strand,
                          gt_feature_node_get_strand((GtFeatureNode*) gn));
      gt_assert(gt_str_cmp(gene_seqid, gt_genome_node_get_seqid(gn)) == 0);
    }

    gene_node = gt_feature_node_new(gene_seqid, gt_ft_gene, gene_range.start,
                                    gene_range.end, gene_strand);

    if ((gname = gt_hashmap_get(cinfo->gene_id_to_name_mapping,
                              (const char*) key))) {
      gt_feature_node_add_attribute((GtFeatureNode*) gene_node, GT_GFF_NAME,
                                      gname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(mRNAs); i++) {
      gn = *(GtGenomeNode**) gt_array_get(mRNAs, i);
      gt_feature_node_add_child((GtFeatureNode*) gene_node,
                                (GtFeatureNode*) gn);
    }

    /* store the gene */
    gt_queue_add(genome_nodes, gene_node);

    /* free */
    gt_array_delete(mRNAs);
  }

  return had_err;
}
Exemplo n.º 8
0
static int gt_sketch_page_runner(GT_UNUSED int argc,
                                 const char **argv,
                                 int parsed_args,
                                 void *tool_arguments,
                                 GtError *err)
{
  SketchPageArguments *arguments = tool_arguments;
  int had_err = 0;
  GtFeatureIndex *features = NULL;
  GtRange qry_range, sequence_region_range;
  GtStyle *sty = NULL;
  GtStr *prog, *gt_style_file;
  GtDiagram *d = NULL;
  GtLayout *l = NULL;
  GtBioseq *bioseq = NULL;
  GtCanvas *canvas = NULL;
  char *seqid = NULL;
  const char *outfile = NULL;
  GtUword start, height, num_pages = 0;
  double offsetpos, usable_height;
  cairo_surface_t *surf = NULL;
  cairo_t *cr = NULL;
  bool has_seqid;
  GtTextWidthCalculator *twc;
  gt_error_check(err);

  features = gt_feature_index_memory_new();

  if (cairo_version() < CAIRO_VERSION_ENCODE(1, 8, 6))
    gt_warning("Your cairo library (version %s) is older than version 1.8.6! "
               "These versions contain a bug which may result in "
               "corrupted PDF output!", cairo_version_string());

  /* get style */
  sty = gt_style_new(err);
  if (gt_str_length(arguments->stylefile) == 0)
  {
    prog = gt_str_new();
    gt_str_append_cstr_nt(prog, argv[0],
                          gt_cstr_length_up_to_char(argv[0], ' '));
    gt_style_file = gt_get_gtdata_path(gt_str_get(prog), err);
    gt_str_delete(prog);
    gt_str_append_cstr(gt_style_file, "/sketch/default.style");
  }
  else
  {
    gt_style_file = gt_str_ref(arguments->stylefile);
  }
  had_err = gt_style_load_file(sty, gt_str_get(gt_style_file), err);
  if (!had_err) {
    had_err = gt_feature_index_has_seqid(features, &has_seqid,
                                         gt_str_get(arguments->seqid), err);
  }

  outfile = argv[parsed_args];
  if (!had_err)
  {
    /* get features */
    had_err = gt_feature_index_add_gff3file(features, argv[parsed_args+1], err);
     if (!had_err && gt_str_length(arguments->seqid) == 0) {
      seqid = gt_feature_index_get_first_seqid(features, err);
      if (seqid == NULL)
      {
        gt_error_set(err, "GFF input file must contain a sequence region!");
        had_err = -1;
      }
    }
    else if (!had_err && !has_seqid)
    {
      gt_error_set(err, "sequence region '%s' does not exist in GFF input file",
                   gt_str_get(arguments->seqid));
      had_err = -1;
    }
    else if (!had_err)
      seqid = gt_str_get(arguments->seqid);
  }

  /* set text */
  if (gt_str_length(arguments->text) == 0)
  {
    gt_str_delete(arguments->text);
    arguments->text = gt_str_new_cstr(argv[parsed_args+1]);
  }

  if (!had_err)
  {
    /* set display range */
    had_err = gt_feature_index_get_range_for_seqid(features,
                                                   &sequence_region_range,
                                                   seqid, err);
  }
  if (!had_err)
  {
    qry_range.start = (arguments->range.start == GT_UNDEF_UWORD ?
                         sequence_region_range.start :
                         arguments->range.start);
    qry_range.end   = (arguments->range.end == GT_UNDEF_UWORD ?
                         sequence_region_range.end :
                         arguments->range.end);

    /* set output format */
    if (strcmp(gt_str_get(arguments->format), "pdf") == 0)
    {
      surf = cairo_pdf_surface_create(outfile,
                                      mm_to_pt(arguments->pwidth),
                                      mm_to_pt(arguments->pheight));
    }
    else if (strcmp(gt_str_get(arguments->format), "ps") == 0)
    {
      surf =  cairo_ps_surface_create(outfile,
                                      mm_to_pt(arguments->pwidth),
                                      mm_to_pt(arguments->pheight));
    }
    gt_log_log("created page with %.2f:%.2f dimensions\n",
                                                  mm_to_pt(arguments->pwidth),
                                                  mm_to_pt(arguments->pheight));

    offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER;
    usable_height = mm_to_pt(arguments->pheight)
                              - arguments->theight
                              - arguments->theight
                              - 4*TEXT_SPACER;

    if (gt_str_length(arguments->seqfile) > 0) {
      bioseq = gt_bioseq_new(gt_str_get(arguments->seqfile), err);
    }

    cr = cairo_create(surf);
    cairo_set_font_size(cr, 8);
    twc = gt_text_width_calculator_cairo_new(cr, sty, err);
    for (start = qry_range.start; start <= qry_range.end;
         start += arguments->width)
    {
      GtRange single_range;
      GtCustomTrack *ct = NULL;
      const char *seq;
      single_range.start = start;
      single_range.end = start + arguments->width;

      if (had_err)
        break;

      d = gt_diagram_new(features, seqid, &single_range, sty, err);
      if (!d) {
        had_err = -1;
        break;
      }
      if (bioseq) {
        seq = gt_bioseq_get_sequence(bioseq, 0);
        ct = gt_custom_track_gc_content_new(seq,
                                      gt_bioseq_get_sequence_length(bioseq, 0),
                                      800, 70, 0.4, true);
        gt_diagram_add_custom_track(d, ct);
      }

      l = gt_layout_new_with_twc(d, mm_to_pt(arguments->width), sty, twc, err);
      had_err = gt_layout_get_height(l, &height, err);
      if (!had_err) {
        if (gt_double_smaller_double(usable_height - 10 - 2*TEXT_SPACER
              - arguments->theight, offsetpos + height))
        {
            draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1],
                        seqid, num_pages, mm_to_pt(arguments->pwidth),
                        mm_to_pt(arguments->pheight),
                        arguments->theight);
          cairo_show_page(cr);
          offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER;
          num_pages++;
        }
        canvas = gt_canvas_cairo_context_new(sty,
                                             cr,
                                             offsetpos,
                                             mm_to_pt(arguments->pwidth),
                                             height,
                                             NULL,
                                             err);
        if (!canvas)
          had_err = -1;
        offsetpos += height;
        if (!had_err)
          had_err = gt_layout_sketch(l, canvas, err);
      }
      gt_canvas_delete(canvas);
      gt_layout_delete(l);
      gt_diagram_delete(d);
      if (ct)
        gt_custom_track_delete(ct);
    }
    draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid,
                num_pages, mm_to_pt(arguments->pwidth),
                mm_to_pt(arguments->pheight),
                arguments->theight);
    cairo_show_page(cr);
    num_pages++;
    gt_log_log("finished, should be "GT_WU" pages\n", num_pages);
    gt_text_width_calculator_delete(twc);
    cairo_destroy(cr);
    cairo_surface_flush(surf);
    cairo_surface_finish(surf);
    cairo_surface_destroy(surf);
    cairo_debug_reset_static_data();
    if (bioseq)
      gt_bioseq_delete(bioseq);
    gt_style_delete(sty);
    gt_free(seqid);
    gt_str_delete(gt_style_file);
    gt_feature_index_delete(features);
  }
  return had_err;
}
static int gt_condenser_search_runner(GT_UNUSED int argc,
                                      GT_UNUSED const char **argv,
                                      GT_UNUSED int parsed_args,
                                      void *tool_arguments,
                                      GtError *err)
{
  GtCondenserSearchArguments *arguments = tool_arguments;
  int i, had_err = 0;
  char *querypath = gt_str_get(arguments->querypath);
  GtStr* coarse_fname = gt_str_new_cstr("coarse_");
  char *db_basename = NULL;
  char *suffix_ptr = NULL;
  GtTimer *timer = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  db_basename = gt_basename(gt_str_get(arguments->dbpath));
  /* if first char is '.' this might be a hidden file */
  if (strlen(db_basename) > (size_t) 1 &&
      (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) {
    /* remove suffix */
    *suffix_ptr = '\0';
  }
  gt_str_append_cstr(coarse_fname, db_basename);
  gt_str_append_cstr(coarse_fname, ".fas");
  gt_free(db_basename);
  db_basename = NULL;
  suffix_ptr = NULL;

  if (arguments->blastn || arguments->blastp) {
    GtMatch              *match;
    GtMatchIterator      *mp = NULL;
    GtNREncseq           *nrencseq = NULL;
    GtStr                *fastaname = gt_str_clone(arguments->dbpath);
    HitPosition          *hits;
    double                eval,
                          raw_eval = 0.0;
    GtUword               coarse_db_len = 0;
    GtMatchIteratorStatus status;
    int                   curr_hits = 0,
                          max_hits = 100;

    hits = gt_malloc(sizeof (*hits) * (size_t) max_hits);

    gt_str_append_cstr(fastaname, ".fas");

    for (i=0; i < max_hits; i++) {
      hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1);
    }

    if (gt_showtime_enabled()) {
      timer = gt_timer_new_with_progress_description("initialization");
      gt_timer_start(timer);
    }

    /*extract sequences from compressed database*/
    if (!had_err) {
      nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath),
                                             logger, err);
      if (nrencseq == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->ceval == GT_UNDEF_DOUBLE ||
          arguments->feval == GT_UNDEF_DOUBLE) {
        /* from NCBI BLAST tutorial:
           E = Kmne^{-lambdaS}
           calculates E-value for score S with natural scale parameters K for
           search space size and lambda for the scoring system
           E = mn2^-S'
           m being the subject (total) length, n the length of ONE query
           calculates E-value for bit-score S'
         */
        GtFastaReader *reader;
        GtCondenserSearchAvg avg = {0,0};
        reader = gt_fasta_reader_rec_new(arguments->querypath);
        had_err = gt_fasta_reader_run(reader, NULL, NULL,
                                      gt_condenser_search_cum_moving_avg,
                                      &avg,
                                      err);
        if (!had_err) {
          GtUword S = arguments->bitscore;
          gt_log_log(GT_WU " queries, avg query size: " GT_WU,
                     avg.count, avg.avg);
          raw_eval = 1/pow(2.0, (double) S) * avg.avg;
          gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval);
          gt_assert(avg.avg != 0);
        }
        gt_fasta_reader_delete(reader);
      }
    }

    /*create BLAST database from compressed database fasta file*/
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create coarse BLAST db", stderr);
      if (arguments->blastn)
        had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname),
                                                          err);
      else
        had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname),
                                                          err);
    }

    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "coarse BLAST run", stderr);

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();
      gt_blast_process_call_set_db(call, gt_str_get(fastaname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, arguments->ceval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      while (!had_err &&
             (status = gt_match_iterator_next(mp, &match, err)) !=
             GT_MATCHER_STATUS_END)
      {
        if (status == GT_MATCHER_STATUS_OK) {
          GtUword hit_seq_id;
          char string[7];
          const char *dbseqid = gt_match_get_seqid2(match);
          if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) {
            gt_match_get_range_seq2(match, hits[curr_hits].range);
            hits[curr_hits].idx = hit_seq_id;
            gt_match_delete(match);
            curr_hits++;
            if (curr_hits == max_hits) {
              HitPosition *hit_extention;
              max_hits += 100;
              hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits);
              for (i=max_hits - 100; i < max_hits; i++) {
                hits[i].range = gt_malloc(sizeof (*hits[i].range));
              }
            }
          } else {
            gt_error_set(err, "could not parse unique db header %s", dbseqid);
            had_err = -1;
          }
        } else if (status == GT_MATCHER_STATUS_ERROR) {
          had_err = -1;
        }
      }
      gt_match_iterator_delete(mp);
    }
    /*extract sequences*/
    if (!had_err) {
      GtNREncseqDecompressor *decomp;
      GtFile *coarse_hits;
      if (timer != NULL)
        gt_timer_show_progress(timer, "extract coarse search hits", stderr);
      decomp = gt_n_r_encseq_decompressor_new(nrencseq);
      coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err);
      /* TODO DW do NOT extract complete uniques! these could be complete
         chromosomes!! just extract something around it? maybe +- max query
         length*/
      for (i = 0; i < curr_hits; i++) {
        gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp,
                                                             hits[i].idx);
      }
      had_err =
        gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits,
                                                           decomp,
                                                           &coarse_db_len,
                                                           err);
      gt_assert(coarse_db_len != 0);
      gt_file_delete(coarse_hits);
      gt_n_r_encseq_decompressor_delete(decomp);
    }
    gt_n_r_encseq_delete(nrencseq);

    /* create BLAST database from decompressed database file */
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create fine BLAST db", stderr);
      if (arguments->blastn)
        had_err =
          gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname),
                                                  err);
      else
        had_err =
          gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname),
                                                  err);
    }
    /* perform fine BLAST search */
    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "fine BLAST run", stderr);

      if (arguments->feval == GT_UNDEF_DOUBLE) {
        eval = raw_eval * coarse_db_len;
      } else {
        eval = arguments->feval;
      }

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();

      gt_blast_process_call_set_db(call, gt_str_get(coarse_fname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, eval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval,
                    coarse_db_len);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      if (!had_err) {
        GtUword numofhits = 0;
        while (!had_err &&
               (status = gt_match_iterator_next(mp, &match, err)) !=
               GT_MATCHER_STATUS_END) {
          if (status == GT_MATCHER_STATUS_OK) {
            GtMatchBlast *matchb = (GtMatchBlast*) match;
            char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50);
            GtRange range_seq1;
            GtRange range_seq2;
            numofhits++;
            gt_match_get_range_seq1(match, &range_seq1);
            gt_match_get_range_seq2(match, &range_seq2);
            gt_file_xprintf(
                    arguments->outfp,
                    "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t"
                    GT_WU "\t%g\t%.3f\n",
                    gt_match_get_seqid1(match),
                    gt_match_get_seqid2(match),
                    gt_match_blast_get_similarity(matchb),
                    gt_match_blast_get_align_length(matchb),
                    range_seq1.start,
                    range_seq1.end,
                    range_seq2.start,
                    range_seq2.end,
                    gt_match_blast_get_evalue(matchb),
                    (double) gt_match_blast_get_bitscore(matchb));
            gt_match_delete(match);
            gt_free(dbseqid);
          } else if (status == GT_MATCHER_STATUS_ERROR) {
            had_err = -1;
          }
        }
        gt_log_log(GT_WU " hits found\n", numofhits);
      }
      gt_match_iterator_delete(mp);

    }
    if (!had_err)
      if (timer != NULL)
        gt_timer_show_progress_final(timer, stderr);
    gt_timer_delete(timer);

    /*cleanup*/
    for (i=0; i < max_hits; i++) {
      gt_free(hits[i].range);
    }
    gt_free(hits);
    gt_str_delete(fastaname);
  }
  gt_str_delete(coarse_fname);
  gt_logger_delete(logger);
  return had_err;
}
Exemplo n.º 10
0
int mg_computepath(CombinedScoreMatrixEntry **combinedscore_matrix,
                   HitInformation *hit_information,
                   unsigned long rows,
                   unsigned long contig_len,
                   ParseStruct *parsestruct_ptr, GtError * err)
{
  int had_err = 0;

  /* Initialisieren der Matrix fuer die Pfadberechnung */
  PathMatrixEntry **path_matrix;

  /* i: Zaehlvariable fuer die Matrix-Zeilen; k: Zaehlvariable Precursors
     (von 0 bis max 2) maxpath_frame: Speichern des vorherigen Frames von
     dem der max-Wert berechnet wird */
  unsigned short row_index = 0,
    precursor_index = 0,
    precursors_row = 0,
    maxpath_frame = 0;

  /* Position in der Query-DNA */
  unsigned long column_index = 0;

  /* Variablen fuer den aktuellen Frame, den vorherigen Frame(speichert
     einen Wert aus precursors[], die Zeile des vorherigen Frames, GtArray
     mit den Precursors-Frames */
  short current_frame = 0,
    precursors_frame = 0,
    precursors[NUM_PRECURSORS];

  /* q ist der Wert, der bei Aus- oder Eintreten in ein Gen auf dem
     Forward- bzw. Reverse-Strang berechnet wird */
  double q = ARGUMENTSSTRUCT(leavegene_value),
    max_new = 1,
    max_old = 1;

  /* Speicherreservierung fuer die Path-Matrix - Groesse entsprechend der
     CombinedScore-Matrix */
  gt_array2dim_calloc(path_matrix, 7, contig_len);

  gt_error_check(err);

  /* fuer die erste Spalte der Path-Matrix wird die erste Spalte der
     CombinedScore-Matrix uebernommen */
  for (row_index = 0; row_index < rows; row_index++)
  {
    path_matrix[row_index][0].score =
      combinedscore_matrix[row_index][0].matrix_score;
    path_matrix[row_index][0].path_frame = row_index;
  }

  /* Spaltenweise Berechnung des opt. Pfades */
  for (column_index = 1; column_index < contig_len; column_index++)
  {
    for (row_index = 0; row_index < rows; row_index++)
    {
      /* Zaehlvariable fuer die Zeile wird umgerechnet in den entsprechenden
         Leserahmen */
      current_frame = get_current_frame(row_index);
      /* Aufruf der Methode zum Berechnen der moeglichen Leserahmen anhand von
         aktuellem Leserahmen und der Query-DNA-Sequenz */
      compute_precursors(current_frame,
                         column_index,
                         precursors);

      /* der max-Wert der moeglichen Vorgaenger wird berechnet */
      for (precursor_index = 0;
           precursor_index < NUM_PRECURSORS
             && (precursors[precursor_index] != UNDEFINED);
           ++precursor_index)
      {
        /* aktueller Vorgaengerleserahmen - es gibt max. 3 moegliche
           Vorgaenger */
        precursors_frame = precursors[precursor_index];
        /* Vorgaengerleserahmen wird umgerechnet in die entsprechende
           Matrix-Zeile */
        precursors_row = get_matrix_row(precursors_frame);

        /* der DP-Algo umfasst 3 moegliche Faelle
           1. Fall: Wechsel vom Reversen- auf den Forward-Strang bzw.
           umgekehrt */
        if ((current_frame < 0 && precursors_frame > 0) ||
            (current_frame > 0 && precursors_frame < 0))
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index].matrix_score
                      + 2*q;
        }
        /* 2. Fall: Einfacher Wechsel des Leserahmens, also von + zu +
           bzw.- zu - */
        else if (current_frame != 0 && precursors_frame != current_frame)
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index].matrix_score
                      + q;
        }
        /* 3. Fall: Leserahmen wird beibehalten bzw. Wechsel von kodierend zu
           nicht-kodierend oder umgekehrt */
        else
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index]
                      .matrix_score;
        }

        /* Bestimmen des Max-Wertes der max. 3 Moeglichkeiten und Speichern der
           Zeile, von der der Max-Wert stammt */
        if (gt_double_compare(max_new, max_old) > 0)
        {
            max_old = max_new;
            maxpath_frame = precursors_row;
        }
      }

      /* Speichern des Max-Wertes und der "Vorgaenger"-Zeile;
         zuruecksetzen der Variablen */
      path_matrix[row_index][column_index].score      = max_old;
      path_matrix[row_index][column_index].path_frame = maxpath_frame;

      max_new = DBL_MIN;
      max_old = DBL_MIN;
      maxpath_frame = 0;
    }
  }

  /* Aufruf der Methode zur Genvorhersage */
  had_err = mg_compute_gene_prediction(combinedscore_matrix,
                                       path_matrix,
                                       contig_len,
                                       hit_information,
                                       parsestruct_ptr, err);

  gt_array2dim_delete(path_matrix);

  return had_err;
}
Exemplo n.º 11
0
static int gt_show_seedext_runner(GT_UNUSED int argc,
                                  GT_UNUSED const char **argv,
                                  GT_UNUSED int parsed_args,
                                  void *tool_arguments,
                                  GtError *err)
{
  int had_err = 0;
  GtUword alignmentwidth;
  GtShowSeedextArguments *arguments = tool_arguments;
  GtSeedextendMatchIterator *semi;

  gt_error_check(err);
  gt_assert(arguments != NULL);
  /* Parse option string in first line of file specified by filename. */
  alignmentwidth = arguments->show_alignment ? 70 : 0;
  semi = gt_seedextend_match_iterator_new(arguments->matchfilename,err);
  if (semi == NULL)
  {
    had_err = -1;
  }
  /* Parse seed extensions. */
  if (!had_err)
  {
    const GtEncseq *aencseq = gt_seedextend_match_iterator_aencseq(semi),
                   *bencseq = gt_seedextend_match_iterator_bencseq(semi);
    GtAlignment *alignment = gt_alignment_new();
    Polishing_info *pol_info = NULL;
    GtSequencepairbuffer seqpairbuf = {NULL,NULL,0,0};

    /* the following are used if seed_extend is set */
    GtGreedyextendmatchinfo *greedyextendmatchinfo = NULL;
    GtProcessinfo_and_querymatchspaceptr processinfo_and_querymatchspaceptr;
    const GtUchar *characters = gt_encseq_alphabetcharacters(aencseq);
    const GtUchar wildcardshow = gt_encseq_alphabetwildcardshow(aencseq);
    GtUchar *alignment_show_buffer
      = arguments->show_alignment ? gt_alignment_buffer_new(alignmentwidth)
                                  : NULL;
    GtLinspaceManagement *linspace_spacemanager = gt_linspaceManagement_new();
    GtScoreHandler *linspace_scorehandler = gt_scorehandler_new(0,1,0,1);;

    if (!arguments->relax_polish)
    {
      double matchscore_bias = GT_DEFAULT_MATCHSCORE_BIAS;
      if (gt_seedextend_match_iterator_bias_parameters(semi))
      {
        matchscore_bias = gt_greedy_dna_sequence_bias_get(aencseq);
      }
      pol_info = polishing_info_new_with_bias(
                          gt_seedextend_match_iterator_errorpercentage(semi),
                          matchscore_bias,
                          gt_seedextend_match_iterator_history_size(semi));
    }
    if (arguments->seed_display)
    {
      gt_seedextend_match_iterator_seed_display_set(semi);
    }
    if (arguments->show_alignment || arguments->showeoplist)
    {
      gt_seedextend_match_iterator_querymatchoutoptions_set(semi,
                                                       true,
                                                       arguments->showeoplist,
                                                       alignmentwidth,
                                                       !arguments->relax_polish,
                                                       arguments->seed_display);
    }
    if (arguments->seed_extend)
    {
      greedyextendmatchinfo
        = gt_greedy_extend_matchinfo_new(70,
                              GT_MAX_ALI_LEN_DIFF,
                              gt_seedextend_match_iterator_history_size(semi),
                              GT_MIN_PERC_MAT_HISTORY,
                              0, /* userdefinedleastlength */
                              GT_EXTEND_CHAR_ACCESS_ANY,
                              100,
                              pol_info);
    }
    if (pol_info != NULL)
    {
      gt_alignment_polished_ends(alignment,pol_info,false);
    }
    processinfo_and_querymatchspaceptr.processinfo = greedyextendmatchinfo;
    if (arguments->sortmatches)
    {
      (void) gt_seedextend_match_iterator_all_sorted(semi,true);
    }
    while (true)
    {
      GtQuerymatch *querymatchptr = gt_seedextend_match_iterator_next(semi);

      if (querymatchptr == NULL)
      {
        break;
      }
      if (gt_seedextend_match_iterator_has_seedline(semi))
      {
        if (arguments->seed_extend)
        {
          if (aencseq == bencseq)
          {
            const GtUword
              seedlen = gt_seedextend_match_iterator_seedlen(semi),
              seedpos1 = gt_seedextend_match_iterator_seedpos1(semi),
              seedpos2 = gt_seedextend_match_iterator_seedpos2(semi);

            processinfo_and_querymatchspaceptr.querymatchspaceptr
              = querymatchptr;
            had_err = gt_greedy_extend_selfmatch_with_output(
                                  &processinfo_and_querymatchspaceptr,
                                  aencseq,
                                  seedlen,
                                  seedpos1,
                                  seedpos2,
                                  err);
            if (had_err)
            {
              break;
            }
          } else
          {
            gt_assert(false);
          }
        } else
        {
          const GtUword query_totallength
            = gt_encseq_seqlength(bencseq,
                                  gt_querymatch_queryseqnum(querymatchptr));
          gt_show_seed_extend_encseq(querymatchptr,
                                     aencseq,
                                     bencseq,
                                     query_totallength);
        }
      } else
      {
        gt_show_seed_extend_plain(&seqpairbuf,
                                  linspace_spacemanager,
                                  linspace_scorehandler,
                                  alignment,
                                  alignment_show_buffer,
                                  alignmentwidth,
                                  arguments->showeoplist,
                                  characters,
                                  wildcardshow,
                                  aencseq,
                                  bencseq,
                                  querymatchptr);
      }
    }
    polishing_info_delete(pol_info);
    gt_greedy_extend_matchinfo_delete(greedyextendmatchinfo);
    gt_free(alignment_show_buffer);
    gt_scorehandler_delete(linspace_scorehandler);
    gt_linspaceManagement_delete(linspace_spacemanager);
    gt_free(seqpairbuf.a_sequence);
    gt_free(seqpairbuf.b_sequence);
    gt_alignment_delete(alignment);
  }
  gt_seedextend_match_iterator_delete(semi);
  return had_err;
}
Exemplo n.º 12
0
static int gt_select_runner(int argc, const char **argv, int parsed_args,
                            void *tool_arguments, GtError *err)
{
  SelectArguments *arguments = tool_arguments;
  GtNodeStream *gff3_in_stream, *select_stream,
               *targetbest_select_stream = NULL, *gff3_out_stream;
  int had_err;
  GtFile *drop_file = NULL;
  GtNodeVisitor *gff3outvis = NULL;
  gt_error_check(err);
  gt_assert(arguments);

  /* create a gff3 input stream */
  gff3_in_stream = gt_gff3_in_stream_new_unsorted(argc - parsed_args,
                                                  argv + parsed_args);
  if (arguments->verbose && arguments->outfp)
    gt_gff3_in_stream_show_progress_bar((GtGFF3InStream*) gff3_in_stream);

  /* create a filter stream */
  select_stream = gt_select_stream_new(gff3_in_stream, arguments->seqid,
                                       arguments->source,
                                       &arguments->contain_range,
                                       &arguments->overlap_range,
                                       arguments->strand,
                                       arguments->targetstrand,
                                       arguments->has_CDS,
                                       arguments->max_gene_length,
                                       arguments->max_gene_num,
                                       arguments->min_gene_score,
                                       arguments->max_gene_score,
                                       arguments->min_average_splice_site_prob,
                                       arguments->feature_num,
                                       arguments->filter_files,
                                       arguments->filter_logic,
                                       err);

  if (select_stream) {
    GtSelectStream *fs = (GtSelectStream*) select_stream;

    if (gt_str_length(arguments->dropped_file) > 0) {
      drop_file = gt_file_new(gt_str_get(arguments->dropped_file), "w", err);
      gff3outvis = gt_gff3_visitor_new(drop_file);
      gt_select_stream_set_drophandler(fs, print_to_file_drophandler,
                                       (void*) gff3outvis);
    } else {
      gt_select_stream_set_drophandler(fs, default_drophandler, NULL);
    }

    gt_select_stream_set_single_intron_factor(select_stream,
                                              arguments->single_intron_factor);

    if (arguments->targetbest)
      targetbest_select_stream = gt_targetbest_select_stream_new(select_stream);

    /* create a gff3 output stream */
    gff3_out_stream = gt_gff3_out_stream_new(arguments->targetbest
                                             ? targetbest_select_stream
                                             : select_stream,
                                             arguments->outfp);

    /* pull the features through the stream and free them afterwards */
    had_err = gt_node_stream_pull(gff3_out_stream, err);

    /* free */
    gt_node_stream_delete(gff3_out_stream);
    gt_node_stream_delete(select_stream);
    gt_node_stream_delete(targetbest_select_stream);
  } else {
    had_err = -1;
  }
  gt_file_delete(drop_file);
  gt_node_visitor_delete(gff3outvis);
  gt_node_stream_delete(gff3_in_stream);
  return had_err;
}
Exemplo n.º 13
0
static GtRDBStmt* gt_rdb_mysql_prepare(GtRDB *rdb, const char *query,
                                       unsigned long num_params, GtError *err)
{
  GtRDBStmt *st = NULL;
  GtRDBStmtMySQL *stm = NULL;
  GtRDBMySQL *rdbm;
  int had_err = 0, retval = 0;
  /* we need these to keep track of result/parameter and string buffers */
  HashElemInfo str_buffer_hash = {
      gt_ht_ptr_elem_hash,
      { free_str },
      sizeof (GtStr*),
      gt_ht_ptr_elem_cmp,
      NULL,
      NULL
    },
    buffer_hash = {
      gt_ht_ptr_elem_hash,
      { free_buf },
      sizeof (void*),
      gt_ht_ptr_elem_cmp,
      NULL,
      NULL
    };
  MYSQL_STMT *tmp = NULL;
  gt_assert(rdb && query);
  gt_error_check(err);

  rdbm = gt_rdb_mysql_cast(rdb);
  tmp = mysql_stmt_init(&rdbm->conn);
  if ((retval = mysql_stmt_prepare(tmp, query, strlen(query)))) {
    gt_error_set(err, GT_MYSQL_ERRMSG, retval, mysql_stmt_error(tmp));
    had_err = -1;
  }
  if (!had_err) {
    int param_count;
    param_count = mysql_stmt_param_count(tmp);
    if (param_count != num_params) {
      gt_error_set(err, "invalid parameter count: %lu expected, %d given",
                   num_params, param_count);
      mysql_stmt_close(tmp);
      had_err = -1;
    }
  }
  if (!had_err) {
    st = gt_rdb_stmt_create(gt_rdb_stmt_mysql_class());
    stm = gt_rdb_stmt_mysql_cast(st);
    stm->num_params = num_params;
    stm->query = gt_str_new_cstr(query);
    stm->buffers = gt_hashtable_new(buffer_hash);
    stm->returned_strings = gt_hashtable_new(str_buffer_hash);
    stm->stmt = tmp;
    stm->update_maxlengths = true;
    stm->params = gt_calloc(num_params, sizeof (MYSQL_BIND));
    mysql_stmt_attr_set(tmp, STMT_ATTR_UPDATE_MAX_LENGTH,
                        &stm->update_maxlengths);
    memset(stm->params, 0, num_params*sizeof (MYSQL_BIND));
    stm->conn = &rdbm->conn;
  }
  return st;
}
Exemplo n.º 14
0
static int gt_rdb_stmt_mysql_exec(GtRDBStmt *st, GtError *err)
{
  GtRDBStmtMySQL *stm;
  int rval, had_err = 0, num_fields;
  MYSQL_RES *meta_res = NULL;
  gt_assert(st);
  gt_error_check(err);
  stm = gt_rdb_stmt_mysql_cast(st);
  if (!stm->executed) {
    if (stm->num_params > 0) {
      gt_assert(stm->stmt && stm->params);
      if ((rval = mysql_stmt_bind_param(stm->stmt, stm->params))) {
        gt_error_set(err, GT_MYSQL_ERRMSG, rval, mysql_stmt_error(stm->stmt));
        had_err = -1;
      }
    }
    if (!had_err && (rval = mysql_stmt_execute(stm->stmt))) {
      gt_error_set(err, GT_MYSQL_ERRMSG, rval, mysql_stmt_error(stm->stmt));
      had_err = -1;
    }
    if (!had_err) {
      stm->executed = true;
      if (mysql_stmt_store_result(stm->stmt)) {
        gt_error_set(err, GT_MYSQL_ERRMSG,
                     had_err, mysql_stmt_error(stm->stmt));
        had_err = -1;
      }
      meta_res = mysql_stmt_result_metadata(stm->stmt);
      if (!had_err && meta_res) {
        int i = 0;
        /* statement returned a result */
        num_fields = mysql_num_fields(meta_res);
        stm->results = gt_calloc(num_fields, sizeof (MYSQL_BIND));
        /* prepare result buffers for each field */
        for (i=0;i<num_fields;i++) {
          MYSQL_FIELD *field;
          field = mysql_fetch_field(meta_res);
          stm->results[i].buffer_type = field->type;
          switch (field->type) {
            case MYSQL_TYPE_DOUBLE:
              {double *dbl = gt_calloc(1, sizeof (double));
              gt_hashtable_add(stm->buffers, &dbl);
              stm->results[i].buffer_length = sizeof (double);
              stm->results[i].buffer = dbl;}
              break;
            case MYSQL_TYPE_LONG:
            case MYSQL_TYPE_INT24:
            {int *l = gt_calloc(1, sizeof (int));
              gt_hashtable_add(stm->buffers, &l);
              stm->results[i].is_unsigned = false;
              stm->results[i].buffer_length = sizeof (int);
              stm->results[i].buffer = l;}
            case MYSQL_TYPE_SHORT:
            {short int *l = gt_calloc(1, sizeof (short int));
              gt_hashtable_add(stm->buffers, &l);
              stm->results[i].is_unsigned = false;
              stm->results[i].buffer_length = sizeof (short int);
              stm->results[i].buffer = l;}
            case MYSQL_TYPE_TINY:
              {signed char *l = gt_calloc(1, sizeof (signed char));
              gt_hashtable_add(stm->buffers, &l);
              stm->results[i].is_unsigned = false;
              stm->results[i].buffer_length = sizeof (signed char);
              stm->results[i].buffer = l;}
              break;
            case MYSQL_TYPE_STRING:
            case MYSQL_TYPE_VAR_STRING:
            case MYSQL_TYPE_BLOB:
            case MYSQL_TYPE_TINY_BLOB:
            case MYSQL_TYPE_MEDIUM_BLOB:
            case MYSQL_TYPE_LONG_BLOB:
            case MYSQL_TYPE_BIT:
              {char *str = gt_calloc(field->max_length+1, sizeof (char));
              gt_hashtable_add(stm->buffers, &str);
              unsigned long *length = gt_calloc(1, sizeof (unsigned long));
              gt_hashtable_add(stm->buffers, &length);
              stm->results[i].buffer = str;
              stm->results[i].buffer_length = field->max_length;
              stm->results[i].length = length;}
              break;
            default:
              /* unsupported data type */
              break;
          }
        }
        if (!had_err)
          mysql_stmt_bind_result(stm->stmt, stm->results);
        mysql_free_result(meta_res);
      } else {
        return 1;
      }
    }
  }
  if (!had_err) {
    switch ((rval = mysql_stmt_fetch(stm->stmt))) {
      case 0:
      default:
        break;
      case MYSQL_NO_DATA:
        had_err = 1;  /* last row read */
        break;
      case 1:
        gt_error_set(err, GT_MYSQL_ERRMSG, mysql_stmt_errno(stm->stmt),
                     mysql_stmt_error(stm->stmt));
        had_err = -1;
        break;
    }
  }
  return had_err;
}
Exemplo n.º 15
0
static int gt_seqorder_runner(GT_UNUSED int argc, const char **argv,
                              int parsed_args, void *tool_arguments, GtError *err)
{
    GtSeqorderArguments *arguments = tool_arguments;
    int had_err = 0;
    GtEncseq *encseq;
    GtEncseqLoader *loader;
    GtUword i, nofseqs;

    gt_error_check(err);
    gt_assert(arguments != NULL);

    /* load encseq */
    loader = gt_encseq_loader_new();
    encseq = gt_encseq_loader_load(loader, argv[parsed_args], err);
    if (encseq == NULL)
        had_err = -1;
    if (had_err == 0 && !gt_encseq_has_description_support(encseq))
        gt_warning("%s has no description support", argv[parsed_args]);
    if (!had_err)
    {
        nofseqs = gt_encseq_num_of_sequences(encseq);
        if (arguments->invert)
        {
            for (i = nofseqs; i > 0; i--)
                gt_seqorder_output(i - 1, encseq);
        }
        else if (arguments->shuffle)
        {
            GtUword *seqnums;
            seqnums = gt_malloc(sizeof (GtUword) * nofseqs);
            gt_seqorder_get_shuffled_seqnums(nofseqs, seqnums);
            for (i = 0; i < nofseqs; i++)
                gt_seqorder_output(seqnums[i], encseq);
            gt_free(seqnums);
        }
        else
        {
            GtSuffixsortspace *suffixsortspace;
            gt_assert(arguments->sort || arguments->revsort);
            suffixsortspace
                = gt_suffixsortspace_new(nofseqs,
                                         /* Use iterator over sequence separators:
                                            saves a lot of binary searches */
                                         gt_encseq_seqstartpos(encseq, nofseqs-1),
                                         false,NULL);
            gt_seqorder_sort(suffixsortspace, encseq);
            if (arguments->sort)
                for (i = 0; i < nofseqs; i++)
                    gt_seqorder_output(gt_encseq_seqnum(encseq,
                                                        gt_suffixsortspace_getdirect(suffixsortspace, i)), encseq);
            else
                for (i = nofseqs; i > 0; i--)
                    gt_seqorder_output(gt_encseq_seqnum(encseq,
                                                        gt_suffixsortspace_getdirect(suffixsortspace, i - 1)), encseq);
            gt_suffixsortspace_delete(suffixsortspace, false);
        }
    }

    gt_encseq_loader_delete(loader);
    gt_encseq_delete(encseq);
    return had_err;
}
Exemplo n.º 16
0
int gtr_run(GtR *gtr, int argc, const char **argv, GtError *err)
{
  GtToolfunc toolfunc;
  GtTool *tool = NULL;
  char **nargv = NULL;
  void *mem, *map;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(gtr);
  if (gtr->debug)
    enable_logging(gt_str_get(gtr->debugfp), &gtr->logfp);
  if (gtr->quiet)
    gt_warning_disable();
  gtr->seed = gt_ya_rand_init(gtr->seed);
  gt_log_log("seed=%u", gtr->seed);
  if (gtr->list)
    return list_tools(gtr);
  if (gt_str_length(gtr->manoutdir) > 0)
    return create_manpages(gtr, gt_str_get(gtr->manoutdir), err);
  if (gtr->check64bit)
    return check64bit();
  if (gtr->test)
    return run_tests(gtr, err);
  if (gt_str_length(gtr->testspacepeak)) {
    mem = gt_malloc(1 << 26); /* alloc 64 MB */;
    map = gt_fa_xmmap_read(gt_str_get(gtr->testspacepeak), NULL);
    gt_fa_xmunmap(map);
    gt_free(mem);
  }
  if (argc == 0 && !gtr->interactive) {
    gt_error_set(err, "neither tool nor script specified; option -help lists "
                      "possible tools");
    had_err = -1;
  }
  if (!had_err && argc) {
    if (!gtr->tools || !gt_toolbox_has_tool(gtr->tools, argv[0])) {
      /* no tool found -> try to open script */
      if (gt_file_exists(argv[0])) {
        /* export script */
        gt_lua_set_script_dir(gtr->L, argv[0]);
        /* run script */
        nargv = gt_cstr_array_prefix_first(argv, gt_error_get_progname(err));
        gt_lua_set_arg(gtr->L, nargv[0], (const char**) nargv+1);
        if (luaL_dofile(gtr->L, argv[0])) {
          /* error */
          gt_assert(lua_isstring(gtr->L, -1)); /* error message on top */
          gt_error_set(err, "could not execute script %s",
                       lua_tostring(gtr->L, -1));
          had_err = -1;
          lua_pop(gtr->L, 1); /* pop error message */
        }
      }
      else {
        /* neither tool nor script found */
        gt_error_set(err, "neither tool nor script '%s' found; option -help "
                          "lists possible tools", argv[0]);
        had_err = -1;
      }
    }
    else {
      /* run tool */
      if (!(toolfunc = gt_toolbox_get(gtr->tools, argv[0]))) {
        tool = gt_toolbox_get_tool(gtr->tools, argv[0]);
        gt_assert(tool);
      }
      nargv = gt_cstr_array_prefix_first(argv, gt_error_get_progname(err));
      gt_error_set_progname(err, nargv[0]);
      if (toolfunc)
        had_err = toolfunc(argc, (const char**) nargv, err);
      else
        had_err = gt_tool_run(tool, argc, (const char**) nargv, err);
    }
  }
  gt_cstr_array_delete(nargv);
  if (!had_err && gtr->interactive) {
    gt_showshortversion(gt_error_get_progname(err));
    gt_lua_set_arg(gtr->L, gt_error_get_progname(err), argv);
    run_interactive_lua_interpreter(gtr->L);
  }
  if (had_err)
    return EXIT_FAILURE;
  return EXIT_SUCCESS;
}
Exemplo n.º 17
0
int gt_findsubquerygmatchforward(const GtEncseq *encseq,
                                 const void *genericindex,
                                 unsigned long totallength,
                                 Greedygmatchforwardfunction gmatchforward,
                                 const GtAlphabet *alphabet,
                                 const GtStrArray *queryfilenames,
                                 Definedunsignedlong minlength,
                                 Definedunsignedlong maxlength,
                                 bool showsequence,
                                 bool showquerypos,
                                 bool showsubjectpos,
                                 GtError *err)
{
    Substringinfo substringinfo;
    Rangespecinfo rangespecinfo;
    bool haserr = false;
    GtSeqIterator *seqit;
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    int retval;
    uint64_t unitnum;

    gt_error_check(err);
    substringinfo.genericindex = genericindex;
    substringinfo.totallength = totallength;
    rangespecinfo.minlength = minlength;
    rangespecinfo.maxlength = maxlength;
    rangespecinfo.showsequence = showsequence;
    rangespecinfo.showquerypos = showquerypos;
    rangespecinfo.showsubjectpos = showsubjectpos;
    substringinfo.preprocessgmatchlength = showunitnum;
    substringinfo.processgmatchlength = showifinlengthrange;
    substringinfo.postprocessgmatchlength = NULL;
    substringinfo.alphabet = alphabet;
    substringinfo.processinfo = &rangespecinfo;
    substringinfo.gmatchforward = gmatchforward;
    substringinfo.encseq = encseq;
    seqit = gt_seqiterator_sequence_buffer_new(queryfilenames, err);
    if (!seqit)
        haserr = true;
    if (!haserr)
    {
        gt_seqiterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alphabet));
        for (unitnum = 0; /* Nothing */; unitnum++)
        {
            retval = gt_seqiterator_next(seqit,
                                         &query,
                                         &querylen,
                                         &desc,
                                         err);
            if (retval < 0)
            {
                haserr = true;
                break;
            }
            if (retval == 0)
            {
                break;
            }
            gmatchposinsinglesequence(&substringinfo,
                                      unitnum,
                                      query,
                                      querylen,
                                      desc);
        }
        gt_seqiterator_delete(seqit);
    }
    return haserr ? -1 : 0;
}
Exemplo n.º 18
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);

  /* TODO: support discontinuous start/stop codons */
  for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    if (gt_feature_node_get_attribute((GtFeatureNode*) gn,
        GTF_PARSER_STOP_CODON_FLAG)) {
      GtUword j;
      GtRange stop_codon_rng = gt_genome_node_get_range(gn);
      bool found_cds = false;
      for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) {
        GtGenomeNode* gn2;
        GtRange this_rng;
        const char *this_type;
        gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j);
        if (gn == gn2) continue;
        this_rng = gt_genome_node_get_range(gn2);
        this_type = gt_feature_node_get_type((GtFeatureNode*) gn2);
        if (this_type == gt_symbol(gt_ft_CDS)) {
          if (gt_range_contains(&this_rng, &stop_codon_rng)) {
            if (cinfo->tidy) {
              gt_warning("stop codon on line %u in file %s is contained in "
                         "CDS in line %u",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn),
                         gt_genome_node_get_line_number(gn2));
              found_cds = true;
            } else {
              gt_error_set(err, "stop codon on line %u in file %s is "
                                "contained in CDS in line %u",
                           gt_genome_node_get_line_number(gn),
                           gt_genome_node_get_filename(gn),
                           gt_genome_node_get_line_number(gn2));
              had_err = -1;
            }
            break;
          }
          if (this_rng.end + 1 == stop_codon_rng.start) {
            this_rng.end = stop_codon_rng.end;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
          if (this_rng.start == stop_codon_rng.end + 1) {
            this_rng.start = stop_codon_rng.start;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
        }
      }
      if (!found_cds) {
        if (!had_err) {
          if (cinfo->tidy) {
            gt_warning("found stop codon on line %u in file %s with no "
                       "flanking CDS, ignoring it",
                       gt_genome_node_get_line_number(gn),
                       gt_genome_node_get_filename(gn));
          } else {
            gt_error_set(err, "found stop codon on line %u in file %s with no "
                              "flanking CDS",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn));
            had_err = -1;
            break;
          }
        }
      } else {
        gt_array_rem(gt_genome_node_array, i);
        gt_genome_node_delete(gn);
      }
    }
  }

  for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    GtStrand strand;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    if (strand != mRNA_strand) {
      gt_error_set(err, "feature %s on line %u has strand %c, but the "
                        "parent transcript has strand %c",
                   (const char*) key,
                   gt_genome_node_get_line_number(gn),
                   GT_STRAND_CHARS[strand],
                   GT_STRAND_CHARS[mRNA_strand]);
      had_err = -1;
      break;
    } else {
      mRNA_strand = gt_strand_join(mRNA_strand, strand);
    }
    if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id",
                                  key);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key)) && strlen(tname) > 0) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gt_genome_node_ref(gn));
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}
Exemplo n.º 19
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);
  for (i = 1; i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    /* XXX: an error check is necessary here, otherwise gt_strand_join() can
       cause a failed assertion */
    mRNA_strand = gt_strand_join(mRNA_strand,
                          gt_feature_node_get_strand((GtFeatureNode*) gn));
    if (gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key))) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gn);
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}
Exemplo n.º 20
0
GtPdomModelSet* gt_pdom_model_set_new(GtStrArray *hmmfiles, GtError *err)
{
  GtStr *concat_dbnames, *cmdline, *indexfilename = NULL;
  GtUword i;
  char *md5_hash, ch;
  const char *tmpdir;
  int had_err = 0, rval;
  FILE *dest;
  GtPdomModelSet *pdom_model_set;
  gt_assert(hmmfiles);
  gt_error_check(err);

  rval = system("hmmpress -h > /dev/null");
  if (rval == -1) {
    gt_error_set(err, "error executing system(hmmpress)");
    return NULL;
  }
#ifndef _WIN32
  if (WEXITSTATUS(rval) != 0) {
    gt_error_set(err, "cannot find the hmmpress executable in PATH");
    return NULL;
  }
#else
  /* XXX */
  gt_error_set(err, "hmmpress for Windows not implemented");
  return NULL;
#endif

  pdom_model_set = gt_calloc((size_t) 1, sizeof (GtPdomModelSet));
  concat_dbnames = gt_str_new();
  for (i = 0; !had_err && i < gt_str_array_size(hmmfiles); i++) {
    const char *filename = gt_str_array_get(hmmfiles, i);
    if (!gt_file_exists(filename)) {
      gt_error_set(err, "invalid HMM file: %s", filename);
      gt_str_delete(concat_dbnames);
      gt_free(pdom_model_set);
      return NULL;
    } else {
      gt_str_append_cstr(concat_dbnames, filename);
    }
  }
  if (!had_err) {
    pdom_model_set->filename = gt_str_new();
    if (!(tmpdir = getenv("TMPDIR")))
      tmpdir = "/tmp";
    gt_str_append_cstr(pdom_model_set->filename, tmpdir);
    gt_str_append_char(pdom_model_set->filename, GT_PATH_SEPARATOR);
    md5_hash = gt_md5_fingerprint(gt_str_get(concat_dbnames),
                                  gt_str_length(concat_dbnames));
    gt_str_append_cstr(pdom_model_set->filename, md5_hash);
    gt_free(md5_hash);
    gt_str_delete(concat_dbnames);
    indexfilename = gt_str_new_cstr(gt_str_get(pdom_model_set->filename));
    gt_str_append_cstr(indexfilename, GT_HMM_INDEX_SUFFIX);
  }

  if (!gt_file_exists(gt_str_get(indexfilename))) {
    dest = fopen(gt_str_get(pdom_model_set->filename), "w+");
    if (!dest) {
      gt_error_set(err, "could not create file %s",
                 gt_str_get(pdom_model_set->filename));
      had_err = -1;
    }
    if (!had_err) {
      for (i = 0; !had_err && i < gt_str_array_size(hmmfiles); i++) {
        FILE *source;
        const char *filename = gt_str_array_get(hmmfiles, i);
        source = fopen(filename, "r");
        if (!source) {
          gt_error_set(err, "could not open HMM file %s", filename);
          had_err = -1;
        }
        if (!had_err) {
          while (( ch = fgetc(source)) != EOF)
            (void) fputc(ch, dest);
          (void) fclose(source);
        }
      }
      (void) fclose(dest);
    }
    /* XXX: read hmmer path from env */
    cmdline = gt_str_new_cstr("hmmpress -f ");
    gt_str_append_str(cmdline, pdom_model_set->filename);
    gt_str_append_cstr(cmdline, "> /dev/null");   /* XXX: portability? */

    rval = system(gt_str_get(cmdline));
    gt_str_delete(cmdline);
    if (rval == -1) {
      gt_error_set(err, "error executing system(hmmpress)");
      return NULL;
    }
#ifndef _WIN32
    if (WEXITSTATUS(rval) != 0) {
      gt_error_set(err, "an error occurred during HMM preprocessing");
      had_err = -1;
    }
#else
    gt_error_set(err, "WEXITSTATUS not implemented on Windows");
    had_err = -1;
#endif
  }

  if (had_err) {
    gt_pdom_model_set_delete(pdom_model_set);
    pdom_model_set = NULL;
  }
  gt_str_delete(indexfilename);
  return pdom_model_set;
}
Exemplo n.º 21
0
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes,
                        GtStr *filenamestr, GtFile *fpin, bool be_tolerant,
                        GtError *err)
{
  GtStr *seqid_str, *source_str, *line_buffer;
  char *line;
  size_t line_length;
  GtUword i, line_number = 0;
  GtGenomeNode *gn;
  GtRange range;
  GtPhase phase_value;
  GtStrand gt_strand_value;
  GtSplitter *splitter, *attribute_splitter;
  float score_value;
  char *seqname,
       *source,
       *feature,
       *start,
       *end,
       *score,
       *strand,
       *frame,
       *attributes,
       *token,
       *gene_id,
       *gene_name = NULL,
       *transcript_id,
       *transcript_name = NULL,
       **tokens;
  GtHashmap *transcript_id_hash; /* map from transcript id to array of genome
                                    nodes */
  GtArray *gt_genome_node_array;
  ConstructionInfo cinfo;
  GTF_feature_type gtf_feature_type;
  GT_UNUSED bool gff_type_is_valid = false;
  const char *type = NULL;
  const char *filename;
  bool score_is_defined;
  int had_err = 0;

  gt_assert(parser && genome_nodes);
  gt_error_check(err);

  filename = gt_str_get(filenamestr);

  /* alloc */
  line_buffer = gt_str_new();
  splitter = gt_splitter_new(),
  attribute_splitter = gt_splitter_new();

#define HANDLE_ERROR                                                \
        if (had_err) {                                              \
          if (be_tolerant) {                                        \
            fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \
            gt_error_unset(err);                                       \
            gt_str_reset(line_buffer);                                 \
            had_err = 0;                                            \
            continue;                                               \
          }                                                         \
          else {                                                    \
            had_err = -1;                                           \
            break;                                                  \
          }                                                         \
        }

  while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) {
    line = gt_str_get(line_buffer);
    line_length = gt_str_length(line_buffer);
    line_number++;
    had_err = 0;

    if (line_length == 0) {
      gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number,
                 filename);
    }
    else if (line[0] == '#') {
      /* storing comment */
      if (line_length >= 2 && line[1] == '#')
        gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */
      else
        gn = gt_comment_node_new(line+1);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      gt_queue_add(genome_nodes, gn);
    }
    else {
      /* process tab delimited GTF line */
      gt_splitter_reset(splitter);
      gt_splitter_split(splitter, line, line_length, '\t');
      if (gt_splitter_size(splitter) != 9UL) {
        gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU
                     " tab (\\t) " "separated fields instead of 9", line_number,
                     filename,
                  gt_splitter_size(splitter));
        had_err = -1;
        break;
      }
      tokens = gt_splitter_get_tokens(splitter);
      seqname    = tokens[0];
      source     = tokens[1];
      feature    = tokens[2];
      start      = tokens[3];
      end        = tokens[4];
      score      = tokens[5];
      strand     = tokens[6];
      frame      = tokens[7];
      attributes = tokens[8];

      /* parse feature */
      if (GTF_feature_type_get(&gtf_feature_type, feature) == -1) {
        /* we skip unknown features */
        fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown "
                "feature: \"%s\"\n", line_number, filename, feature);
        gt_str_reset(line_buffer);
        continue;
      }

      /* translate into GFF3 feature type */
      switch (gtf_feature_type) {
        case GTF_CDS:
        case GTF_stop_codon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_CDS);
          type = gt_ft_CDS;
          break;
        case GTF_exon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_exon);
          type = gt_ft_exon;
      }
      gt_assert(gff_type_is_valid);

      /* parse the range */
      had_err = gt_parse_range(&range, start, end, line_number, filename, err);
      HANDLE_ERROR;

      /* process seqname (we have to do it here because we need the range) */
      gt_region_node_builder_add_region(parser->region_node_builder, seqname,
                                        range);

      /* parse the score */
      had_err = gt_parse_score(&score_is_defined, &score_value, score,
                               line_number, filename, err);
      HANDLE_ERROR;

      /* parse the strand */
      had_err = gt_parse_strand(&gt_strand_value, strand, line_number, filename,
                               err);
      HANDLE_ERROR;

      /* parse the frame */
      had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err);
      HANDLE_ERROR;

      /* parse the attributes */
      gt_splitter_reset(attribute_splitter);
      gene_id = NULL;
      transcript_id = NULL;
      gt_splitter_split(attribute_splitter, attributes, strlen(attributes),
                        ';');
      for (i = 0; i < gt_splitter_size(attribute_splitter); i++) {
        token = gt_splitter_get_token(attribute_splitter, i);
        /* skip leading blanks */
        while (*token == ' ')
          token++;
        /* look for the two mandatory attributes */
        if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                         filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1;
        }
        else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE,
                         strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1;
        }
        else if (strncmp(token, GENE_NAME_ATTRIBUTE,
                         strlen(GENE_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*gene_name == '"')
            gene_name++;
          if (gene_name[strlen(gene_name)-1] == '"')
            gene_name[strlen(gene_name)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE,
                         strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*transcript_name == '"')
            transcript_name++;
          if (transcript_name[strlen(transcript_name)-1] == '"')
            transcript_name[strlen(transcript_name)-1] = '\0';
        }
      }

      /* check for the mandatory attributes */
      if (!gene_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;
      if (!transcript_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;

      /* process the mandatory attributes */
      if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash,
                                             gene_id))) {
        transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                            (GtFree) gt_array_delete);
        gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id),
                    transcript_id_hash);
      }
      gt_assert(transcript_id_hash);

      if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash,
                                            transcript_id))) {
        gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*));
        gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id),
                    gt_genome_node_array);
      }
      gt_assert(gt_genome_node_array);

      /* save optional gene_name and transcript_name attributes */
      if (transcript_name
            && !gt_hashmap_get(parser->transcript_id_to_name_mapping,
                             transcript_id)) {
        gt_hashmap_add(parser->transcript_id_to_name_mapping,
                    gt_cstr_dup(transcript_id),
                    gt_cstr_dup(transcript_name));
      }
      if (gene_name && !gt_hashmap_get(parser->gene_id_to_name_mapping,
                                    gene_id)) {
        gt_hashmap_add(parser->gene_id_to_name_mapping,
                    gt_cstr_dup(gene_id),
                    gt_cstr_dup(gene_name));
      }

      /* get seqid */
      seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname);
      if (!seqid_str) {
        seqid_str = gt_str_new_cstr(seqname);
        gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str),
                       seqid_str);
      }
      gt_assert(seqid_str);

      /* construct the new feature */
      gn = gt_feature_node_new(seqid_str, type, range.start, range.end,
                                 gt_strand_value);
      gt_genome_node_set_origin(gn, filenamestr, line_number);

      /* set source */
      source_str = gt_hashmap_get(parser->source_to_str_mapping, source);
      if (!source_str) {
        source_str = gt_str_new_cstr(source);
        gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str),
                    source_str);
      }
      gt_assert(source_str);
      gt_feature_node_set_source((GtFeatureNode*) gn, source_str);

      if (score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
      if (phase_value != GT_PHASE_UNDEFINED)
        gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value);
      gt_array_add(gt_genome_node_array, gn);
    }

    gt_str_reset(line_buffer);
  }

  /* process all region nodes */
  if (!had_err)
    gt_region_node_builder_build(parser->region_node_builder, genome_nodes);

  /* process all feature nodes */
  cinfo.genome_nodes = genome_nodes;
  cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping;
  cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping;
  if (!had_err) {
    had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes,
                              &cinfo, err);
  }

  /* free */
  gt_splitter_delete(splitter);
  gt_splitter_delete(attribute_splitter);
  gt_str_delete(line_buffer);

  return had_err;
}
Exemplo n.º 22
0
static int split_fasta_file(const char *filename, GtUword max_filesize,
                            bool force, GtError *err)
{
  GtFile *srcfp = NULL, *destfp = NULL;
  GtStr *destfilename = NULL;
  GtUword filenum = 0, bytecount = 0, separator_pos;
  int read_bytes, had_err = 0;
  char buf[BUFSIZ];

  gt_error_check(err);
  gt_assert(filename && max_filesize);

  /* open source file */
  srcfp = gt_file_xopen(filename, "r");
  gt_assert(srcfp);

  /* read start characters */
  if ((read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) == 0) {
    gt_error_set(err, "file \"%s\" is empty", filename);
    had_err = -1;
  }
  bytecount += read_bytes;

  /* make sure the file is in fasta format */
  if (!had_err && buf[0] != '>') {
    gt_error_set(err, "file is not in FASTA format");
    had_err = -1;
  }

  if (!had_err) {
    /* open destination file */
    destfilename = gt_str_new();
    gt_str_append_cstr_nt(destfilename, filename,
                          gt_file_basename_length(filename));
    gt_str_append_char(destfilename, '.');
    gt_str_append_ulong(destfilename, ++filenum);
    gt_str_append_cstr(destfilename,
                       gt_file_mode_suffix(gt_file_mode(srcfp)));
    if (!(destfp = gt_output_file_xopen_forcecheck(gt_str_get(destfilename),
                                                  "w",
                                                  force, err))) {
      had_err = -1;
    }
    if (!had_err)
      gt_file_xwrite(destfp, buf, read_bytes);

    while (!had_err &&
           (read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) != 0) {
      if (bytecount + read_bytes > max_filesize) {
        int offset = bytecount < max_filesize ? max_filesize - bytecount : 0;
        if ((separator_pos = buf_contains_separator(buf, offset, read_bytes))) {
          separator_pos--;
          gt_assert(separator_pos < read_bytes);
          if (separator_pos)
            gt_file_xwrite(destfp, buf, separator_pos);
          /* close current file */
          gt_file_delete(destfp);
          /* open new file */
          gt_str_reset(destfilename);
          gt_str_append_cstr_nt(destfilename, filename,
                                gt_file_basename_length(filename));
          gt_str_append_char(destfilename, '.');
          gt_str_append_ulong(destfilename, ++filenum);
          gt_str_append_cstr(destfilename,
                             gt_file_mode_suffix(gt_file_mode(srcfp)));
          if (!(destfp =
                  gt_output_file_xopen_forcecheck(gt_str_get(destfilename), "w",
                                                 force, err))) {
            had_err = -1;
            break;
          }
          bytecount = read_bytes - separator_pos; /* reset */
          gt_assert(buf[separator_pos] == '>');
          gt_file_xwrite(destfp, buf + separator_pos,
                         read_bytes - separator_pos);
          continue;
        }
      }
      bytecount += read_bytes;
      gt_file_xwrite(destfp, buf, read_bytes);
    }
  }

  /* free */
  gt_str_delete(destfilename);

  /* close current file */
  gt_file_delete(destfp);

  /* close source file */
  gt_file_delete(srcfp);

  return had_err;
}
Exemplo n.º 23
0
static int gt_fasta_reader_fsm_run(GtFastaReader *fasta_reader,
                                   GtFastaReaderProcDescription
                                   proc_description,
                                   GtFastaReaderProcSequencePart
                                   proc_sequence_part,
                                   GtFastaReaderProcSequenceLength
                                   proc_sequence_length,
                                   void *data, GtError *err)
{
  GtFastaReaderFSM *fr = gt_fasta_reader_fsm_cast(fasta_reader);
  unsigned char cc;
  GtFastaReaderState state = EXPECTING_SEPARATOR;
  GtUword sequence_length = 0, line_counter = 1;
  GtStr *description, *sequence;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(fr);

  /* init */
  description = gt_str_new();
  sequence    = gt_str_new();

  /* at least one function has to be defined */
  gt_assert(proc_description || proc_sequence_part || proc_sequence_length);

  /* rewind sequence file (to allow multiple calls) */
  if (fr->sequence_file)
    gt_file_xrewind(fr->sequence_file);

  /* reading */
  while (!had_err && gt_file_xread(fr->sequence_file, &cc, 1) != 0) {
    switch (state) {
      case EXPECTING_SEPARATOR:
        if (cc != GT_FASTA_SEPARATOR) {
          gt_error_set(err,
                    "the first character of fasta file \"%s\" has to be '%c'",
                    gt_str_get(fr->sequence_filename), GT_FASTA_SEPARATOR);
          had_err = -1;
        }
        else
          state = READING_DESCRIPTION;
        break;
      case READING_DESCRIPTION:
        if (cc == '\n') {
          if (proc_description) {
            had_err = proc_description(gt_str_get(description),
                                       gt_str_length(description), data, err);
            if (!had_err)
              gt_str_reset(description);
          }
          if (!had_err) {
            sequence_length = 0;
            line_counter++;
            state = READING_SEQUENCE_AFTER_NEWLINE;
          }
        }
        else if (proc_description && cc != '\r')
          gt_str_append_char(description, cc);
        break;
      case READING_SEQUENCE_AFTER_NEWLINE:
        if (cc == GT_FASTA_SEPARATOR) {
          if (!sequence_length) {
            gt_assert(line_counter);
            gt_error_set(err, "empty sequence after description given in line "
                              ""GT_WU"", line_counter - 1);
            had_err = -1;
            break;
          }
          else {
            if (proc_sequence_part) {
              gt_assert(gt_str_length(sequence));
              had_err = proc_sequence_part(gt_str_get(sequence),
                                           gt_str_length(sequence), data, err);
            }
            if (had_err)
              break;
            gt_str_reset(sequence);
            if (proc_sequence_length)
              had_err = proc_sequence_length(sequence_length, data, err);
            if (had_err)
              break;
            state = READING_DESCRIPTION;
            continue;
          }
        }
        /*@fallthrough@*/
      case READING_SEQUENCE:
        if (cc == '\n') {
          line_counter++;
          state = READING_SEQUENCE_AFTER_NEWLINE;
        }
        else {
          sequence_length++;
          if (proc_sequence_part) {
            if (gt_str_length(sequence) == BUFSIZ) {
              had_err = proc_sequence_part(gt_str_get(sequence),
                                           gt_str_length(sequence), data, err);
              if (had_err)
                break;
              gt_str_reset(sequence);
            }
            if (cc != ' ' && cc != '\r')
              gt_str_append_char(sequence, cc);
          }
        }
        break;
    }
  }

  if (!had_err) {
    /* checks after reading */
    switch (state) {
      case EXPECTING_SEPARATOR:
        gt_error_set(err, "sequence file \"%s\" is empty",
                  gt_str_get(fr->sequence_filename));
        had_err = -1;
        break;
      case READING_DESCRIPTION:
        gt_error_set(err, "unfinished fasta entry in line " GT_WU
                     " of sequence file \"%s\"",
                     line_counter, gt_str_get(fr->sequence_filename));
        had_err = -1;
        break;
      case READING_SEQUENCE_AFTER_NEWLINE:
      case READING_SEQUENCE:
        if (!sequence_length) {
          gt_assert(line_counter);
          gt_error_set(err, "empty sequence after description given in line "
                            ""GT_WU"", line_counter - 1);
          had_err = -1;
        }
        else {
          if (proc_sequence_part) {
            gt_assert(gt_str_length(sequence));
            had_err = proc_sequence_part(gt_str_get(sequence),
                                         gt_str_length(sequence), data, err);
          }
          if (!had_err && proc_sequence_length)
            had_err = proc_sequence_length(sequence_length, data, err);
        }
    }
  }

  /* free */
  gt_str_delete(sequence);
  gt_str_delete(description);

  return had_err;
}
static int gt_xrf_abbr_parse_tree_validate_entries(const GtXRFAbbrParseTree
                                                           *xrf_abbr_parse_tree,
                                                   GtError *err)
{
  GtUword i;
  GtHashmap *abbrvs;
  const char *value;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(xrf_abbr_parse_tree);

  abbrvs = gt_hashmap_new(GT_HASH_STRING, NULL, NULL);
  for (i = 0; !had_err
         && i < gt_xrf_abbr_parse_tree_num_of_entries(xrf_abbr_parse_tree);
       i++) {
    GtXRFAbbrEntry *entry = *(GtXRFAbbrEntry**)
                                gt_array_get(xrf_abbr_parse_tree->entries, i);
    if (!(value = gt_xrf_abbr_entry_get_value(entry, XRF_LABEL_ABBREVIATION))) {
      gt_error_set(err, "file \"%s\": line "GT_WU": required "
                        "label \"" XRF_LABEL_ABBREVIATION "\" missing",
                   gt_xrf_abbr_entry_filename(entry),
                   gt_xrf_abbr_entry_line(entry));
      had_err = -1;
    }
    if (!had_err) {
      gt_assert(value);
      if (gt_hashmap_get(abbrvs, value)) {
        gt_error_set(err, "file \"%s\": line "GT_WU": duplicate abbreviation "
                          "\"%s\", must be unique",
                     gt_xrf_abbr_entry_filename(entry),
                     gt_xrf_abbr_entry_line(entry),
                     value);
        had_err = -1;
      } else {
        gt_hashmap_add(abbrvs, (void*) value, (void*) value);
      }
    }
    if (!had_err && (value = gt_xrf_abbr_entry_get_value(entry,
                                                XRF_LABEL_SHORTHAND_NAME))) {
      if (strlen(value) >= 10) {
        gt_error_set(err, "file \"%s\": line "GT_WU": length of "
                          "shorthand name \"%s\" "
                          "is not less than 10 characters",
                     gt_xrf_abbr_entry_filename(entry),
                     gt_xrf_abbr_entry_line(entry), value);
        had_err = -1;
      }
    }
    if (!had_err && (value = gt_xrf_abbr_entry_get_value(entry,
                                              XRF_LABEL_LOCAL_ID_SYNTAX))) {
      GtError *regex_error = gt_error_new();
      bool match;
      if (gt_grep(&match, value, "", regex_error)) {
        gt_error_set(err, "file \"%s\": line "GT_WU": invalid "
                          "regular expression \"%s\" (%s)",
                     gt_xrf_abbr_entry_filename(entry),
                     gt_xrf_abbr_entry_line(entry), value,
                     gt_error_get(regex_error));
        had_err = -1;
      }
      gt_error_delete(regex_error);
    }
  }
  gt_hashmap_delete(abbrvs);
  return had_err;
}
Exemplo n.º 25
0
int gt_paircmp(int argc, const char **argv, GtError *err)
{
  int parsed_args;
  Cmppairwiseopt cmppairwise;
  GtOPrval oprval;
  GtFastaReader *reader0 = NULL,
                *reader1 = NULL;

  gt_error_check(err);

  oprval = parse_options(&parsed_args, &cmppairwise, argc, argv, err);
  if (oprval == GT_OPTION_PARSER_OK)
  {
    gt_assert(parsed_args == argc);
    showsimpleoptions(&cmppairwise);
    if (cmppairwise.showedist)
    {
      GtUword edist, len1, len2;
      GtStr *s1, *s2;

      gt_assert(gt_str_array_size(cmppairwise.strings) >= 2);
      s1 = gt_str_array_get_str(cmppairwise.strings,0);
      s2 = gt_str_array_get_str(cmppairwise.strings,1UL);
      len1 = gt_str_length(s1);
      len2 = gt_str_length(s2);
      edist = gt_computegreedyunitedist((const GtUchar *) gt_str_get(s1),
                                        len1,
                                        (const GtUchar *) gt_str_get(s2),
                                        len2);
      printf(GT_WU " " GT_WU " " GT_WU " " GT_WU "%% errors\n",
             edist, len1,len2,(200 * edist)/(len1+len2));
    }
    else if (cmppairwise.print)
    {
      const GtStr *str0 = gt_str_array_get_str(cmppairwise.strings,0),
                  *str1 = gt_str_array_get_str(cmppairwise.strings,1);

      gt_print_edist_alignment((const GtUchar *) gt_str_get(str0),0,
                               gt_str_length(str0),
                               (const GtUchar *) gt_str_get(str1),0,
                               gt_str_length(str1));
    } else
    {
      size_t idx;
      Checkfunctiontabentry checkfunction_tab[] = {
        MAKECheckfunctiontabentry(gt_checkgreedyunitedist),
        MAKECheckfunctiontabentry(gt_checklinearspace),
        MAKECheckfunctiontabentry(gt_checklinearspace_local),
        MAKECheckfunctiontabentry(gt_checkaffinelinearspace),
        MAKECheckfunctiontabentry(gt_checkaffinelinearspace_local),
        MAKECheckfunctiontabentry(gt_checkdiagonalbandalign),
        MAKECheckfunctiontabentry(gt_checkdiagonalbandaffinealign)
      };

      if (cmppairwise.fasta)
      {
        gt_assert(gt_str_array_size(cmppairwise.files) == 3);
        cmppairwise.fastasequences0 = gt_str_array_new();
        cmppairwise.fastasequences1 = gt_str_array_new();

        reader0 = gt_fasta_reader_rec_new(gt_str_array_get_str(
                                                        cmppairwise.files,1UL));
        gt_fasta_reader_run(reader0, NULL, save_fastaentry,
                            NULL, cmppairwise.fastasequences0, err);
        reader1 = gt_fasta_reader_rec_new (gt_str_array_get_str(
                                                        cmppairwise.files,2UL));
        gt_fasta_reader_run(reader1, NULL, save_fastaentry,
                            NULL, cmppairwise.fastasequences1, err);
        gt_error_check(err);
      }
      for (idx = 0; idx < sizeof checkfunction_tab/sizeof checkfunction_tab[0];
           idx++)
      {
        GtUword testcases;

        printf("run %s\n",checkfunction_tab[idx].name);
        testcases
          = applycheckfunctiontosimpleoptions(checkfunction_tab[idx].function,
                                              &cmppairwise);
        printf("# number of testcases for %s: " GT_WU "\n",
               checkfunction_tab[idx].name,testcases);
      }
      gt_fasta_reader_delete(reader0);
      gt_fasta_reader_delete(reader1);
    }
  }
  freesimpleoption(&cmppairwise);
  if (oprval == GT_OPTION_PARSER_REQUESTS_EXIT)
  {
    return 0;
  }
  if (oprval == GT_OPTION_PARSER_ERROR)
  {
    return -1;
  }
  return 0;
}
Exemplo n.º 26
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
  bool mirrored = false;
  int had_err = 0,
      i;
  GtEncseq              *encseq = NULL;
  GtGenomediffArguments *arguments = tool_arguments;
  GtLogger              *logger;
  GtShuUnitFileInfo     *unit_info = NULL;
  GtTimer               *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose,
                         GT_LOGGER_DEFLT_PREFIX,
                         stdout);
  gt_assert(logger);

  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(arguments->filenames, argv[i]);
  }

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("start");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (arguments->with_units) {
    gt_logger_log(logger, "unitfile option set, filename is %s\n",
                  gt_str_get(arguments->unitfile));
  }

  if (timer != NULL)
    gt_timer_show_progress(timer, "start shu search", stdout);

  if (gt_str_array_size(arguments->filenames) > 1UL) {
    GtEncseqEncoder *ee = gt_encseq_encoder_new();
    gt_encseq_encoder_set_timer(ee, timer);
    gt_encseq_encoder_set_logger(ee, logger);
    /* kr only makes sense for dna, so we can check this already with ee */
    gt_encseq_encoder_set_input_dna(ee);
    had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                       gt_str_get(arguments->indexname), err);
    gt_encseq_encoder_delete(ee);
  }
  else {
    gt_str_append_str(arguments->indexname,
                      gt_str_array_get_str(arguments->filenames, 0));
    if (arguments->with_esa || arguments->with_pck) {
      GtStr *current_line = gt_str_new();
      FILE *prj_fp;
      const char *buffer;
      char **elements = NULL;

      prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                       GT_PROJECTFILESUFFIX,"rb",err);
      if (prj_fp == NULL)
        had_err = -1;
      while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
        buffer = gt_str_get(current_line);
        if (elements != NULL) {
          gt_free(elements[0]);
          gt_free(elements[1]);
        }
        gt_free(elements);
        elements = gt_cstr_split(buffer, '=');
        gt_log_log("%s", elements[0]);
        if (strcmp("mirrored", elements[0]) == 0) {
          gt_log_log("%s", elements[1]);
          if (strcmp("1", elements[1]) == 0) {
            mirrored = true;
            gt_log_log("sequences are treated as mirrored");
          }
        }
        gt_str_reset(current_line);
      }
      gt_str_delete(current_line);
      if (elements != NULL) {
        gt_free(elements[0]);
        gt_free(elements[1]);
      }
      gt_free(elements);
      gt_fa_xfclose(prj_fp);
    }
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                                                           err);
    if (mirrored)
      gt_encseq_loader_mirror(el);
    encseq =
      gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
    gt_encseq_loader_delete(el);
  }
  if (encseq == NULL)
    had_err = -1;
  if (!had_err) {
    unit_info = gt_shu_unit_info_new(encseq);
    if (arguments->with_units)
      had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                           logger, err);
  }

  if (!had_err) {
    uint64_t **shusums = NULL;
    if (arguments->with_esa || arguments->with_pck) {
      shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                         logger, timer, err);
      if (shusums == NULL)
        had_err = -1;
    }
    else {
      const bool doesa = true;
      GenomediffInfo gd_info;
      Suffixeratoroptions sopts;
      sopts.beverbose = arguments->verbose;
      sopts.indexname = arguments->indexname;
      sopts.db = NULL;
      sopts.encopts = NULL;
      sopts.genomediff = true;
      sopts.inputindex = arguments->indexname;
      sopts.loadopts = arguments->loadopts;
      sopts.showprogress = false;
      sopts.idxopts = arguments->idxopts;

      gt_assert(unit_info != NULL);
      gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                          unit_info->num_of_genomes);
      gd_info.shulensums = shusums;
      gd_info.unit_info = unit_info;
      had_err = runsuffixerator(doesa, &sopts, &gd_info, logger, err);
    }
    if (!had_err && shusums != NULL) {
      had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                      arguments->with_pck, logger, timer, err);
      gt_array2dim_delete(shusums);
    }
  }

  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(logger);
  gt_encseq_delete(encseq);
  gt_shu_unit_info_delete(unit_info);

  return had_err;
}
Exemplo n.º 27
0
int gt_mapfmindex (Fmindex *fmindex,const char *indexname,
                GtLogger *logger,GtError *err)
{
  FILE *fpin = NULL;
  bool haserr = false, storeindexpos = true;
  GtSpecialcharinfo specialcharinfo;

  gt_error_check(err);
  fmindex->mappedptr = NULL;
  fmindex->bwtformatching = NULL;
  fmindex->alphabet = NULL;
  fpin = gt_fa_fopen_with_suffix(indexname,FMASCIIFILESUFFIX,"rb",err);
  if (fpin == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    if (scanfmafileviafileptr(fmindex,
                              &specialcharinfo,
                              &storeindexpos,
                              indexname,
                              fpin,
                              logger,
                              err) != 0)
    {
      haserr = true;
    }
  }
  gt_fa_xfclose(fpin);
  if (!haserr)
  {
    fmindex->bwtformatching = mapbwtencoding(indexname,logger,err);
    if (fmindex->bwtformatching == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    fmindex->specpos.nextfreeGtPairBwtidx
      = (GtUword) gt_determinenumberofspecialstostore(&specialcharinfo);
    fmindex->specpos.spaceGtPairBwtidx = NULL;
    fmindex->specpos.allocatedGtPairBwtidx = 0;
    fmindex->alphabet = gt_alphabet_ref(
                                  gt_encseq_alphabet(fmindex->bwtformatching));
    if (fmindex->alphabet == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    GtStr *tmpfilename;

    gt_computefmkeyvalues (fmindex,
                           &specialcharinfo,
                           fmindex->bwtlength,
                           fmindex->log2bsize,
                           fmindex->log2markdist,
                           gt_alphabet_num_of_chars(fmindex->alphabet),
                           fmindex->suffixlength,
                           storeindexpos);
    tmpfilename = gt_str_new_cstr(indexname);
    gt_str_append_cstr(tmpfilename,FMDATAFILESUFFIX);
    if (gt_fillfmmapspecstartptr(fmindex,storeindexpos,tmpfilename,err) != 0)
    {
      haserr = true;
    }
    gt_str_delete(tmpfilename);
  }
  if (haserr)
  {
    gt_freefmindex(fmindex);
  }
  return haserr ? -1 : 0;
}
Exemplo n.º 28
0
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv,
                           int parsed_args, void *tool_arguments,
                           GtError *err)
{
  GtEncseqInfoArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlphabet *alpha;
  const GtUchar *chars;
  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->nomap) {
    GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err);
    if (!emd)
      had_err = -1;

    if (!had_err) {
      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_metadata_version(emd));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd)
                                                  ? "yes"
                                                  : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_total_length(emd));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                      gt_encseq_metadata_num_of_sequences(emd));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_num_of_files(emd));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                        gt_encseq_metadata_min_seq_length(emd),
                                        gt_encseq_metadata_max_seq_length(emd));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                 gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd)));

      alpha = gt_encseq_metadata_alphabet(emd);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

    }
    gt_encseq_metadata_delete(emd);
  } else {
    GtEncseqLoader *encseq_loader;
    GtEncseq *encseq;

    encseq_loader = gt_encseq_loader_new();
    if (arguments->mirror)
      gt_encseq_loader_mirror(encseq_loader);
    if (!(encseq = gt_encseq_loader_load(encseq_loader,
                                         argv[parsed_args], err)))
      had_err = -1;

    if (!had_err) {
      const GtStrArray *filenames;
      GtUword i;

      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq)
                                                   ? "yes"
                                                   : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "compressed size: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n",
                                        gt_encseq_sizeofrep(encseq));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_sequences(encseq));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_files(encseq));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                      gt_encseq_min_seq_length(encseq),
                                      gt_encseq_max_seq_length(encseq));

      filenames = gt_encseq_filenames(encseq);
      gt_file_xprintf(arguments->outfp, "original filenames:\n");
      for (i = 0; i < gt_str_array_size(filenames); i++) {
        gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n",
                                          gt_str_array_get(filenames, i),
                                          (GtUword)
                                     gt_encseq_effective_filelength(encseq, i));
      }

      alpha = gt_encseq_alphabet(encseq);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

      gt_file_xprintf(arguments->outfp, "character distribution:\n");
      for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) {
        GtUword cc;
        cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i]));
        gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n",
                                          (char) chars[i],
                                          cc,
                             (cc /(double) (gt_encseq_total_length(encseq)
                                  - gt_encseq_num_of_sequences(encseq)+1))*100);
      }

      gt_file_xprintf(arguments->outfp, "number of wildcards: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_wildcards(encseq),
                                        gt_encseq_realwildcardranges(encseq));

      gt_file_xprintf(arguments->outfp, "number of special characters: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_specialcharacters(encseq),
                                        gt_encseq_realspecialranges(encseq));

      gt_file_xprintf(arguments->outfp, "length of longest non-special "
                                        "character stretch: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                   gt_encseq_lengthoflongestnonspecial(encseq));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                   gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq)));

      gt_file_xprintf(arguments->outfp, "bits used per character: ");
      gt_file_xprintf(arguments->outfp, "%f\n",
        (double) ((uint64_t) CHAR_BIT *
                  (uint64_t) gt_encseq_sizeofrep(encseq)) /
        (double) gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "has special ranges: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_specialranges(encseq)
                                          ? "yes"
                                          : "no");

      gt_file_xprintf(arguments->outfp, "has description support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                       gt_encseq_has_description_support(encseq)
                                          ? "yes"
                                          : "no");

      if (gt_encseq_has_description_support(encseq)) {
        gt_file_xprintf(arguments->outfp, "length of longest description: ");
        gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_max_desc_length(encseq));
      }

      gt_file_xprintf(arguments->outfp, "has multiple sequence support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_multiseq_support(encseq)
                                          ? "yes"
                                          : "no");
    }
    gt_encseq_delete(encseq);
    gt_encseq_loader_delete(encseq_loader);
  }

  return had_err;
}
Exemplo n.º 29
0
int  gt_mapspec_read(GtMapspecSetupFunc setup, void *data,
                     const GtStr *filename, unsigned long expectedsize,
                     void **mapped, GtError *err)
{
  void *mapptr;
  uint64_t expectedaccordingtomapspec;
  unsigned long byteoffset = 0;
  size_t numofbytes;
  GtMapspec *ms = gt_malloc(sizeof (GtMapspec));
  GtMapspecification *mapspecptr;
  int had_err = 0;
  unsigned long totalpadunits = 0;

  gt_error_check(err);
  GT_INITARRAY(&ms->mapspectable, GtMapspecification);
  setup(ms, data, false);

  mapptr = gt_fa_mmap_read(gt_str_get(filename), &numofbytes, err);
  if (mapptr == NULL)
  {
    had_err = -1;
  }
  *mapped = mapptr;
  if (!had_err)
  {
    if (assigncorrecttype(ms->mapspectable.spaceGtMapspecification,
                          mapptr,0,err) != 0)
    {
      had_err = -1;
    }
  }
  if (!had_err)
  {
    expectedaccordingtomapspec =
                               detexpectedaccordingtomapspec(&ms->mapspectable);
    if (expectedaccordingtomapspec != (uint64_t) numofbytes)
    {
      gt_error_set(err,"%lu bytes read from %s, but " Formatuint64_t
                         " expected",
                         (unsigned long) numofbytes,
                         gt_str_get(filename),
                         PRINTuint64_tcast(expectedaccordingtomapspec));
      had_err = -1;
    }
  }
  if (!had_err)
  {
    mapspecptr = ms->mapspectable.spaceGtMapspecification;
    gt_assert(mapspecptr != NULL);
    byteoffset = CALLCASTFUNC(uint64_t,unsigned_long,
                              (uint64_t) (mapspecptr->sizeofunit *
                                          mapspecptr->numofunits));
    if (byteoffset % (unsigned long) GT_WORDSIZE_INBYTES > 0)
    {
      size_t padunits
        = GT_WORDSIZE_INBYTES - (byteoffset % GT_WORDSIZE_INBYTES);
      byteoffset += (unsigned long) padunits;
      totalpadunits += (unsigned long) padunits;
    }
    for (mapspecptr++;
         mapspecptr < ms->mapspectable.spaceGtMapspecification +
                      ms->mapspectable.nextfreeGtMapspecification; mapspecptr++)
    {
      if (assigncorrecttype(mapspecptr,mapptr,byteoffset,err) != 0)
      {
        had_err = -1;
        break;
      }
      byteoffset = CALLCASTFUNC(uint64_t,unsigned_long,
                                (uint64_t) (byteoffset +
                                            mapspecptr->sizeofunit *
                                            mapspecptr->numofunits));
      if (byteoffset % (unsigned long) GT_WORDSIZE_INBYTES > 0)
      {
        size_t padunits
          = GT_WORDSIZE_INBYTES - (byteoffset % GT_WORDSIZE_INBYTES);
        byteoffset += (unsigned long) padunits;
        totalpadunits += (unsigned long) padunits;
      }
    }
  }
  if (!had_err)
  {
    if (expectedsize + totalpadunits != byteoffset)
    {
      gt_error_set(err,"mapping: expected file size is %lu bytes, "
                       "but file has %lu bytes",
                       expectedsize,byteoffset);
      had_err = -1;
    }
  }
  GT_FREEARRAY(&ms->mapspectable,GtMapspecification);
  gt_free(ms);
  return had_err;
}
Exemplo n.º 30
0
static int cluster_annotate_nodes(GtClusteredSet *cs, GtEncseq *encseq,
                                  const char *feature, GtArray *nodes,
                                  GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *curnode = NULL, *tmp;
  GtClusteredSetIterator *csi = NULL;
  GtGenomeNode *gn;
  GtHashmap *desc2node;
  GtStr *seqid = NULL;
  int had_err = 0;
  unsigned long num_of_clusters, i, elm;
  const char *fnt = NULL;
  char buffer[BUFSIZ], *real_feature;
  gt_error_check(err);

  if ((strcmp(feature, "lLTR") == 0) || (strcmp(feature, "rLTR") == 0))
    real_feature = gt_cstr_dup(gt_ft_long_terminal_repeat);
  else
    real_feature = gt_cstr_dup(feature);

  desc2node = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL);
  for (i = 0; i < gt_array_size(nodes); i++) {
    gn = *(GtGenomeNode**) gt_array_get(nodes, i);
    if (gt_feature_node_try_cast(gn) == NULL)
      continue;
    fni = gt_feature_node_iterator_new((GtFeatureNode*) gn);
    while ((curnode = gt_feature_node_iterator_next(fni)) != NULL) {
      char header[BUFSIZ];
      fnt = gt_feature_node_get_type(curnode);
      if (strcmp(fnt, gt_ft_repeat_region) == 0) {
        const char *rid;
        unsigned long id;
        seqid = gt_genome_node_get_seqid((GtGenomeNode*) curnode);
        rid = gt_feature_node_get_attribute(curnode, "ID");
        (void) sscanf(rid, "repeat_region%lu", &id);
        (void) snprintf(buffer, BUFSIZ, "%s_%lu", gt_str_get(seqid), id);
      } else if (strcmp(fnt, gt_ft_protein_match) == 0) {
        GtRange range;
        const char *attr;
        attr = gt_feature_node_get_attribute(curnode, "name");
        if (!attr)
          continue;
        if (strcmp(feature, attr) != 0)
          continue;
        range = gt_genome_node_get_range((GtGenomeNode*) curnode);
        if ((range.end - range.start + 1) < 10UL)
          continue;
        (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start,
                        range.end);
        gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode);
      } else if (strcmp(fnt, real_feature) == 0) {
        GtRange range;
        range = gt_genome_node_get_range((GtGenomeNode*) curnode);
        if ((range.end - range.start + 1) < 10UL)
          continue;
        (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start,
                        range.end);
        gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode);
      }
    }
    gt_feature_node_iterator_delete(fni);
  }
  gt_free(real_feature);

  num_of_clusters = gt_clustered_set_num_of_clusters(cs, err);
  for (i = 0; i < num_of_clusters; i++) {
    csi = gt_clustered_set_get_iterator(cs, i ,err);
    if (csi != NULL) {
      while (!had_err && (gt_clustered_set_iterator_next(csi, &elm, err)
             != GT_CLUSTERED_SET_ITERATOR_STATUS_END)) {
        char clid[BUFSIZ];
        const char *encseqdesc;
        char *encseqid;
        unsigned long desclen;
        encseqdesc = gt_encseq_description(encseq, &desclen, elm);
        encseqid = gt_calloc((size_t) (desclen + 1), sizeof (char));
        (void) strncpy(encseqid, encseqdesc, (size_t) desclen);
        encseqid[desclen] = '\0';
        tmp = (GtFeatureNode*) gt_hashmap_get(desc2node, (void*) encseqid);
        (void) snprintf(clid, BUFSIZ, "%lu", i);
        gt_feature_node_set_attribute(tmp, "clid", clid);
        gt_free(encseqid);
      }
    }
    gt_clustered_set_iterator_delete(csi, err);
    csi = NULL;
  }
  gt_hashmap_delete(desc2node);
  return had_err;
}