Example #1
0
static GtUword gt_encseq_col_num_of_files(const GtSeqCol *sc)
{
  const GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc);
  return gt_encseq_num_of_files(esc->encseq);
}
Example #2
0
GtBUstate_shulen *gt_sfx_multiesashulengthdist_new(const GtEncseq *encseq,
                                            GenomediffInfo *gd_info)
{
  GtBUstate_shulen *bustate;

  bustate = gt_malloc(sizeof (*bustate));
  bustate->encseq = encseq;
  bustate->previousbucketlastsuffix = ULONG_MAX;
  bustate->idxoffset = 0;
  bustate->firstedgefromroot = false;
#ifdef SHUDEBUG
  bustate->nextid = 0;
#endif
  if (gd_info == NULL)
    bustate->unit_info = gt_shu_unit_info_new(encseq);
  else
    bustate->unit_info = gd_info->unit_info;

  bustate->numofdbfiles = gt_encseq_num_of_files(encseq);
#ifdef GENOMEDIFF_PAPER_IMPL
  bustate->leafdist
    = gt_malloc(sizeof (*bustate->leafdist) * bustate->numofdbfiles);
#endif
  bustate->file_to_genome_map = bustate->unit_info->map_files;
  if (gd_info == NULL)
    bustate->shulengthdist = shulengthdist_new(bustate->numofdbfiles);
  else
    bustate->shulengthdist = gd_info->shulensums;

  bustate->stack = (void *) gt_GtArrayGtBUItvinfo_new_shulen();
  return bustate;
}
Example #3
0
int gt_multiesa2shulengthdist_print(Sequentialsuffixarrayreader *ssar,
                                    const GtEncseq *encseq,
                                    GtError *err)
{
  GtBUstate_shulen *state;
  bool haserr = false;

  state = gt_malloc(sizeof (*state));
  state->numofdbfiles = gt_encseq_num_of_files(encseq);
  state->encseq = encseq;
#ifdef GENOMEDIFF_PAPER_IMPL
  state->leafdist = gt_malloc(sizeof (*state->leafdist) * state->numofdbfiles);
#endif
#ifdef SHUDEBUG
  state->nextid = 0;
#endif
  state->shulengthdist = shulengthdist_new(state->numofdbfiles);
  if (gt_esa_bottomup_shulen(ssar, state, err) != 0)
  {
    haserr = true;
  }
  if (!haserr)
  {
    shulengthdist_print(NULL,(const uint64_t * const*) state->shulengthdist,
                        state->numofdbfiles);
  }
  gt_array2dim_delete(state->shulengthdist);
#ifdef GENOMEDIFF_PAPER_IMPL
  gt_free(state->leafdist);
#endif
  gt_free(state);
  return haserr ? -1 : 0;
}
Example #4
0
static int encseq_lua_num_of_files(lua_State *L)
{
  GtEncseq **encseq;
  encseq = check_encseq(L, 1);
  lua_pushnumber(L, gt_encseq_num_of_files(*encseq));
  return 1;
}
Example #5
0
static GtUword gt_encseq_col_get_sequence_length(const GtSeqCol *sc,
                                                       GtUword filenum,
                                                       GtUword seqnum)
{
  GtEncseqCol *esc;
  GtUword encseq_seqnum;
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq));
  encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum;
  return gt_encseq_seqlength(esc->encseq, encseq_seqnum);
}
Example #6
0
static const char* gt_encseq_col_get_md5_fingerprint(const GtSeqCol *sc,
                                                     GtUword filenum,
                                                     GtUword seqnum)
{
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq));
  return gt_md5_tab_get(esc->md5_tab,
                        gt_encseq_filenum_first_seqnum(esc->encseq,
                                                       filenum) + seqnum);
}
Example #7
0
static GtUword gt_encseq_col_num_of_seqs(const GtSeqCol *sc,
                                               GtUword filenum)
{
  GtEncseqCol *esc;
  /* XXX cache function evaluated values */
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq));
  if (gt_encseq_num_of_files(esc->encseq) == 1 && filenum == 0)
    return gt_encseq_num_of_sequences(esc->encseq);
  else if (filenum == gt_encseq_num_of_files(esc->encseq) - 1) {
    return (gt_encseq_num_of_sequences(esc->encseq)
              - gt_encseq_filenum_first_seqnum(esc->encseq, filenum));
  } else {
    GtUword firstpos, nextpos;
    gt_assert(filenum < gt_encseq_num_of_files(esc->encseq) - 1);
    firstpos = gt_encseq_filenum_first_seqnum(esc->encseq, filenum);
    nextpos = gt_encseq_filenum_first_seqnum(esc->encseq, filenum + 1);
    return nextpos - firstpos;
  }
}
Example #8
0
static int encseq_lua_filestartpos(lua_State *L)
{
  GtEncseq **encseq;
  GtUword fileno;
  encseq = check_encseq(L, 1);
  fileno = luaL_checknumber(L, 2);
  luaL_argcheck(L, fileno < gt_encseq_num_of_files(*encseq), 2,
                "cannot exceed number of files");
  lua_pushnumber(L, gt_encseq_filestartpos(*encseq, fileno));
  return 1;
}
Example #9
0
static char* gt_encseq_col_get_description(const GtSeqCol *sc,
                                           GtUword filenum,
                                           GtUword seqnum)
{
  GtEncseqCol *esc;
  const char *desc;
  GtUword encseq_seqnum, desclen;
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq));
  encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum;
  gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq));
  desc = gt_encseq_description(esc->encseq, &desclen, encseq_seqnum);
  gt_assert(desc && desclen > 0);
  return gt_cstr_dup_nt(desc, desclen);;
}
Example #10
0
static char* gt_encseq_col_get_sequence(const GtSeqCol *sc,
                                        GtUword filenum,
                                        GtUword seqnum,
                                        GtUword start,
                                        GtUword end)
{
  GtEncseqCol *esc;
  char *out;
  GtUword encseq_seqnum, startpos;
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq));
  encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum;
  gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq));
  gt_assert(start <= end);
  startpos = gt_encseq_seqstartpos(esc->encseq, encseq_seqnum);
  out = gt_calloc(end - start + 1, sizeof (char));
  gt_encseq_extract_decoded(esc->encseq, out, startpos + start, startpos + end);
  return out;
}
Example #11
0
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv,
                           int parsed_args, void *tool_arguments,
                           GtError *err)
{
  GtEncseqInfoArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlphabet *alpha;
  const GtUchar *chars;
  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->nomap) {
    GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err);
    if (!emd)
      had_err = -1;

    if (!had_err) {
      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_metadata_version(emd));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd)
                                                  ? "yes"
                                                  : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_total_length(emd));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                      gt_encseq_metadata_num_of_sequences(emd));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_num_of_files(emd));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                        gt_encseq_metadata_min_seq_length(emd),
                                        gt_encseq_metadata_max_seq_length(emd));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                 gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd)));

      alpha = gt_encseq_metadata_alphabet(emd);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

    }
    gt_encseq_metadata_delete(emd);
  } else {
    GtEncseqLoader *encseq_loader;
    GtEncseq *encseq;

    encseq_loader = gt_encseq_loader_new();
    if (arguments->mirror)
      gt_encseq_loader_mirror(encseq_loader);
    if (!(encseq = gt_encseq_loader_load(encseq_loader,
                                         argv[parsed_args], err)))
      had_err = -1;

    if (!had_err) {
      const GtStrArray *filenames;
      GtUword i;

      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq)
                                                   ? "yes"
                                                   : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "compressed size: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n",
                                        gt_encseq_sizeofrep(encseq));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_sequences(encseq));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_files(encseq));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                      gt_encseq_min_seq_length(encseq),
                                      gt_encseq_max_seq_length(encseq));

      filenames = gt_encseq_filenames(encseq);
      gt_file_xprintf(arguments->outfp, "original filenames:\n");
      for (i = 0; i < gt_str_array_size(filenames); i++) {
        gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n",
                                          gt_str_array_get(filenames, i),
                                          (GtUword)
                                     gt_encseq_effective_filelength(encseq, i));
      }

      alpha = gt_encseq_alphabet(encseq);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

      gt_file_xprintf(arguments->outfp, "character distribution:\n");
      for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) {
        GtUword cc;
        cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i]));
        gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n",
                                          (char) chars[i],
                                          cc,
                             (cc /(double) (gt_encseq_total_length(encseq)
                                  - gt_encseq_num_of_sequences(encseq)+1))*100);
      }

      gt_file_xprintf(arguments->outfp, "number of wildcards: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_wildcards(encseq),
                                        gt_encseq_realwildcardranges(encseq));

      gt_file_xprintf(arguments->outfp, "number of special characters: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_specialcharacters(encseq),
                                        gt_encseq_realspecialranges(encseq));

      gt_file_xprintf(arguments->outfp, "length of longest non-special "
                                        "character stretch: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                   gt_encseq_lengthoflongestnonspecial(encseq));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                   gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq)));

      gt_file_xprintf(arguments->outfp, "bits used per character: ");
      gt_file_xprintf(arguments->outfp, "%f\n",
        (double) ((uint64_t) CHAR_BIT *
                  (uint64_t) gt_encseq_sizeofrep(encseq)) /
        (double) gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "has special ranges: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_specialranges(encseq)
                                          ? "yes"
                                          : "no");

      gt_file_xprintf(arguments->outfp, "has description support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                       gt_encseq_has_description_support(encseq)
                                          ? "yes"
                                          : "no");

      if (gt_encseq_has_description_support(encseq)) {
        gt_file_xprintf(arguments->outfp, "length of longest description: ");
        gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_max_desc_length(encseq));
      }

      gt_file_xprintf(arguments->outfp, "has multiple sequence support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_multiseq_support(encseq)
                                          ? "yes"
                                          : "no");
    }
    gt_encseq_delete(encseq);
    gt_encseq_loader_delete(encseq_loader);
  }

  return had_err;
}