Ejemplo n.º 1
0
static int encseq_lua_has_multiseq_support(lua_State *L)
{
  GtEncseq **encseq;
  encseq = check_encseq(L, 1);
  lua_pushboolean(L, gt_encseq_has_multiseq_support(*encseq));
  return 1;
}
Ejemplo n.º 2
0
static void gt_querysubstringmatch(bool selfmatch,
                                   const GtEncseq *dbencseq,
                                   const ESASuffixptr *suftabpart,
                                   GtReadmode readmode,
                                   GtUword numberofsuffixes,
                                   uint64_t queryunitnum,
                                   GtQueryrepresentation *queryrep,
                                   GtUword minmatchlength,
                                   GtProcessquerymatch processquerymatch,
                                   void *processquerymatchinfo,
                                   GtQuerymatch *querymatchspaceptr)
{
  GtMMsearchiterator *mmsi;
  GtUword totallength, localqueryoffset = 0;
  uint64_t localqueryunitnum = queryunitnum;
  GtQuerysubstring querysubstring;

  gt_assert(numberofsuffixes > 0);
  totallength = gt_encseq_total_length(dbencseq);
  querysubstring.queryrep = queryrep;
  for (querysubstring.currentoffset = 0;
       querysubstring.currentoffset <= queryrep->seqlen - minmatchlength;
       querysubstring.currentoffset++)
  {
    GtUword dbstart;

    mmsi = gt_mmsearchiterator_new(dbencseq,
                                   suftabpart,
                                   0, /* leftbound */
                                   numberofsuffixes - 1, /* rightbound */
                                   0, /* offset */
                                   readmode,
                                   &querysubstring,
                                   minmatchlength);
    while (gt_mmsearchiterator_next(&dbstart,mmsi))
    {
      if (gt_mmsearch_isleftmaximal(dbencseq,
                                    readmode,
                                    dbstart,
                                    &querysubstring))
      {
        GtUword dbseqnum, dbseqstartpos, dbseqlen, extend;

        extend = gt_mmsearch_extendright(dbencseq,
                                         mmsi->esr,
                                         readmode,
                                         totallength,
                                         dbstart + minmatchlength,
                                         &querysubstring,
                                         minmatchlength);

        if (gt_encseq_has_multiseq_support(dbencseq))
        {
          dbseqnum = gt_encseq_seqnum(dbencseq,dbstart);
          dbseqstartpos = gt_encseq_seqstartpos(dbencseq,dbseqnum);
          dbseqlen = gt_encseq_seqlength(dbencseq,dbseqnum);
        } else
        {
          dbseqnum = dbseqstartpos = dbseqlen = 0;
        }
        gt_querymatch_init(querymatchspaceptr,
                           minmatchlength + extend,
                           dbstart,
                           dbseqnum,
                           dbstart - dbseqstartpos,
                           dbseqlen,
                           0, /* score */
                           0, /* edist */
                           selfmatch,
                           localqueryunitnum,
                           minmatchlength + extend,
                           localqueryoffset,
                           queryrep->seqlen);
        processquerymatch(processquerymatchinfo,querymatchspaceptr);
      }
    }
    gt_mmsearchiterator_delete(mmsi);
    mmsi = NULL;
    if (gt_mmsearch_accessquery(queryrep,querysubstring.currentoffset)
        == (GtUchar) SEPARATOR)
    {
      localqueryunitnum++;
      localqueryoffset = 0;
    } else
    {
      localqueryoffset++;
    }
  }
}
Ejemplo n.º 3
0
static int gt_encseq_check_runner(GT_UNUSED int argc, const char **argv,
                                  int parsed_args, void *tool_arguments,
                                  GtError *err)
{
  GtEncseqCheckArguments *arguments = tool_arguments;
  int had_err = 0;
  GtEncseqLoader *encseq_loader;
  GtEncseq *encseq;
  gt_error_check(err);
  gt_assert(arguments);

  encseq_loader = gt_encseq_loader_new();
  if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err)))
    had_err = -1;
  if (!had_err) {
    int readmode;
    gt_encseq_check_startpositions(encseq);
    for (readmode = 0; readmode < 4; readmode++)
    {
      if (gt_alphabet_is_dna(gt_encseq_alphabet(encseq)) ||
           ((GtReadmode) readmode) == GT_READMODE_FORWARD ||
           ((GtReadmode) readmode) == GT_READMODE_REVERSE)
      {
        if (gt_encseq_check_consistency(encseq,
                           gt_encseq_filenames(encseq),
                           (GtReadmode) readmode,
                           arguments->scantrials,
                           arguments->multicharcmptrials,
                           gt_encseq_has_multiseq_support(encseq),
                           err) != 0)
        {
          had_err = -1;
          break;
        }
      }
    }
    if (!had_err)
    {
      gt_encseq_check_specialranges(encseq);
    }
    if (!had_err)
    {
      gt_encseq_check_markpos(encseq);
    }
    if (!had_err)
    {
      had_err = gt_encseq_check_minmax(encseq, err);
    }
    if (!had_err &&
           arguments->prefixlength > 0)
    {
      if (gt_verifymappedstr(encseq,
                             arguments->prefixlength,
                             err) != 0)
      {
        had_err = -1;
      }
    }
  }
  gt_encseq_delete(encseq);
  gt_encseq_loader_delete(encseq_loader);
  return had_err;
}
Ejemplo n.º 4
0
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv,
                           int parsed_args, void *tool_arguments,
                           GtError *err)
{
  GtEncseqInfoArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlphabet *alpha;
  const GtUchar *chars;
  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->nomap) {
    GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err);
    if (!emd)
      had_err = -1;

    if (!had_err) {
      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_metadata_version(emd));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd)
                                                  ? "yes"
                                                  : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_total_length(emd));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                      gt_encseq_metadata_num_of_sequences(emd));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_num_of_files(emd));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                        gt_encseq_metadata_min_seq_length(emd),
                                        gt_encseq_metadata_max_seq_length(emd));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                 gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd)));

      alpha = gt_encseq_metadata_alphabet(emd);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

    }
    gt_encseq_metadata_delete(emd);
  } else {
    GtEncseqLoader *encseq_loader;
    GtEncseq *encseq;

    encseq_loader = gt_encseq_loader_new();
    if (arguments->mirror)
      gt_encseq_loader_mirror(encseq_loader);
    if (!(encseq = gt_encseq_loader_load(encseq_loader,
                                         argv[parsed_args], err)))
      had_err = -1;

    if (!had_err) {
      const GtStrArray *filenames;
      GtUword i;

      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq)
                                                   ? "yes"
                                                   : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "compressed size: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n",
                                        gt_encseq_sizeofrep(encseq));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_sequences(encseq));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_files(encseq));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                      gt_encseq_min_seq_length(encseq),
                                      gt_encseq_max_seq_length(encseq));

      filenames = gt_encseq_filenames(encseq);
      gt_file_xprintf(arguments->outfp, "original filenames:\n");
      for (i = 0; i < gt_str_array_size(filenames); i++) {
        gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n",
                                          gt_str_array_get(filenames, i),
                                          (GtUword)
                                     gt_encseq_effective_filelength(encseq, i));
      }

      alpha = gt_encseq_alphabet(encseq);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

      gt_file_xprintf(arguments->outfp, "character distribution:\n");
      for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) {
        GtUword cc;
        cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i]));
        gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n",
                                          (char) chars[i],
                                          cc,
                             (cc /(double) (gt_encseq_total_length(encseq)
                                  - gt_encseq_num_of_sequences(encseq)+1))*100);
      }

      gt_file_xprintf(arguments->outfp, "number of wildcards: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_wildcards(encseq),
                                        gt_encseq_realwildcardranges(encseq));

      gt_file_xprintf(arguments->outfp, "number of special characters: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_specialcharacters(encseq),
                                        gt_encseq_realspecialranges(encseq));

      gt_file_xprintf(arguments->outfp, "length of longest non-special "
                                        "character stretch: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                   gt_encseq_lengthoflongestnonspecial(encseq));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                   gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq)));

      gt_file_xprintf(arguments->outfp, "bits used per character: ");
      gt_file_xprintf(arguments->outfp, "%f\n",
        (double) ((uint64_t) CHAR_BIT *
                  (uint64_t) gt_encseq_sizeofrep(encseq)) /
        (double) gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "has special ranges: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_specialranges(encseq)
                                          ? "yes"
                                          : "no");

      gt_file_xprintf(arguments->outfp, "has description support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                       gt_encseq_has_description_support(encseq)
                                          ? "yes"
                                          : "no");

      if (gt_encseq_has_description_support(encseq)) {
        gt_file_xprintf(arguments->outfp, "length of longest description: ");
        gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_max_desc_length(encseq));
      }

      gt_file_xprintf(arguments->outfp, "has multiple sequence support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_multiseq_support(encseq)
                                          ? "yes"
                                          : "no");
    }
    gt_encseq_delete(encseq);
    gt_encseq_loader_delete(encseq_loader);
  }

  return had_err;
}