static int hmmsearch_call_coarse_search(GtCondenseq* ces,
                                        char *hmmsearch_path,
                                        char *table_filename,
                                        char *hmm_filename,
                                        GtLogger *logger,
                                        GtError *err) {
  int had_err = 0;
  char **hmmargs = NULL,
       *hmmenv[] = { NULL };
  GtStr *coarse_fas = gt_condenseq_unique_fasta_file(ces);
  GtSafePipe *pipe = NULL;
  gt_assert(coarse_fas != NULL);

  /* Array has to end with NULL */
  hmmargs = gt_calloc((size_t) 8, sizeof (*hmmargs));
  hmmargs[0] = hmmsearch_path;
  hmmargs[1] = gt_cstr_dup("--noali");
  hmmargs[2] = gt_cstr_dup("--notextw");
  hmmargs[3] = gt_cstr_dup("--domtblout");
  hmmargs[4] = table_filename;
  hmmargs[5] = hmm_filename;
  hmmargs[6] = gt_str_get(coarse_fas);

  gt_logger_log(logger, "calling: %s", hmmsearch_path);

  pipe = gt_safe_popen(hmmsearch_path, hmmargs, hmmenv, err);

  if (pipe == NULL)
    had_err = -1;

  gt_free(hmmargs[1]);
  gt_free(hmmargs[2]);
  gt_free(hmmargs[3]);
  gt_free(hmmargs);
  gt_str_delete(coarse_fas);

  /* pipe test for splint */
  if (!had_err && pipe != NULL) {
    if (gt_log_enabled()) {
      GtStr *line = gt_str_new();
      while (gt_str_read_next_line(line, pipe->read_fd) == 0) {
        gt_log_log("%s", gt_str_get(line));
        gt_str_reset(line);
      }
      gt_str_delete(line);
    }
    (void) gt_safe_pclose(pipe);
  }
  return had_err;
}
static int hmmsearch_call_fine_search(GtStr *table_filename,
                                      char *fine_fasta_filename,
                                      char *hmmsearch_path,
                                      char *hmm_filename,
                                      GtLogger *logger,
                                      GtError *err) {
  int had_err = 0;
  GtSafePipe *pipe = NULL;
  char **hmmargs = NULL,
       *hmmenv[] = { NULL };
  size_t hmmargc = (size_t) 4;
  unsigned int hmmidx = 0;

  if (table_filename != NULL) {
    hmmargc += (size_t) 2;
  }
  hmmargs = gt_calloc(hmmargc, sizeof (*hmmargs));
  hmmargs[hmmidx++] = hmmsearch_path;
  if (table_filename != NULL) {
    hmmargs[hmmidx++] = gt_cstr_dup("--tblout");
    hmmargs[hmmidx++] = gt_str_get(table_filename);
  }
  hmmargs[hmmidx++] = hmm_filename;
  hmmargs[hmmidx++] = fine_fasta_filename;
  gt_assert(hmmargs[hmmidx] == NULL);

  gt_logger_log(logger, "calling: %s", hmmsearch_path);

  pipe = gt_safe_popen(hmmsearch_path, hmmargs, hmmenv, err);

  if (table_filename != NULL)
    gt_free(hmmargs[1]);
  gt_free(hmmargs);

  if (pipe == NULL)
    had_err = -1;

  if (!had_err) {
    GtStr *line = gt_str_new();
    gt_assert(pipe != NULL); /* shut up splint */
    while (gt_str_read_next_line(line, pipe->read_fd) == 0) {
      printf("%s\n", gt_str_get(line));
      gt_str_reset(line);
    }
    gt_str_delete(line);
    (void) gt_safe_pclose(pipe);
  }
  return had_err;
}
Example #3
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
    bool mirrored = false;
    int had_err = 0,
        i;
    GtEncseq              *encseq = NULL;
    GtGenomediffArguments *arguments = tool_arguments;
    GtLogger              *logger;
    GtShuUnitFileInfo     *unit_info = NULL;
    GtTimer               *timer = NULL;

    gt_error_check(err);
    gt_assert(arguments);

    logger = gt_logger_new(arguments->verbose,
                           GT_LOGGER_DEFLT_PREFIX,
                           stdout);
    gt_assert(logger);

    for (i = parsed_args; i < argc; i++) {
        gt_str_array_add_cstr(arguments->filenames, argv[i]);
    }

    if (gt_showtime_enabled()) {
        timer = gt_timer_new_with_progress_description("start");
        gt_timer_start(timer);
        gt_assert(timer);
    }

    if (arguments->with_units) {
        gt_logger_log(logger, "unitfile option set, filename is %s\n",
                      gt_str_get(arguments->unitfile));
    }

    if (timer != NULL)
        gt_timer_show_progress(timer, "start shu search", stdout);

    if (gt_str_array_size(arguments->filenames) > 1UL) {
        GtEncseqEncoder *ee = gt_encseq_encoder_new();
        gt_encseq_encoder_set_timer(ee, timer);
        gt_encseq_encoder_set_logger(ee, logger);
        /* kr only makes sense for dna, so we can check this already with ee */
        gt_encseq_encoder_set_input_dna(ee);
        had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                           gt_str_get(arguments->indexname), err);
        gt_encseq_encoder_delete(ee);
    }
    else {
        gt_str_append_str(arguments->indexname,
                          gt_str_array_get_str(arguments->filenames, 0));
        if (arguments->with_esa || arguments->with_pck) {
            GtStr *current_line = gt_str_new();
            FILE *prj_fp;
            const char *buffer;
            char **elements = NULL;

            prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                             GT_PROJECTFILESUFFIX,"rb",err);
            if (prj_fp == NULL)
                had_err = -1;
            while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
                buffer = gt_str_get(current_line);
                if (elements != NULL) {
                    gt_free(elements[0]);
                    gt_free(elements[1]);
                }
                gt_free(elements);
                elements = gt_cstr_split(buffer, '=');
                gt_log_log("%s", elements[0]);
                if (strcmp("mirrored", elements[0]) == 0) {
                    gt_log_log("%s", elements[1]);
                    if (strcmp("1", elements[1]) == 0) {
                        mirrored = true;
                        gt_log_log("sequences are treated as mirrored");
                    }
                }
                gt_str_reset(current_line);
            }
            gt_str_delete(current_line);
            if (elements != NULL) {
                gt_free(elements[0]);
                gt_free(elements[1]);
            }
            gt_free(elements);
            gt_fa_xfclose(prj_fp);
        }
    }

    if (!had_err) {
        GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                             err);
        if (mirrored)
            gt_encseq_loader_mirror(el);
        encseq =
            gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
        gt_encseq_loader_delete(el);
    }
    if (encseq == NULL)
        had_err = -1;
    if (!had_err) {
        unit_info = gt_shu_unit_info_new(encseq);
        if (arguments->with_units)
            had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                                 logger, err);
    }

    if (!had_err) {
        uint64_t **shusums = NULL;
        if (arguments->with_esa || arguments->with_pck) {
            shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                               logger, timer, err);
            if (shusums == NULL)
                had_err = -1;
        }
        else {
            const bool doesa = true;
            GenomediffInfo gd_info;
            Suffixeratoroptions sopts;
            sopts.beverbose = arguments->verbose;
            sopts.indexname = arguments->indexname;
            sopts.db = NULL;
            sopts.encopts = NULL;
            sopts.genomediff = true;
            sopts.inputindex = arguments->indexname;
            sopts.loadopts = arguments->loadopts;
            sopts.showprogress = false;
            sopts.idxopts = arguments->idxopts;

            gt_assert(unit_info != NULL);
            gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                                unit_info->num_of_genomes);
            gd_info.shulensums = shusums;
            gd_info.unit_info = unit_info;
            had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err);
        }
        if (!had_err && shusums != NULL) {
            had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                            arguments->with_pck, logger, timer, err);
            gt_array2dim_delete(shusums);
        }
    }

    if (timer != NULL) {
        gt_timer_show_progress_final(timer, stdout);
        gt_timer_delete(timer);
    }
    gt_logger_delete(logger);
    gt_encseq_delete(encseq);
    gt_shu_unit_info_delete(unit_info);

    return had_err;
}
static int hmmsearch_process_coarse_hits(
                                       char *table_filename,
                                       GtCondenseq *ces,
                                       GtCondenseqHmmsearchArguments *arguments,
                                       GtLogger *logger,
                                       GtError *err) {
  int had_err = 0;
  GtStr *line = gt_str_new();
  FILE *table = NULL;
  GtSplitter *splitter = gt_splitter_new();
  GtStr *query = gt_str_new(),
        *fine_fasta_filename = gt_str_new_cstr("condenseq");
  GtRBTree *sequences = NULL;
  GtUword filecount = (GtUword) 1;
  unsigned int querycount = 0;
  const GtUword fine_fasta_name_length = gt_str_length(fine_fasta_filename);
  const GtUword table_name_length = gt_str_length(arguments->outtable_filename);

  table = gt_xfopen(table_filename, "r");

  sequences = gt_rbtree_new(hmmsearch_cmp_seqnum,
                            hmmsearch_tree_free_node, NULL);

  while (!had_err && gt_str_read_next_line(line, table) == 0) {
    char *c_line = gt_str_get(line);
    GtUword uid;
    const GtUword target_column = 0,
          query_column = (GtUword) 3;

    if (c_line[0] != '#') {
      gt_splitter_split_non_empty(splitter, c_line, gt_str_length(line), ' ');
      gt_assert(gt_splitter_size(splitter) == (GtUword) 23);
      if (sscanf(gt_splitter_get_token(splitter, target_column),
                 GT_WU, &uid) != 1) {
        gt_error_set(err, "couldn't parse target number: %s",
                     gt_splitter_get_token(splitter, target_column));
        had_err = -1;
      }
      if (gt_str_length(query) == 0 ||
          strcmp(gt_str_get(query),
                 gt_splitter_get_token(splitter, query_column)) != 0) {
        gt_str_set(query, gt_splitter_get_token(splitter, query_column));
        gt_logger_log(logger, "new query: %s", gt_str_get(query));
        querycount++;
      }
      if (!had_err && querycount == arguments->max_queries) {
        hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces);
        if (table_name_length != 0)
          gt_str_append_uword(arguments->outtable_filename, filecount++);
        had_err =
          hmmsearch_call_fine_search(table_name_length != 0 ?
                                       arguments->outtable_filename :
                                       NULL,
                                     gt_str_get(fine_fasta_filename),
                                     gt_str_get(arguments->hmmsearch_path),
                                     gt_str_get(arguments->hmm),
                                     logger, err);
        gt_rbtree_clear(sequences);
        gt_str_set_length(fine_fasta_filename, fine_fasta_name_length);
        if (table_name_length != 0)
          gt_str_set_length(arguments->outtable_filename, table_name_length);
        querycount = 0;
      }
      if (!had_err) {
        if (gt_condenseq_each_redundant_seq(ces, uid,
                                            hmmsearch_process_seq,
                                            sequences, err) == 0) {
          had_err = -1;
        }
      }
      gt_splitter_reset(splitter);
    }
    gt_str_reset(line);
  }
  gt_splitter_delete(splitter);
  gt_str_delete(line);
  gt_str_delete(query);
  gt_xfclose(table);

  if (!had_err) {
    hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces);
    if (table_name_length != 0)
      gt_str_append_uword(arguments->outtable_filename, filecount++);
    had_err =
      hmmsearch_call_fine_search(table_name_length != 0 ?
                                 arguments->outtable_filename :
                                 NULL,
                                 gt_str_get(fine_fasta_filename),
                                 gt_str_get(arguments->hmmsearch_path),
                                 gt_str_get(arguments->hmm),
                                 logger, err);
  }
  gt_log_log("created " GT_WU " files", filecount);
  gt_rbtree_delete(sequences);
  gt_str_delete(fine_fasta_filename);
  return had_err;
}
Example #5
0
static int scanprjfileuintkeysviafileptr(Suffixarray *suffixarray,
                                         const char *indexname,
                                         GtLogger *logger,
                                         FILE *fpin,
                                         GtError *err)
{
  uint32_t integersize, littleendian, readmodeint, mirrored;
  unsigned int linenum;
  GtUword currentlinelength;
  size_t dbfilelen = strlen(DBFILEKEY);
  bool haserr = false;
  GtScannedprjkeytable *scannedprjkeytable;
  GtStr *currentline;
  /* the following five variables are local as the parsed values are
     not required: they are determined by reading the encseq */
  GtSpecialcharinfo specialcharinfo;
  GtUword totallength,
                numofsequences,
                numofdbsequences,
                numofquerysequences;

  gt_error_check(err);
  scannedprjkeytable = gt_scannedprjkeytable_new();
  GT_SCANNEDPRJKEY_ADD("totallength",&totallength,NULL);
  GT_SCANNEDPRJKEY_ADD("specialcharacters",
                       &specialcharinfo.specialcharacters,NULL);
  GT_SCANNEDPRJKEY_ADD("specialranges",
                       &specialcharinfo.specialranges,NULL);
  GT_SCANNEDPRJKEY_ADD("realspecialranges",
                       &specialcharinfo.realspecialranges,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofspecialprefix",
                       &specialcharinfo.lengthofspecialprefix,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofspecialsuffix",
                       &specialcharinfo.lengthofspecialsuffix,NULL);
  GT_SCANNEDPRJKEY_ADD("wildcards",
                       &specialcharinfo.wildcards,NULL);
  GT_SCANNEDPRJKEY_ADD("wildcardranges",
                       &specialcharinfo.wildcardranges,NULL);
  GT_SCANNEDPRJKEY_ADD("realwildcardranges",
                       &specialcharinfo.realwildcardranges,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofwildcardprefix",
                       &specialcharinfo.lengthofwildcardprefix,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofwildcardsuffix",
                       &specialcharinfo.lengthofwildcardsuffix,NULL);
  GT_SCANNEDPRJKEY_ADD("numofsequences",&numofsequences,NULL);
  GT_SCANNEDPRJKEY_ADD("numofdbsequences",&numofdbsequences,NULL);
  gt_scannedprjkey_add(scannedprjkeytable,"numofquerysequences",
                       &numofquerysequences,0,false,NULL);
  GT_SCANNEDPRJKEY_ADD("numberofallsortedsuffixes",
                       &suffixarray->numberofallsortedsuffixes,NULL);
  GT_SCANNEDPRJKEY_ADD("longest",&suffixarray->longest.valueunsignedlong,
                       &suffixarray->longest.defined);
  GT_SCANNEDPRJKEY_ADD("prefixlength",&suffixarray->prefixlength,NULL);
  GT_SCANNEDPRJKEY_ADD("largelcpvalues",
                       &suffixarray->numoflargelcpvalues.valueunsignedlong,
                       &suffixarray->numoflargelcpvalues.defined);
  gt_scannedprjkey_add(scannedprjkeytable,"averagelcp",
                       &suffixarray->averagelcp.valuedouble,
                       sizeof (suffixarray->averagelcp.valuedouble),
                       true,
                       &suffixarray->averagelcp.defined);
  GT_SCANNEDPRJKEY_ADD("maxbranchdepth",
                       &suffixarray->maxbranchdepth.valueunsignedlong,
                       &suffixarray->maxbranchdepth.defined);
  GT_SCANNEDPRJKEY_ADD("integersize",&integersize,NULL);
  GT_SCANNEDPRJKEY_ADD("littleendian",&littleendian,NULL);
  GT_SCANNEDPRJKEY_ADD("readmode",&readmodeint,NULL);
  GT_SCANNEDPRJKEY_ADD("mirrored",&mirrored,NULL);
  currentline = gt_str_new();
  for (linenum = 0; gt_str_read_next_line(currentline, fpin) != EOF; linenum++)
  {
    currentlinelength = gt_str_length(currentline);
    if (dbfilelen <= (size_t) currentlinelength &&
        memcmp(DBFILEKEY,gt_str_get(currentline),dbfilelen) == 0)
    {
      /* Nothing */
    } else
    {
      if (gt_scannedprjkey_analyze(indexname,
                                   GT_PROJECTFILESUFFIX,
                                   linenum,
                                   gt_str_get(currentline),
                                   currentlinelength,
                                   scannedprjkeytable,
                                   err) != 0)
      {
        haserr = true;
        break;
      }
    }
    gt_str_reset(currentline);
  }
  gt_str_delete(currentline);
  if (!haserr && gt_scannedprjkey_allkeysdefined(indexname,GT_PROJECTFILESUFFIX,
                                                 scannedprjkeytable,
                                                 logger,err) != 0)
  {
    haserr = true;
  }
  if (!haserr && integersize != (uint32_t) 32 && integersize != (uint32_t) 64)
  {
    gt_error_set(err,"%s%s contains illegal line defining the integer size",
                 indexname,GT_PROJECTFILESUFFIX);
    haserr = true;
  }
  if (!haserr && integersize != (uint32_t) (sizeof (GtUword) * CHAR_BIT))
  {
    gt_error_set(err,"index was generated for %u-bit integers while "
                      "this program uses %u-bit integers",
                      (unsigned int) integersize,
                      (unsigned int) (sizeof (GtUword) * CHAR_BIT));
    haserr = true;
  }
  if (!haserr)
  {
    if (gt_is_little_endian())
    {
      if (littleendian != (uint32_t) 1)
      {
        gt_error_set(err,"computer has little endian byte order, while index "
                         "was built on computer with big endian byte order");
        haserr = true;
      }
    } else
    {
      if (littleendian == (uint32_t) 1)
      {
        gt_error_set(err,"computer has big endian byte order, while index "
                         "was built on computer with little endian byte "
                         "order");
        haserr = true;
      }
    }
  }
  if (!haserr)
  {
    if (readmodeint > (uint32_t) 3)
    {
      gt_error_set(err,"illegal readmode %u",(unsigned int) readmodeint);
      haserr = true;
    }
    suffixarray->readmode = (GtReadmode) readmodeint;
  }
  if (!haserr)
  {
    if (mirrored > (uint32_t) 1)
    {
      gt_error_set(err,"illegal mirroring flag: only 0(=no mirroring) and "
                       "1 (=mirroring) is supported, but read %u",
                       (unsigned int) mirrored);
      haserr = true;
    }
    suffixarray->mirroredencseq = (mirrored == (uint32_t) 1);
  }
  gt_scannedprjkeytable_delete(scannedprjkeytable);
  return haserr ? -1 : 0;
}
Example #6
0
static int scanfmafileviafileptr(Fmindex *fmindex,
                                 GtSpecialcharinfo *specialcharinfo,
                                 bool *storeindexpos,
                                 const char *indexname,
                                 FILE *fpin,
                                 GtLogger *logger,
                                 GtError *err)
{
  bool haserr = false;
  GtScannedprjkeytable *scannedprjkeytable;
  unsigned int intstoreindexpos;

  gt_error_check(err);
  scannedprjkeytable = gt_scannedprjkeytable_new();
  GT_SCANNEDPRJKEY_ADD("bwtlength",&fmindex->bwtlength,NULL);
  GT_SCANNEDPRJKEY_ADD("longest",&fmindex->longestsuffixpos,NULL);
  GT_SCANNEDPRJKEY_ADD("storeindexpos",&intstoreindexpos,NULL);
  GT_SCANNEDPRJKEY_ADD("log2blocksize",&fmindex->log2bsize,NULL);
  GT_SCANNEDPRJKEY_ADD("log2markdist",&fmindex->log2markdist,NULL);
  GT_SCANNEDPRJKEY_ADD("specialcharacters",
                       &specialcharinfo->specialcharacters,NULL);
  GT_SCANNEDPRJKEY_ADD("specialranges",&specialcharinfo->specialranges,NULL);
  GT_SCANNEDPRJKEY_ADD("realspecialranges",&specialcharinfo->realspecialranges,
                       NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofspecialprefix",
                       &specialcharinfo->lengthofspecialprefix,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofspecialsuffix",
                       &specialcharinfo->lengthofspecialsuffix,NULL);
  GT_SCANNEDPRJKEY_ADD("wildcards",&specialcharinfo->wildcards,NULL);
  GT_SCANNEDPRJKEY_ADD("wildcardranges",&specialcharinfo->wildcardranges,NULL);
  GT_SCANNEDPRJKEY_ADD("realwildcardranges",
                       &specialcharinfo->realwildcardranges,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofwildcardprefix",
                       &specialcharinfo->lengthofwildcardprefix,NULL);
  GT_SCANNEDPRJKEY_ADD("lengthofwildcardsuffix",
                       &specialcharinfo->lengthofwildcardsuffix,NULL);
  GT_SCANNEDPRJKEY_ADD("suffixlength",&fmindex->suffixlength,NULL);
  if (!haserr)
  {
    GtStr *currentline;
    unsigned int linenum;

    currentline = gt_str_new();
    for (linenum = 0; gt_str_read_next_line(currentline, fpin) != EOF;
         linenum++)
    {
      if (gt_scannedprjkey_analyze(indexname,
                                   FMASCIIFILESUFFIX,
                                   linenum,
                                   gt_str_get(currentline),
                                   gt_str_length(currentline),
                                   scannedprjkeytable,
                                   err) != 0)
      {
        haserr = true;
        break;
      }
      gt_str_reset(currentline);
    }
    gt_str_delete(currentline);
  }
  if (!haserr && gt_scannedprjkey_allkeysdefined(indexname,FMASCIIFILESUFFIX,
                                                 scannedprjkeytable,
                                                 logger,err) != 0)
  {
    haserr = true;
  }
  if (!haserr)
  {
    if (intstoreindexpos == 1U)
    {
      *storeindexpos = true;
    } else
    {
      if (intstoreindexpos == 0)
      {
        *storeindexpos = false;
      } else
      {
        gt_error_set(err,"illegal value in line matching \"storeindexpos=\"");
        haserr = true;
      }
    }
  }
  gt_scannedprjkeytable_delete(scannedprjkeytable);
  return haserr ? -1 : 0;
}
Example #7
0
static int scanprjfileuintkeysviafileptr(Suffixarray *suffixarray,
                                         const GtStr *indexname,
                                         Verboseinfo *verboseinfo,
                                         FILE *fpin,
                                         GtError *err)
{
  uint32_t integersize, littleendian, readmodeint;
  unsigned int linenum;
  unsigned long currentlinelength;

  DefinedSeqpos maxbranchdepth;
  size_t dbfilelen = strlen(DBFILEKEY);
  bool haserr = false;
  GtArray *riktab;
  GtStr *currentline;
  /* the following five variables are local as the parsed values are
     not required: they are determined by reading the encodedsequence */
  Seqpos totallength;
  Specialcharinfo specialcharinfo;
  unsigned long numofsequences,
                numofdbsequences,
                numofquerysequences;

  gt_error_check(err);
  riktab = gt_array_new(sizeofReadintkeys());
  SETREADINTKEYS("totallength",&totallength,NULL);
  SETREADINTKEYS("specialcharacters",
                 &specialcharinfo.specialcharacters,NULL);
  SETREADINTKEYS("specialranges",
                 &specialcharinfo.specialranges,NULL);
  SETREADINTKEYS("realspecialranges",
                 &specialcharinfo.realspecialranges,NULL);
  SETREADINTKEYS("lengthofspecialprefix",
                 &specialcharinfo.lengthofspecialprefix,NULL);
  SETREADINTKEYS("lengthofspecialsuffix",
                 &specialcharinfo.lengthofspecialsuffix,NULL);
  SETREADINTKEYS("numofsequences",&numofsequences,NULL);
  SETREADINTKEYS("numofdbsequences",&numofdbsequences,NULL);
  setreadintkeys(riktab,"numofquerysequences",&numofquerysequences,0,NULL);
  SETREADINTKEYS("longest",&suffixarray->longest.valueseqpos,
                           &suffixarray->longest.defined);
  SETREADINTKEYS("prefixlength",&suffixarray->prefixlength,NULL);
  SETREADINTKEYS("largelcpvalues",
                 &suffixarray->numoflargelcpvalues.valueseqpos,
                 &suffixarray->numoflargelcpvalues.defined);
  SETREADINTKEYS("maxbranchdepth",&maxbranchdepth.valueseqpos,
                 &maxbranchdepth.defined);
  SETREADINTKEYS("integersize",&integersize,NULL);
  SETREADINTKEYS("littleendian",&littleendian,NULL);
  SETREADINTKEYS("readmode",&readmodeint,NULL);
  currentline = gt_str_new();
  for (linenum = 0; gt_str_read_next_line(currentline, fpin) != EOF; linenum++)
  {
    currentlinelength = gt_str_length(currentline);
    if (dbfilelen <= (size_t) currentlinelength &&
       memcmp(DBFILEKEY,gt_str_get(currentline),dbfilelen) == 0)
    {
      /* Nothing */
    } else
    {
      if (analyzeuintline(indexname,
                         PROJECTFILESUFFIX,
                         linenum,
                         gt_str_get(currentline),
                         currentlinelength,
                         riktab,
                         err) != 0)
      {
        haserr = true;
        break;
      }
    }
    gt_str_reset(currentline);
  }
  gt_str_delete(currentline);
  if (!haserr && allkeysdefined(indexname,PROJECTFILESUFFIX,riktab,
                                verboseinfo,err) != 0)
  {
    haserr = true;
  }
  if (!haserr &&
      integersize != (uint32_t) 32 &&
      integersize != (uint32_t) 64)
  {
    gt_error_set(err,"%s%s contains illegal line defining the integer size",
                  gt_str_get(indexname),PROJECTFILESUFFIX);
    haserr = true;
  }
  if (!haserr && integersize != (uint32_t) (sizeof (Seqpos) * CHAR_BIT))
  {
    gt_error_set(err,"index was generated for %u-bit integers while "
                      "this program uses %u-bit integers",
                      (unsigned int) integersize,
                      (unsigned int) (sizeof (Seqpos) * CHAR_BIT));
    haserr = true;
  }
  if (!haserr)
  {
    if (gt_is_little_endian())
    {
      if (littleendian != (uint32_t) 1)
      {
        gt_error_set(err,"computer has little endian byte order, while index "
                      "was build on computer with big endian byte order");
        haserr = true;
      }
    } else
    {
      if (littleendian == (uint32_t) 1)
      {
        gt_error_set(err,"computer has big endian byte order, while index "
                      "was build on computer with little endian byte "
                      "order");
        haserr = true;
      }
    }
  }
  if (!haserr)
  {
    if (readmodeint > (uint32_t) 3)
    {
      gt_error_set(err,"illegal readmode %u",(unsigned int) readmodeint);
      haserr = true;
    }
    suffixarray->readmode = (Readmode) readmodeint;
  }
  gt_array_delete(riktab);
  return haserr ? -1 : 0;
}
Example #8
0
static int itersearchoverallkeys(const GtEncseq *encseq,
                                 const char *keytab,
                                 unsigned long numofkeys,
                                 unsigned long keysize,
                                 const GtStr *fileofkeystoextract,
                                 unsigned long linewidth,
                                 GtError *err)
{
  FILE *fp;
  GtStr *currentline;
  uint64_t linenum;
  unsigned long seqnum, countmissing = 0;
  bool haserr = false;
  Fastakeyquery fastakeyquery;

  if (linewidth == 0)
  {
    gt_error_set(err,"use option width to specify line width for formatting");
    return -1;
  }
  fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err);
  if (fp == NULL)
  {
    return -1;
  }
  currentline = gt_str_new();
  fastakeyquery.fastakey = gt_malloc(sizeof (char) * (keysize+1));
  for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++)
  {
    if (extractkeyfromcurrentline(&fastakeyquery,
                                  keysize,
                                  currentline,
                                  linenum,
                                  fileofkeystoextract,
                                  err) != 0)
    {
      haserr = true;
      break;
    }
    seqnum = searchfastaqueryindes(fastakeyquery.fastakey,keytab,numofkeys,
                                   keysize);
    if (seqnum < numofkeys)
    {
      if (giextract_encodedseq2fasta(stdout,
                                     encseq,
                                     seqnum,
                                     &fastakeyquery,
                                     linewidth,
                                     err) != 0)
      {
        haserr = true;
        break;
      }
    } else
    {
      countmissing++;
    }
    gt_str_reset(currentline);
  }
  if (!haserr && countmissing > 0)
  {
    printf("# number of unsatified fastakey-queries: %lu\n",countmissing);
  }
  gt_str_delete(currentline);
  gt_fa_fclose(fp);
  gt_free(fastakeyquery.fastakey);
  return haserr ? - 1 : 0;
}
Example #9
0
int gt_extractkeysfromdesfile(const char *indexname,
                              bool sortkeys,
                              GtLogger *logger,
                              GtError *err)
{
  FILE *fpin, *fpout = NULL;
  GtStr *line = NULL;
  const char *keyptr;
  unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/
  bool haserr = false, firstdesc = true;
  char *previouskey = NULL;
  Fixedsizekey *keytab = NULL, *keytabptr = NULL;
  GtEncseq *encseq = NULL;
  unsigned long numofentries = 0;
  const unsigned long linewidth = 60UL;

  fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err);
  if (fpin == NULL)
  {
    return -1;
  }
  if (!sortkeys)
  {
    fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err);
    if (fpout == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    line = gt_str_new();
  }
  for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF;
       linenum++)
  {
    keyptr = desc2key(&keylen,gt_str_get(line),err);
    if (keyptr == NULL)
    {
      haserr = true;
      break;
    }
    if (keylen == 0)
    {
      gt_error_set(err,"key of length 0 in \"%s\" not expected",
                   gt_str_get(line));
      haserr = true;
      break;
    }
    if (firstdesc)
    {
      if (keylen > (unsigned long) CHAR_MAX)
      {
        gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; "
                         "no key must be larger than %d",
                          (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX);
        haserr = true;
        break;
      }
      constantkeylen = keylen;
      previouskey = gt_malloc(sizeof (char) * (constantkeylen+1));
      firstdesc = false;
      if (!sortkeys)
      {
        gt_xfputc((char) constantkeylen,fpout);
      } else
      {
        GtEncseqLoader *el;
        if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE)
        {
          gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; "
                           "no key must be larger than %d",
                            (int) keylen,(int) keylen,keyptr,keylen,
                            MAXFIXEDKEYSIZE);
          haserr = true;
          break;
        }
        el = gt_encseq_loader_new();
        gt_encseq_loader_set_logger(el, logger);
        encseq = gt_encseq_loader_load(el, indexname, err);
        gt_encseq_loader_delete(el);
        if (encseq == NULL)
        {
          haserr = true;
          break;
        }
        numofentries = gt_encseq_num_of_sequences(encseq);
        gt_assert(numofentries > 0);
        keytab = gt_malloc(sizeof (*keytab) * numofentries);
        keytabptr = keytab;
      }
    } else
    {
      if (constantkeylen != keylen)
      {
        gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of "
                         "the same length which for all previously seen "
                         "headers is %lu",
                         (int) keylen,(int) keylen,keyptr,keylen,
                         constantkeylen);
        haserr = true;
        break;
      }
      gt_assert(previouskey != NULL);
      if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0)
      {
        gt_error_set(err,"previous key \"%s\" is not lexicographically smaller "
                         "than current key \"%*.*s\"",
                         previouskey,(int) keylen,(int) keylen,keyptr);
        haserr = true;
        break;
        /*
        printf("previous key \"%s\" (no %lu) is lexicographically larger "
               "than current key \"%*.*s\"\n",
               previouskey,linenum,(int) keylen,(int) keylen,keyptr);
        incorrectorder++;
        */
      }
    }
    if (!sortkeys)
    {
      gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout);
      gt_xfputc('\0',fpout);
    } else
    {
      gt_assert(keytabptr != NULL);
      strncpy(keytabptr->key,keyptr,(size_t) constantkeylen);
      keytabptr->key[constantkeylen] = '\0';
      keytabptr->seqnum = linenum;
      keytabptr++;
    }
    strncpy(previouskey,keyptr,(size_t) constantkeylen);
    previouskey[constantkeylen] = '\0';
    gt_str_reset(line);
  }
  if (!haserr)
  {
    gt_logger_log(logger,"number of keys of length %lu = %lu",
                constantkeylen,linenum);
    /*
    gt_logger_log(logger,"number of incorrectly ordered keys = %lu",
                incorrectorder);
    */
  }
  gt_str_delete(line);
  gt_fa_fclose(fpin);
  gt_fa_fclose(fpout);
  gt_free(previouskey);
  if (!haserr && sortkeys)
  {
    gt_assert(keytabptr != NULL);
    gt_assert(numofentries > 0);
    gt_assert(keytabptr == keytab + numofentries);
    qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys);
    gt_assert(keytabptr != NULL);
    for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries;
         keytabptr++)
    {
      if (giextract_encodedseq2fasta(stdout,
                                     encseq,
                                     keytabptr->seqnum,
                                     NULL,
                                     linewidth,
                                     err) != 0)
      {
        haserr = true;
        break;
      }
    }
  }
  if (encseq != NULL)
  {
    gt_encseq_delete(encseq);
    encseq = NULL;
  }
  gt_free(keytab);
  return haserr ? -1 : 0;
}
Example #10
0
static Fastakeyquery *readfileofkeystoextract(bool verbose,
                                              unsigned long *numofqueries,
                                              const GtStr *fileofkeystoextract,
                                              GtError *err)
{
  FILE *fp;
  GtStr *currentline;
  bool haserr = false;
  uint64_t linenum;
  Fastakeyquery *fastakeyqueries;
#undef SKDEBUG
#ifdef SKDEBUG
  unsigned long i;
#endif

  gt_error_check(err);
  *numofqueries = gt_file_number_of_lines(gt_str_get(fileofkeystoextract));
  if (*numofqueries == 0)
  {
    gt_error_set(err,"empty file \"%s\" not allowed",
                 gt_str_get(fileofkeystoextract));
    return NULL;
  }
  fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err);
  if (fp == NULL)
  {
    return NULL;
  }
  if (verbose)
  {
    printf("# opened keyfile \"%s\"\n",gt_str_get(fileofkeystoextract));
  }
  fastakeyqueries = gt_malloc(sizeof (*fastakeyqueries) * (*numofqueries));
  currentline = gt_str_new();
  for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++)
  {
    if (extractkeyfromcurrentline(fastakeyqueries + linenum,
                                  0,
                                  currentline,
                                  linenum,
                                  fileofkeystoextract,
                                  err) != 0)
    {
      haserr = true;
      break;
    }
    gt_str_reset(currentline);
  }
  gt_str_delete(currentline);
  gt_fa_fclose(fp);
  if (haserr)
  {
    fastakeyqueries_delete(fastakeyqueries,*numofqueries);
    return NULL;
  }
  qsort(fastakeyqueries,(size_t) *numofqueries,sizeof (*fastakeyqueries),
        comparefastakeys);
  if (verbose)
  {
    printf("# %lu fastakey-queries successfully parsed and sorted\n",
            *numofqueries);
  }
  *numofqueries = remdupsfastakeyqueries(fastakeyqueries,*numofqueries,verbose);
#ifdef SKDEBUG
  for (i=0; i<*numofqueries; i++)
  {
    printf("%lu %s\n",i,fastakeyqueries[i].fastakey);
  }
#endif
  return fastakeyqueries;
}