Beispiel #1
0
int gt_hcr_decoder_decode_range(GtHcrDecoder *hcr_dec, const char *name,
                                GtUword start, GtUword end,
                                GtTimer *timer, GtError *err)
{
    char qual[BUFSIZ] = {0},
                        seq[BUFSIZ] = {0};
    GtStr *desc = gt_str_new();
    int had_err = 0;
    GtUword cur_width,
            cur_read;
    size_t i;
    FILE *output;
    GT_UNUSED GtHcrSeqDecoder *seq_dec;

    gt_error_check(err);
    gt_assert(hcr_dec && name);
    seq_dec = hcr_dec->seq_dec;
    gt_assert(start <= end);
    gt_assert(start < seq_dec->num_of_reads && end < seq_dec->num_of_reads);
    if (timer != NULL)
        gt_timer_show_progress(timer, "decode hcr", stdout);
    output = gt_fa_fopen_with_suffix(name, HCRFILEDECODEDSUFFIX, "w", err);
    if (output == NULL)
        had_err = -1;

    for (cur_read = start; had_err == 0 && cur_read <= end; cur_read++) {
        if (gt_hcr_decoder_decode(hcr_dec, cur_read, seq, qual, desc, err) != 0)
            had_err = -1;
        else {
            gt_xfputc(HCR_DESCSEPSEQ, output);
            if (hcr_dec->encdesc != NULL)
                gt_xfputs(gt_str_get(desc), output);
            else
                fprintf(output, ""GT_WU"", cur_read);
            gt_xfputc('\n', output);
            for (i = 0, cur_width = 0; i < strlen(seq); i++, cur_width++) {
                if (cur_width == HCR_LINEWIDTH) {
                    cur_width = 0;
                    gt_xfputc('\n', output);
                }
                gt_xfputc(seq[i], output);
            }
            gt_xfputc('\n', output);
            gt_xfputc(HCR_DESCSEPQUAL, output);
            gt_xfputc('\n', output);
            for (i = 0, cur_width = 0; i < strlen(qual); i++, cur_width++) {
                if (cur_width == HCR_LINEWIDTH) {
                    cur_width = 0;
                    gt_xfputc('\n', output);
                }
                gt_xfputc(qual[i], output);
            }
            gt_xfputc('\n', output);
        }
    }
    gt_fa_xfclose(output);
    gt_str_delete(desc);
    return had_err;
}
Beispiel #2
0
static int hcr_write_seq_qual_data(const char *name, GtHcrEncoder *hcr_enc,
                                   GtTimer *timer, GtError *err)
{
    int had_err = 0;
    FILE *fp;
    GtUword dummy = 0;
    GtWord pos;

    gt_error_check(err);

    fp = gt_fa_fopen_with_suffix(name, HCRFILESUFFIX, "wb", err);
    if (fp == NULL)
        had_err = -1;

    if (!had_err) {
        if (timer != NULL)
            gt_timer_show_progress(timer, "write sequences and qualities encoding",
                                   stdout);
        hcr_write_file_info(fp, hcr_enc);

        had_err = hcr_write_seqdistrtab(fp, hcr_enc);

        if (!had_err) {
            bool is_not_at_pageborder;

            pos = ftell(fp);
            gt_xfwrite_one(&dummy, fp);

            is_not_at_pageborder = (ftell(fp) % hcr_enc->pagesize) != 0;

            if (is_not_at_pageborder)
                hcr_enc->seq_encoder->start_of_encoding =
                    (ftell(fp) / hcr_enc->pagesize + 1) * hcr_enc->pagesize;
            else
                hcr_enc->seq_encoder->start_of_encoding = ftell(fp);

            if (hcr_enc->page_sampling)
                hcr_enc->seq_encoder->sampling =
                    gt_sampling_new_page(hcr_enc->sampling_rate,
                                         (off_t) hcr_enc->seq_encoder->start_of_encoding);
            else if (hcr_enc->regular_sampling)
                hcr_enc->seq_encoder->sampling =
                    gt_sampling_new_regular(hcr_enc->sampling_rate,
                                            (off_t) hcr_enc->seq_encoder->start_of_encoding);

            had_err = hcr_write_seqs(fp, hcr_enc, err);
        }
        if (!had_err) {
            gt_assert(fp);
            gt_xfseek(fp, pos, SEEK_SET);
            gt_xfwrite_one(&hcr_enc->seq_encoder->startofsamplingtab, fp);
        }
        gt_fa_xfclose(fp);
    }
    return 0;
}
Beispiel #3
0
static int construct_bioseq_files(GtBioseq *bs, GtStr *bioseq_indexname,
                                  GtError *err)
{
  GtStr *sequence_filename;
  GtEncseqEncoder *ee;
  GtStrArray *indexfn;
  int had_err = 0;

  gt_error_check(err);

  /* register the signal handler to remove incomplete files upon termination */
  if (!bs->use_stdin) {
    gt_bioseq_index_filename = gt_str_get(bs->sequence_file);
    gt_sig_register_all(remove_bioseq_files);
  }

  /* if stdin is used as input, we need to create a tempfile containing the
     sequence as GtEncseq cannot be built from stdin directly */
  if (bs->use_stdin) {
    GtStr *tmpfilename;
    FILE *tmpfile = NULL;
    int i;
    char buf[BUFSIZ];
    tmpfilename = gt_str_new();
    tmpfile = gt_xtmpfp(tmpfilename);
    gt_assert(tmpfile);
    i = 1;
    while (i > 0) {
      i = fread(buf, 1, BUFSIZ, stdin);
      if (i > 0) fwrite(buf, 1, i, tmpfile);
    }
    gt_fa_xfclose(tmpfile);
    sequence_filename = tmpfilename;
  } else {
    sequence_filename = gt_str_ref(bs->sequence_file);
  }
  gt_assert(gt_str_length(sequence_filename) > 0);
  ee = gt_encseq_encoder_new();
  gt_encseq_encoder_enable_description_support(ee);
  gt_encseq_encoder_enable_md5_support(ee);
  gt_encseq_encoder_enable_multiseq_support(ee);
  gt_encseq_encoder_enable_lossless_support(ee);
  indexfn = gt_str_array_new();
  gt_str_array_add(indexfn, sequence_filename);
  gt_str_delete(sequence_filename);
  had_err = gt_encseq_encoder_encode(ee, indexfn,
                                     gt_str_get(bioseq_indexname), err);
  /* unregister the signal handler */
   if (!bs->use_stdin)
    gt_sig_unregister_all();

  gt_str_array_delete(indexfn);
  gt_encseq_encoder_delete(ee);
  return had_err;
}
Beispiel #4
0
static void write_fingerprints(char **md5_fingerprints,
                               GtUword num_of_md5s,
                               GtStr *fingerprints_filename,
                               bool use_file_locking)
{
  FILE *fingerprints_file;
  gt_assert(md5_fingerprints && num_of_md5s && fingerprints_filename);
  fingerprints_file = gt_fa_xfopen(gt_str_get(fingerprints_filename), "w");
  if (use_file_locking)
    gt_fa_lock_exclusive(fingerprints_file);
  dump_md5_fingerprints(md5_fingerprints, num_of_md5s, fingerprints_file);
  if (use_file_locking)
    gt_fa_unlock(fingerprints_file);
  gt_fa_xfclose(fingerprints_file);
}
Beispiel #5
0
void gt_md5_tab_delete(GtMD5Tab *md5_tab)
{
  GtUword i;
  if (!md5_tab) return;
  if (md5_tab->reference_count) {
    md5_tab->reference_count--;
    return;
  }
  gt_fa_xmunmap(md5_tab->fingerprints);
  gt_fa_unlock(md5_tab->fingerprints_file);
  gt_fa_xfclose(md5_tab->fingerprints_file);
  gt_hashmap_delete(md5_tab->md5map);
  if (md5_tab->owns_md5s) {
    for (i = 0; i < md5_tab->num_of_md5s; i++)
      gt_free(md5_tab->md5_fingerprints[i]);
    gt_free(md5_tab->md5_fingerprints);
  }
  gt_free(md5_tab);
}
Beispiel #6
0
/* The following function processes a file. That is, it checkes if the file with
   name <filename> is already contained in the array <files>. If so, the index
   refering to this array is returned. Otherwise, the hash of the file content
   is compared with the hash <filehash>. If the hashes are the same, the
   filename is added to the array and the index is returned. Otherwise, the
   function calls exit(). */
static GtUword process_file(GthInput *input, char *filename,
                                  char *filehash, bool isreferencefile,
                                  GthAlphatype alphatype)
{
  GtWord fileindex;
  FILE *fp;

  if (isreferencefile)
    fileindex = gth_input_determine_reference_file_index(input, filename);
  else
    fileindex = gth_input_determine_genomic_file_index(input, filename);

  if (fileindex == -1) {
    /* file is not contained in array yet -> open file */
    fp = gt_fa_xfopen(filename, "r");

    /* check the hash */
    if (!hashes_are_the_same(filehash, fp)) {
      fprintf(stderr, "apparently file \"%s\" has changed\n", filename);
      exit(EXIT_FAILURE);
    }

    /* hashes equal -> store new file in array and return index number */
    gt_fa_xfclose(fp);
    if (isreferencefile) {
      gth_input_add_reference_file(input, filename, alphatype);
      fileindex = gth_input_num_of_ref_files(input) - 1;
    }
    else {
      gth_input_add_genomic_file(input, filename);
      fileindex = gth_input_num_of_gen_files(input) - 1;
    }
    return fileindex;
  }

  /* file is already contained in array -> return index number */
  return fileindex;
}
Beispiel #7
0
int gt_alphabet_to_file(const GtAlphabet *alphabet, const char *indexname,
                        GtError *err)
{
  FILE *al1fp;
  bool haserr = false;

  gt_error_check(err);
  al1fp = gt_fa_fopen_with_suffix(indexname,GT_ALPHABETFILESUFFIX,"wb",err);
  if (al1fp == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    GtStr *buf = gt_str_new();
    gt_alphabet_to_str(alphabet, buf);
    gt_xfwrite(gt_str_get(buf), sizeof (char), (size_t) gt_str_length(buf),
               al1fp);
    gt_fa_xfclose(al1fp);
    gt_str_delete(buf);
  }
  return haserr ? -1 : 0;
}
Beispiel #8
0
static bool scanprjfileuintkeys(Suffixarray *suffixarray,
                                const char *indexname,
                                GtLogger *logger,
                                GtError *err)
{
  bool haserr = false;
  FILE *fp;

  gt_error_check(err);
  fp = gt_fa_fopen_with_suffix(indexname,GT_PROJECTFILESUFFIX,"rb",err);
  if (fp == NULL)
  {
    haserr = true;
  }
  if (!haserr && scanprjfileuintkeysviafileptr(suffixarray,
                                               indexname,logger,
                                               fp,err) != 0)
  {
    haserr = true;
  }
  gt_fa_xfclose(fp);
  return haserr;
}
Beispiel #9
0
static bool scanprjfileuintkeys(Suffixarray *suffixarray,
                                const GtStr *indexname,
                                Verboseinfo *verboseinfo,
                                GtError *err)
{
  bool haserr = false;
  FILE *fp;

  gt_error_check(err);
  fp = opensfxfile(indexname,PROJECTFILESUFFIX,"rb",err);
  if (fp == NULL)
  {
    haserr = true;
  }
  if (!haserr && scanprjfileuintkeysviafileptr(suffixarray,
                                               indexname,verboseinfo,
                                               fp,err) != 0)
  {
    haserr = true;
  }
  gt_fa_xfclose(fp);
  return haserr;
}
Beispiel #10
0
static bool read_fingerprints(GtMD5Tab *md5_tab,
                              const char *fingerprints_filename,
                              bool use_file_locking)
{
  bool reading_succeeded = true;
  size_t len;
  gt_assert(md5_tab && fingerprints_filename);
  /* open file */
  gt_assert(gt_file_exists(fingerprints_filename));
  if (use_file_locking) {
    md5_tab->fingerprints_file = gt_fa_xfopen(fingerprints_filename, "r");
    gt_fa_lock_shared(md5_tab->fingerprints_file);
  }
  md5_tab->fingerprints = gt_fa_xmmap_read(fingerprints_filename, &len);
  if (len != md5_tab->num_of_md5s * 33) {
    gt_fa_xmunmap(md5_tab->fingerprints);
    md5_tab->fingerprints = NULL;
    gt_fa_unlock(md5_tab->fingerprints_file);
    gt_fa_xfclose(md5_tab->fingerprints_file);
    md5_tab->fingerprints_file = NULL;
    reading_succeeded = false;
  }
  return reading_succeeded;
}
Beispiel #11
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
    bool mirrored = false;
    int had_err = 0,
        i;
    GtEncseq              *encseq = NULL;
    GtGenomediffArguments *arguments = tool_arguments;
    GtLogger              *logger;
    GtShuUnitFileInfo     *unit_info = NULL;
    GtTimer               *timer = NULL;

    gt_error_check(err);
    gt_assert(arguments);

    logger = gt_logger_new(arguments->verbose,
                           GT_LOGGER_DEFLT_PREFIX,
                           stdout);
    gt_assert(logger);

    for (i = parsed_args; i < argc; i++) {
        gt_str_array_add_cstr(arguments->filenames, argv[i]);
    }

    if (gt_showtime_enabled()) {
        timer = gt_timer_new_with_progress_description("start");
        gt_timer_start(timer);
        gt_assert(timer);
    }

    if (arguments->with_units) {
        gt_logger_log(logger, "unitfile option set, filename is %s\n",
                      gt_str_get(arguments->unitfile));
    }

    if (timer != NULL)
        gt_timer_show_progress(timer, "start shu search", stdout);

    if (gt_str_array_size(arguments->filenames) > 1UL) {
        GtEncseqEncoder *ee = gt_encseq_encoder_new();
        gt_encseq_encoder_set_timer(ee, timer);
        gt_encseq_encoder_set_logger(ee, logger);
        /* kr only makes sense for dna, so we can check this already with ee */
        gt_encseq_encoder_set_input_dna(ee);
        had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                           gt_str_get(arguments->indexname), err);
        gt_encseq_encoder_delete(ee);
    }
    else {
        gt_str_append_str(arguments->indexname,
                          gt_str_array_get_str(arguments->filenames, 0));
        if (arguments->with_esa || arguments->with_pck) {
            GtStr *current_line = gt_str_new();
            FILE *prj_fp;
            const char *buffer;
            char **elements = NULL;

            prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                             GT_PROJECTFILESUFFIX,"rb",err);
            if (prj_fp == NULL)
                had_err = -1;
            while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
                buffer = gt_str_get(current_line);
                if (elements != NULL) {
                    gt_free(elements[0]);
                    gt_free(elements[1]);
                }
                gt_free(elements);
                elements = gt_cstr_split(buffer, '=');
                gt_log_log("%s", elements[0]);
                if (strcmp("mirrored", elements[0]) == 0) {
                    gt_log_log("%s", elements[1]);
                    if (strcmp("1", elements[1]) == 0) {
                        mirrored = true;
                        gt_log_log("sequences are treated as mirrored");
                    }
                }
                gt_str_reset(current_line);
            }
            gt_str_delete(current_line);
            if (elements != NULL) {
                gt_free(elements[0]);
                gt_free(elements[1]);
            }
            gt_free(elements);
            gt_fa_xfclose(prj_fp);
        }
    }

    if (!had_err) {
        GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                             err);
        if (mirrored)
            gt_encseq_loader_mirror(el);
        encseq =
            gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
        gt_encseq_loader_delete(el);
    }
    if (encseq == NULL)
        had_err = -1;
    if (!had_err) {
        unit_info = gt_shu_unit_info_new(encseq);
        if (arguments->with_units)
            had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                                 logger, err);
    }

    if (!had_err) {
        uint64_t **shusums = NULL;
        if (arguments->with_esa || arguments->with_pck) {
            shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                               logger, timer, err);
            if (shusums == NULL)
                had_err = -1;
        }
        else {
            const bool doesa = true;
            GenomediffInfo gd_info;
            Suffixeratoroptions sopts;
            sopts.beverbose = arguments->verbose;
            sopts.indexname = arguments->indexname;
            sopts.db = NULL;
            sopts.encopts = NULL;
            sopts.genomediff = true;
            sopts.inputindex = arguments->indexname;
            sopts.loadopts = arguments->loadopts;
            sopts.showprogress = false;
            sopts.idxopts = arguments->idxopts;

            gt_assert(unit_info != NULL);
            gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                                unit_info->num_of_genomes);
            gd_info.shulensums = shusums;
            gd_info.unit_info = unit_info;
            had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err);
        }
        if (!had_err && shusums != NULL) {
            had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                            arguments->with_pck, logger, timer, err);
            gt_array2dim_delete(shusums);
        }
    }

    if (timer != NULL) {
        gt_timer_show_progress_final(timer, stdout);
        gt_timer_delete(timer);
    }
    gt_logger_delete(logger);
    gt_encseq_delete(encseq);
    gt_shu_unit_info_delete(unit_info);

    return had_err;
}
static int readfirstvaluesfromfile(GtEncseqMetadata *emd,
                                   const char *indexname, GtError *err)
{
  FILE *fp;
  bool had_err = false;
  unsigned long cc, byteoffset = 0, alphatype;
  char *alphadef;

  gt_error_check(err);
  fp = gt_fa_fopen_with_suffix(indexname, GT_ENCSEQFILESUFFIX, "rb", err);
  if (fp == NULL)
  {
    had_err = true;
  }
  NEXTFREAD(emd->is64bit);
  if (!had_err)
  {
    if ((int) emd->is64bit > 1)
    {
      gt_error_set(err, "illegal platform code %u in \"%s%s\"", emd->is64bit,
                   indexname, GT_ENCSEQFILESUFFIX);
      had_err = true;
    }
    if (!had_err && ((emd->is64bit && sizeof (unsigned long) != (size_t) 8)
          || (!emd->is64bit && sizeof (unsigned long) == (size_t) 8)))
    {
      gt_error_set(err, "trying to load 64-bit index \"%s%s\" on a 32-bit "
                        "system or vice versa -- please use correct index "
                        "for this platform", indexname, GT_ENCSEQFILESUFFIX);
      had_err = true;
    }
  }
  NEXTFREAD(emd->version);
  if (!had_err)
  {
    if (emd->version < GT_ENCSEQ_VERSION)    {
      gt_error_set(err, "index \"%s%s\" is format version %lu, current is "
                        "%lu -- please re-encode",
                        indexname, GT_ENCSEQFILESUFFIX,
                        emd->version, GT_ENCSEQ_VERSION);
      had_err = true;
    }
  }
  NEXTFREAD(cc);
  if (!had_err)
  {
    if (cc >= (unsigned long) GT_ACCESS_TYPE_UNDEFINED)
    {
      gt_error_set(err, "illegal type %lu in \"%s%s\"", cc,
                   indexname, GT_ENCSEQFILESUFFIX);
      had_err = true;
    }
  }
  if (!had_err) {
    emd->sat = (GtEncseqAccessType) cc;
    NEXTFREAD(emd->totallength);
    NEXTFREAD(emd->numofdbsequences);
    NEXTFREAD(emd->numofdbfiles);
    NEXTFREAD(emd->lengthofdbfilenames);
    NEXTFREAD(emd->specialcharinfo);
    NEXTFREAD(emd->minseqlen);
    NEXTFREAD(emd->maxseqlen);
  }
  NEXTFREAD(alphatype);
  if (!had_err) {
    if (alphatype > 2UL) {
      gt_error_set(err, "illegal alphabet type %lu in \"%s%s\"", alphatype,
                   indexname, GT_ENCSEQFILESUFFIX);
      had_err = true;
    }
  }
  if (!had_err) {
    NEXTFREAD(emd->lengthofalphadef);
    switch (alphatype) {
      case 0:
        emd->alpha = gt_alphabet_new_dna();
        break;
      case 1:
        emd->alpha = gt_alphabet_new_protein();
        break;
      case 2:
        gt_assert(emd->lengthofalphadef > 0);
        emd->customalphabet = true;
        alphadef = gt_malloc(sizeof (char) * emd->lengthofalphadef);
        NEXTFREADWSIZE(*(alphadef), emd->lengthofalphadef);
        emd->alpha = gt_alphabet_new_from_string(alphadef,
                                                 emd->lengthofalphadef,
                                                 err);
        if (!emd->alpha) {
          had_err = true;
        }
        gt_free(alphadef);
        break;
    }
    gt_assert(emd->alpha != NULL);
  }
  gt_fa_xfclose(fp);
  return had_err ? -1 : 0;
}
Beispiel #13
0
int gt_pbs_unit_test(GtError *err)
{
  int had_err = 0;
  GtLTRElement element;
  GtPBSOptions o;
  GtStr *tmpfilename;
  FILE *tmpfp;
  GtPBSResults *res;
  GtPBSHit *hit;
  double score1, score2;
  GtRange rng;
  char *rev_seq,
       *seq,
       tmp[BUFSIZ];
  const char *fullseq =                           "aaaaaaaaaaaaaaaaaaaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "acatactaggatgctag" /* <- PBS forward */
                                     "aatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatag"
                                   /* PBS reverse -> */ "gatcctaaggctac"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa"
                    "aaaaaaaaaaaaaaaaaaaa";

  /* notice previous errors */
  gt_error_check(err);

  /* create temporary tRNA library file */
  tmpfilename = gt_str_new();
  tmpfp = gt_xtmpfp(tmpfilename);
  fprintf(tmpfp, ">test1\nccccccccccccccctagcatcctagtatgtccc\n"
                 ">test2\ncccccccccgatcctagggctaccctttc\n");
  gt_fa_xfclose(tmpfp);
  ensure(had_err, gt_file_exists(gt_str_get(tmpfilename)));

  /* setup testing parameters */
  o.radius = 30;
  o.max_edist = 1;
  o.alilen.start = 11;
  o.alilen.end = 30;
  o.offsetlen.start = 0;
  o.offsetlen.end = 5;
  o.trnaoffsetlen.start = 0;
  o.trnaoffsetlen.end =  40;
  o.ali_score_match = 5;
  o.ali_score_mismatch = -10;
  o.ali_score_insertion = o.ali_score_deletion = -20;
  o.trna_lib = gt_bioseq_new(gt_str_get(tmpfilename), err);
  ensure(had_err, gt_bioseq_number_of_sequences(o.trna_lib) == 2);

  element.leftLTR_5 = 20;
  element.leftLTR_3 = 119;
  element.rightLTR_5 = 520;
  element.rightLTR_3 = 619;

  /* setup sequences */
  seq     = gt_malloc(600 * sizeof (char));
  rev_seq = gt_malloc(600 * sizeof (char));
  memcpy(seq,     fullseq + 20, 600);
  memcpy(rev_seq, fullseq + 20, 600);
  gt_reverse_complement(rev_seq, 600, err);

  /* try to find PBS in sequences */
  res = gt_pbs_find(seq, rev_seq, &element, &o, err);
  ensure(had_err, res != NULL);
  ensure(had_err, gt_pbs_results_get_number_of_hits(res) == 2);

  /* check first hit on forward strand */
  hit = gt_pbs_results_get_ranked_hit(res, 0);
  ensure(had_err, hit != NULL);
  ensure(had_err, gt_pbs_hit_get_alignment_length(hit) == 17);
  ensure(had_err, gt_pbs_hit_get_edist(hit) == 0);
  ensure(had_err, gt_pbs_hit_get_offset(hit) == 0);
  ensure(had_err, gt_pbs_hit_get_tstart(hit) == 3);
  ensure(had_err, strcmp(gt_pbs_hit_get_trna(hit), "test1") == 0);
  rng = gt_pbs_hit_get_coords(hit);
  ensure(had_err, rng.start == 120);
  ensure(had_err, rng.end == 136);
  score1 = gt_pbs_hit_get_score(hit);
  ensure(had_err, gt_pbs_hit_get_strand(hit) == GT_STRAND_FORWARD);
  memset(tmp, 0, BUFSIZ-1);
  memcpy(tmp, fullseq + (rng.start * sizeof (char)),
         (rng.end - rng.start + 1) * sizeof (char));
  ensure(had_err, strcmp(tmp, "acatactaggatgctag" ) == 0);

  /* check second hit on reverse strand */
  hit = gt_pbs_results_get_ranked_hit(res, 1);
  ensure(had_err, hit != NULL);
  ensure(had_err, gt_pbs_hit_get_alignment_length(hit) == 14);
  ensure(had_err, gt_pbs_hit_get_edist(hit) == 1);
  ensure(had_err, gt_pbs_hit_get_offset(hit) == 0);
  ensure(had_err, gt_pbs_hit_get_tstart(hit) == 6);
  ensure(had_err, strcmp(gt_pbs_hit_get_trna(hit), "test2") == 0);
  rng = gt_pbs_hit_get_coords(hit);
  ensure(had_err, rng.start == 506);
  ensure(had_err, rng.end == 519);
  score2 = gt_pbs_hit_get_score(hit);
  ensure(had_err, gt_double_compare(score1, score2) > 0);
  ensure(had_err, gt_pbs_hit_get_strand(hit) == GT_STRAND_REVERSE);
  memset(tmp, 0, BUFSIZ-1);
  memcpy(tmp, fullseq + (rng.start * sizeof (char)),
         (rng.end - rng.start + 1) * sizeof (char));
  ensure(had_err, strcmp(tmp, "gatcctaaggctac" ) == 0);

  /* clean up */
  gt_xremove(gt_str_get(tmpfilename));
  ensure(had_err, !gt_file_exists(gt_str_get(tmpfilename)));
  gt_str_delete(tmpfilename);
  gt_bioseq_delete(o.trna_lib);
  gt_free(rev_seq);
  gt_free(seq);
  gt_pbs_results_delete(res);

  return had_err;
}
Beispiel #14
0
int gt_mapfmindex (Fmindex *fmindex,const char *indexname,
                GtLogger *logger,GtError *err)
{
  FILE *fpin = NULL;
  bool haserr = false, storeindexpos = true;
  GtSpecialcharinfo specialcharinfo;

  gt_error_check(err);
  fmindex->mappedptr = NULL;
  fmindex->bwtformatching = NULL;
  fmindex->alphabet = NULL;
  fpin = gt_fa_fopen_with_suffix(indexname,FMASCIIFILESUFFIX,"rb",err);
  if (fpin == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    if (scanfmafileviafileptr(fmindex,
                              &specialcharinfo,
                              &storeindexpos,
                              indexname,
                              fpin,
                              logger,
                              err) != 0)
    {
      haserr = true;
    }
  }
  gt_fa_xfclose(fpin);
  if (!haserr)
  {
    fmindex->bwtformatching = mapbwtencoding(indexname,logger,err);
    if (fmindex->bwtformatching == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    fmindex->specpos.nextfreeGtPairBwtidx
      = (unsigned long) gt_determinenumberofspecialstostore(&specialcharinfo);
    fmindex->specpos.spaceGtPairBwtidx = NULL;
    fmindex->specpos.allocatedGtPairBwtidx = 0;
    fmindex->alphabet = gt_alphabet_ref(
                                  gt_encseq_alphabet(fmindex->bwtformatching));
    if (fmindex->alphabet == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    GtStr *tmpfilename;

    gt_computefmkeyvalues (fmindex,
                           &specialcharinfo,
                           fmindex->bwtlength,
                           fmindex->log2bsize,
                           fmindex->log2markdist,
                           gt_alphabet_num_of_chars(fmindex->alphabet),
                           fmindex->suffixlength,
                           storeindexpos);
    tmpfilename = gt_str_new_cstr(indexname);
    gt_str_append_cstr(tmpfilename,FMDATAFILESUFFIX);
    if (gt_fillfmmapspecstartptr(fmindex,storeindexpos,tmpfilename,err) != 0)
    {
      haserr = true;
    }
    gt_str_delete(tmpfilename);
  }
  if (haserr)
  {
    gt_freefmindex(fmindex);
  }
  return haserr ? -1 : 0;
}
static int enumeratelcpintervals(const char *inputindex,
                                 Sequentialsuffixarrayreader *ssar,
                                 const char *storeindex,
                                 bool storecounts,
                                 GtUword mersize,
                                 GtUword minocc,
                                 GtUword maxocc,
                                 bool performtest,
                                 GtLogger *logger,
                                 GtError *err)
{
  TyrDfsstate *state;
  bool haserr = false;
  unsigned int alphasize;

  gt_error_check(err);
  state = gt_malloc(sizeof (*state));
  GT_INITARRAY(&state->occdistribution,Countwithpositions);
  state->esrspace = gt_encseq_create_reader_with_readmode(
                                   gt_encseqSequentialsuffixarrayreader(ssar),
                                   gt_readmodeSequentialsuffixarrayreader(ssar),
                                   0);
  state->mersize = (GtUword) mersize;
  state->encseq = gt_encseqSequentialsuffixarrayreader(ssar);
  alphasize = gt_alphabet_num_of_chars(gt_encseq_alphabet(state->encseq));
  state->readmode = gt_readmodeSequentialsuffixarrayreader(ssar);
  state->storecounts = storecounts;
  state->minocc = minocc;
  state->maxocc = maxocc;
  state->totallength = gt_encseq_total_length(state->encseq);
  state->performtest = performtest;
  state->countoutputmers = 0;
  state->merindexfpout = NULL;
  state->countsfilefpout = NULL;
  GT_INITARRAY(&state->largecounts,Largecount);
  if (strlen(storeindex) == 0)
  {
    state->sizeofbuffer = 0;
    state->bytebuffer = NULL;
  } else
  {
    state->sizeofbuffer = MERBYTES(mersize);
    state->bytebuffer = gt_malloc(sizeof *state->bytebuffer
                                  * state->sizeofbuffer);
  }
  if (performtest)
  {
    state->currentmer = gt_malloc(sizeof *state->currentmer
                                  * state->mersize);
    state->suftab = gt_suftabSequentialsuffixarrayreader(ssar);
  } else
  {
    state->currentmer = NULL;
    state->suftab = NULL;
  }
  if (state->mersize > state->totallength)
  {
    gt_error_set(err,"mersize "GT_WU" > "GT_WU" = totallength not allowed",
                 state->mersize,
                 state->totallength);
    haserr = true;
  } else
  {
    if (strlen(storeindex) == 0)
    {
      state->processoccurrencecount = adddistpos2distribution;
    } else
    {
      state->merindexfpout = gt_fa_fopen_with_suffix(storeindex,MERSUFFIX,
                                                    "wb",err);
      if (state->merindexfpout == NULL)
      {
        haserr = true;
      } else
      {
        if (state->storecounts)
        {
          state->countsfilefpout
            = gt_fa_fopen_with_suffix(storeindex,COUNTSSUFFIX,"wb",err);
          if (state->countsfilefpout == NULL)
          {
            haserr = true;
          }
        }
      }
      state->processoccurrencecount = outputsortedstring2index;
    }
    if (!haserr)
    {
      if (gt_depthfirstesa(ssar,
                          tyr_allocateDfsinfo,
                          tyr_freeDfsinfo,
                          tyr_processleafedge,
                          NULL,
                          tyr_processcompletenode,
                          tyr_assignleftmostleaf,
                          tyr_assignrightmostleaf,
                          (Dfsstate*) state,
                          logger,
                          err) != 0)
      {
        haserr = true;
      }
      if (strlen(storeindex) == 0)
      {
        showfinalstatistics(state,inputindex,logger);
      }
    }
    if (!haserr)
    {
      if (state->countsfilefpout != NULL)
      {
        gt_logger_log(logger,"write "GT_WU" mercounts > "GT_WU
                      " to file \"%s%s\"",
                      state->largecounts.nextfreeLargecount,
                      (GtUword) MAXSMALLMERCOUNT,
                      storeindex,
                      COUNTSSUFFIX);
        gt_xfwrite(state->largecounts.spaceLargecount, sizeof (Largecount),
                  (size_t) state->largecounts.nextfreeLargecount,
                  state->countsfilefpout);
      }
    }
    if (!haserr)
    {
      gt_logger_log(logger,"number of "GT_WU"-mers in index: "GT_WU"",
                  mersize,
                  state->countoutputmers);
      gt_logger_log(logger,"index size: %.2f megabytes\n",
                  GT_MEGABYTES(state->countoutputmers * state->sizeofbuffer +
                               sizeof (GtUword) * EXTRAINTEGERS));
    }
  }
  /* now out EXTRAINTEGERS integer values */
  if (!haserr && state->merindexfpout != NULL)
  {
    outputbytewiseUlongvalue(state->merindexfpout,
                             (GtUword) state->mersize);
    outputbytewiseUlongvalue(state->merindexfpout,(GtUword) alphasize);
  }
  gt_fa_xfclose(state->merindexfpout);
  gt_fa_xfclose(state->countsfilefpout);
  GT_FREEARRAY(&state->occdistribution,Countwithpositions);
  gt_free(state->currentmer);
  gt_free(state->bytebuffer);
  GT_FREEARRAY(&state->largecounts,Largecount);
  gt_encseq_reader_delete(state->esrspace);
  gt_free(state);
  return haserr ? -1 : 0;
}
static int gt_compressedbits_runner(GT_UNUSED int argc,
                                    GT_UNUSED const char **argv,
                                    GT_UNUSED int parsed_args,
                                    void *tool_arguments,
                                    GtError *err)
{
  GtCompressdbitsArguments *arguments = tool_arguments;
  int had_err = 0;
  unsigned long idx;
  unsigned long long num_of_bits = 0ULL;
  GtBitsequence *bits = NULL;
  GtCompressedBitsequence *cbs = NULL, *read_cbs = NULL;
  GtStr *filename = gt_str_new();
  FILE *fp = NULL;

  gt_error_check(err);
  gt_assert(arguments);
  gt_assert(argc == parsed_args);

  if (gt_option_is_set(arguments->filename_op)) {
    FILE *file = NULL;
    gt_assert(arguments->filename != NULL);

    file = gt_xfopen(gt_str_get(arguments->filename), "r");
    if ((size_t) 1 != gt_xfread(&num_of_bits,
                                sizeof (num_of_bits), (size_t) 1, file)) {
      had_err = -1;
    }
    if (!had_err) {
      gt_log_log("bits to read: %llu", num_of_bits);
      arguments->size = (unsigned long) GT_NUMOFINTSFORBITS(num_of_bits);
      bits = gt_malloc(sizeof (*bits) * arguments->size);
      if ((size_t) arguments->size !=
          gt_xfread(bits, sizeof (*bits),
                    (size_t) arguments->size, file)) {
        had_err = -1;
      }
    }
    gt_xfclose(file);
  }
  else {
    bits = gt_calloc(sizeof (*bits), (size_t) arguments->size);
    num_of_bits = (unsigned long long) (GT_INTWORDSIZE * arguments->size);

    if (arguments->fill_random) {
      for (idx = 0; idx < arguments->size; idx++) {
        bits[idx] =
          (GtBitsequence) (0xAAAAAAAAAAAAAAAAUL ^ gt_rand_max(ULONG_MAX));
      }
    }
    else {
      for (idx = 0; idx < arguments->size; idx++)
        bits[idx] = (GtBitsequence) (0xAAAAAAAAAAAAAAAAUL ^ idx);
    }
  }

  if (!had_err) {
    fp = gt_xtmpfp(filename);
    gt_fa_xfclose(fp);
    fp = NULL;

    gt_log_log("filename: %s", gt_str_get(filename));
    gt_log_log("size in words: %lu", arguments->size);
    cbs = gt_compressed_bitsequence_new(
                            bits, arguments->samplerate,
                            (unsigned long) num_of_bits);
    gt_log_log("original size in MB: %2.3f",
               (sizeof (*bits) * arguments->size) / (1024.0 * 1024.0));
    gt_log_log("compressed size in MB: %2.3f",
               gt_compressed_bitsequence_size(cbs) / (1024.0 * 1024.0));
    gt_log_log("popcount table size thereof in MB: %2.3f",
               gt_popcount_tab_calculate_size(15U) / (1024.0 * 1024.0));
    had_err = gt_compressed_bitsequence_write(cbs, gt_str_get(filename), err);
  }
  if (!had_err)
  {
    read_cbs =
      gt_compressed_bitsequence_new_from_file(gt_str_get(filename), err);
    if (read_cbs == NULL)
      had_err = -1;
  }
  if (!had_err && bits != NULL && arguments->check_consistency) {
    for (idx = 0; (unsigned long long) idx < num_of_bits; ++idx) {
      int GT_UNUSED bit = gt_compressed_bitsequence_access(read_cbs, idx);
      int GT_UNUSED original = GT_ISIBITSET(bits, idx) ? 1 : 0;
      gt_assert(gt_compressed_bitsequence_access(cbs, idx) == bit);
      gt_assert(original == bit);
    }
  }
  gt_compressed_bitsequence_delete(cbs);
  gt_compressed_bitsequence_delete(read_cbs);
  gt_free(bits);
  gt_str_delete(filename);
  return had_err;
}