Example #1
0
static GtBaseQualDistr* hcr_base_qual_distr_new(GtAlphabet *alpha,
        GtQualRange qrange)
{
    GtBaseQualDistr *bqd;
    bqd = gt_calloc((size_t) 1, sizeof (GtBaseQualDistr));
    gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL,
                        gt_alphabet_size(alpha));

    bqd->ncols = gt_alphabet_size(alpha);
    bqd->nrows = HCR_HIGHESTQUALVALUE + 1U;
    bqd->qual_offset = HCR_LOWESTQUALVALUE;
    bqd->wildcard_indx = gt_alphabet_size(alpha) - 1;
    bqd->min_qual = HCR_HIGHESTQUALVALUE;
    bqd->max_qual = HCR_LOWESTQUALVALUE;
    gt_safe_assign(bqd->qrange_start, qrange.start);
    gt_safe_assign(bqd->qrange_end, qrange.end);
    bqd->alpha = alpha;
    return bqd;
}
Example #2
0
static int hcr_huffman_write_base_qual_freq(GtUword symbol,
        GtUint64 freq,
        GT_UNUSED GtBitsequence code,
        GT_UNUSED unsigned code_length,
        void *pt)
{
    GtUchar base,
            qual;
    WriteNodeInfo *info = (WriteNodeInfo*)pt;

    gt_safe_assign(base, (symbol % gt_alphabet_size(info->alpha)));
    if (base == (GtUchar) gt_alphabet_size(info->alpha) - 1)
        base = (GtUchar) WILDCARD;
    gt_safe_assign(base, (toupper(gt_alphabet_decode(info->alpha, base))));

    gt_xfwrite_one(&base, info->output);

    gt_safe_assign(qual,
                   (symbol / gt_alphabet_size(info->alpha) + info->qual_offset));

    gt_xfwrite_one(&qual, info->output);
    gt_xfwrite_one(&freq, info->output);
    return 0;
}
Example #3
0
static GtBaseQualDistr* hcr_base_qual_distr_new_from_file(FILE *fp,
        GtAlphabet *alpha)
{
    GtBaseQualDistr *bqd;
    char read_char_code;
    GtUchar cur_char_code;
    unsigned char cur_qual;
    unsigned alpha_size,
             min_qual = HCR_HIGHESTQUALVALUE,
             max_qual = HCR_LOWESTQUALVALUE;
    GtUword numofleaves,
            i;
    GtUint64 cur_freq;
    GT_UNUSED size_t read,
              one = (size_t) 1;

    alpha_size = gt_alphabet_size(alpha);
    bqd = gt_malloc(sizeof (GtBaseQualDistr));
    gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, alpha_size)
    bqd->ncols = alpha_size;
    bqd->nrows = HCR_HIGHESTQUALVALUE + 1U;
    bqd->qual_offset = HCR_LOWESTQUALVALUE;
    bqd->wildcard_indx = alpha_size - 1;

    read = gt_xfread_one(&numofleaves, fp);
    gt_assert(read == one);
    for (i = 0; i < numofleaves; i++) {
        read = gt_xfread_one(&read_char_code, fp);
        gt_assert(read == one);
        read = gt_xfread_one(&cur_qual, fp);
        gt_assert(read == one);
        read = gt_xfread_one(&cur_freq, fp);
        gt_assert(read == one);
        cur_char_code = gt_alphabet_encode(alpha, read_char_code);
        if (cur_char_code == (GtUchar) WILDCARD)
            gt_safe_assign(cur_char_code, bqd->wildcard_indx);
        bqd->distr[cur_qual][cur_char_code] = cur_freq;
        if ((unsigned) cur_qual > max_qual)
            max_qual = cur_qual;
        if ((unsigned) cur_qual < min_qual)
            min_qual = cur_qual;
    }

    bqd->min_qual = min_qual;
    bqd->max_qual = max_qual;
    hcr_base_qual_distr_trim(bqd);
    return bqd;
}
Example #4
0
static HcrHuffDataIterator *decoder_init_data_iterator(GtWord start_of_encoding,
                                                       GtWord end_of_encoding,
                                                       const GtStr *filename)
{
  HcrHuffDataIterator *data_iter = gt_malloc(sizeof (*data_iter));
  data_iter->path = gt_str_get(filename);
  data_iter->pos = data_iter->start = (size_t) start_of_encoding;
  data_iter->end = (size_t) end_of_encoding;
  data_iter->pages_per_chunk = HCR_PAGES_PER_CHUNK;
  data_iter->pagesize = gt_pagesize();
  gt_assert(data_iter->start % data_iter->pagesize == 0);
  data_iter->blocksize = data_iter->pagesize * data_iter->pages_per_chunk;
  gt_safe_assign(data_iter->bitseq_per_chunk,
                 (data_iter->blocksize / sizeof (GtBitsequence)));
  data_iter->data = NULL;
  return data_iter;
}
Example #5
0
static int get_next_file_chunk_for_huffman(GtBitsequence **bits,
        GtUword *length,
        GtUword *offset,
        GtUword *pad_length,
        void *meminfo)
{
    const int empty = 0,
              success = 1;
    HcrHuffDataIterator *data_iter;
    gt_assert(meminfo);
    gt_assert(bits && length && offset && pad_length);
    data_iter = (HcrHuffDataIterator*) meminfo;

    gt_log_log("pos in iter: "GT_WU"", (GtUword) data_iter->pos);
    if (data_iter->pos < data_iter->end) {
        gt_fa_xmunmap(data_iter->data);
        data_iter->data = NULL;
        data_iter->data = gt_fa_xmmap_read_range(data_iter->path,
                          (size_t) data_iter->blocksize,
                          data_iter->pos);
        data_iter->pos += data_iter->blocksize;
        if (data_iter->pos > data_iter->end) {
            gt_safe_assign(*length,
                           (data_iter->blocksize - (data_iter->pos - data_iter->end)));
            *length /= sizeof (GtBitsequence);
        }
        else
            *length = data_iter->bitseq_per_chunk;

        *offset = 0;
        *pad_length = 0;
        *bits = data_iter->data;
        return success;
    }
    gt_fa_xmunmap(data_iter->data);
    data_iter->data = NULL;
    *bits = NULL;
    *length = 0;
    *offset = 0;
    *pad_length = 0;
    return empty;
}
GtUword gt_condenseq_uniques_position_binsearch(const GtCondenseq *condenseq,
                                                GtUword position)
{
  GtWord idx, low, high;
  gt_assert(condenseq && condenseq->udb_nelems > 0);
  low = (GtWord) -1;
  gt_safe_assign(high, condenseq->udb_nelems);
  idx = GT_DIV2(low + high);
  while (high - low > (GtWord) 1) {
    if (position < condenseq->uniques[idx].orig_startpos) {
      high = idx;
    }
    else {
      low = idx;
    }
    idx = GT_DIV2(low + high);
  }
  if (low > (GtWord) -1 && condenseq->uniques[idx].orig_startpos <= position)
    return (GtUword) idx;
  return condenseq->udb_nelems;
}
Example #7
0
static int hcr_write_seqs(FILE *fp, GtHcrEncoder *hcr_enc, GtError *err)
{
    int had_err = 0, seqit_err;
    GtUword bits_to_write = 0,
            len,
            read_counter = 0,
            page_counter = 0,
            bits_left_in_page,
            cur_read = 0;
    GtWord filepos;
    GtSeqIterator *seqit;
    const GtUchar *seq,
          *qual;
    char *desc;
    GtBitOutStream *bitstream;

    gt_error_check(err);
    gt_assert(hcr_enc->seq_encoder->sampling);

    gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8));

    gt_xfseek(fp, hcr_enc->seq_encoder->start_of_encoding, SEEK_SET);
    bitstream = gt_bitoutstream_new(fp);

    seqit = gt_seq_iterator_fastq_new(hcr_enc->files, err);
    if (!seqit) {
        gt_assert(gt_error_is_set(err));
        had_err = -1;
    }

    if (!had_err) {
        gt_seq_iterator_set_quality_buffer(seqit, &qual);
        gt_seq_iterator_set_symbolmap(seqit,
                                      gt_alphabet_symbolmap(hcr_enc->seq_encoder->alpha));
        hcr_enc->seq_encoder->total_num_of_symbols = 0;
        while (!had_err &&
                (seqit_err = gt_seq_iterator_next(seqit,
                             &seq,
                             &len,
                             &desc, err)) == 1) {

            /* count the bits */
            bits_to_write = hcr_write_seq(hcr_enc->seq_encoder, seq, qual, len,
                                          bitstream, true);

            /* check if a new sample has to be added */
            if (gt_sampling_is_next_element_sample(hcr_enc->seq_encoder->sampling,
                                                   page_counter,
                                                   read_counter,
                                                   bits_to_write,
                                                   bits_left_in_page)) {
                gt_bitoutstream_flush_advance(bitstream);

                filepos = gt_bitoutstream_pos(bitstream);
                if (filepos < 0) {
                    had_err = -1;
                    gt_error_set(err, "error by ftell: %s", strerror(errno));
                }
                else {
                    gt_sampling_add_sample(hcr_enc->seq_encoder->sampling,
                                           (size_t) filepos,
                                           cur_read);

                    read_counter = 0;
                    page_counter = 0;
                    gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8));
                }
            }

            if (!had_err) {
                /* do the writing */
                bits_to_write = hcr_write_seq(hcr_enc->seq_encoder,
                                              seq, qual, len, bitstream, false);

                /* update counter for sampling */
                while (bits_left_in_page < bits_to_write) {
                    page_counter++;
                    bits_to_write -= bits_left_in_page;
                    gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8));
                }
                bits_left_in_page -= bits_to_write;
                /* always set first page as written */
                if (page_counter == 0)
                    page_counter++;
                read_counter++;
                hcr_enc->seq_encoder->total_num_of_symbols += len;
                cur_read++;
            }
        }
        gt_assert(hcr_enc->num_of_reads == cur_read);
        if (!had_err && seqit_err) {
            had_err = seqit_err;
            gt_assert(gt_error_is_set(err));
        }
    }

    if (!had_err) {
        gt_bitoutstream_flush(bitstream);
        filepos = gt_bitoutstream_pos(bitstream);
        if (filepos < 0) {
            had_err = -1;
            gt_error_set(err, "error by ftell: %s", strerror(errno));
        }
        else {
            hcr_enc->seq_encoder->startofsamplingtab = filepos;
            gt_log_log("start of samplingtab: "GT_WU"",
                       hcr_enc->seq_encoder->startofsamplingtab);
            if (hcr_enc->seq_encoder->sampling != NULL)
                gt_sampling_write(hcr_enc->seq_encoder->sampling, fp);
        }
    }
    gt_bitoutstream_delete(bitstream);
    gt_seq_iterator_delete(seqit);
    return had_err;
}
static int gt_compreads_compress_arguments_check(GT_UNUSED int rest_argc,
                                       void *tool_arguments,
                                       GtError *err)
{
  int had_err = 0;
  GtCsrHcrEncodeArguments *arguments = tool_arguments;
  GtSplitter *splitter = NULL;
  GtStr *buffer;
  gt_error_check(err);
  gt_assert(arguments);

  if (gt_str_array_size(arguments->files) == 0) {
    gt_error_set(err, "option \"-files\" is mandatory and requires"
                      " at least one filename as argument!");
    had_err = -1;
  }

  if (!had_err) {
    if (gt_str_length(arguments->name) == 0) {
      if (gt_str_array_size(arguments->files) > 1UL) {
        gt_error_set(err, "option \"-name\" needs to be specified"
                          " if more than one file is given");
        had_err = -1;
      }
      else {
        GtUword i;
        char *basename;
        splitter = gt_splitter_new();
        basename = gt_basename(gt_str_array_get(arguments->files, 0));
        buffer = gt_str_new_cstr(basename);
        gt_splitter_split(splitter, gt_str_get(buffer), gt_str_length(buffer),
                          '.');
        for (i = 0; i < gt_splitter_size(splitter) - 1; i++) {
          gt_str_append_cstr(arguments->name,
                             gt_splitter_get_token(splitter, i));
          if (i < gt_splitter_size(splitter) - 2)
            gt_str_append_char(arguments->name, '.');
        }
        gt_free(basename);
        gt_splitter_delete(splitter);
        gt_str_delete(buffer);
      }
    }
  }

  if (!had_err) {
    char *sampling_type = gt_str_get(arguments->method);
    static const char *methods[] = { "page", "regular", "none" };

    if (!strcmp(methods[0], sampling_type)) {
      arguments->pagewise = true;
      if (arguments->srate == GT_UNDEF_UWORD)
        arguments->srate = GT_SAMPLING_DEFAULT_PAGE_RATE;
      else if (arguments->srate == 0) {
        gt_error_set(err, "page sampling was chosen, but sampling"
                          " rate was set to "GT_WU"! this seems wrong.",
                     arguments->srate);
        had_err = -1;
      }
    }
    else if (!strcmp(methods[1], sampling_type)) {
      arguments->regular = true;
      if (arguments->srate == GT_UNDEF_UWORD)
        arguments->srate = GT_SAMPLING_DEFAULT_REGULAR_RATE;
      else if (arguments->srate == 0) {
        gt_error_set(err, "regular sampling was chosen, but sampling rate "
                          " was set to "GT_WU"! this seems wrong.",
                     arguments->srate);
        had_err = -1;
      }
    }
    else if (!strcmp(methods[2], sampling_type)) {
      if (arguments->srate == GT_UNDEF_UWORD)
        arguments->srate = 0;
      else if (arguments->srate != 0) {
        gt_error_set(err, "no sampling was chosen, but sampling rate was"
                          " set to "GT_WU"! this seems wrong.",
                          arguments->srate);
        had_err = -1;
      }
    }
    else {
      gt_error_set(err, "somethings wrong with the stype option");
      had_err = -1;
    }
  }

  if (!had_err) {
    if (arguments->arg_range.start != GT_UNDEF_UWORD) {
      if (arguments->arg_range.start <= (GtUword) UINT_MAX) {
        gt_safe_assign(arguments->qrng.start, arguments->arg_range.start);
        if (arguments->arg_range.end <= (GtUword) UINT_MAX)
          gt_safe_assign(arguments->qrng.end, arguments->arg_range.end);
        else
          had_err = -1;
      }
      else
        had_err = -1;
    }
    if (had_err)
      gt_error_set(err, "Range for qualities: value to large! larger than %u",
                   UINT_MAX);
  }
  return had_err;
}
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv,
                                        int parsed_args, void *tool_arguments,
                                        GtError *err)
{
  GtCondenseqCompressArguments *arguments = tool_arguments;
  GtLogger *logger,
           *kdb_logger;
  FILE *kmer_fp = NULL;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);
  kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr);
  if (arguments->kdb) {
    kmer_fp = gt_fa_fopen("kmer_db.out", "w", err);
    gt_logger_set_target(kdb_logger, kmer_fp);
  }

  if (gt_str_length(arguments->indexname) == 0UL) {
    char *basenameptr;
    basenameptr = gt_basename(argv[parsed_args]);
    gt_str_set(arguments->indexname, basenameptr);
    gt_free(basenameptr);
  }

  if (!had_err) {
    GtEncseqLoader *es_l = gt_encseq_loader_new();
    arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err);
    if (arguments->input_es == NULL)
      had_err = -1;
    gt_encseq_loader_delete(es_l);
  }

  if (!had_err) {
    if (arguments->minalignlength == GT_UNDEF_UWORD)
      arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ?
                                  arguments->initsize / (GtUword) 3UL :
                                  GT_UNDEF_UWORD;
    if (arguments->windowsize == GT_UNDEF_UINT)
      arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ?
                              (unsigned int) (arguments->minalignlength / 5U) :
                              GT_UNDEF_UINT;
    if (arguments->windowsize < 4U)
      arguments->windowsize = 4U;
    if (arguments->kmersize == GT_UNDEF_UINT) {
      unsigned int size =
        gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es));
      /* size^k ~= 100000 */
      gt_safe_assign(arguments->kmersize,
                     gt_round_to_long(gt_log_base(100000.0, (double) size)));
      gt_logger_log(logger, "|A|: %u, k: %u",
                    size, arguments->kmersize);
    }

    if (arguments->windowsize == GT_UNDEF_UINT) {
      arguments->windowsize = 5U * arguments->kmersize;
    }
    if (arguments->minalignlength == GT_UNDEF_UWORD) {
      arguments->minalignlength = (GtUword) (3UL * arguments->windowsize);
    }
    if (arguments->initsize == GT_UNDEF_UWORD) {
      arguments->initsize = (GtUword) (3UL * arguments->minalignlength);
    }
  }
  if (!had_err &&
      arguments->windowsize <= arguments->kmersize) {
    gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!",
                 arguments->windowsize, arguments->kmersize);
    had_err = -1;
  }
  if (!had_err &&
      arguments->minalignlength < (GtUword) arguments->windowsize) {
    gt_error_set(err, "-alignlength (" GT_WU ") must be at least "
                 "-windowsize (%u)!", arguments->minalignlength,
                 arguments->windowsize);
    had_err = -1;
  }
  if (!had_err && (arguments->initsize < arguments->minalignlength)) {
    gt_error_set(err, "-initsize (" GT_WU ") must be at least "
                 "-alignlength (" GT_WU ")!", arguments->initsize,
                 arguments->minalignlength);
    had_err = -1;
  }

  if (!had_err) {
    GtCondenseqCreator *ces_c;

    if (!had_err) {
      ces_c = gt_condenseq_creator_new(arguments->initsize,
                                       arguments->minalignlength,
                                       arguments->xdrop,
                                       &(arguments->scores),
                                       arguments->kmersize,
                                       arguments->windowsize,
                                       logger,
                                       err);
      if (ces_c == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->cutoff_value == GT_UNDEF_UWORD)
        gt_condenseq_creator_use_mean_cutoff(ces_c);
      else if (arguments->cutoff_value == 0)
        gt_condenseq_creator_disable_cutoff(ces_c);
      else
        gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value);
      gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction);
      if (arguments->prune)
        gt_condenseq_creator_disable_prune(ces_c);
      if (arguments->brute)
        gt_condenseq_creator_enable_brute_force(ces_c);
      if (!arguments->diags)
        gt_condenseq_creator_disable_diagonals(ces_c);
      if (arguments->full_diags)
        gt_condenseq_creator_enable_full_diagonals(ces_c);
      if (arguments->clean_percent != GT_UNDEF_UINT)
        gt_condenseq_creator_set_diags_clean_limit(ces_c,
                                                   arguments->clean_percent);

      had_err = gt_condenseq_creator_create(ces_c,
                                            arguments->indexname,
                                            arguments->input_es,
                                            logger, kdb_logger, err);

      gt_condenseq_creator_delete(ces_c);
    }
  }

  gt_logger_delete(logger);
  gt_logger_delete(kdb_logger);
  if (arguments->kdb)
    gt_fa_fclose(kmer_fp);
  return had_err;
}