static GtBaseQualDistr* hcr_base_qual_distr_new(GtAlphabet *alpha, GtQualRange qrange) { GtBaseQualDistr *bqd; bqd = gt_calloc((size_t) 1, sizeof (GtBaseQualDistr)); gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, gt_alphabet_size(alpha)); bqd->ncols = gt_alphabet_size(alpha); bqd->nrows = HCR_HIGHESTQUALVALUE + 1U; bqd->qual_offset = HCR_LOWESTQUALVALUE; bqd->wildcard_indx = gt_alphabet_size(alpha) - 1; bqd->min_qual = HCR_HIGHESTQUALVALUE; bqd->max_qual = HCR_LOWESTQUALVALUE; gt_safe_assign(bqd->qrange_start, qrange.start); gt_safe_assign(bqd->qrange_end, qrange.end); bqd->alpha = alpha; return bqd; }
static int hcr_huffman_write_base_qual_freq(GtUword symbol, GtUint64 freq, GT_UNUSED GtBitsequence code, GT_UNUSED unsigned code_length, void *pt) { GtUchar base, qual; WriteNodeInfo *info = (WriteNodeInfo*)pt; gt_safe_assign(base, (symbol % gt_alphabet_size(info->alpha))); if (base == (GtUchar) gt_alphabet_size(info->alpha) - 1) base = (GtUchar) WILDCARD; gt_safe_assign(base, (toupper(gt_alphabet_decode(info->alpha, base)))); gt_xfwrite_one(&base, info->output); gt_safe_assign(qual, (symbol / gt_alphabet_size(info->alpha) + info->qual_offset)); gt_xfwrite_one(&qual, info->output); gt_xfwrite_one(&freq, info->output); return 0; }
static GtBaseQualDistr* hcr_base_qual_distr_new_from_file(FILE *fp, GtAlphabet *alpha) { GtBaseQualDistr *bqd; char read_char_code; GtUchar cur_char_code; unsigned char cur_qual; unsigned alpha_size, min_qual = HCR_HIGHESTQUALVALUE, max_qual = HCR_LOWESTQUALVALUE; GtUword numofleaves, i; GtUint64 cur_freq; GT_UNUSED size_t read, one = (size_t) 1; alpha_size = gt_alphabet_size(alpha); bqd = gt_malloc(sizeof (GtBaseQualDistr)); gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, alpha_size) bqd->ncols = alpha_size; bqd->nrows = HCR_HIGHESTQUALVALUE + 1U; bqd->qual_offset = HCR_LOWESTQUALVALUE; bqd->wildcard_indx = alpha_size - 1; read = gt_xfread_one(&numofleaves, fp); gt_assert(read == one); for (i = 0; i < numofleaves; i++) { read = gt_xfread_one(&read_char_code, fp); gt_assert(read == one); read = gt_xfread_one(&cur_qual, fp); gt_assert(read == one); read = gt_xfread_one(&cur_freq, fp); gt_assert(read == one); cur_char_code = gt_alphabet_encode(alpha, read_char_code); if (cur_char_code == (GtUchar) WILDCARD) gt_safe_assign(cur_char_code, bqd->wildcard_indx); bqd->distr[cur_qual][cur_char_code] = cur_freq; if ((unsigned) cur_qual > max_qual) max_qual = cur_qual; if ((unsigned) cur_qual < min_qual) min_qual = cur_qual; } bqd->min_qual = min_qual; bqd->max_qual = max_qual; hcr_base_qual_distr_trim(bqd); return bqd; }
static HcrHuffDataIterator *decoder_init_data_iterator(GtWord start_of_encoding, GtWord end_of_encoding, const GtStr *filename) { HcrHuffDataIterator *data_iter = gt_malloc(sizeof (*data_iter)); data_iter->path = gt_str_get(filename); data_iter->pos = data_iter->start = (size_t) start_of_encoding; data_iter->end = (size_t) end_of_encoding; data_iter->pages_per_chunk = HCR_PAGES_PER_CHUNK; data_iter->pagesize = gt_pagesize(); gt_assert(data_iter->start % data_iter->pagesize == 0); data_iter->blocksize = data_iter->pagesize * data_iter->pages_per_chunk; gt_safe_assign(data_iter->bitseq_per_chunk, (data_iter->blocksize / sizeof (GtBitsequence))); data_iter->data = NULL; return data_iter; }
static int get_next_file_chunk_for_huffman(GtBitsequence **bits, GtUword *length, GtUword *offset, GtUword *pad_length, void *meminfo) { const int empty = 0, success = 1; HcrHuffDataIterator *data_iter; gt_assert(meminfo); gt_assert(bits && length && offset && pad_length); data_iter = (HcrHuffDataIterator*) meminfo; gt_log_log("pos in iter: "GT_WU"", (GtUword) data_iter->pos); if (data_iter->pos < data_iter->end) { gt_fa_xmunmap(data_iter->data); data_iter->data = NULL; data_iter->data = gt_fa_xmmap_read_range(data_iter->path, (size_t) data_iter->blocksize, data_iter->pos); data_iter->pos += data_iter->blocksize; if (data_iter->pos > data_iter->end) { gt_safe_assign(*length, (data_iter->blocksize - (data_iter->pos - data_iter->end))); *length /= sizeof (GtBitsequence); } else *length = data_iter->bitseq_per_chunk; *offset = 0; *pad_length = 0; *bits = data_iter->data; return success; } gt_fa_xmunmap(data_iter->data); data_iter->data = NULL; *bits = NULL; *length = 0; *offset = 0; *pad_length = 0; return empty; }
GtUword gt_condenseq_uniques_position_binsearch(const GtCondenseq *condenseq, GtUword position) { GtWord idx, low, high; gt_assert(condenseq && condenseq->udb_nelems > 0); low = (GtWord) -1; gt_safe_assign(high, condenseq->udb_nelems); idx = GT_DIV2(low + high); while (high - low > (GtWord) 1) { if (position < condenseq->uniques[idx].orig_startpos) { high = idx; } else { low = idx; } idx = GT_DIV2(low + high); } if (low > (GtWord) -1 && condenseq->uniques[idx].orig_startpos <= position) return (GtUword) idx; return condenseq->udb_nelems; }
static int hcr_write_seqs(FILE *fp, GtHcrEncoder *hcr_enc, GtError *err) { int had_err = 0, seqit_err; GtUword bits_to_write = 0, len, read_counter = 0, page_counter = 0, bits_left_in_page, cur_read = 0; GtWord filepos; GtSeqIterator *seqit; const GtUchar *seq, *qual; char *desc; GtBitOutStream *bitstream; gt_error_check(err); gt_assert(hcr_enc->seq_encoder->sampling); gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8)); gt_xfseek(fp, hcr_enc->seq_encoder->start_of_encoding, SEEK_SET); bitstream = gt_bitoutstream_new(fp); seqit = gt_seq_iterator_fastq_new(hcr_enc->files, err); if (!seqit) { gt_assert(gt_error_is_set(err)); had_err = -1; } if (!had_err) { gt_seq_iterator_set_quality_buffer(seqit, &qual); gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(hcr_enc->seq_encoder->alpha)); hcr_enc->seq_encoder->total_num_of_symbols = 0; while (!had_err && (seqit_err = gt_seq_iterator_next(seqit, &seq, &len, &desc, err)) == 1) { /* count the bits */ bits_to_write = hcr_write_seq(hcr_enc->seq_encoder, seq, qual, len, bitstream, true); /* check if a new sample has to be added */ if (gt_sampling_is_next_element_sample(hcr_enc->seq_encoder->sampling, page_counter, read_counter, bits_to_write, bits_left_in_page)) { gt_bitoutstream_flush_advance(bitstream); filepos = gt_bitoutstream_pos(bitstream); if (filepos < 0) { had_err = -1; gt_error_set(err, "error by ftell: %s", strerror(errno)); } else { gt_sampling_add_sample(hcr_enc->seq_encoder->sampling, (size_t) filepos, cur_read); read_counter = 0; page_counter = 0; gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8)); } } if (!had_err) { /* do the writing */ bits_to_write = hcr_write_seq(hcr_enc->seq_encoder, seq, qual, len, bitstream, false); /* update counter for sampling */ while (bits_left_in_page < bits_to_write) { page_counter++; bits_to_write -= bits_left_in_page; gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8)); } bits_left_in_page -= bits_to_write; /* always set first page as written */ if (page_counter == 0) page_counter++; read_counter++; hcr_enc->seq_encoder->total_num_of_symbols += len; cur_read++; } } gt_assert(hcr_enc->num_of_reads == cur_read); if (!had_err && seqit_err) { had_err = seqit_err; gt_assert(gt_error_is_set(err)); } } if (!had_err) { gt_bitoutstream_flush(bitstream); filepos = gt_bitoutstream_pos(bitstream); if (filepos < 0) { had_err = -1; gt_error_set(err, "error by ftell: %s", strerror(errno)); } else { hcr_enc->seq_encoder->startofsamplingtab = filepos; gt_log_log("start of samplingtab: "GT_WU"", hcr_enc->seq_encoder->startofsamplingtab); if (hcr_enc->seq_encoder->sampling != NULL) gt_sampling_write(hcr_enc->seq_encoder->sampling, fp); } } gt_bitoutstream_delete(bitstream); gt_seq_iterator_delete(seqit); return had_err; }
static int gt_compreads_compress_arguments_check(GT_UNUSED int rest_argc, void *tool_arguments, GtError *err) { int had_err = 0; GtCsrHcrEncodeArguments *arguments = tool_arguments; GtSplitter *splitter = NULL; GtStr *buffer; gt_error_check(err); gt_assert(arguments); if (gt_str_array_size(arguments->files) == 0) { gt_error_set(err, "option \"-files\" is mandatory and requires" " at least one filename as argument!"); had_err = -1; } if (!had_err) { if (gt_str_length(arguments->name) == 0) { if (gt_str_array_size(arguments->files) > 1UL) { gt_error_set(err, "option \"-name\" needs to be specified" " if more than one file is given"); had_err = -1; } else { GtUword i; char *basename; splitter = gt_splitter_new(); basename = gt_basename(gt_str_array_get(arguments->files, 0)); buffer = gt_str_new_cstr(basename); gt_splitter_split(splitter, gt_str_get(buffer), gt_str_length(buffer), '.'); for (i = 0; i < gt_splitter_size(splitter) - 1; i++) { gt_str_append_cstr(arguments->name, gt_splitter_get_token(splitter, i)); if (i < gt_splitter_size(splitter) - 2) gt_str_append_char(arguments->name, '.'); } gt_free(basename); gt_splitter_delete(splitter); gt_str_delete(buffer); } } } if (!had_err) { char *sampling_type = gt_str_get(arguments->method); static const char *methods[] = { "page", "regular", "none" }; if (!strcmp(methods[0], sampling_type)) { arguments->pagewise = true; if (arguments->srate == GT_UNDEF_UWORD) arguments->srate = GT_SAMPLING_DEFAULT_PAGE_RATE; else if (arguments->srate == 0) { gt_error_set(err, "page sampling was chosen, but sampling" " rate was set to "GT_WU"! this seems wrong.", arguments->srate); had_err = -1; } } else if (!strcmp(methods[1], sampling_type)) { arguments->regular = true; if (arguments->srate == GT_UNDEF_UWORD) arguments->srate = GT_SAMPLING_DEFAULT_REGULAR_RATE; else if (arguments->srate == 0) { gt_error_set(err, "regular sampling was chosen, but sampling rate " " was set to "GT_WU"! this seems wrong.", arguments->srate); had_err = -1; } } else if (!strcmp(methods[2], sampling_type)) { if (arguments->srate == GT_UNDEF_UWORD) arguments->srate = 0; else if (arguments->srate != 0) { gt_error_set(err, "no sampling was chosen, but sampling rate was" " set to "GT_WU"! this seems wrong.", arguments->srate); had_err = -1; } } else { gt_error_set(err, "somethings wrong with the stype option"); had_err = -1; } } if (!had_err) { if (arguments->arg_range.start != GT_UNDEF_UWORD) { if (arguments->arg_range.start <= (GtUword) UINT_MAX) { gt_safe_assign(arguments->qrng.start, arguments->arg_range.start); if (arguments->arg_range.end <= (GtUword) UINT_MAX) gt_safe_assign(arguments->qrng.end, arguments->arg_range.end); else had_err = -1; } else had_err = -1; } if (had_err) gt_error_set(err, "Range for qualities: value to large! larger than %u", UINT_MAX); } return had_err; }
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtCondenseqCompressArguments *arguments = tool_arguments; GtLogger *logger, *kdb_logger; FILE *kmer_fp = NULL; int had_err = 0; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->kdb) { kmer_fp = gt_fa_fopen("kmer_db.out", "w", err); gt_logger_set_target(kdb_logger, kmer_fp); } if (gt_str_length(arguments->indexname) == 0UL) { char *basenameptr; basenameptr = gt_basename(argv[parsed_args]); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } if (!had_err) { GtEncseqLoader *es_l = gt_encseq_loader_new(); arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->input_es == NULL) had_err = -1; gt_encseq_loader_delete(es_l); } if (!had_err) { if (arguments->minalignlength == GT_UNDEF_UWORD) arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ? arguments->initsize / (GtUword) 3UL : GT_UNDEF_UWORD; if (arguments->windowsize == GT_UNDEF_UINT) arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ? (unsigned int) (arguments->minalignlength / 5U) : GT_UNDEF_UINT; if (arguments->windowsize < 4U) arguments->windowsize = 4U; if (arguments->kmersize == GT_UNDEF_UINT) { unsigned int size = gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es)); /* size^k ~= 100000 */ gt_safe_assign(arguments->kmersize, gt_round_to_long(gt_log_base(100000.0, (double) size))); gt_logger_log(logger, "|A|: %u, k: %u", size, arguments->kmersize); } if (arguments->windowsize == GT_UNDEF_UINT) { arguments->windowsize = 5U * arguments->kmersize; } if (arguments->minalignlength == GT_UNDEF_UWORD) { arguments->minalignlength = (GtUword) (3UL * arguments->windowsize); } if (arguments->initsize == GT_UNDEF_UWORD) { arguments->initsize = (GtUword) (3UL * arguments->minalignlength); } } if (!had_err && arguments->windowsize <= arguments->kmersize) { gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!", arguments->windowsize, arguments->kmersize); had_err = -1; } if (!had_err && arguments->minalignlength < (GtUword) arguments->windowsize) { gt_error_set(err, "-alignlength (" GT_WU ") must be at least " "-windowsize (%u)!", arguments->minalignlength, arguments->windowsize); had_err = -1; } if (!had_err && (arguments->initsize < arguments->minalignlength)) { gt_error_set(err, "-initsize (" GT_WU ") must be at least " "-alignlength (" GT_WU ")!", arguments->initsize, arguments->minalignlength); had_err = -1; } if (!had_err) { GtCondenseqCreator *ces_c; if (!had_err) { ces_c = gt_condenseq_creator_new(arguments->initsize, arguments->minalignlength, arguments->xdrop, &(arguments->scores), arguments->kmersize, arguments->windowsize, logger, err); if (ces_c == NULL) had_err = -1; } if (!had_err) { if (arguments->cutoff_value == GT_UNDEF_UWORD) gt_condenseq_creator_use_mean_cutoff(ces_c); else if (arguments->cutoff_value == 0) gt_condenseq_creator_disable_cutoff(ces_c); else gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value); gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction); if (arguments->prune) gt_condenseq_creator_disable_prune(ces_c); if (arguments->brute) gt_condenseq_creator_enable_brute_force(ces_c); if (!arguments->diags) gt_condenseq_creator_disable_diagonals(ces_c); if (arguments->full_diags) gt_condenseq_creator_enable_full_diagonals(ces_c); if (arguments->clean_percent != GT_UNDEF_UINT) gt_condenseq_creator_set_diags_clean_limit(ces_c, arguments->clean_percent); had_err = gt_condenseq_creator_create(ces_c, arguments->indexname, arguments->input_es, logger, kdb_logger, err); gt_condenseq_creator_delete(ces_c); } } gt_logger_delete(logger); gt_logger_delete(kdb_logger); if (arguments->kdb) gt_fa_fclose(kmer_fp); return had_err; }