static int construct_bioseq_files(GtBioseq *bs, GtStr *bioseq_indexname, GtError *err) { GtStr *sequence_filename; GtEncseqEncoder *ee; GtStrArray *indexfn; int had_err = 0; gt_error_check(err); /* register the signal handler to remove incomplete files upon termination */ if (!bs->use_stdin) { gt_bioseq_index_filename = gt_str_get(bs->sequence_file); gt_sig_register_all(remove_bioseq_files); } /* if stdin is used as input, we need to create a tempfile containing the sequence as GtEncseq cannot be built from stdin directly */ if (bs->use_stdin) { GtStr *tmpfilename; FILE *tmpfile = NULL; int i; char buf[BUFSIZ]; tmpfilename = gt_str_new(); tmpfile = gt_xtmpfp(tmpfilename); gt_assert(tmpfile); i = 1; while (i > 0) { i = fread(buf, 1, BUFSIZ, stdin); if (i > 0) fwrite(buf, 1, i, tmpfile); } gt_fa_xfclose(tmpfile); sequence_filename = tmpfilename; } else { sequence_filename = gt_str_ref(bs->sequence_file); } gt_assert(gt_str_length(sequence_filename) > 0); ee = gt_encseq_encoder_new(); gt_encseq_encoder_enable_description_support(ee); gt_encseq_encoder_enable_md5_support(ee); gt_encseq_encoder_enable_multiseq_support(ee); gt_encseq_encoder_enable_lossless_support(ee); indexfn = gt_str_array_new(); gt_str_array_add(indexfn, sequence_filename); gt_str_delete(sequence_filename); had_err = gt_encseq_encoder_encode(ee, indexfn, gt_str_get(bioseq_indexname), err); /* unregister the signal handler */ if (!bs->use_stdin) gt_sig_unregister_all(); gt_str_array_delete(indexfn); gt_encseq_encoder_delete(ee); return had_err; }
GtHcrEncoder *gt_hcr_encoder_new(GtStrArray *files, GtAlphabet *alpha, bool descs, GtQualRange qrange, GtTimer *timer, GtError *err) { GtBaseQualDistr *bqd; GtHcrEncoder *hcr_enc; GtSeqIterator *seqit; GtStrArray *file; int had_err = 0, status; GtUword len1, len2, i, num_of_reads = 0; const GtUchar *seq, *qual; char *desc; gt_error_check(err); gt_assert(alpha && files); if (timer != NULL) gt_timer_show_progress(timer, "get <base,qual> distr", stdout); if (qrange.start != GT_UNDEF_UINT) if (qrange.start == qrange.end) { gt_error_set(err, "qrange.start must unequal qrange.end"); return NULL; } hcr_enc = gt_malloc(sizeof (GtHcrEncoder)); hcr_enc->files = files; hcr_enc->num_of_files = gt_str_array_size(files); hcr_enc->num_of_reads = 0; hcr_enc->page_sampling = false; hcr_enc->regular_sampling = false; hcr_enc->sampling_rate = 0; hcr_enc->pagesize = gt_pagesize(); if (descs) { hcr_enc->encdesc_encoder = gt_encdesc_encoder_new(); if (timer != NULL) gt_encdesc_encoder_set_timer(hcr_enc->encdesc_encoder, timer); } else hcr_enc->encdesc_encoder = NULL; hcr_enc->seq_encoder = gt_malloc(sizeof (GtHcrSeqEncoder)); hcr_enc->seq_encoder->alpha = alpha; hcr_enc->seq_encoder->sampling = NULL; hcr_enc->seq_encoder->fileinfos = gt_calloc((size_t) hcr_enc->num_of_files, sizeof (*(hcr_enc->seq_encoder->fileinfos))); hcr_enc->seq_encoder->qrange = qrange; bqd = hcr_base_qual_distr_new(alpha, qrange); /* check if reads in the same file are of same length and get <base, quality> pair distribution */ for (i = 0; i < hcr_enc->num_of_files; i++) { file = gt_str_array_new(); gt_str_array_add(file, gt_str_array_get_str(files, i)); seqit = gt_seq_iterator_fastq_new(file, err); if (!seqit) { gt_error_set(err, "cannot initialize GtSeqIteratorFastQ object"); had_err = -1; } if (!had_err) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alpha)); gt_seq_iterator_set_quality_buffer(seqit, &qual); status = gt_seq_iterator_next(seqit, &seq, &len1, &desc, err); if (status == 1) { num_of_reads = 1UL; while (!had_err) { status = gt_seq_iterator_next(seqit, &seq, &len2, &desc, err); if (status == -1) had_err = -1; if (status != 1) break; if (len2 != len1) { gt_error_set(err, "reads have to be of equal length"); had_err = -1; break; } if (hcr_base_qual_distr_add(bqd, qual, seq, len1) != 0) had_err = -1; len1 = len2; num_of_reads++; } } else if (status == -1) had_err = -1; if (!had_err) { if (i == 0) hcr_enc->seq_encoder->fileinfos[i].readnum = num_of_reads; else hcr_enc->seq_encoder->fileinfos[i].readnum = hcr_enc->seq_encoder->fileinfos[i - 1].readnum + num_of_reads; hcr_enc->seq_encoder->fileinfos[i].readlength = len1; } } hcr_enc->num_of_reads += num_of_reads; gt_str_array_delete(file); gt_seq_iterator_delete(seqit); } if (!had_err) hcr_base_qual_distr_trim(bqd); if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "build huffman tree for sequences and" " qualities", stdout); hcr_enc->seq_encoder->huffman = gt_huffman_new(bqd, hcr_base_qual_distr_func, (GtUword) bqd->ncols * bqd->nrows); } if (!had_err) { hcr_enc->seq_encoder->qual_offset = bqd->qual_offset; hcr_base_qual_distr_delete(bqd); return hcr_enc; } return NULL; }
static int update_seq_col_if_necessary(GtRegionMapping *rm, GtStr *seqid, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(rm && seqid); /* for mappings, we need to load the changed sequence, if needed... */ if (rm->mapping) { if (!rm->sequence_file || (gt_str_cmp(rm->sequence_name, seqid))) { gt_str_delete(rm->sequence_file); /* ignore MD5 hashes when using region mappings */ if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid) +GT_MD5_SEQID_TOTAL_LEN, err); } else rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err); if (!rm->sequence_file) had_err = -1; else { /* load new seqcol */ if (!rm->sequence_filenames) rm->sequence_filenames = gt_str_array_new(); else gt_str_array_reset(rm->sequence_filenames); gt_str_array_add(rm->sequence_filenames, rm->sequence_file); if (!rm->sequence_name) rm->sequence_name = gt_str_new(); else gt_str_reset(rm->sequence_name); gt_str_append_str(rm->sequence_name, seqid); gt_seq_col_delete(rm->seq_col); rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err); if (!rm->seq_col) had_err = -1; } } } else { /* ...otherwise, just make sure the seqcol is loaded */ if (!rm->seq_col) { if (rm->encseq) { if (!(rm->seq_col = gt_encseq_col_new(rm->encseq, err))) had_err = -1; } else { gt_assert(rm->sequence_filenames); if (!(rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err))) had_err = -1; } } if (!had_err && rm->usedesc) { if (rm->seqid2seqnum_mapping) gt_seqid2seqnum_mapping_delete(rm->seqid2seqnum_mapping); rm->seqid2seqnum_mapping = gt_seqid2seqnum_mapping_new_seqcol(rm->seq_col, err); if (!rm->seqid2seqnum_mapping) { had_err = -1; } } } return had_err; }