Exemple #1
0
static int construct_bioseq_files(GtBioseq *bs, GtStr *bioseq_indexname,
                                  GtError *err)
{
  GtStr *sequence_filename;
  GtEncseqEncoder *ee;
  GtStrArray *indexfn;
  int had_err = 0;

  gt_error_check(err);

  /* register the signal handler to remove incomplete files upon termination */
  if (!bs->use_stdin) {
    gt_bioseq_index_filename = gt_str_get(bs->sequence_file);
    gt_sig_register_all(remove_bioseq_files);
  }

  /* if stdin is used as input, we need to create a tempfile containing the
     sequence as GtEncseq cannot be built from stdin directly */
  if (bs->use_stdin) {
    GtStr *tmpfilename;
    FILE *tmpfile = NULL;
    int i;
    char buf[BUFSIZ];
    tmpfilename = gt_str_new();
    tmpfile = gt_xtmpfp(tmpfilename);
    gt_assert(tmpfile);
    i = 1;
    while (i > 0) {
      i = fread(buf, 1, BUFSIZ, stdin);
      if (i > 0) fwrite(buf, 1, i, tmpfile);
    }
    gt_fa_xfclose(tmpfile);
    sequence_filename = tmpfilename;
  } else {
    sequence_filename = gt_str_ref(bs->sequence_file);
  }
  gt_assert(gt_str_length(sequence_filename) > 0);
  ee = gt_encseq_encoder_new();
  gt_encseq_encoder_enable_description_support(ee);
  gt_encseq_encoder_enable_md5_support(ee);
  gt_encseq_encoder_enable_multiseq_support(ee);
  gt_encseq_encoder_enable_lossless_support(ee);
  indexfn = gt_str_array_new();
  gt_str_array_add(indexfn, sequence_filename);
  gt_str_delete(sequence_filename);
  had_err = gt_encseq_encoder_encode(ee, indexfn,
                                     gt_str_get(bioseq_indexname), err);
  /* unregister the signal handler */
   if (!bs->use_stdin)
    gt_sig_unregister_all();

  gt_str_array_delete(indexfn);
  gt_encseq_encoder_delete(ee);
  return had_err;
}
Exemple #2
0
GtHcrEncoder *gt_hcr_encoder_new(GtStrArray *files, GtAlphabet *alpha,
                                 bool descs, GtQualRange qrange, GtTimer *timer,
                                 GtError *err)
{
    GtBaseQualDistr *bqd;
    GtHcrEncoder *hcr_enc;
    GtSeqIterator *seqit;
    GtStrArray *file;
    int had_err = 0,
        status;
    GtUword len1,
            len2,
            i,
            num_of_reads = 0;
    const GtUchar *seq,
          *qual;
    char *desc;

    gt_error_check(err);
    gt_assert(alpha && files);

    if (timer != NULL)
        gt_timer_show_progress(timer, "get <base,qual> distr", stdout);

    if (qrange.start != GT_UNDEF_UINT)
        if (qrange.start == qrange.end) {
            gt_error_set(err, "qrange.start must unequal qrange.end");
            return NULL;
        }

    hcr_enc = gt_malloc(sizeof (GtHcrEncoder));
    hcr_enc->files = files;
    hcr_enc->num_of_files = gt_str_array_size(files);
    hcr_enc->num_of_reads = 0;
    hcr_enc->page_sampling = false;
    hcr_enc->regular_sampling = false;
    hcr_enc->sampling_rate = 0;
    hcr_enc->pagesize = gt_pagesize();
    if (descs) {
        hcr_enc->encdesc_encoder = gt_encdesc_encoder_new();
        if (timer != NULL)
            gt_encdesc_encoder_set_timer(hcr_enc->encdesc_encoder, timer);
    }
    else
        hcr_enc->encdesc_encoder = NULL;

    hcr_enc->seq_encoder = gt_malloc(sizeof (GtHcrSeqEncoder));
    hcr_enc->seq_encoder->alpha = alpha;
    hcr_enc->seq_encoder->sampling = NULL;
    hcr_enc->seq_encoder->fileinfos = gt_calloc((size_t) hcr_enc->num_of_files,
                                      sizeof (*(hcr_enc->seq_encoder->fileinfos)));
    hcr_enc->seq_encoder->qrange = qrange;
    bqd = hcr_base_qual_distr_new(alpha, qrange);

    /* check if reads in the same file are of same length and get
       <base, quality> pair distribution */
    for (i = 0; i < hcr_enc->num_of_files; i++) {
        file = gt_str_array_new();
        gt_str_array_add(file, gt_str_array_get_str(files, i));
        seqit = gt_seq_iterator_fastq_new(file, err);
        if (!seqit) {
            gt_error_set(err, "cannot initialize GtSeqIteratorFastQ object");
            had_err = -1;
        }
        if (!had_err) {
            gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alpha));
            gt_seq_iterator_set_quality_buffer(seqit, &qual);
            status = gt_seq_iterator_next(seqit, &seq, &len1, &desc, err);

            if (status == 1) {
                num_of_reads = 1UL;
                while (!had_err) {
                    status = gt_seq_iterator_next(seqit, &seq, &len2, &desc, err);
                    if (status == -1)
                        had_err = -1;
                    if (status != 1)
                        break;
                    if (len2 != len1) {
                        gt_error_set(err, "reads have to be of equal length");
                        had_err = -1;
                        break;
                    }
                    if (hcr_base_qual_distr_add(bqd, qual, seq, len1) != 0)
                        had_err = -1;
                    len1 = len2;
                    num_of_reads++;
                }
            }
            else if (status == -1)
                had_err = -1;

            if (!had_err) {
                if (i == 0)
                    hcr_enc->seq_encoder->fileinfos[i].readnum = num_of_reads;
                else
                    hcr_enc->seq_encoder->fileinfos[i].readnum =
                        hcr_enc->seq_encoder->fileinfos[i - 1].readnum + num_of_reads;
                hcr_enc->seq_encoder->fileinfos[i].readlength = len1;
            }
        }
        hcr_enc->num_of_reads += num_of_reads;
        gt_str_array_delete(file);
        gt_seq_iterator_delete(seqit);
    }
    if (!had_err)
        hcr_base_qual_distr_trim(bqd);

    if (!had_err) {
        if (timer != NULL)
            gt_timer_show_progress(timer, "build huffman tree for sequences and"
                                   " qualities", stdout);
        hcr_enc->seq_encoder->huffman =
            gt_huffman_new(bqd,
                           hcr_base_qual_distr_func,
                           (GtUword) bqd->ncols * bqd->nrows);
    }
    if (!had_err) {
        hcr_enc->seq_encoder->qual_offset = bqd->qual_offset;
        hcr_base_qual_distr_delete(bqd);
        return hcr_enc;
    }
    return NULL;
}
static int update_seq_col_if_necessary(GtRegionMapping *rm, GtStr *seqid,
                                       GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_assert(rm && seqid);
  /* for mappings, we need to load the changed sequence, if needed... */
  if (rm->mapping) {
    if (!rm->sequence_file || (gt_str_cmp(rm->sequence_name, seqid))) {
      gt_str_delete(rm->sequence_file);
      /* ignore MD5 hashes when using region mappings */
      if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
        rm->sequence_file = region_mapping_map(rm,
                                               gt_str_get(seqid)
                                                 +GT_MD5_SEQID_TOTAL_LEN,
                                               err);
      } else
        rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err);
      if (!rm->sequence_file)
        had_err = -1;
      else {
        /* load new seqcol */
        if (!rm->sequence_filenames)
          rm->sequence_filenames = gt_str_array_new();
        else
          gt_str_array_reset(rm->sequence_filenames);
        gt_str_array_add(rm->sequence_filenames, rm->sequence_file);
        if (!rm->sequence_name)
          rm->sequence_name = gt_str_new();
        else
          gt_str_reset(rm->sequence_name);
        gt_str_append_str(rm->sequence_name, seqid);
        gt_seq_col_delete(rm->seq_col);
        rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err);
        if (!rm->seq_col)
          had_err = -1;
      }
    }
  } else {
    /* ...otherwise, just make sure the seqcol is loaded */
    if (!rm->seq_col) {
      if (rm->encseq) {
        if (!(rm->seq_col = gt_encseq_col_new(rm->encseq, err)))
          had_err = -1;
      } else {
        gt_assert(rm->sequence_filenames);
        if (!(rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err)))
          had_err = -1;
      }
    }
    if (!had_err && rm->usedesc) {
      if (rm->seqid2seqnum_mapping)
        gt_seqid2seqnum_mapping_delete(rm->seqid2seqnum_mapping);
      rm->seqid2seqnum_mapping =
                           gt_seqid2seqnum_mapping_new_seqcol(rm->seq_col, err);
      if (!rm->seqid2seqnum_mapping) {
        had_err = -1;
      }
    }
  }
  return had_err;
}