Esempio n. 1
0
static int parse_bed_range(GtRange *range, GtStr *start, GtStr *end,
                           GtWord offset, GtIO *bed_file, bool thick,
                           GtError *err)
{
  int had_err;
  gt_error_check(err);
  had_err = gt_parse_range(range, gt_str_get(start), gt_str_get(end),
                           gt_io_get_line_number(bed_file),
                           gt_io_get_filename(bed_file), err);
  /* BED has a weird numbering scheme: positions are 0-based, but the end
     position is not part of the feature. Transform to 1-based coordinates. */
  range->start++;
  /* Ranges defining a 'thick' region sometimes come with length 0 to
     designate that there are no thick regions. So do not fail here and
     handle that case later. */
  if (!thick) {
    if (!had_err && range->start > range->end) {
      gt_error_set(err, "file \"%s\": line "GT_WU": BED feature has length 0",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file));
      had_err = -1;
    }
  }
  if (offset)
    *range = gt_range_offset(range, offset);
  return had_err;
}
Esempio n. 2
0
static int parse_bed_range(GtRange *range, GtStr *start, GtStr *end,
                           long offset, GtIO *bed_file, GtError *err)
{
  int had_err;
  gt_error_check(err);
  had_err = gt_parse_range(range, gt_str_get(start), gt_str_get(end),
                           gt_io_get_line_number(bed_file),
                           gt_io_get_filename(bed_file), err);
  /* BED has a weird numbering scheme: positions are 0-based, but the end
     position is not part of the feature. Transform to 1-based coordinates. */
  range->start++;
  if (!had_err && range->start > range->end) {
    gt_error_set(err, "file \"%s\": line %lu: BED feature has length 0",
                 gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file));
    had_err = -1;
  }
  if (offset)
    *range = gt_range_offset(range, offset);
  return had_err;
}
Esempio n. 3
0
static void make_sequence_region(GtHashmap *sequence_regions,
                                 GtStr *sequenceid,
                                 GthRegionFactory *srf,
                                 GthInput *input,
                                 GtUword filenum,
                                 GtUword seqnum)
{
    GtUword offset_is_defined = false;
    GtRange range, descrange;
    GtGenomeNode *sr = NULL;
    gt_assert(sequence_regions && sequenceid && srf && input);
    if (gth_input_use_substring_spec(input)) {
        range.start = gth_input_genomic_substring_from(input);
        range.end   = gth_input_genomic_substring_to(input);
    }
    else {
        range = gth_input_get_relative_genomic_range(input, filenum, seqnum);
    }
    if (srf->use_desc_ranges) {
        GtStr *description = gt_str_new();
        gth_input_get_genomic_description(input, description, filenum, seqnum);
        if (!gt_parse_description_range(gt_str_get(description), &descrange))
            offset_is_defined = true;
        gt_str_delete(description);
    }
    if (offset_is_defined)
        range = gt_range_offset(&range, descrange.start);
    else
        range = gt_range_offset(&range, 1); /* 1-based */
    if (!gt_str_length(sequenceid) ||
            (gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)) &&
             !offset_is_defined)) {
        /* sequenceid is empty or exists already (and no offset has been parsed)
           -> make one up */
        GtStr *seqid;
        char *base;
        base = gt_basename(gth_input_get_genomic_filename(input, filenum));
        seqid = gt_str_new_cstr(base);
        gt_free(base);
        gt_str_append_char(seqid, '|');
        gt_str_append_uword(seqid, seqnum + 1); /* 1-based */
        seqid_store_add(srf->seqid_store, filenum, seqnum, seqid, GT_UNDEF_UWORD);
        gt_assert(!gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid)));
        gt_cstr_table_add(srf->used_seqids, gt_str_get(seqid));
        sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum),
                                range.start, range.end);
        gt_hashmap_add(sequence_regions,
                       (void*) gt_cstr_table_get(srf->used_seqids,
                               gt_str_get(seqid)),
                       sr);
        gt_str_delete(seqid);
    }
    else {
        /* sequenceid does not exists already (or an offset has been parsed)
           -> use this one */
        if (!gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid))) {
            /* no sequence region with this id exists -> create one */
            gt_cstr_table_add(srf->used_seqids, gt_str_get(sequenceid));
            seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid,
                            offset_is_defined ? descrange.start : GT_UNDEF_UWORD);
            sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum,
                                                    seqnum), range.start, range.end);
            gt_hashmap_add(sequence_regions,
                           (void*) gt_cstr_table_get(srf->used_seqids,
                                   gt_str_get(sequenceid)),
                           sr);
        }
        else {
            GtRange prev_range, new_range;
            /* sequence region with this id exists already -> modify range */
            sr = gt_hashmap_get(sequence_regions, gt_str_get(sequenceid));
            gt_assert(sr);
            prev_range = gt_genome_node_get_range(sr);
            new_range = gt_range_join(&prev_range, &range);
            gt_genome_node_set_range(sr, &new_range);
            seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid,
                            offset_is_defined ? descrange.start : GT_UNDEF_UWORD);
        }
    }
    gt_assert(sr);
}