static int seqid_info_get(SeqidInfo *seqid_info, GtUword *seqnum,
                          GtUword *filenum, GtRange *outrange,
                          const GtRange *inrange,
                          GT_UNUSED const char *filename,
                          const char *seqid, GtError *err)
{
  SeqidInfoElem *seqid_info_elem;
  GtUword i;
  gt_error_check(err);
  gt_assert(seqid_info && seqnum && outrange && inrange);
  for (i = 0; i < gt_array_size(seqid_info); i++) {
    seqid_info_elem = gt_array_get(seqid_info, i);
    if (seqid_info_elem->descrange.end == GT_UNDEF_UWORD ||
        gt_range_contains(&seqid_info_elem->descrange, inrange)) {
      *seqnum = seqid_info_elem->seqnum;
      *filenum = seqid_info_elem->filenum;
      *outrange = seqid_info_elem->descrange;
      return 0;
    }
  }
  gt_error_set(err,
               "cannot find a sequence with ID \"%s\" "
               "{range " GT_WU "," GT_WU ")",
               seqid, inrange->start, inrange->end);
  return -1;
}
Example #2
0
static bool filter_contain_range(GtFeatureNode *fn, GtRange contain_range)
{
  GtRange range;
  gt_assert(fn);
  range = gt_genome_node_get_range((GtGenomeNode*) fn);
  if (contain_range.start != GT_UNDEF_ULONG &&
      !gt_range_contains(&contain_range, &range)) {
    return true;
  }
  return false;
}
Example #3
0
static bool contains(const ConsensusSA *csa,
                     unsigned long sa_1, unsigned long sa_2)
{
  GtRange range_sa_1, range_sa_2;
  gt_assert(csa);

  /* get ranges */
  range_sa_1 = extract_genomic_range(csa, sa_1);
  range_sa_2 = extract_genomic_range(csa, sa_2);

  if (gt_range_contains(&range_sa_1, &range_sa_2) &&
      compatible(csa, sa_1, sa_2)) {
    return true;
  }
  return false;
}
int gt_seqid2seqnum_mapping_map(GtSeqid2SeqnumMapping *mapping,
                                const char *seqid, const GtRange *inrange,
                                GtUword *seqnum, GtUword *filenum,
                                GtUword *offset, GtError *err)
{
  SeqidInfo *seqid_info;
  GtRange outrange;
  gt_error_check(err);
  gt_assert(mapping && seqid && seqnum);
  /* try to answer request from cache */
  if (mapping->cached_seqid && !strcmp(seqid, mapping->cached_seqid) &&
      (mapping->cached_range.end == GT_UNDEF_UWORD ||
       gt_range_contains(&mapping->cached_range, inrange))) {
    *seqnum = mapping->cached_seqnum;
    *filenum = mapping->cached_filenum;
    if (offset)
      *offset = mapping->cached_range.start;
    return 0;
  }
  /* cache miss -> regular mapping */
  if (!(seqid_info = gt_hashmap_get(mapping->map, seqid))) {
    gt_error_set(err, "no sequence with ID \"%s\" found in input sequence(s)",
                 seqid);
    return -1;
  }
  /* get results from seqid info */
  if (seqid_info_get(seqid_info, seqnum, filenum, &outrange, inrange,
                     mapping->filename, seqid, err)) {
    return -1;
  }
  /* report offset */
  if (offset)
    *offset = outrange.start;
  /* store result in cache */
  mapping->cached_seqid = gt_hashmap_get_key(mapping->map, seqid);
  gt_assert(mapping->cached_seqid);
  mapping->cached_seqnum = *seqnum;
  mapping->cached_filenum = *filenum;
  mapping->cached_range = outrange;
  return 0;
}
static void orf_attach_results_to_gff3(GtFeatureNode *gf,
                                       GtRange orf_rng, unsigned int orf_frame,
                                       GtStrand strand, GT_UNUSED GtError *err)
{
  GtGenomeNode *child;
  GtStr *tag;
  tag = gt_str_new_cstr(GT_ORF_FINDER_TAG);

  orf_rng.start++; orf_rng.end++;

  GtFeatureNodeIterator *gfi;
  GtFeatureNode *curnode = NULL, *parent_node = NULL;
  GtRange gfi_range;
  char frame_buf[3];
  sprintf(frame_buf, "%d", orf_frame);

  gfi = gt_feature_node_iterator_new(gf);

  while ((curnode = gt_feature_node_iterator_next(gfi))) {
    if (strcmp(gt_feature_node_get_type(curnode),
                                              (const char*) GT_ORF_TYPE) != 0) {
      gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode);
      if (gt_range_contains(&gfi_range, &orf_rng)) {
        parent_node = curnode;
      }
    }
  }
  if (parent_node) {
    child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf),
                                GT_ORF_TYPE,
                                orf_rng.start,
                                orf_rng.end,
                                strand);
    gt_feature_node_set_source((GtFeatureNode*) child, tag);
    gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf);
    gt_feature_node_add_child(parent_node,(GtFeatureNode*) child);
  }
  gt_str_delete(tag);
  gt_feature_node_iterator_delete(gfi);
}
static int seqid_info_get(SeqidInfo *seqid_info, unsigned long *seqnum,
                          unsigned long *filenum, GtRange *outrange,
                          const GtRange *inrange, const char *filename,
                          const char *seqid, GtError *err)
{
  SeqidInfoElem *seqid_info_elem;
  unsigned long i;
  gt_error_check(err);
  gt_assert(seqid_info && seqnum && outrange && inrange);
  for (i = 0; i < gt_array_size(seqid_info); i++) {
    seqid_info_elem = gt_array_get(seqid_info, i);
    if (seqid_info_elem->descrange.end == GT_UNDEF_ULONG ||
        gt_range_contains(&seqid_info_elem->descrange, inrange)) {
      *seqnum = seqid_info_elem->seqnum;
      *filenum = seqid_info_elem->filenum;
      *outrange = seqid_info_elem->descrange;
      return 0;
    }
  }
  gt_error_set(err, "cannot find sequence ID \"%s\" (with range %lu,%lu) in "
               "sequence file \"%s\"", seqid, inrange->start, inrange->end,
               filename);
  return -1;
}
Example #7
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);

  /* TODO: support discontinuous start/stop codons */
  for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    if (gt_feature_node_get_attribute((GtFeatureNode*) gn,
        GTF_PARSER_STOP_CODON_FLAG)) {
      GtUword j;
      GtRange stop_codon_rng = gt_genome_node_get_range(gn);
      bool found_cds = false;
      for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) {
        GtGenomeNode* gn2;
        GtRange this_rng;
        const char *this_type;
        gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j);
        if (gn == gn2) continue;
        this_rng = gt_genome_node_get_range(gn2);
        this_type = gt_feature_node_get_type((GtFeatureNode*) gn2);
        if (this_type == gt_symbol(gt_ft_CDS)) {
          if (gt_range_contains(&this_rng, &stop_codon_rng)) {
            if (cinfo->tidy) {
              gt_warning("stop codon on line %u in file %s is contained in "
                         "CDS in line %u",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn),
                         gt_genome_node_get_line_number(gn2));
              found_cds = true;
            } else {
              gt_error_set(err, "stop codon on line %u in file %s is "
                                "contained in CDS in line %u",
                           gt_genome_node_get_line_number(gn),
                           gt_genome_node_get_filename(gn),
                           gt_genome_node_get_line_number(gn2));
              had_err = -1;
            }
            break;
          }
          if (this_rng.end + 1 == stop_codon_rng.start) {
            this_rng.end = stop_codon_rng.end;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
          if (this_rng.start == stop_codon_rng.end + 1) {
            this_rng.start = stop_codon_rng.start;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
        }
      }
      if (!found_cds) {
        if (!had_err) {
          if (cinfo->tidy) {
            gt_warning("found stop codon on line %u in file %s with no "
                       "flanking CDS, ignoring it",
                       gt_genome_node_get_line_number(gn),
                       gt_genome_node_get_filename(gn));
          } else {
            gt_error_set(err, "found stop codon on line %u in file %s with no "
                              "flanking CDS",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn));
            had_err = -1;
            break;
          }
        }
      } else {
        gt_array_rem(gt_genome_node_array, i);
        gt_genome_node_delete(gn);
      }
    }
  }

  for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    GtStrand strand;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    if (strand != mRNA_strand) {
      gt_error_set(err, "feature %s on line %u has strand %c, but the "
                        "parent transcript has strand %c",
                   (const char*) key,
                   gt_genome_node_get_line_number(gn),
                   GT_STRAND_CHARS[strand],
                   GT_STRAND_CHARS[mRNA_strand]);
      had_err = -1;
      break;
    } else {
      mRNA_strand = gt_strand_join(mRNA_strand, strand);
    }
    if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id",
                                  key);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key)) && strlen(tname) > 0) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gt_genome_node_ref(gn));
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}