Ejemplo n.º 1
0
static void convert_chain_to_inverted_chain(GthInvertedChain *inverted_chain,
                                            GthChain *chain)
{
  unsigned long i, lastexonnum = gt_array_size(chain->forwardranges) - 1;
  GtRange range;

  /* inverted chain is empty */
  gt_assert(!gt_array_size(inverted_chain->forwardranges));
  /* chain is not empty */
  gt_assert(gt_array_size(chain->forwardranges));

  /* copy file and sequence numbers */
  inverted_chain->gen_file_num = chain->gen_file_num;
  inverted_chain->gen_seq_num  = chain->gen_seq_num;
  inverted_chain->ref_file_num = chain->ref_file_num;
  inverted_chain->ref_seq_num  = chain->ref_seq_num;

  /* save startpos */
  inverted_chain->startpos = ((GtRange*)
                              gt_array_get_first(chain->forwardranges))->start;

  /* save endpos */
  inverted_chain->endpos = ((GtRange*)
                             gt_array_get_last(chain->forwardranges))->end;

  /* convert (potential) exons to (potential) introns */
  for (i = 0; i < lastexonnum; i++) {
    range.start  = ((GtRange*) gt_array_get(chain->forwardranges, i))
                  ->end + 1;
    range.end = ((GtRange*) gt_array_get(chain->forwardranges, i+1))
                  ->start - 1;
    gt_array_add(inverted_chain->forwardranges, range);
  }
}
Ejemplo n.º 2
0
GtRange gth_sa_range_forward(const GthSA *sa)
{
  GtRange range;
  GtUword leftgenomicborder, rightgenomicborder;

  gt_assert(sa);

  leftgenomicborder  = ((Exoninfo*) gt_array_get_first(sa->exons))
                       ->leftgenomicexonborder;
  rightgenomicborder = ((Exoninfo*) gt_array_get_last(sa->exons))
                       ->rightgenomicexonborder;

  if (sa->gen_strand_forward) {
    range.start = leftgenomicborder;
    range.end = rightgenomicborder;
  }
  else {
    /* genomic offset is defined */
    gt_assert(sa->gen_offset != GT_UNDEF_UWORD);
    range.start  = sa->gen_total_length - 1
                   - (rightgenomicborder - sa->gen_offset)
                   + sa->gen_offset;
    range.end = sa->gen_total_length - 1
                - (leftgenomicborder - sa->gen_offset)
                + sa->gen_offset;
  }

  return range;
}
Ejemplo n.º 3
0
void assemble_cluster(GthPGL *pgl, bool disableclustersas)
{
  GthSACluster *sacluster;
  GthSA *sa;
  GtUword i;

  sacluster = gt_malloc(sizeof (GthSACluster));
  sacluster->representative = *(GthSA**) gt_array_get_first(pgl->alignments);
  sacluster->members = gt_array_new(sizeof (GthSA*));

  for (i = 1; i < gt_array_size(pgl->alignments); i++) {
    sa = *(GthSA**) gt_array_get(pgl->alignments, i);
    if (disableclustersas ||
        gth_sa_cmp_genomic_actual(&sacluster->representative, &sa)) {
      /* spliced alignments differ -> create a new cluster */
      gt_array_add(pgl->saclusters, sacluster);
      sacluster = gt_malloc(sizeof (GthSACluster));
      sacluster->representative = sa;
      sacluster->members = gt_array_new(sizeof (GthSA*));
    }
    else {
      /* spliced alignments are equal -> store new sa also in current cluster */
      gt_array_add(sacluster->members, sa);
    }
  }

  /* store last cluster */
  gt_array_add(pgl->saclusters, sacluster);
}
Ejemplo n.º 4
0
GtUword gt_ranges_spanned_length(const GtArray *ranges)
{
  GtRange spanned_range;
  gt_assert(ranges);
  spanned_range.start = ((GtRange*) gt_array_get_first(ranges))->start;
  spanned_range.end   = ((GtRange*) gt_array_get_last(ranges))->end;
  return gt_range_length(&spanned_range);
}
Ejemplo n.º 5
0
static GtRange chain_get_genomicrange(GthChain *chain)
{
  GtRange range;
  gt_assert(chain);
  range.start = ((GtRange*) gt_array_get_first(chain->forwardranges))->start;
  range.end = ((GtRange*) gt_array_get_last(chain->forwardranges))->end;
  gt_assert(range.start <= range.end);
  return range;
}
Ejemplo n.º 6
0
static void sort_matches_and_calc_buckets(GtArray *matches, GtArray *buckets,
                                          GtUword *maxbucketlength)
{
  GtUword i, currentstart = 0, currentend = 0;
  GthMatch *matchptr;
  Bucket bucket, *bucketptr;

  gt_assert(gt_array_size(matches));

  /* sort matches */
  qsort(gt_array_get_space(matches), gt_array_size(matches), sizeof (GthMatch),
        compare_matches);

  /* init first bucket */
  matchptr = gt_array_get_first(matches);
  bucket.seqnum1  = matchptr->Storeseqnumreference;
  bucket.seqnum2  = matchptr->Storeseqnumgenomic;
  bucket.startpos = 0;

  /* calc buckets */
  for (i = 1; i < gt_array_size(matches); i++) {
    matchptr = gt_array_get(matches, i);
    if (matchptr->Storeseqnumreference != bucket.seqnum1 ||
        matchptr->Storeseqnumgenomic != bucket.seqnum2) {
      /* save the current bucket */
      currentend    = i - 1;
      bucket.length = currentend - currentstart + 1;
      gt_array_add(buckets, bucket);

      /* create new bucket */
      currentstart    = i;
      bucket.seqnum1  = matchptr->Storeseqnumreference;
      bucket.seqnum2  = matchptr->Storeseqnumgenomic;
      bucket.startpos = i;
    }
  }

  /* save last bucket */
  currentend = i - 1;
  bucket.length = currentend - currentstart + 1;
  gt_array_add(buckets, bucket);

  /* compute maximum bucket length */
  *maxbucketlength = 0;
  for (i = 0; i < gt_array_size(buckets); i++) {
    bucketptr = gt_array_get(buckets, i);
    if (bucketptr->length > *maxbucketlength)
      *maxbucketlength = bucketptr->length;
  }

  gt_assert(sum_of_bucket_lengths_equals_num_of_matches(buckets,
                                                     gt_array_size(matches)));
}
Ejemplo n.º 7
0
GtRange gth_sa_range_actual(const GthSA *sa)
{
  GtRange range;

  gt_assert(sa);

  range.start = ((Exoninfo*) gt_array_get_first(sa->exons))
                ->leftgenomicexonborder;
  range.end = ((Exoninfo*) gt_array_get_last(sa->exons))
              ->rightgenomicexonborder;

  return range;
}
Ejemplo n.º 8
0
static void enrich_chain(GthChain *chain, GtFragment *fragments,
                         unsigned long num_of_fragments, bool comments,
                         GtFile *outfp)
{
  GtRange genomicrange, fragmentrange;
  GtArray *enrichment;
  unsigned long i;
  gt_assert(chain && fragments && num_of_fragments);
  if (comments) {
    gt_file_xprintf(outfp, "%c enrich global chain with the following "
                              "forward ranges:\n",COMMENTCHAR);
    gt_file_xprintf(outfp, "%c ", COMMENTCHAR);
    gt_ranges_show(chain->forwardranges, outfp);
  }
  /* get genomic range of DP range */
  genomicrange = chain_get_genomicrange(chain);
  enrichment = gt_array_new(sizeof (GtRange));
  /* add each fragment which overlaps which DP range to the enrichment */
  for (i = 0; i < num_of_fragments; i++) {
    fragmentrange.start  = fragments[i].startpos2;
    fragmentrange.end = fragments[i].endpos2;
    if (gt_range_overlap(&genomicrange, &fragmentrange))
      gt_array_add(enrichment, fragmentrange);
  }
  gt_assert(gt_array_size(enrichment));
  /* sort the enrichment */
  qsort(gt_array_get_space(enrichment), gt_array_size(enrichment),
        sizeof (GtRange), (GtCompare) gt_range_compare);
  /* reset the current DP range array */
  gt_array_reset(chain->forwardranges);
  /* rebuild the DP range array which now includes the enrichment */
  genomicrange = *(GtRange*) gt_array_get_first(enrichment);
  gt_array_add(chain->forwardranges, genomicrange);
  for (i = 1; i < gt_array_size(enrichment); i++) {
    genomicrange = *(GtRange*) gt_array_get(enrichment, i);
    if (genomicrange.start <=
        ((GtRange*) gt_array_get_last(chain->forwardranges))->end) {
      /* overlap found -> modify last range, if necessary */
      if (((GtRange*) gt_array_get_last(chain->forwardranges))->end <
          genomicrange.end) {
        ((GtRange*) gt_array_get_last(chain->forwardranges))->end =
          genomicrange.end;
      }
    }
    else {
      /* save range */
      gt_array_add(chain->forwardranges, genomicrange);
    }
  }
  gt_array_delete(enrichment);
}
Ejemplo n.º 9
0
bool gt_ranges_borders_are_in_region(GtArray *ranges, const GtRange *region)
{
  gt_assert(ranges && region);

  /* check region start */
  if (((GtRange*) gt_array_get_first(ranges))->start < region->start)
    return false;

  /* check region end */
  if (((GtRange*) gt_array_get_last(ranges))->end > region->end)
    return false;

  return true;
}
Ejemplo n.º 10
0
void gth_chain_contract(GthChain *dest, const GthChain *src)
{
  GtRange forwardrange, reverserange;

  gt_assert(gt_array_size(src->forwardranges) ==
            gt_array_size(src->reverseranges));

  /* copy core */
  chain_copy_core(dest, src);

  /* contract ranges */
  forwardrange.start  = ((GtRange*)
                        gt_array_get_first(src->forwardranges))->start;
  forwardrange.end = ((GtRange*)
                        gt_array_get_last(src->forwardranges))->end;
  reverserange.start  = ((GtRange*)
                        gt_array_get_first(src->reverseranges))->start;
  reverserange.end = ((GtRange*)
                        gt_array_get_last(src->reverseranges))->end;

  /* store contracted ranges */
  gt_array_add(dest->forwardranges, forwardrange);
  gt_array_add(dest->reverseranges, reverserange);
}
Ejemplo n.º 11
0
static int seqid_info_add(SeqidInfo *seqid_info, unsigned long seqnum,
                          unsigned long filenum, const GtRange *range,
                          const char *filename, const char *seqid, GtError *err)
{
  SeqidInfoElem *seqid_info_elem_ptr, seqid_info_elem;
  gt_error_check(err);
  gt_assert(range);
  seqid_info_elem_ptr = gt_array_get_first(seqid_info);
  if (range->end == GT_UNDEF_ULONG ||
      seqid_info_elem_ptr->descrange.end == GT_UNDEF_ULONG) {
    gt_error_set(err, "sequence file \"%s\" does contain multiple sequences "
                  "with ID \"%s\" and not all of them have description ranges",
                  filename, seqid);
    return -1;
  }
  seqid_info_elem.seqnum = seqnum;
  seqid_info_elem.filenum = filenum;
  seqid_info_elem.descrange = *range;
  gt_array_add(seqid_info, seqid_info_elem);
  return 0;
}
static int seqid_info_add(SeqidInfo *seqid_info, GtUword seqnum,
                          GtUword filenum, const GtRange *range,
                          GT_UNUSED const char *filename,
                          const char *seqid, GtError *err)
{
  SeqidInfoElem *seqid_info_elem_ptr, seqid_info_elem;
  gt_error_check(err);
  gt_assert(range);
  seqid_info_elem_ptr = gt_array_get_first(seqid_info);
  if (range->end == GT_UNDEF_UWORD ||
      seqid_info_elem_ptr->descrange.end == GT_UNDEF_UWORD) {
    gt_error_set(err, "input sequence(s) contain multiple sequences "
                 "with ID \"%s\" and not all of them have description ranges",
                 seqid);
    return -1;
  }
  seqid_info_elem.seqnum = seqnum;
  seqid_info_elem.filenum = filenum;
  seqid_info_elem.descrange = *range;
  gt_array_add(seqid_info, seqid_info_elem);
  return 0;
}
Ejemplo n.º 13
0
static int check_cds_phases(GtArray *cds_features, GtCDSCheckVisitor *v,
                            bool is_multi, bool second_pass, GtError *err)
{
  GtPhase current_phase, correct_phase = GT_PHASE_ZERO;
  GtFeatureNode *fn;
  GtStrand strand;
  unsigned long i, current_length;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(cds_features);
  gt_assert(gt_array_size(cds_features));
  fn = *(GtFeatureNode**) gt_array_get_first(cds_features);
  strand = gt_feature_node_get_strand(fn);
  if (strand == GT_STRAND_REVERSE)
    gt_array_reverse(cds_features);
  for (i = 0; !had_err && i < gt_array_size(cds_features); i++) {
    fn = *(GtFeatureNode**) gt_array_get(cds_features, i);
    /* the first phase can be anything (except being undefined), because the
       GFF3 spec says:

       NOTE 4 - CDS features MUST have have a defined phase field. Otherwise it
       is not possible to infer the correct polypeptides corresponding to
       partially annotated genes. */
    if ((!i && gt_feature_node_get_phase(fn) == GT_PHASE_UNDEFINED) ||
        (i && gt_feature_node_get_phase(fn) != correct_phase)) {
      if (gt_hashmap_get(v->cds_features, fn)) {
        if (v->tidy && !is_multi && !gt_feature_node_has_children(fn)) {
          /* we can split the feature */
          gt_warning("%s feature on line %u in file \"%s\" has multiple "
                     "parents which require different phases; split feature",
                     gt_ft_CDS,
                     gt_genome_node_get_line_number((GtGenomeNode*) fn),
                     gt_genome_node_get_filename((GtGenomeNode*) fn));
          gt_hashmap_add(v->cds_features_to_split, fn, fn);
          v->splitting_is_necessary = true; /* split later */
        }
        else {
          gt_error_set(err, "%s feature on line %u in file \"%s\" has multiple "
                       "parents which require different phases",
                       gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn));
          had_err = -1;
        }
      }
      else {
        if (v->tidy) {
          if (!second_pass) {
            gt_warning("%s feature on line %u in file \"%s\" has the wrong "
                       "phase %c -> correcting it to %c", gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn),
                       GT_PHASE_CHARS[gt_feature_node_get_phase(fn)],
                       GT_PHASE_CHARS[correct_phase]);
          }
          gt_feature_node_set_phase(fn, correct_phase);
        }
        else {
          gt_error_set(err, "%s feature on line %u in file \"%s\" has the "
                       "wrong phase %c (should be %c)", gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn),
                       GT_PHASE_CHARS[gt_feature_node_get_phase(fn)],
                       GT_PHASE_CHARS[correct_phase]);
          had_err = -1;
        }
      }
    }
    if (!had_err) {
      current_phase = gt_feature_node_get_phase(fn);
      current_length = gt_genome_node_get_length((GtGenomeNode*) fn);
      correct_phase = (3 - (current_length - current_phase) % 3) % 3;
      gt_hashmap_add(v->cds_features, fn, fn); /* record CDS feature */
    }
  }
  return had_err;
}
Ejemplo n.º 14
0
/* XXX: change this function: add more sophisticated extension strategy */
void gth_chain_extend_borders(GthChain *chain, const GtRange *gen_seq_bounds,
                              const GtRange *gen_seq_bounds_rc,
                              GT_UNUSED unsigned long gen_total_length,
                              GT_UNUSED unsigned long gen_offset)
{
  long tmpborder;

  /* at least one range in chain */
  gt_assert(gt_array_size(chain->forwardranges));
  /* forward range borders are in considered genomic region */
  gt_assert(gt_ranges_borders_are_in_region(chain->forwardranges,
                                            gen_seq_bounds));
  /* reverse range borders are in considered genomic region */
  gt_assert(gt_ranges_borders_are_in_region(chain->reverseranges,
                                            gen_seq_bounds_rc));
  /* chain->forwardranges is forward and consecutive */
  gt_assert(gt_ranges_are_consecutive(chain->forwardranges));
  /* valid sequence bounds */
  gt_assert(gen_seq_bounds->start <= gen_seq_bounds->end);
  gt_assert(gen_seq_bounds_rc->start <= gen_seq_bounds_rc->end);

  /* set start border, forward strand */
  tmpborder = gt_safe_cast2long(((GtRange*)
                                 gt_array_get_first(chain->forwardranges))
                                 ->start);
  tmpborder -= DPEXTENSION;
  if (tmpborder < gt_safe_cast2long(gen_seq_bounds->start))
    tmpborder = gen_seq_bounds->start;
  ((GtRange*) gt_array_get_first(chain->forwardranges))->start =
    gt_safe_cast2ulong(tmpborder);

  /* set start border, reverse complement strand */
  tmpborder = gt_safe_cast2long(((GtRange*)
                                 gt_array_get_first(chain->reverseranges))
                                ->start);
  tmpborder -= DPEXTENSION;
  if (tmpborder < gt_safe_cast2long(gen_seq_bounds_rc->start))
    tmpborder = gen_seq_bounds_rc->start;
  ((GtRange*) gt_array_get_first(chain->reverseranges))->start =
    gt_safe_cast2ulong(tmpborder);

  /* set end border, forward strand */
  tmpborder = gt_safe_cast2long(((GtRange*)
                                gt_array_get_last(chain->forwardranges))
                                ->end);
  tmpborder += DPEXTENSION;
  if (tmpborder > gt_safe_cast2long(gen_seq_bounds->end))
    tmpborder = gen_seq_bounds->end;
  ((GtRange*) gt_array_get_last(chain->forwardranges))->end =
    gt_safe_cast2ulong(tmpborder);

  /* set end border, reverse complement strand */
  tmpborder = gt_safe_cast2long(((GtRange*)
                                gt_array_get_last(chain->reverseranges))
                                ->end);
  tmpborder += DPEXTENSION;
  if (tmpborder > gt_safe_cast2long(gen_seq_bounds_rc->end))
    tmpborder = gen_seq_bounds_rc->end;
  ((GtRange*) gt_array_get_last(chain->reverseranges))->end =
    gt_safe_cast2ulong(tmpborder);

  gt_assert(chain_is_filled_and_consistent(chain, gen_total_length,
                                           gen_offset));
}
Ejemplo n.º 15
0
void gth_sa_calc_polyAtailpos(GthSA *sa, const unsigned char *ref_seq_tran,
                              GtAlphabet *ref_alphabet)
{
  GtUword ppa, mma, rightreferenceborder, referencelength;
  GtWord i, leftreferenceborder;

  sa->polyAtailpos.start = 0;
  sa->polyAtailpos.end = 0;
  ppa = mma = 0;

  rightreferenceborder = ((Exoninfo*) gt_array_get_last(sa->exons))
                         ->rightreferenceexonborder;
  leftreferenceborder  = ((Exoninfo*) gt_array_get_first(sa->exons))
                         ->leftreferenceexonborder;

  /* setting i */
  referencelength = gth_sa_ref_total_length(sa);
  if ((rightreferenceborder + 1) >=
      (referencelength - 1 - CALCPOLYATAILWINDOW)) {
    i = gt_safe_cast2long(rightreferenceborder + 1);
  }
  else {
    if (referencelength < 1 + CALCPOLYATAILWINDOW)
      i = 0;
    else
      i =  referencelength - 1 - CALCPOLYATAILWINDOW;
  }

  for (/* i already set */; i < gt_safe_cast2long(referencelength); i++) {
    if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'A'))
      ppa++;
    else {
      if (ppa > 0 && mma < 1) {
        mma++;
        continue;
      }
      else {
        if (ppa >= MINIMUMPOLYATAILLENGTH)
          break;
        else {
          ppa = mma = 0;
          continue;
        }
      }
    }
  }

  if (ppa >= MINIMUMPOLYATAILLENGTH) {
    sa->polyAtailpos.start = gt_safe_cast2ulong(i - ppa - mma);
    sa->polyAtailpos.end = i - 1;
  }
  else {
    ppa = mma = 0;

    /* setting i */
    if ((leftreferenceborder - 1) <= CALCPOLYATAILWINDOW)
      i = leftreferenceborder - 1;
    else
      i =  CALCPOLYATAILWINDOW - 1;

    for (/* i already set */; i >= 0; i--) {
      if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'T'))
        ppa++;
      else {
        if (ppa > 0 && mma < 1) {
          mma++;
          continue;
        }
        else {
          if (ppa >= MINIMUMPOLYATAILLENGTH)
            break;
          else {
            ppa = mma = 0;
            continue;
          }
        }
      }
    }

    if (ppa >= MINIMUMPOLYATAILLENGTH) {
      sa->polyAtailpos.start  = gt_safe_cast2ulong(i + ppa + mma);
      sa->polyAtailpos.end = i + 1;
    }
  }
}