Exemplo n.º 1
0
int gt_region_mapping_get_sequence_length(GtRegionMapping *rm,
                                          unsigned long *length, GtStr *seqid,
                                          GtError *err)
{
  unsigned long filenum, seqnum;
  int had_err;
  gt_error_check(err);
  GT_UNUSED GtRange range;
  gt_assert(rm && seqid);
  if (rm->userawseq) {
    return rm->rawlength;
  }
  had_err = update_seq_col_if_necessary(rm, seqid, err);
  if (!had_err) {
    if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
      had_err = gt_seq_col_md5_to_sequence_length(rm->seq_col, length, seqid,
                                                  err);
    }
    else if (rm->usedesc) {
      gt_assert(rm->seqid2seqnum_mapping);
      had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping,
                                            gt_str_get(seqid), &range, &seqnum,
                                            &filenum, NULL, err);
      if (!had_err)
        *length = gt_seq_col_get_sequence_length(rm->seq_col, filenum, seqnum);
    }
    else if (rm->matchdesc) {
      had_err = gt_seq_col_grep_desc_sequence_length(rm->seq_col, length,
                                                     seqid, err);
    }
    else if (rm->useseqno) {
      unsigned long seqno = GT_UNDEF_ULONG;
      gt_assert(rm->encseq);
      if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) {
        gt_error_set(err, "seqid '%s' does not have the form 'seqX' "
                          "where X is a sequence number in the encoded "
                          "sequence", gt_str_get(seqid));
        had_err = -1;
      }
      gt_assert(had_err || seqno != GT_UNDEF_ULONG);
      if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) {
          gt_error_set(err, "trying to access sequence %lu, but encoded "
                            "sequence contains only %lu sequences",
                            seqno, gt_encseq_num_of_sequences(rm->encseq));
          had_err = -1;
      }
      if (!had_err) {
        *length = gt_encseq_seqlength(rm->encseq, seqno);
      }
    }
    else
      *length = gt_seq_col_get_sequence_length(rm->seq_col, 0, 0);
  }
  return had_err;
}
Exemplo n.º 2
0
static int gt_encseq_col_grep_desc_sequence_length(GtSeqCol *sc,
                                                   GtUword *length,
                                                   GtStr *seqid,
                                                   GtError *err)
{
  GtUword filenum = 0, seqnum = 0;
  int had_err;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && length && seqid);
  had_err = gt_encseq_col_do_grep_desc(esc, &filenum, &seqnum, seqid, err);
  if (!had_err)
    *length = gt_seq_col_get_sequence_length(sc, seqnum, filenum);
  return had_err;
}
Exemplo n.º 3
0
int gt_region_mapping_get_sequence(GtRegionMapping *rm, char **seq,
                                   GtStr *seqid, unsigned long start,
                                   unsigned long end, GtError *err)
{
  int had_err = 0;
  unsigned long offset = 1;
  GtRange range = {GT_UNDEF_ULONG, GT_UNDEF_ULONG};
  gt_error_check(err);
  gt_assert(rm && seq && seqid && gt_str_length(seqid) > 0);

  /* handle rawseq access first  */
  if (rm->userawseq) {
    gt_assert(!rm->seqid2seqnum_mapping);
    *seq = gt_calloc(end - start + 1, sizeof (char));
    strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char));
    return 0;
  }

  /* make sure that correct sequence is loaded */
  had_err = update_seq_col_if_necessary(rm, seqid, err);

  /* MD5 sequence id */
  if (!had_err) {
    if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
      had_err = gt_seq_col_md5_to_seq(rm->seq_col, seq, start - offset,
                                      end - offset, seqid, err);
      return had_err;
    }
  }

  /* ``regular'' sequence ID */
  if (!had_err) {
    gt_assert(!rm->usedesc || rm->seqid2seqnum_mapping);
    gt_assert(rm->mapping || rm->seq_col);
    if (rm->usedesc) {
      unsigned long seqnum, filenum;
      gt_assert(rm->seqid2seqnum_mapping);
      range.start = start;
      range.end = end;
      had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping,
                                            gt_str_get(seqid), &range, &seqnum,
                                            &filenum, &offset, err);

      if (!had_err) {
        if (range.end != GT_UNDEF_ULONG && range.start != GT_UNDEF_ULONG &&
              range.end >= gt_seq_col_get_sequence_length(rm->seq_col, filenum,
                                                          seqnum)
              + offset) {
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (with "
                       "boundaries %lu-%lu). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid),
                       range.start, range.end);
          had_err = -1;
        }
      }
      if (!had_err) {
        *seq = gt_seq_col_get_sequence(rm->seq_col, filenum, seqnum,
                                       start - offset, end - offset);
      }
    } else if (rm->matchdesc) {
      gt_assert(!rm->seqid2seqnum_mapping);
      gt_assert(rm->seq_col);
      if (!had_err) {
        had_err = gt_seq_col_grep_desc(rm->seq_col, seq, start - 1, end - 1,
                                       seqid, err);
      }
    } else if (rm->useseqno) {
      unsigned long seqno = GT_UNDEF_ULONG;
      gt_assert(rm->encseq);
      if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) {
        gt_error_set(err, "seqid '%s' does not have the form 'seqX' "
                          "where X is a sequence number in the encoded "
                          "sequence", gt_str_get(seqid));
        had_err = -1;
      }
      gt_assert(had_err || seqno != GT_UNDEF_ULONG);
      if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) {
          gt_error_set(err, "trying to access sequence %lu, but encoded "
                            "sequence contains only %lu sequences",
                            seqno, gt_encseq_num_of_sequences(rm->encseq));
          had_err = -1;
      }
      if (!had_err) {
        unsigned long seqlength = gt_encseq_seqlength(rm->encseq, seqno);
        if (start > seqlength || end > seqlength) {
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (only "
                       "%lu characters in size). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid), seqlength);
          had_err = -1;
        }
      }
      if (!had_err) {
        unsigned long seqstartpos;
        *seq = gt_calloc(end - start + 1, sizeof (char));
        seqstartpos = gt_encseq_seqstartpos(rm->encseq, seqno);
        gt_encseq_extract_decoded(rm->encseq, *seq, seqstartpos + start - 1,
                                  seqstartpos + end - 1);
      }
    } else if (rm->userawseq) {
      gt_assert(!rm->seqid2seqnum_mapping);
      *seq = gt_calloc(end - start + 1, sizeof (char));
      strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char));
    } else {
      gt_assert(rm->seq_col);
      if (!had_err) {
        unsigned long seqlength = gt_seq_col_get_sequence_length(rm->seq_col,
                                                                 0, 0);
        if (start > seqlength || end > seqlength) {
          had_err = -1;
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (only "
                       "%lu characters in size). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid), seqlength);
        }
        if (!had_err) {
          *seq = gt_seq_col_get_sequence(rm->seq_col, 0, 0, start - offset,
                                         end - offset);
        }
      }
    }
  }
  return had_err;
}