static int gt_encseq_col_grep_desc(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *seqid, GtError *err) { GtUword filenum = 0, seqnum = 0; int had_err; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && seq && seqid); had_err = gt_encseq_col_do_grep_desc(esc, &filenum, &seqnum, seqid, err); if (!had_err) { *seq = gt_seq_col_get_sequence(sc, filenum, seqnum, start, end); } return had_err; }
int gt_region_mapping_get_sequence(GtRegionMapping *rm, char **seq, GtStr *seqid, unsigned long start, unsigned long end, GtError *err) { int had_err = 0; unsigned long offset = 1; GtRange range = {GT_UNDEF_ULONG, GT_UNDEF_ULONG}; gt_error_check(err); gt_assert(rm && seq && seqid && gt_str_length(seqid) > 0); /* handle rawseq access first */ if (rm->userawseq) { gt_assert(!rm->seqid2seqnum_mapping); *seq = gt_calloc(end - start + 1, sizeof (char)); strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char)); return 0; } /* make sure that correct sequence is loaded */ had_err = update_seq_col_if_necessary(rm, seqid, err); /* MD5 sequence id */ if (!had_err) { if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { had_err = gt_seq_col_md5_to_seq(rm->seq_col, seq, start - offset, end - offset, seqid, err); return had_err; } } /* ``regular'' sequence ID */ if (!had_err) { gt_assert(!rm->usedesc || rm->seqid2seqnum_mapping); gt_assert(rm->mapping || rm->seq_col); if (rm->usedesc) { unsigned long seqnum, filenum; gt_assert(rm->seqid2seqnum_mapping); range.start = start; range.end = end; had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping, gt_str_get(seqid), &range, &seqnum, &filenum, &offset, err); if (!had_err) { if (range.end != GT_UNDEF_ULONG && range.start != GT_UNDEF_ULONG && range.end >= gt_seq_col_get_sequence_length(rm->seq_col, filenum, seqnum) + offset) { gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (with " "boundaries %lu-%lu). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), range.start, range.end); had_err = -1; } } if (!had_err) { *seq = gt_seq_col_get_sequence(rm->seq_col, filenum, seqnum, start - offset, end - offset); } } else if (rm->matchdesc) { gt_assert(!rm->seqid2seqnum_mapping); gt_assert(rm->seq_col); if (!had_err) { had_err = gt_seq_col_grep_desc(rm->seq_col, seq, start - 1, end - 1, seqid, err); } } else if (rm->useseqno) { unsigned long seqno = GT_UNDEF_ULONG; gt_assert(rm->encseq); if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) { gt_error_set(err, "seqid '%s' does not have the form 'seqX' " "where X is a sequence number in the encoded " "sequence", gt_str_get(seqid)); had_err = -1; } gt_assert(had_err || seqno != GT_UNDEF_ULONG); if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) { gt_error_set(err, "trying to access sequence %lu, but encoded " "sequence contains only %lu sequences", seqno, gt_encseq_num_of_sequences(rm->encseq)); had_err = -1; } if (!had_err) { unsigned long seqlength = gt_encseq_seqlength(rm->encseq, seqno); if (start > seqlength || end > seqlength) { gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (only " "%lu characters in size). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), seqlength); had_err = -1; } } if (!had_err) { unsigned long seqstartpos; *seq = gt_calloc(end - start + 1, sizeof (char)); seqstartpos = gt_encseq_seqstartpos(rm->encseq, seqno); gt_encseq_extract_decoded(rm->encseq, *seq, seqstartpos + start - 1, seqstartpos + end - 1); } } else if (rm->userawseq) { gt_assert(!rm->seqid2seqnum_mapping); *seq = gt_calloc(end - start + 1, sizeof (char)); strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char)); } else { gt_assert(rm->seq_col); if (!had_err) { unsigned long seqlength = gt_seq_col_get_sequence_length(rm->seq_col, 0, 0); if (start > seqlength || end > seqlength) { had_err = -1; gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (only " "%lu characters in size). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), seqlength); } if (!had_err) { *seq = gt_seq_col_get_sequence(rm->seq_col, 0, 0, start - offset, end - offset); } } } } return had_err; }