static int gt_bioseq_col_grep_desc(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *seqid, GtError *err) { GtUword filenum = 0, seqnum = 0, seqlength; int had_err; GtBioseqCol *bsc; bsc = gt_bioseq_col_cast(sc); gt_error_check(err); gt_assert(bsc && seq && seqid); had_err = grep_desc(bsc, &filenum, &seqnum, seqid, err); if (!had_err) { seqlength = gt_bioseq_get_sequence_length(bsc->bioseqs[filenum], seqnum); if (start > seqlength - 1 || end > seqlength - 1) { had_err = -1; gt_error_set(err, "trying to extract range "GT_WU"-"GT_WU" on sequence " "``%s'' which is not covered by that sequence (only " ""GT_WU" characters in size). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), seqlength); } } if (!had_err) { *seq = gt_bioseq_get_sequence_range(bsc->bioseqs[filenum], seqnum, start, end); } return had_err; }
static char* gt_bioseq_col_get_sequence(const GtSeqCol *sc, GtUword filenum, GtUword seqnum, GtUword start, GtUword end) { GtBioseqCol *bsc; bsc = gt_bioseq_col_cast(sc); gt_assert(bsc && filenum < bsc->num_of_seqfiles); return gt_bioseq_get_sequence_range(bsc->bioseqs[filenum], seqnum, start, end); }
GtSeq* gt_bioseq_get_seq_range(GtBioseq *bs, GtUword idx, GtUword start, GtUword end) { GtSeq *seq; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); gt_assert(end >= start); gt_assert(end - start + 1 > gt_encseq_seqlength(bs->encseq, idx)); seq = gt_seq_new_own(gt_bioseq_get_sequence_range(bs, idx, start, end), end - start + 1, gt_encseq_alphabet(bs->encseq)); gt_seq_set_description(seq, gt_bioseq_get_description(bs, idx)); return seq; }
static int gt_bioseq_col_md5_to_seq(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; GtBioseq *bioseq = NULL; GtBioseqCol *bsc; int had_err = 0; bsc = gt_bioseq_col_cast(sc); gt_error_check(err); gt_assert(bsc && seq && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (!(had_err = md5_to_index(&bioseq, &seqnum, bsc, md5_seqid, err))) { gt_assert(seqnum != GT_UNDEF_UWORD); *seq = gt_bioseq_get_sequence_range(bioseq, seqnum, start, end); } return had_err; }
static char* generate_fragment(GtShredder *shredder, unsigned long *fragment_length, GtStr *desc) { gt_assert(shredder && fragment_length); if (shredder->seqnum < gt_bioseq_number_of_sequences(shredder->bioseq)) { unsigned long seqlen, fraglen; char *frag; seqlen = gt_bioseq_get_sequence_length(shredder->bioseq, shredder->seqnum); fraglen = (shredder->maxlength == shredder->minlength ? 0 : gt_rand_max(shredder->maxlength - shredder->minlength)) + shredder->minlength; gt_assert(fraglen >= shredder->minlength); if (shredder->pos + fraglen > seqlen) fraglen = seqlen - shredder->pos; *fragment_length = fraglen; gt_str_reset(desc); gt_str_append_cstr(desc, gt_bioseq_get_description(shredder->bioseq, shredder->seqnum)); gt_assert(shredder->pos + fraglen <= seqlen); frag = gt_bioseq_get_sequence_range(shredder->bioseq, shredder->seqnum, shredder->pos, shredder->pos + fraglen -1); if (shredder->pos + fraglen == seqlen) { /* last fragment */ shredder->seqnum++; shredder->pos = 0; } else { if (fraglen > shredder->overlap) shredder->pos += fraglen - shredder->overlap; else shredder->pos++; /* go at least one base further each step */ } return frag; } return NULL; }