static int md5_to_seqid(GtGenomeNode *gn, GtRegionMapping *region_mapping, GtError *err) { GtStr *seqid; int had_err = 0; gt_error_check(err); gt_assert(gn && region_mapping); seqid = gt_genome_node_get_seqid(gn); if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { /* seqid is a MD5 seqid -> change id */ GtStr *desc = gt_str_new(); had_err = gt_region_mapping_get_description(region_mapping, desc, seqid, err); if (!had_err) { GtStr *new_seqid = gt_str_new(); gt_regular_seqid_save(new_seqid, desc); if (gt_feature_node_try_cast(gn)) { M2IChangeSeqidInfo info; info.new_seqid = new_seqid; info.region_mapping = region_mapping; had_err = gt_feature_node_traverse_children((GtFeatureNode*) gn, &info, m2i_change_seqid, true, err); } else gt_genome_node_change_seqid(gn, new_seqid); gt_str_delete(new_seqid); } gt_str_delete(desc); } return had_err; }
int gt_region_mapping_get_sequence_length(GtRegionMapping *rm, unsigned long *length, GtStr *seqid, GtError *err) { unsigned long filenum, seqnum; int had_err; gt_error_check(err); GT_UNUSED GtRange range; gt_assert(rm && seqid); if (rm->userawseq) { return rm->rawlength; } had_err = update_seq_col_if_necessary(rm, seqid, err); if (!had_err) { if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { had_err = gt_seq_col_md5_to_sequence_length(rm->seq_col, length, seqid, err); } else if (rm->usedesc) { gt_assert(rm->seqid2seqnum_mapping); had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping, gt_str_get(seqid), &range, &seqnum, &filenum, NULL, err); if (!had_err) *length = gt_seq_col_get_sequence_length(rm->seq_col, filenum, seqnum); } else if (rm->matchdesc) { had_err = gt_seq_col_grep_desc_sequence_length(rm->seq_col, length, seqid, err); } else if (rm->useseqno) { unsigned long seqno = GT_UNDEF_ULONG; gt_assert(rm->encseq); if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) { gt_error_set(err, "seqid '%s' does not have the form 'seqX' " "where X is a sequence number in the encoded " "sequence", gt_str_get(seqid)); had_err = -1; } gt_assert(had_err || seqno != GT_UNDEF_ULONG); if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) { gt_error_set(err, "trying to access sequence %lu, but encoded " "sequence contains only %lu sequences", seqno, gt_encseq_num_of_sequences(rm->encseq)); had_err = -1; } if (!had_err) { *length = gt_encseq_seqlength(rm->encseq, seqno); } } else *length = gt_seq_col_get_sequence_length(rm->seq_col, 0, 0); } return had_err; }
int gt_bioseq_col_md5_to_sequence_length(GtSeqCol *sc, GtUword *len, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; GtBioseq *bioseq = NULL; GtBioseqCol *bsc; int had_err = 0; bsc = gt_bioseq_col_cast(sc); gt_error_check(err); gt_assert(bsc && len && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (!(had_err = md5_to_index(&bioseq, &seqnum, bsc, md5_seqid, err))) { gt_assert(seqnum != GT_UNDEF_UWORD); *len = gt_bioseq_get_sequence_length(bioseq, seqnum); } return had_err; }
static int gt_bioseq_col_md5_to_description(GtSeqCol *sc, GtStr *desc, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; GtBioseq *bioseq = NULL; GtBioseqCol *bsc; int had_err = 0; bsc = gt_bioseq_col_cast(sc); gt_error_check(err); gt_assert(bsc && desc && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (!(had_err = md5_to_index(&bioseq, &seqnum, bsc, md5_seqid, err))) { gt_assert(seqnum != GT_UNDEF_UWORD); gt_str_append_cstr(desc, gt_bioseq_get_description(bioseq, seqnum)); } return had_err; }
static int gt_bioseq_col_md5_to_seq(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; GtBioseq *bioseq = NULL; GtBioseqCol *bsc; int had_err = 0; bsc = gt_bioseq_col_cast(sc); gt_error_check(err); gt_assert(bsc && seq && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (!(had_err = md5_to_index(&bioseq, &seqnum, bsc, md5_seqid, err))) { gt_assert(seqnum != GT_UNDEF_UWORD); *seq = gt_bioseq_get_sequence_range(bioseq, seqnum, start, end); } return had_err; }
int gt_encseq_col_md5_to_sequence_length(GtSeqCol *sc, GtUword *len, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && len && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); seqnum = gt_md5_tab_map(esc->md5_tab, gt_str_get(md5_seqid) + GT_MD5_SEQID_PREFIX_LEN); if (seqnum != GT_UNDEF_UWORD) { gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq)); *len = gt_encseq_seqlength(esc->encseq, seqnum); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }
static int gt_encseq_col_md5_to_seq(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; char seqid[GT_MD5_SEQID_HASH_LEN + 1]; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && seq && start <= end && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) { const char *cstrseqid = gt_str_get(md5_seqid); if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) { gt_error_set(err, "MD5 sequence id %s not terminated with '%c'", gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR); had_err = -1; } if (!had_err) { strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN, GT_MD5_SEQID_HASH_LEN); seqid[GT_MD5_SEQID_HASH_LEN] = '\0'; } } seqnum = gt_md5_tab_map(esc->md5_tab, seqid); if (seqnum != GT_UNDEF_UWORD) { GtUword startpos = gt_encseq_seqstartpos(esc->encseq, seqnum), GT_UNUSED seqlength = gt_encseq_seqlength(esc->encseq, seqnum); *seq = gt_calloc(end - start + 1, sizeof (char)); gt_encseq_extract_decoded(esc->encseq, (char*) *seq, startpos + start, startpos + end); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }
static int gt_encseq_col_md5_to_description(GtSeqCol *sc, GtStr *desc, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; char seqid[GT_MD5_SEQID_HASH_LEN + 1]; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && desc && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) { const char *cstrseqid = gt_str_get(md5_seqid); if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) { gt_error_set(err, "MD5 sequence id %s not terminated with '%c'", gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR); had_err = -1; } if (!had_err) { strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN, GT_MD5_SEQID_HASH_LEN); seqid[GT_MD5_SEQID_HASH_LEN] = '\0'; } } seqnum = gt_md5_tab_map(esc->md5_tab, seqid); if (seqnum != GT_UNDEF_UWORD) { const char *cdesc; GtUword desc_len; gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq)); cdesc = gt_encseq_description(esc->encseq, &desc_len, seqnum); gt_str_append_cstr_nt(desc, cdesc, desc_len); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }
const char* gt_region_mapping_get_md5_fingerprint(GtRegionMapping *rm, GtStr *seqid, const GtRange *range, unsigned long *offset, GtError *err) { const char *md5 = NULL; int had_err; unsigned long filenum, seqnum; gt_error_check(err); gt_assert(rm && seqid); gt_assert(!rm->userawseq); /* not implemented */ gt_assert(!gt_md5_seqid_has_prefix(gt_str_get(seqid))); /* not implemented */ had_err = update_seq_col_if_necessary(rm, seqid, err); if (!had_err) { if (rm->usedesc) { gt_assert(rm->seqid2seqnum_mapping); had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping, gt_str_get(seqid), range, &seqnum, &filenum, offset, err); if (!had_err) md5 = gt_seq_col_get_md5_fingerprint(rm->seq_col, filenum, seqnum); } else if (rm->matchdesc) { if (!rm->seq_col) { if (rm->encseq) { if (!(rm->seq_col = gt_encseq_col_new(rm->encseq, err))) had_err = -1; } else { if (!(rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err))) had_err = -1; } } if (!had_err) (void) gt_seq_col_grep_desc_md5(rm->seq_col, &md5, seqid, err); *offset = 1; } else if (rm->useseqno) { GtMD5Tab *tab = NULL; unsigned long seqno = GT_UNDEF_ULONG; gt_assert(rm->encseq); if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) { gt_error_set(err, "seqid '%s' does not have the form 'seqX' " "where X is a sequence number in the encoded " "sequence", gt_str_get(seqid)); had_err = -1; } gt_assert(had_err || seqno != GT_UNDEF_ULONG); if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) { gt_error_set(err, "trying to access sequence %lu, but encoded" "sequence contains only %lu sequences", seqno, gt_encseq_num_of_sequences(rm->encseq)); had_err = -1; } if (!had_err) { tab = gt_encseq_get_md5_tab(rm->encseq, err); if (!tab) had_err = -1; } *offset = 1; if (!had_err) return gt_md5_tab_get(tab, seqno); else return NULL; } else { if (!had_err) md5 = gt_seq_col_get_md5_fingerprint(rm->seq_col, 0, 0); *offset = 1; } } return md5; }
int gt_region_mapping_get_description(GtRegionMapping *rm, GtStr *desc, GtStr *seqid, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(rm && desc && seqid); if (rm->userawseq) { gt_str_append_cstr(desc, "<rawseq>"); return 0; } had_err = update_seq_col_if_necessary(rm, seqid, err); if (!had_err) { if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { had_err = gt_seq_col_md5_to_description(rm->seq_col, desc, seqid, err); } return had_err; } if (!had_err) { if (rm->usedesc) { unsigned long filenum, seqnum; gt_assert(rm->seqid2seqnum_mapping); had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping, gt_str_get(seqid), NULL, &seqnum, &filenum, NULL, err); if (!had_err) { char *cdesc; cdesc = gt_seq_col_get_description(rm->seq_col, filenum, seqnum); gt_assert(cdesc); gt_str_append_cstr(desc, cdesc); gt_free(cdesc); } } else if (rm->useseqno) { unsigned long seqno = GT_UNDEF_ULONG; gt_assert(rm->encseq); if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) { gt_error_set(err, "seqid '%s' does not have the form 'seqX' " "where X is a sequence number in the encoded " "sequence", gt_str_get(seqid)); had_err = -1; } gt_assert(had_err || seqno != GT_UNDEF_ULONG); if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) { gt_error_set(err, "trying to access sequence %lu, but encoded" "sequence contains only %lu sequences", seqno, gt_encseq_num_of_sequences(rm->encseq)); had_err = -1; } if (!had_err) { unsigned long desclen; const char *edesc; edesc = gt_encseq_description(rm->encseq, &desclen, seqno); gt_str_append_cstr_nt(desc, edesc, desclen); } } else if (rm->matchdesc) { const char *md5; /* XXX: not beautiful, but works -- this may be LOTS faster */ had_err = gt_seq_col_grep_desc_md5(rm->seq_col, &md5, seqid, err); if (!had_err) { GtStr *md5_seqid = gt_str_new_cstr(md5); had_err = gt_seq_col_md5_to_description(rm->seq_col, desc, md5_seqid, err); gt_str_delete(md5_seqid); } } else { if (!had_err) { char *cdesc; cdesc = gt_seq_col_get_description(rm->seq_col, 0, 0); gt_assert(cdesc); gt_str_append_cstr(desc, cdesc); gt_free(cdesc); } } } return had_err; }
int gt_region_mapping_get_sequence(GtRegionMapping *rm, char **seq, GtStr *seqid, unsigned long start, unsigned long end, GtError *err) { int had_err = 0; unsigned long offset = 1; GtRange range = {GT_UNDEF_ULONG, GT_UNDEF_ULONG}; gt_error_check(err); gt_assert(rm && seq && seqid && gt_str_length(seqid) > 0); /* handle rawseq access first */ if (rm->userawseq) { gt_assert(!rm->seqid2seqnum_mapping); *seq = gt_calloc(end - start + 1, sizeof (char)); strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char)); return 0; } /* make sure that correct sequence is loaded */ had_err = update_seq_col_if_necessary(rm, seqid, err); /* MD5 sequence id */ if (!had_err) { if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { had_err = gt_seq_col_md5_to_seq(rm->seq_col, seq, start - offset, end - offset, seqid, err); return had_err; } } /* ``regular'' sequence ID */ if (!had_err) { gt_assert(!rm->usedesc || rm->seqid2seqnum_mapping); gt_assert(rm->mapping || rm->seq_col); if (rm->usedesc) { unsigned long seqnum, filenum; gt_assert(rm->seqid2seqnum_mapping); range.start = start; range.end = end; had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping, gt_str_get(seqid), &range, &seqnum, &filenum, &offset, err); if (!had_err) { if (range.end != GT_UNDEF_ULONG && range.start != GT_UNDEF_ULONG && range.end >= gt_seq_col_get_sequence_length(rm->seq_col, filenum, seqnum) + offset) { gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (with " "boundaries %lu-%lu). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), range.start, range.end); had_err = -1; } } if (!had_err) { *seq = gt_seq_col_get_sequence(rm->seq_col, filenum, seqnum, start - offset, end - offset); } } else if (rm->matchdesc) { gt_assert(!rm->seqid2seqnum_mapping); gt_assert(rm->seq_col); if (!had_err) { had_err = gt_seq_col_grep_desc(rm->seq_col, seq, start - 1, end - 1, seqid, err); } } else if (rm->useseqno) { unsigned long seqno = GT_UNDEF_ULONG; gt_assert(rm->encseq); if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) { gt_error_set(err, "seqid '%s' does not have the form 'seqX' " "where X is a sequence number in the encoded " "sequence", gt_str_get(seqid)); had_err = -1; } gt_assert(had_err || seqno != GT_UNDEF_ULONG); if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) { gt_error_set(err, "trying to access sequence %lu, but encoded " "sequence contains only %lu sequences", seqno, gt_encseq_num_of_sequences(rm->encseq)); had_err = -1; } if (!had_err) { unsigned long seqlength = gt_encseq_seqlength(rm->encseq, seqno); if (start > seqlength || end > seqlength) { gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (only " "%lu characters in size). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), seqlength); had_err = -1; } } if (!had_err) { unsigned long seqstartpos; *seq = gt_calloc(end - start + 1, sizeof (char)); seqstartpos = gt_encseq_seqstartpos(rm->encseq, seqno); gt_encseq_extract_decoded(rm->encseq, *seq, seqstartpos + start - 1, seqstartpos + end - 1); } } else if (rm->userawseq) { gt_assert(!rm->seqid2seqnum_mapping); *seq = gt_calloc(end - start + 1, sizeof (char)); strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char)); } else { gt_assert(rm->seq_col); if (!had_err) { unsigned long seqlength = gt_seq_col_get_sequence_length(rm->seq_col, 0, 0); if (start > seqlength || end > seqlength) { had_err = -1; gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (only " "%lu characters in size). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), seqlength); } if (!had_err) { *seq = gt_seq_col_get_sequence(rm->seq_col, 0, 0, start - offset, end - offset); } } } } return had_err; }
static int update_seq_col_if_necessary(GtRegionMapping *rm, GtStr *seqid, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(rm && seqid); /* for mappings, we need to load the changed sequence, if needed... */ if (rm->mapping) { if (!rm->sequence_file || (gt_str_cmp(rm->sequence_name, seqid))) { gt_str_delete(rm->sequence_file); /* ignore MD5 hashes when using region mappings */ if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid) +GT_MD5_SEQID_TOTAL_LEN, err); } else rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err); if (!rm->sequence_file) had_err = -1; else { /* load new seqcol */ if (!rm->sequence_filenames) rm->sequence_filenames = gt_str_array_new(); else gt_str_array_reset(rm->sequence_filenames); gt_str_array_add(rm->sequence_filenames, rm->sequence_file); if (!rm->sequence_name) rm->sequence_name = gt_str_new(); else gt_str_reset(rm->sequence_name); gt_str_append_str(rm->sequence_name, seqid); gt_seq_col_delete(rm->seq_col); rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err); if (!rm->seq_col) had_err = -1; } } } else { /* ...otherwise, just make sure the seqcol is loaded */ if (!rm->seq_col) { if (rm->encseq) { if (!(rm->seq_col = gt_encseq_col_new(rm->encseq, err))) had_err = -1; } else { gt_assert(rm->sequence_filenames); if (!(rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err))) had_err = -1; } } if (!had_err && rm->usedesc) { if (rm->seqid2seqnum_mapping) gt_seqid2seqnum_mapping_delete(rm->seqid2seqnum_mapping); rm->seqid2seqnum_mapping = gt_seqid2seqnum_mapping_new_seqcol(rm->seq_col, err); if (!rm->seqid2seqnum_mapping) { had_err = -1; } } } return had_err; }