static int seqid_info_get(SeqidInfo *seqid_info, GtUword *seqnum, GtUword *filenum, GtRange *outrange, const GtRange *inrange, GT_UNUSED const char *filename, const char *seqid, GtError *err) { SeqidInfoElem *seqid_info_elem; GtUword i; gt_error_check(err); gt_assert(seqid_info && seqnum && outrange && inrange); for (i = 0; i < gt_array_size(seqid_info); i++) { seqid_info_elem = gt_array_get(seqid_info, i); if (seqid_info_elem->descrange.end == GT_UNDEF_UWORD || gt_range_contains(&seqid_info_elem->descrange, inrange)) { *seqnum = seqid_info_elem->seqnum; *filenum = seqid_info_elem->filenum; *outrange = seqid_info_elem->descrange; return 0; } } gt_error_set(err, "cannot find a sequence with ID \"%s\" " "{range " GT_WU "," GT_WU ")", seqid, inrange->start, inrange->end); return -1; }
static bool filter_contain_range(GtFeatureNode *fn, GtRange contain_range) { GtRange range; gt_assert(fn); range = gt_genome_node_get_range((GtGenomeNode*) fn); if (contain_range.start != GT_UNDEF_ULONG && !gt_range_contains(&contain_range, &range)) { return true; } return false; }
static bool contains(const ConsensusSA *csa, unsigned long sa_1, unsigned long sa_2) { GtRange range_sa_1, range_sa_2; gt_assert(csa); /* get ranges */ range_sa_1 = extract_genomic_range(csa, sa_1); range_sa_2 = extract_genomic_range(csa, sa_2); if (gt_range_contains(&range_sa_1, &range_sa_2) && compatible(csa, sa_1, sa_2)) { return true; } return false; }
int gt_seqid2seqnum_mapping_map(GtSeqid2SeqnumMapping *mapping, const char *seqid, const GtRange *inrange, GtUword *seqnum, GtUword *filenum, GtUword *offset, GtError *err) { SeqidInfo *seqid_info; GtRange outrange; gt_error_check(err); gt_assert(mapping && seqid && seqnum); /* try to answer request from cache */ if (mapping->cached_seqid && !strcmp(seqid, mapping->cached_seqid) && (mapping->cached_range.end == GT_UNDEF_UWORD || gt_range_contains(&mapping->cached_range, inrange))) { *seqnum = mapping->cached_seqnum; *filenum = mapping->cached_filenum; if (offset) *offset = mapping->cached_range.start; return 0; } /* cache miss -> regular mapping */ if (!(seqid_info = gt_hashmap_get(mapping->map, seqid))) { gt_error_set(err, "no sequence with ID \"%s\" found in input sequence(s)", seqid); return -1; } /* get results from seqid info */ if (seqid_info_get(seqid_info, seqnum, filenum, &outrange, inrange, mapping->filename, seqid, err)) { return -1; } /* report offset */ if (offset) *offset = outrange.start; /* store result in cache */ mapping->cached_seqid = gt_hashmap_get_key(mapping->map, seqid); gt_assert(mapping->cached_seqid); mapping->cached_seqnum = *seqnum; mapping->cached_filenum = *filenum; mapping->cached_range = outrange; return 0; }
static void orf_attach_results_to_gff3(GtFeatureNode *gf, GtRange orf_rng, unsigned int orf_frame, GtStrand strand, GT_UNUSED GtError *err) { GtGenomeNode *child; GtStr *tag; tag = gt_str_new_cstr(GT_ORF_FINDER_TAG); orf_rng.start++; orf_rng.end++; GtFeatureNodeIterator *gfi; GtFeatureNode *curnode = NULL, *parent_node = NULL; GtRange gfi_range; char frame_buf[3]; sprintf(frame_buf, "%d", orf_frame); gfi = gt_feature_node_iterator_new(gf); while ((curnode = gt_feature_node_iterator_next(gfi))) { if (strcmp(gt_feature_node_get_type(curnode), (const char*) GT_ORF_TYPE) != 0) { gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode); if (gt_range_contains(&gfi_range, &orf_rng)) { parent_node = curnode; } } } if (parent_node) { child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf), GT_ORF_TYPE, orf_rng.start, orf_rng.end, strand); gt_feature_node_set_source((GtFeatureNode*) child, tag); gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf); gt_feature_node_add_child(parent_node,(GtFeatureNode*) child); } gt_str_delete(tag); gt_feature_node_iterator_delete(gfi); }
static int seqid_info_get(SeqidInfo *seqid_info, unsigned long *seqnum, unsigned long *filenum, GtRange *outrange, const GtRange *inrange, const char *filename, const char *seqid, GtError *err) { SeqidInfoElem *seqid_info_elem; unsigned long i; gt_error_check(err); gt_assert(seqid_info && seqnum && outrange && inrange); for (i = 0; i < gt_array_size(seqid_info); i++) { seqid_info_elem = gt_array_get(seqid_info, i); if (seqid_info_elem->descrange.end == GT_UNDEF_ULONG || gt_range_contains(&seqid_info_elem->descrange, inrange)) { *seqnum = seqid_info_elem->seqnum; *filenum = seqid_info_elem->filenum; *outrange = seqid_info_elem->descrange; return 0; } } gt_error_set(err, "cannot find sequence ID \"%s\" (with range %lu,%lu) in " "sequence file \"%s\"", seqid, inrange->start, inrange->end, filename); return -1; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }