static int show_entry(GtStr *description, GtStr *sequence, bool translate, GtUword width, GtFile *outfp) { int had_err = 0; if (translate) { GtTranslatorStatus status; unsigned int frame; char translated; GtStr *protein = gt_str_new(); GtCodonIterator *ci = gt_codon_iterator_simple_new(gt_str_get(sequence), gt_str_length(sequence), NULL); GtTranslator* tr = gt_translator_new(ci); status = gt_translator_next(tr, &translated, &frame, NULL); while (status == GT_TRANSLATOR_OK) { if (frame == 0) gt_str_append_char(protein, translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; gt_fasta_show_entry(gt_str_get(description), gt_str_get(protein), gt_str_length(protein), width, outfp); gt_str_delete(protein); gt_translator_delete(tr); gt_codon_iterator_delete(ci); } else { gt_fasta_show_entry(gt_str_get(description), gt_str_get(sequence), gt_str_length(sequence), width, outfp); } return had_err; }
static int gt_seqmutate_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { MutateArguments *arguments = tool_arguments; GtBioseqIterator *bsi; unsigned long i; GtBioseq *bioseq; GtSeq *mutated_seq; int had_err; gt_error_check(err); gt_assert(arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) { mutated_seq = gt_mutate_seq(gt_bioseq_get_description(bioseq, i), gt_bioseq_get_sequence(bioseq, i), gt_bioseq_get_sequence_length(bioseq, i), gt_bioseq_get_alphabet(bioseq), arguments->rate); gt_fasta_show_entry(gt_seq_get_description(mutated_seq), gt_seq_get_orig(mutated_seq), gt_seq_length(mutated_seq), arguments->width, arguments->outfp); gt_seq_delete(mutated_seq); } gt_bioseq_delete(bioseq); } gt_bioseq_iterator_delete(bsi); return had_err; }
static int gff3_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn, GT_UNUSED GtError *err) { GtGFF3Visitor *gff3_visitor; gt_error_check(err); gff3_visitor = gff3_visitor_cast(nv); gt_assert(nv && sn); gff3_version_string(nv); if (!gff3_visitor->fasta_directive_shown) { if (!gff3_visitor->outstr) gt_file_xprintf(gff3_visitor->outfp, "%s\n", GT_GFF_FASTA_DIRECTIVE); else { gt_str_append_cstr(gff3_visitor->outstr, GT_GFF_FASTA_DIRECTIVE); gt_str_append_char(gff3_visitor->outstr, '\n'); } gff3_visitor->fasta_directive_shown = true; } if (!gff3_visitor->outstr) { gt_fasta_show_entry(gt_sequence_node_get_description(sn), gt_sequence_node_get_sequence(sn), gt_sequence_node_get_sequence_length(sn), gff3_visitor->fasta_width, gff3_visitor->outfp); } else { gt_fasta_show_entry_str(gt_sequence_node_get_description(sn), gt_sequence_node_get_sequence(sn), gt_sequence_node_get_sequence_length(sn), gff3_visitor->fasta_width, gff3_visitor->outstr); } return 0; }
static bool show_target(GT_UNUSED unsigned long pos, void *data) { TargetInfo *ti = data; gt_assert(ti); gt_fasta_show_entry(gt_bioseq_get_description(ti->bioseq, ti->seqnum), gt_bioseq_get_sequence(ti->bioseq, ti->seqnum), gt_bioseq_get_sequence_length(ti->bioseq, ti->seqnum), 0); return true; }
void gt_bioseq_show_as_fasta(GtBioseq *bs, GtUword width, GtFile *outfp) { GtUword i; gt_assert(bs); for (i = 0; i < gt_bioseq_number_of_sequences(bs); i++) { char *seq = gt_bioseq_get_sequence(bs, i); gt_fasta_show_entry(gt_bioseq_get_description(bs, i), seq, gt_bioseq_get_sequence_length(bs, i), width, outfp); gt_free(seq); } }
void gt_bioseq_show_sequence_as_fasta(GtBioseq *bs, GtUword seqnum, GtUword width, GtFile *outfp) { char *seq = NULL; gt_assert(bs); gt_assert(seqnum < gt_bioseq_number_of_sequences(bs)); seq = gt_bioseq_get_sequence(bs, seqnum); gt_fasta_show_entry(gt_bioseq_get_description(bs, seqnum), seq, gt_bioseq_get_sequence_length(bs, seqnum), width, outfp); gt_free(seq); }
static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SeqFilterArguments *arguments = tool_arguments; GtBioseqIterator *bsi; GtBioseq *bioseq; unsigned long i; unsigned long long passed = 0, filtered = 0, num_of_sequences = 0; int had_err = 0; gt_error_check(err); gt_assert(tool_arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) { if ((arguments->minlength == GT_UNDEF_ULONG || gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) && (arguments->maxlength == GT_UNDEF_ULONG || gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength) && (arguments->maxseqnum == GT_UNDEF_ULONG || passed + 1 <= arguments->maxseqnum)) { gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), gt_bioseq_get_sequence(bioseq, i), gt_bioseq_get_sequence_length(bioseq, i), arguments->width, arguments->outfp); passed++; } else filtered++; num_of_sequences++; } gt_bioseq_delete(bioseq); } /* show statistics */ if (!had_err) { gt_assert(passed + filtered == num_of_sequences); fprintf(stderr, "# %llu out of %llu sequences have been removed (%.3f%%)\n", filtered, num_of_sequences, ((double) filtered / num_of_sequences) * 100.0); } gt_bioseq_iterator_delete(bsi); return had_err; }
static int gt_seqtranslate_do_translation(GtTranslateArguments *arguments, const char *sequence, GtUword length, const char *desc, GtStr **translations, bool rev, GtError *err) { GtTranslator *tr; GT_UNUSED GtTranslatorStatus trst; GtCodonIterator *ci; char translated; int had_err = 0; GtStr *str; unsigned int frame, i; ci = gt_codon_iterator_simple_new(sequence, length, err); tr = gt_translator_new(ci); trst = gt_translator_next(tr, &translated, &frame, err); while (trst == GT_TRANSLATOR_OK) { gt_str_append_char(translations[frame], translated); trst = gt_translator_next(tr, &translated, &frame, err); } gt_codon_iterator_delete(ci); gt_translator_delete(tr); if (trst == GT_TRANSLATOR_ERROR) return -1; str = gt_str_new(); for (i = 0; i < 3; i++) { if (gt_str_length(translations[i]) > 0) { gt_str_append_cstr(str, desc); gt_str_append_cstr(str, " ("); gt_str_append_ulong(str, i+1); gt_str_append_cstr(str, rev ? "-" : "+"); gt_str_append_cstr(str, ")"); gt_fasta_show_entry(gt_str_get(str), gt_str_get(translations[i]), gt_str_length(translations[i]), arguments->fasta_width, arguments->outfp); gt_str_reset(translations[i]); gt_str_reset(str); } } gt_str_delete(str); return had_err; }
static int gt_shredder_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtShredderArguments *arguments = tool_arguments; GtBioseqIterator *bsi; unsigned long i; GtBioseq *bioseq; int had_err; GtStr *desc; gt_error_check(err); gt_assert(arguments); /* init */ desc = gt_str_new(); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); /* shredder */ while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { for (i = 0; i < arguments->coverage; i++) { GtShredder *shredder; unsigned long fragment_length; const char *fragment; shredder = gt_shredder_new(bioseq, arguments->minlength, arguments->maxlength); gt_shredder_set_overlap(shredder, arguments->overlap); gt_shredder_set_sample_probability(shredder, arguments->sample_probability); while ((fragment = gt_shredder_shred(shredder, &fragment_length, desc))) { gt_str_append_cstr(desc, " [shreddered fragment]"); gt_fasta_show_entry(gt_str_get(desc), fragment, fragment_length, 0); } gt_shredder_delete(shredder); } gt_bioseq_delete(bioseq); } /* free */ gt_bioseq_iterator_delete(bsi); gt_str_delete(desc); return had_err; }
static int extractseq_pos(GtFile *outfp, GtBioseq *bs, unsigned long frompos, unsigned long topos, unsigned long width, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(bs); if (topos > gt_bioseq_get_raw_sequence_length(bs)) { gt_error_set(err, "argument %lu to option '-%s' is larger than sequence length %lu", topos, TOPOS_OPTION_STR, gt_bioseq_get_raw_sequence_length(bs)); had_err = -1; } if (!had_err) { gt_fasta_show_entry(NULL, gt_bioseq_get_raw_sequence(bs) + frompos - 1, topos - frompos + 1, width, outfp); } return had_err; }
static int split_description(const char *filename, GtStr *splitdesc, unsigned long width, bool force, GtError *err) { unsigned long i; GtBioseq *bioseq; GtStr *descname; int had_err = 0; gt_error_check(err); gt_assert(filename && splitdesc && gt_str_length(splitdesc)); descname = gt_str_new(); if (!(bioseq = gt_bioseq_new(filename, err))) had_err = -1; for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bioseq); i++) { GtFile *outfp; char *seq; gt_str_reset(descname); gt_str_append_str(descname, splitdesc); gt_str_append_char(descname, '/'); gt_str_append_cstr(descname, gt_bioseq_get_description(bioseq, i)); gt_str_append_cstr(descname, gt_file_suffix(filename)); if (!(outfp = gt_output_file_xopen_forcecheck(gt_str_get(descname), "w", force, err))) { had_err = -1; break; } seq = gt_bioseq_get_sequence(bioseq, i); gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), seq, gt_bioseq_get_sequence_length(bioseq, i), width, outfp); gt_free(seq); gt_file_delete(outfp); } gt_bioseq_delete(bioseq); gt_str_delete(descname); return had_err; }
static int extractseq_match(GtFile *outfp, GtBioseq *bs, const char *pattern, unsigned long width, GtError *err) { const char *desc; unsigned long i; bool match; int had_err = 0; gt_error_check(err); gt_assert(bs && pattern); for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bs); i++) { desc = gt_bioseq_get_description(bs, i); gt_assert(desc); had_err = gt_grep(&match, pattern, desc, err); if (!had_err && match) { gt_fasta_show_entry(desc, gt_bioseq_get_sequence(bs, i), gt_bioseq_get_sequence_length(bs, i), width, outfp); } } return had_err; }
static int write_pdom(GtLTRdigestFileOutStream *ls, GtArray *pdoms, const char *pdomname, GT_UNUSED GtRegionMapping *rmap, char *desc, GtError *err) { int had_err = 0; GtFile *seqfile = NULL, *alifile = NULL, *aafile = NULL; GtUword i = 0, seq_length = 0; GtStr *pdom_seq, *pdom_aaseq; gt_error_check(err); pdom_seq = gt_str_new(); pdom_aaseq = gt_str_new(); /* get protein domain output file */ seqfile = (GtFile*) gt_hashmap_get(ls->pdomout_files, pdomname); if (seqfile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.fas", ls->fileprefix, pdomname); seqfile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomout_files, gt_cstr_dup(pdomname), seqfile); } /* get protein alignment output file */ if (ls->write_pdom_alignments) { alifile = (GtFile*) gt_hashmap_get(ls->pdomali_files, pdomname); if (alifile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.ali", ls->fileprefix, pdomname); alifile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomali_files, gt_cstr_dup(pdomname), alifile); } } /* get amino acid sequence output file */ if (ls->write_pdom_aaseqs) { aafile = (GtFile*) gt_hashmap_get(ls->pdomaa_files, pdomname); if (aafile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s_aa.fas", ls->fileprefix, pdomname); aafile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomaa_files, gt_cstr_dup(pdomname), aafile); } } if (gt_array_size(pdoms) > 1UL) { for (i=1UL; i<gt_array_size(pdoms); i++) { gt_assert(gt_genome_node_cmp(*(GtGenomeNode**)gt_array_get(pdoms, i), *(GtGenomeNode**)gt_array_get(pdoms, i-1)) >= 0); } if (gt_feature_node_get_strand(*(GtFeatureNode**) gt_array_get(pdoms, 0UL)) == GT_STRAND_REVERSE) { gt_array_reverse(pdoms); } } /* output protein domain data */ for (i=0;i<gt_array_size(pdoms);i++) { GtRange pdom_rng; GtStr *ali, *aaseq; GtFeatureNode *fn; int rval; fn = *(GtFeatureNode**) gt_array_get(pdoms, i); ali = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_alignment"); aaseq = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_aaseq"); pdom_rng = gt_genome_node_get_range((GtGenomeNode*) fn); rval = gt_extract_feature_sequence(pdom_seq, (GtGenomeNode*) fn, gt_symbol(gt_ft_protein_match), false, NULL, NULL, rmap, err); if (rval) { had_err = -1; break; } if (ls->write_pdom_alignments && ali) { char buf[BUFSIZ]; /* write away alignment */ (void) snprintf(buf, BUFSIZ-1, "Protein domain alignment in translated " "sequence for candidate\n'%s':\n\n", desc); gt_file_xwrite(alifile, buf, (size_t) strlen(buf) * sizeof (char)); gt_file_xwrite(alifile, gt_str_get(ali), (size_t) gt_str_length(ali) * sizeof (char)); gt_file_xwrite(alifile, "---\n\n", 5 * sizeof (char)); } if (ls->write_pdom_aaseqs && aaseq) { /* append amino acid sequence */ gt_str_append_str(pdom_aaseq, aaseq); } gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_alignment"); gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_aaseq"); seq_length += gt_range_length(&pdom_rng); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(pdom_seq), seq_length, GT_FSWIDTH, seqfile); if (ls->write_pdom_aaseqs) { gt_fasta_show_entry(desc, gt_str_get(pdom_aaseq), gt_str_length(pdom_aaseq), GT_FSWIDTH, aafile); } } gt_str_delete(pdom_seq); gt_str_delete(pdom_aaseq); return had_err; }
int gt_extractkeysfromfastafile(bool verbose, GtFile *outfp, unsigned long width, const GtStr *fileofkeystoextract, GtStrArray *referencefiletab, GtError *err) { GtSeqIterator *seqit; const GtUchar *sequence; char *desc, *headerbufferspace = NULL, *keyspace = NULL; const char *keyptr; unsigned long allockeyspace = 0, len, keylen, numofqueries, keyposition, countmarkhit = 0; int had_err = 0; off_t totalsize; Fastakeyquery *fastakeyqueries; size_t headerbuffersize = 0, headerlength; gt_error_check(err); fastakeyqueries = readfileofkeystoextract(verbose,&numofqueries, fileofkeystoextract,err); if (fastakeyqueries == NULL) { return -1; } totalsize = gt_files_estimate_total_size(referencefiletab); if (verbose) { printf("# estimated total size is " Formatuint64_t "\n", PRINTuint64_tcast(totalsize)); } seqit = gt_seqiterator_sequence_buffer_new(referencefiletab, err); if (!seqit) { had_err = -1; } if (!had_err && verbose) { gt_progressbar_start(gt_seqiterator_getcurrentcounter(seqit, (unsigned long long) totalsize), (unsigned long long) totalsize); } while (had_err != -1 && countmarkhit < numofqueries) { had_err = gt_seqiterator_next(seqit, &sequence, &len, &desc, err); if (had_err != 1) { break; } keyptr = desc2key(&keylen,desc,err); if (keyptr == NULL) { had_err = -1; } else { if (allockeyspace < keylen) { keyspace = gt_realloc(keyspace,sizeof (*keyspace) * (keylen+1)); allockeyspace = keylen; } gt_assert(keyspace != NULL); strncpy(keyspace,keyptr,(size_t) keylen); keyspace[keylen] = '\0'; keyposition = searchdesinfastakeyqueries(keyspace,fastakeyqueries, numofqueries); if (keyposition < numofqueries) { while (keyposition < numofqueries && strcmp(fastakeyqueries[keyposition].fastakey,keyspace) == 0) { #ifndef NDEBUG if (fastakeyqueries[keyposition].markhit) { fprintf(stderr,"key %s was already found before\n", fastakeyqueries[keyposition].fastakey); exit(GT_EXIT_PROGRAMMING_ERROR); } #endif headerlength = strlen(desc); if (headerbuffersize < headerlength + EXTRABUF + 1) { headerbuffersize = headerlength + EXTRABUF + 1; headerbufferspace = gt_realloc(headerbufferspace, sizeof (*headerbufferspace) * headerbuffersize); } if (COMPLETE(fastakeyqueries + keyposition)) { /* (void) snprintf(headerbufferspace,headerbuffersize, "%*.*s complete %s", (int) keylen,(int) keylen,keyspace, desc); */ gt_fasta_show_entry(desc, (const char *) sequence, len, width, outfp); } else { (void) snprintf(headerbufferspace,headerbuffersize, "%*.*s %lu %lu %s", (int) keylen,(int) keylen,keyspace, fastakeyqueries[keyposition].frompos, fastakeyqueries[keyposition].topos, desc); gt_fasta_show_entry(headerbufferspace, (const char *) (sequence+fastakeyqueries[keyposition]. frompos - 1), fastakeyqueries[keyposition].topos - fastakeyqueries[keyposition].frompos+1, width, outfp); } fastakeyqueries[keyposition].markhit = true; countmarkhit++; keyposition++; } } #ifdef SKDEBUG printf("%s 1 %lu\n",keyspace, len); #endif } } gt_free(headerbufferspace); gt_free(keyspace); if (verbose) { gt_progressbar_stop(); } if (verbose) { outputnonmarked(fastakeyqueries,numofqueries); } fastakeyqueries_delete(fastakeyqueries,numofqueries); gt_seqiterator_delete(seqit); return had_err; }
static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SeqFilterArguments *arguments = tool_arguments; GtBioseqIterator *bsi; GtBioseq *bioseq; GtUint64 passed = 0, filtered = 0, num_of_sequences = 0, steps = 0; int had_err = 0; gt_error_check(err); gt_assert(tool_arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq != NULL) { GtUword i; GtUint64 current_num = gt_bioseq_number_of_sequences(bioseq); for (i = 0; i < current_num && (arguments->maxseqnum == GT_UNDEF_UWORD || passed + 1 <= arguments->maxseqnum); i++) { char *seq; if ((arguments->step == 1 || steps + 1 == arguments->step) && (arguments->sample_prob == 1.0 || gt_rand_0_to_1() <= arguments->sample_prob) && (arguments->minlength == GT_UNDEF_UWORD || gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) && (arguments->maxlength == GT_UNDEF_UWORD || gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength)) { seq = gt_bioseq_get_sequence(bioseq, i); gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), seq, gt_bioseq_get_sequence_length(bioseq, i), arguments->width, arguments->outfp); gt_free(seq); passed++; } else { filtered++; } steps = (steps + 1 == arguments->step) ? 0 : steps + 1; } filtered += current_num - i; num_of_sequences += current_num; gt_bioseq_delete(bioseq); } /* show statistics */ if (!had_err) { gt_assert(passed + filtered == num_of_sequences); fprintf(stderr, "# " GT_LLU " out of " GT_LLU " sequences have been removed (%.3f%%)\n", filtered, num_of_sequences, ((double) filtered / num_of_sequences) * 100.0); } gt_bioseq_iterator_delete(bsi); return had_err; }
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSequniqArguments *arguments = tool_arguments; GtUint64 duplicates = 0, num_of_sequences = 0; int i, had_err = 0; GtMD5Set *md5set; gt_error_check(err); gt_assert(arguments); md5set = gt_md5set_new(arguments->nofseqs); if (!arguments->seqit) { GtUword j; GtBioseq *bs; for (i = parsed_args; !had_err && i < argc; i++) { if (!(bs = gt_bioseq_new(argv[i], err))) had_err = -1; if (!had_err) { GtMD5SetStatus retval; for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) { char *seq = gt_bioseq_get_sequence(bs, j); retval = gt_md5set_add_sequence(md5set, seq, gt_bioseq_get_sequence_length(bs, j), arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq, gt_bioseq_get_sequence_length(bs, j), arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; gt_free(seq); } gt_bioseq_delete(bs); } } } else { GtSeqIterator *seqit; GtStrArray *files; off_t totalsize; const GtUchar *sequence; char *desc; GtUword len; files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) gt_str_array_add_cstr(files, argv[i]); totalsize = gt_files_estimate_total_size(files); seqit = gt_seq_iterator_sequence_buffer_new(files, err); if (!seqit) had_err = -1; if (!had_err) { if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (!had_err) { GtMD5SetStatus retval; if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1) break; retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len, arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(desc, (const char*) sequence, len, arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; } if (arguments->verbose) gt_progressbar_stop(); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); } /* show statistics */ if (!had_err) { fprintf(stderr, "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n", (GtUword)duplicates, (GtUword)num_of_sequences, ((double) duplicates / (double)num_of_sequences) * 100.0); } gt_md5set_delete(md5set); return had_err; }
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtLTRdigestFileOutStream *ls; GtFeatureNode *fn; GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}; int had_err; GtUword i=0; gt_error_check(err); ls = gt_ltrdigest_file_out_stream_cast(ns); /* initialize this element */ memset(&ls->element, 0, sizeof (GtLTRElement)); /* get annotations from parser */ had_err = gt_node_stream_next(ls->in_stream, gn, err); if (!had_err && *gn) { GtFeatureNodeIterator* gni; GtFeatureNode *mygn; /* only process feature nodes */ if (!(fn = gt_feature_node_try_cast(*gn))) return 0; ls->element.pdomorder = gt_array_new(sizeof (const char*)); /* fill LTRElement structure from GFF3 subgraph */ gni = gt_feature_node_iterator_new(fn); for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni)) (void) gt_genome_node_accept((GtGenomeNode*) mygn, (GtNodeVisitor*) ls->lv, err); gt_feature_node_iterator_delete(gni); } if (!had_err && ls->element.mainnode != NULL) { char desc[GT_MAXFASTAHEADER]; GtFeatureNode *ltr3, *ltr5; GtStr *sdesc, *sreg, *seq; /* find sequence in GtEncseq */ sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode); sdesc = gt_str_new(); had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err); if (!had_err) { GtRange rng; ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char)); (void) snprintf(ls->element.seqid, MIN((size_t) gt_str_length(sdesc), (size_t) ls->seqnamelen)+1, "%s", gt_str_get(sdesc)); gt_cstr_rep(ls->element.seqid, ' ', '_'); if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen) ls->element.seqid[ls->seqnamelen] = '\0'; (void) gt_ltrelement_format_description(&ls->element, ls->seqnamelen, desc, (size_t) (GT_MAXFASTAHEADER-1)); gt_str_delete(sdesc); /* output basic retrotransposon data */ lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR); rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR); rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode); gt_file_xprintf(ls->tabout_file, GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t" GT_WU"\t"GT_WU"\t"GT_WU"\t", rng.start, rng.end, gt_ltrelement_length(&ls->element), ls->element.seqid, lltr_rng.start, lltr_rng.end, gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start, rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element)); } seq = gt_str_new(); /* output TSDs */ if (!had_err && ls->element.leftTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.leftTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); if (!had_err && ls->element.rightTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.rightTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); /* output PPT */ if (!had_err && ls->element.ppt != NULL) { GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt); ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.ppt, gt_symbol(gt_ft_RR_tract), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng), GT_FSWIDTH, ls->pptout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t", ppt_rng.start, ppt_rng.end, gt_str_get(seq), GT_STRAND_CHARS[ppt_strand], (ppt_strand == GT_STRAND_FORWARD ? abs((int) (rltr_rng.start - ppt_rng.end)) : abs((int) (lltr_rng.end - ppt_rng.start)))); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t"); /* output PBS */ if (!had_err && ls->element.pbs != NULL) { GtStrand pbs_strand; pbs_strand = gt_feature_node_get_strand(ls->element.pbs); pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.pbs, gt_symbol(gt_ft_primer_binding_site), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng), GT_FSWIDTH, ls->pbsout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t", pbs_rng.start, pbs_rng.end, GT_STRAND_CHARS[pbs_strand], gt_feature_node_get_attribute(ls->element.pbs, "trna"), gt_str_get(seq), gt_feature_node_get_attribute(ls->element.pbs, "pbsoffset"), gt_feature_node_get_attribute(ls->element.pbs, "trnaoffset"), gt_feature_node_get_attribute(ls->element.pbs, "edist")); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t"); /* output protein domains */ if (!had_err && ls->element.pdoms != NULL) { GtStr *pdomorderstr = gt_str_new(); for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* key = *(const char**) gt_array_get(ls->element.pdomorder, i); GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key); had_err = write_pdom(ls, entry, key, ls->rmap, desc, err); } if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode)) gt_array_reverse(ls->element.pdomorder); for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* name = *(const char**) gt_array_get(ls->element.pdomorder, i); gt_str_append_cstr(pdomorderstr, name); if (i != gt_array_size(ls->element.pdomorder)-1) gt_str_append_cstr(pdomorderstr, "/"); } gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr)); gt_str_delete(pdomorderstr); } /* output LTRs (we just expect them to exist) */ switch (gt_feature_node_get_strand(ls->element.mainnode)) { case GT_STRAND_REVERSE: ltr5 = ls->element.rightLTR; ltr3 = ls->element.leftLTR; break; case GT_STRAND_FORWARD: default: ltr5 = ls->element.leftLTR; ltr3 = ls->element.rightLTR; break; } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr5out_file); gt_str_reset(seq); } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr3out_file); gt_str_reset(seq); } /* output complete oriented element */ if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.mainnode, gt_symbol(gt_ft_LTR_retrotransposon), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->elemout_file); gt_str_reset(seq); } gt_file_xprintf(ls->tabout_file, "\n"); gt_str_delete(seq); } gt_hashmap_delete(ls->element.pdoms); gt_array_delete(ls->element.pdomorder); gt_free(ls->element.seqid); return had_err; }