int gt_extractkeysfromfastafile(bool verbose, GtFile *outfp, unsigned long width, const GtStr *fileofkeystoextract, GtStrArray *referencefiletab, GtError *err) { GtSeqIterator *seqit; const GtUchar *sequence; char *desc, *headerbufferspace = NULL, *keyspace = NULL; const char *keyptr; unsigned long allockeyspace = 0, len, keylen, numofqueries, keyposition, countmarkhit = 0; int had_err = 0; off_t totalsize; Fastakeyquery *fastakeyqueries; size_t headerbuffersize = 0, headerlength; gt_error_check(err); fastakeyqueries = readfileofkeystoextract(verbose,&numofqueries, fileofkeystoextract,err); if (fastakeyqueries == NULL) { return -1; } totalsize = gt_files_estimate_total_size(referencefiletab); if (verbose) { printf("# estimated total size is " Formatuint64_t "\n", PRINTuint64_tcast(totalsize)); } seqit = gt_seqiterator_sequence_buffer_new(referencefiletab, err); if (!seqit) { had_err = -1; } if (!had_err && verbose) { gt_progressbar_start(gt_seqiterator_getcurrentcounter(seqit, (unsigned long long) totalsize), (unsigned long long) totalsize); } while (had_err != -1 && countmarkhit < numofqueries) { had_err = gt_seqiterator_next(seqit, &sequence, &len, &desc, err); if (had_err != 1) { break; } keyptr = desc2key(&keylen,desc,err); if (keyptr == NULL) { had_err = -1; } else { if (allockeyspace < keylen) { keyspace = gt_realloc(keyspace,sizeof (*keyspace) * (keylen+1)); allockeyspace = keylen; } gt_assert(keyspace != NULL); strncpy(keyspace,keyptr,(size_t) keylen); keyspace[keylen] = '\0'; keyposition = searchdesinfastakeyqueries(keyspace,fastakeyqueries, numofqueries); if (keyposition < numofqueries) { while (keyposition < numofqueries && strcmp(fastakeyqueries[keyposition].fastakey,keyspace) == 0) { #ifndef NDEBUG if (fastakeyqueries[keyposition].markhit) { fprintf(stderr,"key %s was already found before\n", fastakeyqueries[keyposition].fastakey); exit(GT_EXIT_PROGRAMMING_ERROR); } #endif headerlength = strlen(desc); if (headerbuffersize < headerlength + EXTRABUF + 1) { headerbuffersize = headerlength + EXTRABUF + 1; headerbufferspace = gt_realloc(headerbufferspace, sizeof (*headerbufferspace) * headerbuffersize); } if (COMPLETE(fastakeyqueries + keyposition)) { /* (void) snprintf(headerbufferspace,headerbuffersize, "%*.*s complete %s", (int) keylen,(int) keylen,keyspace, desc); */ gt_fasta_show_entry(desc, (const char *) sequence, len, width, outfp); } else { (void) snprintf(headerbufferspace,headerbuffersize, "%*.*s %lu %lu %s", (int) keylen,(int) keylen,keyspace, fastakeyqueries[keyposition].frompos, fastakeyqueries[keyposition].topos, desc); gt_fasta_show_entry(headerbufferspace, (const char *) (sequence+fastakeyqueries[keyposition]. frompos - 1), fastakeyqueries[keyposition].topos - fastakeyqueries[keyposition].frompos+1, width, outfp); } fastakeyqueries[keyposition].markhit = true; countmarkhit++; keyposition++; } } #ifdef SKDEBUG printf("%s 1 %lu\n",keyspace, len); #endif } } gt_free(headerbufferspace); gt_free(keyspace); if (verbose) { gt_progressbar_stop(); } if (verbose) { outputnonmarked(fastakeyqueries,numofqueries); } fastakeyqueries_delete(fastakeyqueries,numofqueries); gt_seqiterator_delete(seqit); return had_err; }
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSequniqArguments *arguments = tool_arguments; GtUint64 duplicates = 0, num_of_sequences = 0; int i, had_err = 0; GtMD5Set *md5set; gt_error_check(err); gt_assert(arguments); md5set = gt_md5set_new(arguments->nofseqs); if (!arguments->seqit) { GtUword j; GtBioseq *bs; for (i = parsed_args; !had_err && i < argc; i++) { if (!(bs = gt_bioseq_new(argv[i], err))) had_err = -1; if (!had_err) { GtMD5SetStatus retval; for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) { char *seq = gt_bioseq_get_sequence(bs, j); retval = gt_md5set_add_sequence(md5set, seq, gt_bioseq_get_sequence_length(bs, j), arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq, gt_bioseq_get_sequence_length(bs, j), arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; gt_free(seq); } gt_bioseq_delete(bs); } } } else { GtSeqIterator *seqit; GtStrArray *files; off_t totalsize; const GtUchar *sequence; char *desc; GtUword len; files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) gt_str_array_add_cstr(files, argv[i]); totalsize = gt_files_estimate_total_size(files); seqit = gt_seq_iterator_sequence_buffer_new(files, err); if (!seqit) had_err = -1; if (!had_err) { if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (!had_err) { GtMD5SetStatus retval; if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1) break; retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len, arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(desc, (const char*) sequence, len, arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; } if (arguments->verbose) gt_progressbar_stop(); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); } /* show statistics */ if (!had_err) { fprintf(stderr, "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n", (GtUword)duplicates, (GtUword)num_of_sequences, ((double) duplicates / (double)num_of_sequences) * 100.0); } gt_md5set_delete(md5set); return had_err; }
static int gt_convertseq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtConvertseqArguments *arguments = tool_arguments; int had_err = 0, i; GtFilelengthvalues *flv; GtSeqIterator *seqit; GtSequenceBuffer *sb = NULL; GtStrArray *files; const GtUchar *sequence; char *desc; GtUword len, j; off_t totalsize; gt_error_check(err); gt_assert(arguments != NULL); files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(files, argv[i]); } totalsize = gt_files_estimate_total_size(files); flv = gt_calloc((size_t) gt_str_array_size(files), sizeof (GtFilelengthvalues)); sb = gt_sequence_buffer_new_guess_type(files, err); if (!sb) { had_err = -1; } if (!had_err) { gt_sequence_buffer_set_filelengthtab(sb, flv); /* read input using seqiterator */ seqit = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (true) { GtUchar *seq = NULL; desc = NULL; j = 0UL; had_err = gt_seq_iterator_next(seqit, &sequence, &len, &desc, err); if (had_err != 1) break; if (arguments->revcomp) { GtUchar *newseq = gt_calloc((size_t) len+1, sizeof (GtUchar)); memcpy(newseq, sequence, (size_t) len*sizeof (GtUchar)); had_err = gt_reverse_complement((char*) newseq, len, err); if (had_err) break; seq = newseq; } else seq = (GtUchar*) sequence; if (!arguments->showseq) { bool in_wildcard = false; gt_file_xprintf(arguments->outfp, ">%s\n", desc); for (i = 0; (GtUword) i < len; i++) { if (arguments->reduce_wc_dna) { switch (seq[i]) { case 'a': case 'A': case 'c': case 'C': case 'g': case 'G': case 't': case 'u': case 'T': case 'U': in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; break; default: if (!in_wildcard) { in_wildcard = true; if (isupper((int) seq[i])) gt_file_xfputc((int) 'N', arguments->outfp); else gt_file_xfputc((int) 'n', arguments->outfp); j++; } } } else if (arguments->reduce_wc_prot) { switch (seq[i]) { case 'X': case 'B': case 'Z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'N', arguments->outfp); j++; } break; case 'x': case 'b': case 'z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'n', arguments->outfp); j++; } break; default: in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; } } else { gt_file_xfputc((int) seq[i], arguments->outfp); j++; } if (arguments->fastawidth > 0 && j % arguments->fastawidth == 0) { j = 0; gt_file_xprintf(arguments->outfp, "\n"); } } if (arguments->fastawidth == 0 || len % arguments->fastawidth != 0) gt_file_xprintf(arguments->outfp, "\n"); } if (arguments->revcomp) { gt_free(seq); } } if (arguments->showflv) { for (j=0;j<gt_str_array_size(files);j++) { fprintf(stderr, "file "GT_WU" (%s): "GT_WU"/"GT_WU"\n", j, gt_str_array_get(files, j), (GtUword) flv[j].length, (GtUword) flv[j].effectivelength); } } if (arguments->verbose) { gt_progressbar_stop(); } gt_sequence_buffer_delete(sb); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); gt_free(flv); return had_err; }