GtSeqIterator* gt_seq_iterator_sequence_buffer_new(const GtStrArray *filenametab, GtError *err) { GtSeqIterator *si; GtSequenceBuffer *sb = gt_sequence_buffer_new_guess_type(filenametab, err); if (!sb) return NULL; si = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); gt_sequence_buffer_delete(sb); /* drop this reference */ return si; }
int gt_files_guess_if_protein_sequences(const GtStrArray *filenames, GtError *err) { unsigned int countnonbases = 0, currentposition; GtUchar currentchar; GtSequenceBuffer *fb; int retval; gt_error_check(err); fb = gt_sequence_buffer_new_guess_type(filenames, err); if (!fb) return -1; for (currentposition = 0; currentposition < 1000U; currentposition++) { retval = gt_sequence_buffer_next(fb,¤tchar,err); if (retval < 0) { gt_sequence_buffer_delete(fb); return -1; } if (retval == 0) { break; } switch (currentchar) { case 'L': case 'I': case 'F': case 'E': case 'Q': case 'P': case 'X': case 'Z': countnonbases++; break; default: break; } if (countnonbases > 0) { break; } } gt_sequence_buffer_delete(fb); if (countnonbases > 0) { return 1; /* guess it is a protein sequence */ } return 0; /* guess it is a dna sequence */ }
GtKmercodeiterator *gt_kmercodeiterator_filetab_new( const GtStrArray *filenametab, unsigned int numofchars, unsigned int kmersize, const GtUchar *symbolmap, bool plainformat, GtError *err) { GtKmercodeiterator *kmercodeiterator; GtUchar charcode; bool haserr = false; int retval; gt_error_check(err); kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator)); kmercodeiterator->esr = NULL; kmercodeiterator->hasprocessedfirst = false; kmercodeiterator->inputexhausted = false; kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize); kmercodeiterator->totallength = 0; if (plainformat) { kmercodeiterator->fb = gt_sequence_buffer_plain_new(filenametab); } else { kmercodeiterator->fb = gt_sequence_buffer_new_guess_type(filenametab, err); } if (kmercodeiterator->fb == NULL) { haserr = true; } if (!haserr) { gt_sequence_buffer_set_symbolmap(kmercodeiterator->fb, symbolmap); for (kmercodeiterator->currentposition = 0; kmercodeiterator->currentposition < (unsigned long) kmersize; kmercodeiterator->currentposition++) { retval = gt_sequence_buffer_next(kmercodeiterator->fb,&charcode,err); if (retval < 0) { haserr = true; break; } if (retval == 0) { kmercodeiterator->inputexhausted = true; break; } kmercodeiterator->spwp->windowwidth++; updatespecialpositions(kmercodeiterator->spwp,charcode,false,0); kmercodeiterator->spwp->cyclicwindow[kmercodeiterator-> spwp->windowwidth-1] = charcode; } } if (haserr) { gt_kmercodeiterator_delete(kmercodeiterator); return NULL; } return kmercodeiterator; }
static int testfullscan(const GtStrArray *filenametab, const Encodedsequence *encseq, Readmode readmode, GtError *err) { Seqpos pos, totallength; GtUchar ccscan = 0, ccra, ccsr; GtSequenceBuffer *fb = NULL; int retval; bool haserr = false; Encodedsequencescanstate *esr; unsigned long long fullscanpbar = 0; gt_error_check(err); totallength = getencseqtotallength(encseq); gt_progressbar_start(&fullscanpbar,(unsigned long long) totallength); if (filenametab != NULL) { fb = gt_sequence_buffer_new_guess_type((GtStrArray*) filenametab, err); if (!fb) haserr = true; if (!haserr) gt_sequence_buffer_set_symbolmap(fb, getencseqAlphabetsymbolmap(encseq)); } if (!haserr) { esr = newEncodedsequencescanstate(); initEncodedsequencescanstate(esr,encseq,readmode,0); for (pos=0; /* Nothing */; pos++) { if (filenametab != NULL && readmode == Forwardmode) { retval = gt_sequence_buffer_next(fb,&ccscan,err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } } else { if (pos >= totallength) { break; } } ccra = getencodedchar(encseq,pos,readmode); /* Random access */ if (filenametab != NULL && readmode == Forwardmode) { if (ccscan != ccra) { gt_error_set(err,"access=%s, position=" FormatSeqpos ": scan (readnextchar) = %u != " "%u = random access", encseqaccessname(encseq), pos, (unsigned int) ccscan, (unsigned int) ccra); haserr = true; break; } } ccsr = sequentialgetencodedchar(encseq,esr,pos,readmode); if (ccra != ccsr) { gt_error_set(err,"access=%s, mode=%s: position=" FormatSeqpos ": random access = %u != %u = sequential read", encseqaccessname(encseq), showreadmode(readmode), pos, (unsigned int) ccra, (unsigned int) ccsr); haserr = true; break; } fullscanpbar++; } gt_progressbar_stop(); } if (!haserr) { if (pos != totallength) { gt_error_set(err,"sequence length must be " FormatSeqpos " but is " FormatSeqpos,totallength,pos); haserr = true; } } freeEncodedsequencescanstate(&esr); gt_sequence_buffer_delete(fb); return haserr ? -1 : 0; }
static int gt_seqtranslate_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtTranslateArguments *arguments = tool_arguments; GtSeqIterator *si = NULL; GtSequenceBuffer *sb = NULL; GtStrArray *infiles; int had_err = 0, rval, i; GtStr *translations[3]; translations[0] = gt_str_new(); translations[1] = gt_str_new(); translations[2] = gt_str_new(); gt_error_check(err); gt_assert(arguments); infiles = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(infiles, argv[i]); } sb = gt_sequence_buffer_new_guess_type(infiles, err); if (!sb) had_err = -1; if (!had_err) { si = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); if (!si) had_err = -1; } if (!had_err) { char *desc; const GtUchar *sequence; GtUword len; while (!had_err && (rval = gt_seq_iterator_next(si, &sequence, &len, &desc, err))) { if (rval < 0) { had_err = -1; break; } if (len < GT_CODON_LENGTH) { gt_warning("sequence '%s' is shorter than codon length of %d, skipping", desc, GT_CODON_LENGTH); } else { had_err = gt_seqtranslate_do_translation(arguments, (char*) sequence, len, desc, translations, false, err); if (!had_err && arguments->reverse) { char *revseq = gt_cstr_dup_nt((char*) sequence, len); had_err = gt_reverse_complement(revseq, len, err); if (!had_err) { had_err = gt_seqtranslate_do_translation(arguments, revseq, len, desc, translations, true, err); } gt_free(revseq); } } } } gt_str_delete(translations[0]); gt_str_delete(translations[1]); gt_str_delete(translations[2]); gt_str_array_delete(infiles); gt_seq_iterator_delete(si); gt_sequence_buffer_delete(sb); return had_err; }
static int gt_convertseq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtConvertseqArguments *arguments = tool_arguments; int had_err = 0, i; GtFilelengthvalues *flv; GtSeqIterator *seqit; GtSequenceBuffer *sb = NULL; GtStrArray *files; const GtUchar *sequence; char *desc; GtUword len, j; off_t totalsize; gt_error_check(err); gt_assert(arguments != NULL); files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(files, argv[i]); } totalsize = gt_files_estimate_total_size(files); flv = gt_calloc((size_t) gt_str_array_size(files), sizeof (GtFilelengthvalues)); sb = gt_sequence_buffer_new_guess_type(files, err); if (!sb) { had_err = -1; } if (!had_err) { gt_sequence_buffer_set_filelengthtab(sb, flv); /* read input using seqiterator */ seqit = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (true) { GtUchar *seq = NULL; desc = NULL; j = 0UL; had_err = gt_seq_iterator_next(seqit, &sequence, &len, &desc, err); if (had_err != 1) break; if (arguments->revcomp) { GtUchar *newseq = gt_calloc((size_t) len+1, sizeof (GtUchar)); memcpy(newseq, sequence, (size_t) len*sizeof (GtUchar)); had_err = gt_reverse_complement((char*) newseq, len, err); if (had_err) break; seq = newseq; } else seq = (GtUchar*) sequence; if (!arguments->showseq) { bool in_wildcard = false; gt_file_xprintf(arguments->outfp, ">%s\n", desc); for (i = 0; (GtUword) i < len; i++) { if (arguments->reduce_wc_dna) { switch (seq[i]) { case 'a': case 'A': case 'c': case 'C': case 'g': case 'G': case 't': case 'u': case 'T': case 'U': in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; break; default: if (!in_wildcard) { in_wildcard = true; if (isupper((int) seq[i])) gt_file_xfputc((int) 'N', arguments->outfp); else gt_file_xfputc((int) 'n', arguments->outfp); j++; } } } else if (arguments->reduce_wc_prot) { switch (seq[i]) { case 'X': case 'B': case 'Z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'N', arguments->outfp); j++; } break; case 'x': case 'b': case 'z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'n', arguments->outfp); j++; } break; default: in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; } } else { gt_file_xfputc((int) seq[i], arguments->outfp); j++; } if (arguments->fastawidth > 0 && j % arguments->fastawidth == 0) { j = 0; gt_file_xprintf(arguments->outfp, "\n"); } } if (arguments->fastawidth == 0 || len % arguments->fastawidth != 0) gt_file_xprintf(arguments->outfp, "\n"); } if (arguments->revcomp) { gt_free(seq); } } if (arguments->showflv) { for (j=0;j<gt_str_array_size(files);j++) { fprintf(stderr, "file "GT_WU" (%s): "GT_WU"/"GT_WU"\n", j, gt_str_array_get(files, j), (GtUword) flv[j].length, (GtUword) flv[j].effectivelength); } } if (arguments->verbose) { gt_progressbar_stop(); } gt_sequence_buffer_delete(sb); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); gt_free(flv); return had_err; }