static inline int ownbuffer_genfile_getc(GtFastaBuffer *fb, GtFile *inputstream) { if (fb->currentinpos >= fb->currentfillpos) { fb->currentfillpos = gt_file_xread(inputstream, fb->inputbuffer, (size_t) INPUTFILEBUFFERSIZE); if (fb->currentfillpos == 0) { return EOF; } fb->currentinpos = 0; } return fb->inputbuffer[fb->currentinpos++]; }
static inline int fastq_buf_getchar(GtSeqIteratorFastQ *seqit) { if (seqit->use_ungetchar) { seqit->use_ungetchar = false; return seqit->ungetchar; } else { if (seqit->currentinpos >= seqit->currentfillpos) { seqit->currentfillpos = gt_file_xread(seqit->curfile, seqit->inbuf, GT_SEQIT_QUAL_INBUFSIZE); if (seqit->currentfillpos == 0) return EOF; seqit->currentinpos = 0; } seqit->ungetchar = seqit->inbuf[seqit->currentinpos++]; return seqit->ungetchar; } }
static int split_fasta_file(const char *filename, unsigned long max_filesize, bool force, GtError *err) { GtFile *srcfp = NULL, *destfp = NULL; GtStr *destfilename = NULL; unsigned long filenum = 0, bytecount = 0, separator_pos; int read_bytes, had_err = 0; char buf[BUFSIZ]; gt_error_check(err); gt_assert(filename && max_filesize); /* open source file */ srcfp = gt_file_xopen(filename, "r"); gt_assert(srcfp); /* read start characters */ if ((read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) == 0) { gt_error_set(err, "file \"%s\" is empty", filename); had_err = -1; } bytecount += read_bytes; /* make sure the file is in fasta format */ if (!had_err && buf[0] != '>') { gt_error_set(err, "file is not in FASTA format"); had_err = -1; } if (!had_err) { /* open destination file */ destfilename = gt_str_new(); gt_str_append_cstr_nt(destfilename, filename, gt_file_basename_length(filename)); gt_str_append_char(destfilename, '.'); gt_str_append_ulong(destfilename, ++filenum); gt_str_append_cstr(destfilename, gt_file_mode_suffix(gt_file_mode(srcfp))); if (!(destfp = gt_outputfile_xopen_forcecheck(gt_str_get(destfilename), "w", force, err))) { had_err = -1; } if (!had_err) gt_file_xwrite(destfp, buf, read_bytes); while (!had_err && (read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) != 0) { if (bytecount + read_bytes > max_filesize) { int offset = bytecount < max_filesize ? max_filesize - bytecount : 0; if ((separator_pos = buf_contains_separator(buf, offset, read_bytes))) { separator_pos--; gt_assert(separator_pos < read_bytes); if (separator_pos) gt_file_xwrite(destfp, buf, separator_pos); /* close current file */ gt_file_delete(destfp); /* open new file */ gt_str_reset(destfilename); gt_str_append_cstr_nt(destfilename, filename, gt_file_basename_length(filename)); gt_str_append_char(destfilename, '.'); gt_str_append_ulong(destfilename, ++filenum); gt_str_append_cstr(destfilename, gt_file_mode_suffix(gt_file_mode(srcfp))); if (!(destfp = gt_outputfile_xopen_forcecheck(gt_str_get(destfilename), "w", force, err))) { had_err = -1; break; } bytecount = read_bytes - separator_pos; /* reset */ gt_assert(buf[separator_pos] == '>'); gt_file_xwrite(destfp, buf + separator_pos, read_bytes - separator_pos); continue; } } bytecount += read_bytes; gt_file_xwrite(destfp, buf, read_bytes); } } /* free */ gt_str_delete(destfilename); /* close current file */ gt_file_delete(destfp); /* close source file */ gt_file_delete(srcfp); return had_err; }
static int gt_fasta_reader_fsm_run(GtFastaReader *fasta_reader, GtFastaReaderProcDescription proc_description, GtFastaReaderProcSequencePart proc_sequence_part, GtFastaReaderProcSequenceLength proc_sequence_length, void *data, GtError *err) { GtFastaReaderFSM *fr = gt_fasta_reader_fsm_cast(fasta_reader); unsigned char cc; GtFastaReaderState state = EXPECTING_SEPARATOR; GtUword sequence_length = 0, line_counter = 1; GtStr *description, *sequence; int had_err = 0; gt_error_check(err); gt_assert(fr); /* init */ description = gt_str_new(); sequence = gt_str_new(); /* at least one function has to be defined */ gt_assert(proc_description || proc_sequence_part || proc_sequence_length); /* rewind sequence file (to allow multiple calls) */ if (fr->sequence_file) gt_file_xrewind(fr->sequence_file); /* reading */ while (!had_err && gt_file_xread(fr->sequence_file, &cc, 1) != 0) { switch (state) { case EXPECTING_SEPARATOR: if (cc != GT_FASTA_SEPARATOR) { gt_error_set(err, "the first character of fasta file \"%s\" has to be '%c'", gt_str_get(fr->sequence_filename), GT_FASTA_SEPARATOR); had_err = -1; } else state = READING_DESCRIPTION; break; case READING_DESCRIPTION: if (cc == '\n') { if (proc_description) { had_err = proc_description(gt_str_get(description), gt_str_length(description), data, err); if (!had_err) gt_str_reset(description); } if (!had_err) { sequence_length = 0; line_counter++; state = READING_SEQUENCE_AFTER_NEWLINE; } } else if (proc_description && cc != '\r') gt_str_append_char(description, cc); break; case READING_SEQUENCE_AFTER_NEWLINE: if (cc == GT_FASTA_SEPARATOR) { if (!sequence_length) { gt_assert(line_counter); gt_error_set(err, "empty sequence after description given in line " ""GT_WU"", line_counter - 1); had_err = -1; break; } else { if (proc_sequence_part) { gt_assert(gt_str_length(sequence)); had_err = proc_sequence_part(gt_str_get(sequence), gt_str_length(sequence), data, err); } if (had_err) break; gt_str_reset(sequence); if (proc_sequence_length) had_err = proc_sequence_length(sequence_length, data, err); if (had_err) break; state = READING_DESCRIPTION; continue; } } /*@fallthrough@*/ case READING_SEQUENCE: if (cc == '\n') { line_counter++; state = READING_SEQUENCE_AFTER_NEWLINE; } else { sequence_length++; if (proc_sequence_part) { if (gt_str_length(sequence) == BUFSIZ) { had_err = proc_sequence_part(gt_str_get(sequence), gt_str_length(sequence), data, err); if (had_err) break; gt_str_reset(sequence); } if (cc != ' ' && cc != '\r') gt_str_append_char(sequence, cc); } } break; } } if (!had_err) { /* checks after reading */ switch (state) { case EXPECTING_SEPARATOR: gt_error_set(err, "sequence file \"%s\" is empty", gt_str_get(fr->sequence_filename)); had_err = -1; break; case READING_DESCRIPTION: gt_error_set(err, "unfinished fasta entry in line " GT_WU " of sequence file \"%s\"", line_counter, gt_str_get(fr->sequence_filename)); had_err = -1; break; case READING_SEQUENCE_AFTER_NEWLINE: case READING_SEQUENCE: if (!sequence_length) { gt_assert(line_counter); gt_error_set(err, "empty sequence after description given in line " ""GT_WU"", line_counter - 1); had_err = -1; } else { if (proc_sequence_part) { gt_assert(gt_str_length(sequence)); had_err = proc_sequence_part(gt_str_get(sequence), gt_str_length(sequence), data, err); } if (!had_err && proc_sequence_length) had_err = proc_sequence_length(sequence_length, data, err); } } } /* free */ gt_str_delete(sequence); gt_str_delete(description); return had_err; }