static int gt_sequence_buffer_fasta_advance(GtSequenceBuffer *sb, GtError *err) { int currentchar, ret = 0; GtUword currentoutpos = 0, currentfileadd = 0, currentfileread = 0; GtSequenceBufferMembers *pvt; GtSequenceBufferFasta *sbf; gt_error_check(err); sbf = (GtSequenceBufferFasta*) sb; pvt = sb->pvt; while (true) { if (currentoutpos >= (GtUword) OUTBUFSIZE) { if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length += (uint64_t) currentfileread; pvt->filelengthtab[pvt->filenum].effectivelength += (uint64_t) currentfileadd; } break; } if (sbf->nextfile) { if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length = 0; pvt->filelengthtab[pvt->filenum].effectivelength = 0; } sbf->nextfile = false; sbf->indesc = false; sbf->firstseqinfile = true; currentfileadd = 0; currentfileread = 0; pvt->linenum = (uint64_t) 1; pvt->inputstream = gt_file_xopen(gt_str_array_get(pvt->filenametab, (GtUword) pvt->filenum), "rb"); pvt->currentinpos = 0; pvt->currentfillpos = 0; } else { currentchar = inlinebuf_getchar(sb, pvt->inputstream); if (currentchar == EOF) { gt_file_delete(pvt->inputstream); pvt->inputstream = NULL; if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length += currentfileread; pvt->filelengthtab[pvt->filenum].effectivelength += currentfileadd; } if ((GtUword) pvt->filenum == gt_str_array_size(pvt->filenametab)-1) { pvt->complete = true; break; } pvt->filenum++; sbf->nextfile = true; } else { currentfileread++; if (sbf->indesc) { if (currentchar == NEWLINESYMBOL) { pvt->linenum++; sbf->indesc = false; } if (pvt->descptr != NULL) { if (currentchar == NEWLINESYMBOL) { gt_desc_buffer_finish(pvt->descptr); } else { if (currentchar != CRSYMBOL) gt_desc_buffer_append_char(pvt->descptr, currentchar); } } } else { if (!isspace((int) currentchar)) { if (currentchar == FASTASEPARATOR) { if (sbf->firstoverallseq) { sbf->firstoverallseq = false; sbf->firstseqinfile = false; } else { if (sbf->firstseqinfile) { sbf->firstseqinfile = false; } else { currentfileadd++; } pvt->outbuf[currentoutpos++] = (unsigned char) SEPARATOR; pvt->lastspeciallength++; } sbf->indesc = true; } else { if ((ret = process_char(sb, currentoutpos, (unsigned char) currentchar, err))) return ret; currentoutpos++; currentfileadd++; } } } } } } if (sbf->firstoverallseq) { gt_error_set(err,"no sequences in multiple fasta file(s) %s ...", gt_str_array_get(pvt->filenametab,0)); return -2; } pvt->nextfree = currentoutpos; return 0; }
static int split_fasta_file(const char *filename, unsigned long max_filesize, bool force, GtError *err) { GtFile *srcfp = NULL, *destfp = NULL; GtStr *destfilename = NULL; unsigned long filenum = 0, bytecount = 0, separator_pos; int read_bytes, had_err = 0; char buf[BUFSIZ]; gt_error_check(err); gt_assert(filename && max_filesize); /* open source file */ srcfp = gt_file_xopen(filename, "r"); gt_assert(srcfp); /* read start characters */ if ((read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) == 0) { gt_error_set(err, "file \"%s\" is empty", filename); had_err = -1; } bytecount += read_bytes; /* make sure the file is in fasta format */ if (!had_err && buf[0] != '>') { gt_error_set(err, "file is not in FASTA format"); had_err = -1; } if (!had_err) { /* open destination file */ destfilename = gt_str_new(); gt_str_append_cstr_nt(destfilename, filename, gt_file_basename_length(filename)); gt_str_append_char(destfilename, '.'); gt_str_append_ulong(destfilename, ++filenum); gt_str_append_cstr(destfilename, gt_file_mode_suffix(gt_file_mode(srcfp))); if (!(destfp = gt_outputfile_xopen_forcecheck(gt_str_get(destfilename), "w", force, err))) { had_err = -1; } if (!had_err) gt_file_xwrite(destfp, buf, read_bytes); while (!had_err && (read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) != 0) { if (bytecount + read_bytes > max_filesize) { int offset = bytecount < max_filesize ? max_filesize - bytecount : 0; if ((separator_pos = buf_contains_separator(buf, offset, read_bytes))) { separator_pos--; gt_assert(separator_pos < read_bytes); if (separator_pos) gt_file_xwrite(destfp, buf, separator_pos); /* close current file */ gt_file_delete(destfp); /* open new file */ gt_str_reset(destfilename); gt_str_append_cstr_nt(destfilename, filename, gt_file_basename_length(filename)); gt_str_append_char(destfilename, '.'); gt_str_append_ulong(destfilename, ++filenum); gt_str_append_cstr(destfilename, gt_file_mode_suffix(gt_file_mode(srcfp))); if (!(destfp = gt_outputfile_xopen_forcecheck(gt_str_get(destfilename), "w", force, err))) { had_err = -1; break; } bytecount = read_bytes - separator_pos; /* reset */ gt_assert(buf[separator_pos] == '>'); gt_file_xwrite(destfp, buf + separator_pos, read_bytes - separator_pos); continue; } } bytecount += read_bytes; gt_file_xwrite(destfp, buf, read_bytes); } } /* free */ gt_str_delete(destfilename); /* close current file */ gt_file_delete(destfp); /* close source file */ gt_file_delete(srcfp); return had_err; }
static int gff3_in_stream_plain_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtGFF3InStreamPlain *is = gff3_in_stream_plain_cast(ns); GtStr *filenamestr; int had_err = 0, status_code; gt_error_check(err); if (gt_queue_size(is->genome_node_buffer) > 1) { /* we still have at least two nodes in the buffer -> serve from there */ *gn = gt_queue_get(is->genome_node_buffer); return 0; } /* the buffer is empty or has one element */ gt_assert(gt_queue_size(is->genome_node_buffer) <= 1); for (;;) { /* open file if necessary */ if (!is->file_is_open) { if (gt_str_array_size(is->files) && is->next_file == gt_str_array_size(is->files)) { break; } if (gt_str_array_size(is->files)) { if (strcmp(gt_str_array_get(is->files, is->next_file), "-") == 0) { if (is->stdin_argument) { gt_error_set(err, "multiple specification of argument file \"-\""); had_err = -1; break; } is->fpin = gt_file_xopen(NULL, "r"); is->file_is_open = true; is->stdin_argument = true; } else { is->fpin = gt_file_xopen(gt_str_array_get(is->files, is->next_file), "r"); is->file_is_open = true; } is->next_file++; } else { if (is->stdin_processed) break; is->fpin = NULL; is->file_is_open = true; } is->line_number = 0; if (!had_err && is->progress_bar) { printf("processing file \"%s\"\n", gt_str_array_size(is->files) ? gt_str_array_get(is->files, is->next_file-1) : "stdin"); } if (!had_err && is->fpin && is->progress_bar) { gt_progressbar_start(&is->line_number, gt_file_number_of_lines(gt_str_array_get(is->files, is->next_file-1))); } } gt_assert(is->file_is_open); filenamestr = gt_str_array_size(is->files) ? gt_str_array_get_str(is->files, is->next_file-1) : is->stdinstr; /* read two nodes */ had_err = gt_gff3_parser_parse_genome_nodes(is->gff3_parser, &status_code, is->genome_node_buffer, is->used_types, filenamestr, &is->line_number, is->fpin, err); if (had_err) break; if (status_code != EOF) { had_err = gt_gff3_parser_parse_genome_nodes(is->gff3_parser, &status_code, is->genome_node_buffer, is->used_types, filenamestr, &is->line_number, is->fpin, err); if (had_err) break; } if (status_code == EOF) { /* end of current file */ if (is->progress_bar) gt_progressbar_stop(); gt_file_delete(is->fpin); is->fpin = NULL; is->file_is_open = false; gt_gff3_parser_reset(is->gff3_parser); if (!gt_str_array_size(is->files)) { is->stdin_processed = true; break; } continue; } gt_assert(gt_queue_size(is->genome_node_buffer)); /* make sure the parsed nodes are sorted */ if (is->ensure_sorting && gt_queue_size(is->genome_node_buffer) > 1) { GtGenomeNode *last_node = NULL; /* a sorted stream can have at most one input file */ gt_assert(gt_str_array_size(is->files) == 0 || gt_str_array_size(is->files) == 1); had_err = gt_queue_iterate(is->genome_node_buffer, buffer_is_sorted, &last_node, err); } if (!had_err) { *gn = gt_queue_get(is->genome_node_buffer); } return had_err; } gt_assert(!gt_queue_size(is->genome_node_buffer)); *gn = NULL; return had_err; }