Beispiel #1
0
static inline int ownbuffer_genfile_getc(GtFastaBuffer *fb,
                                         GtFile *inputstream)
{
  if (fb->currentinpos >= fb->currentfillpos)
  {
    fb->currentfillpos = gt_file_xread(inputstream,
                                          fb->inputbuffer,
                                          (size_t) INPUTFILEBUFFERSIZE);
    if (fb->currentfillpos == 0)
    {
       return EOF;
    }
    fb->currentinpos = 0;
  }
  return fb->inputbuffer[fb->currentinpos++];
}
Beispiel #2
0
static inline int fastq_buf_getchar(GtSeqIteratorFastQ *seqit)
{
  if (seqit->use_ungetchar) {
    seqit->use_ungetchar = false;
    return seqit->ungetchar;
  } else {
    if (seqit->currentinpos >= seqit->currentfillpos) {
      seqit->currentfillpos = gt_file_xread(seqit->curfile, seqit->inbuf,
                                             GT_SEQIT_QUAL_INBUFSIZE);
      if (seqit->currentfillpos == 0)
         return EOF;
      seqit->currentinpos = 0;
    }
    seqit->ungetchar = seqit->inbuf[seqit->currentinpos++];
    return seqit->ungetchar;
  }
}
Beispiel #3
0
static int split_fasta_file(const char *filename, unsigned long max_filesize,
                            bool force, GtError *err)
{
  GtFile *srcfp = NULL, *destfp = NULL;
  GtStr *destfilename = NULL;
  unsigned long filenum = 0, bytecount = 0, separator_pos;
  int read_bytes, had_err = 0;
  char buf[BUFSIZ];

  gt_error_check(err);
  gt_assert(filename && max_filesize);

  /* open source file */
  srcfp = gt_file_xopen(filename, "r");
  gt_assert(srcfp);

  /* read start characters */
  if ((read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) == 0) {
    gt_error_set(err, "file \"%s\" is empty", filename);
    had_err = -1;
  }
  bytecount += read_bytes;

  /* make sure the file is in fasta format */
  if (!had_err && buf[0] != '>') {
    gt_error_set(err, "file is not in FASTA format");
    had_err = -1;
  }

  if (!had_err) {
    /* open destination file */
    destfilename = gt_str_new();
    gt_str_append_cstr_nt(destfilename, filename,
                          gt_file_basename_length(filename));
    gt_str_append_char(destfilename, '.');
    gt_str_append_ulong(destfilename, ++filenum);
    gt_str_append_cstr(destfilename,
                       gt_file_mode_suffix(gt_file_mode(srcfp)));
    if (!(destfp = gt_outputfile_xopen_forcecheck(gt_str_get(destfilename), "w",
                                                  force, err))) {
      had_err = -1;
    }
    if (!had_err)
      gt_file_xwrite(destfp, buf, read_bytes);

    while (!had_err &&
           (read_bytes = gt_file_xread(srcfp, buf, BUFSIZ)) != 0) {
      if (bytecount + read_bytes > max_filesize) {
        int offset = bytecount < max_filesize ? max_filesize - bytecount : 0;
        if ((separator_pos = buf_contains_separator(buf, offset, read_bytes))) {
          separator_pos--;
          gt_assert(separator_pos < read_bytes);
          if (separator_pos)
            gt_file_xwrite(destfp, buf, separator_pos);
          /* close current file */
          gt_file_delete(destfp);
          /* open new file */
          gt_str_reset(destfilename);
          gt_str_append_cstr_nt(destfilename, filename,
                                gt_file_basename_length(filename));
          gt_str_append_char(destfilename, '.');
          gt_str_append_ulong(destfilename, ++filenum);
          gt_str_append_cstr(destfilename,
                             gt_file_mode_suffix(gt_file_mode(srcfp)));
          if (!(destfp =
                  gt_outputfile_xopen_forcecheck(gt_str_get(destfilename), "w",
                                                 force, err))) {
            had_err = -1;
            break;
          }
          bytecount = read_bytes - separator_pos; /* reset */
          gt_assert(buf[separator_pos] == '>');
          gt_file_xwrite(destfp, buf + separator_pos,
                         read_bytes - separator_pos);
          continue;
        }
      }
      bytecount += read_bytes;
      gt_file_xwrite(destfp, buf, read_bytes);
    }
  }

  /* free */
  gt_str_delete(destfilename);

  /* close current file */
  gt_file_delete(destfp);

  /* close source file */
  gt_file_delete(srcfp);

  return had_err;
}
static int gt_fasta_reader_fsm_run(GtFastaReader *fasta_reader,
                                   GtFastaReaderProcDescription
                                   proc_description,
                                   GtFastaReaderProcSequencePart
                                   proc_sequence_part,
                                   GtFastaReaderProcSequenceLength
                                   proc_sequence_length,
                                   void *data, GtError *err)
{
  GtFastaReaderFSM *fr = gt_fasta_reader_fsm_cast(fasta_reader);
  unsigned char cc;
  GtFastaReaderState state = EXPECTING_SEPARATOR;
  GtUword sequence_length = 0, line_counter = 1;
  GtStr *description, *sequence;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(fr);

  /* init */
  description = gt_str_new();
  sequence    = gt_str_new();

  /* at least one function has to be defined */
  gt_assert(proc_description || proc_sequence_part || proc_sequence_length);

  /* rewind sequence file (to allow multiple calls) */
  if (fr->sequence_file)
    gt_file_xrewind(fr->sequence_file);

  /* reading */
  while (!had_err && gt_file_xread(fr->sequence_file, &cc, 1) != 0) {
    switch (state) {
      case EXPECTING_SEPARATOR:
        if (cc != GT_FASTA_SEPARATOR) {
          gt_error_set(err,
                    "the first character of fasta file \"%s\" has to be '%c'",
                    gt_str_get(fr->sequence_filename), GT_FASTA_SEPARATOR);
          had_err = -1;
        }
        else
          state = READING_DESCRIPTION;
        break;
      case READING_DESCRIPTION:
        if (cc == '\n') {
          if (proc_description) {
            had_err = proc_description(gt_str_get(description),
                                       gt_str_length(description), data, err);
            if (!had_err)
              gt_str_reset(description);
          }
          if (!had_err) {
            sequence_length = 0;
            line_counter++;
            state = READING_SEQUENCE_AFTER_NEWLINE;
          }
        }
        else if (proc_description && cc != '\r')
          gt_str_append_char(description, cc);
        break;
      case READING_SEQUENCE_AFTER_NEWLINE:
        if (cc == GT_FASTA_SEPARATOR) {
          if (!sequence_length) {
            gt_assert(line_counter);
            gt_error_set(err, "empty sequence after description given in line "
                              ""GT_WU"", line_counter - 1);
            had_err = -1;
            break;
          }
          else {
            if (proc_sequence_part) {
              gt_assert(gt_str_length(sequence));
              had_err = proc_sequence_part(gt_str_get(sequence),
                                           gt_str_length(sequence), data, err);
            }
            if (had_err)
              break;
            gt_str_reset(sequence);
            if (proc_sequence_length)
              had_err = proc_sequence_length(sequence_length, data, err);
            if (had_err)
              break;
            state = READING_DESCRIPTION;
            continue;
          }
        }
        /*@fallthrough@*/
      case READING_SEQUENCE:
        if (cc == '\n') {
          line_counter++;
          state = READING_SEQUENCE_AFTER_NEWLINE;
        }
        else {
          sequence_length++;
          if (proc_sequence_part) {
            if (gt_str_length(sequence) == BUFSIZ) {
              had_err = proc_sequence_part(gt_str_get(sequence),
                                           gt_str_length(sequence), data, err);
              if (had_err)
                break;
              gt_str_reset(sequence);
            }
            if (cc != ' ' && cc != '\r')
              gt_str_append_char(sequence, cc);
          }
        }
        break;
    }
  }

  if (!had_err) {
    /* checks after reading */
    switch (state) {
      case EXPECTING_SEPARATOR:
        gt_error_set(err, "sequence file \"%s\" is empty",
                  gt_str_get(fr->sequence_filename));
        had_err = -1;
        break;
      case READING_DESCRIPTION:
        gt_error_set(err, "unfinished fasta entry in line " GT_WU
                     " of sequence file \"%s\"",
                     line_counter, gt_str_get(fr->sequence_filename));
        had_err = -1;
        break;
      case READING_SEQUENCE_AFTER_NEWLINE:
      case READING_SEQUENCE:
        if (!sequence_length) {
          gt_assert(line_counter);
          gt_error_set(err, "empty sequence after description given in line "
                            ""GT_WU"", line_counter - 1);
          had_err = -1;
        }
        else {
          if (proc_sequence_part) {
            gt_assert(gt_str_length(sequence));
            had_err = proc_sequence_part(gt_str_get(sequence),
                                         gt_str_length(sequence), data, err);
          }
          if (!had_err && proc_sequence_length)
            had_err = proc_sequence_length(sequence_length, data, err);
        }
    }
  }

  /* free */
  gt_str_delete(sequence);
  gt_str_delete(description);

  return had_err;
}