Exemple #1
0
size_t fastq_fread_bytes_aligner_pe(array_list_t *reads, size_t bytes, fastq_file_t *fq_file1, fastq_file_t *fq_file2) {
	size_t accumulated_size = 0;
	char header1[MAX_READ_ID_LENGTH];
	char header2[MAX_READ_ID_LENGTH];
	char read_separator[MAX_READ_ID_LENGTH];
	char sequence1[MAX_READ_SEQUENCE_LENGTH];
	char sequence2[MAX_READ_SEQUENCE_LENGTH];
	char qualities1[MAX_READ_SEQUENCE_LENGTH];
	char qualities2[MAX_READ_SEQUENCE_LENGTH];
	int header_length1, sequence_length1, quality_length1;
	int header_length2, sequence_length2, quality_length2;
	fastq_read_t *read1, *read2;

	while (accumulated_size < bytes && fgets(header1, MAX_READ_ID_LENGTH, fq_file1->fd) != NULL) {
		fgets(sequence1, MAX_READ_SEQUENCE_LENGTH, fq_file1->fd);
		fgets(read_separator, MAX_READ_ID_LENGTH, fq_file1->fd);
		fgets(qualities1, MAX_READ_SEQUENCE_LENGTH, fq_file1->fd);

		header_length1 = strlen(header1);
		sequence_length1 = strlen(sequence1);
		quality_length1 = strlen(qualities1);

		// '\n' char is removed, but '\0' is left
		chomp_at(header1, header_length1 - 1);
		chomp_at(sequence1, sequence_length1 - 1);
		chomp_at(qualities1, quality_length1 - 1);

		// second file
		fgets(header2, MAX_READ_ID_LENGTH, fq_file2->fd);
		fgets(sequence2, MAX_READ_SEQUENCE_LENGTH, fq_file2->fd);
		fgets(read_separator, MAX_READ_ID_LENGTH, fq_file2->fd);
		fgets(qualities2, MAX_READ_SEQUENCE_LENGTH, fq_file2->fd);

		header_length2 = strlen(header2);
		sequence_length2 = strlen(sequence2);
		quality_length2 = strlen(qualities2);

		// '\n' char is removed, but '\0' is left
		chomp_at(header2, header_length2 - 1);
		chomp_at(sequence2, sequence_length2 - 1);
		chomp_at(qualities2, quality_length2 - 1);

		read1 = fastq_read_new(header1, sequence1, qualities1);
		read2 = fastq_read_new(header2, sequence2, qualities2);

		array_list_insert(read1, reads);
		array_list_insert(read2, reads);

		accumulated_size += header_length1 + sequence_length1 + quality_length1 + header_length2 + sequence_length2 + quality_length2;
	}

	return accumulated_size;
}
Exemple #2
0
size_t fastq_fread_bytes_se(array_list_t *reads, size_t bytes, fastq_file_t *fq_file) {
	size_t accumulated_size = 0;
	char header1[MAX_READ_ID_LENGTH];
	char sequence[MAX_READ_SEQUENCE_LENGTH];
	char header2[MAX_READ_ID_LENGTH];
	char qualities[MAX_READ_SEQUENCE_LENGTH];
	int header_length, sequence_length, quality_length;
	fastq_read_t *read;

	while (accumulated_size < bytes && fgets(header1, MAX_READ_ID_LENGTH, fq_file->fd) != NULL) {
		fgets(sequence, MAX_READ_SEQUENCE_LENGTH, fq_file->fd);
		fgets(header2, MAX_READ_ID_LENGTH, fq_file->fd);
		fgets(qualities, MAX_READ_SEQUENCE_LENGTH, fq_file->fd);
		header_length = strlen(header1);
		sequence_length = strlen(sequence);
		quality_length = strlen(qualities);

		// '\n' char is removed, but '\0' is left
		chomp_at(header1, header_length - 1);
		chomp_at(sequence, sequence_length - 1);
		chomp_at(qualities, quality_length - 1);
		
		read = fastq_read_new(header1, sequence, qualities);
		array_list_insert(read, reads);
		
		accumulated_size += header_length + sequence_length + quality_length;
	}

	return accumulated_size;
}
Exemple #3
0
size_t fastq_fread_se_ex(array_list_t *reads, size_t num_reads, fastq_file_t *fq_file) {
  size_t count = 0;
  char *p;
  char header1[MAX_READ_ID_LENGTH];
  char sequence[MAX_READ_SEQUENCE_LENGTH];
  char header2[MAX_READ_ID_LENGTH];
  char qualities[MAX_READ_SEQUENCE_LENGTH];
  int header_length, sequence_length, quality_length;
  fastq_read_t *read;
  
  while (count < num_reads && fgets(header1, MAX_READ_ID_LENGTH, fq_file->fd) != NULL) {
    char *res = fgets(sequence, MAX_READ_SEQUENCE_LENGTH, fq_file->fd);
    res = fgets(header2, MAX_READ_ID_LENGTH, fq_file->fd);
    res = fgets(qualities, MAX_READ_SEQUENCE_LENGTH, fq_file->fd);
    
    header_length = strlen(header1);
    sequence_length = strlen(sequence);
    quality_length = strlen(qualities);
    
    // '\n' char is removed, but '\0' is left
    chomp_at(header1, header_length - 1);
    if ((p = strstr(header1, " ")) != NULL) {
      *p = 0;
    }
    chomp_at(sequence, sequence_length - 1);
    chomp_at(qualities, quality_length - 1);

    read = fastq_read_new(&header1[1], sequence, qualities);
    array_list_insert(read, reads);
    
    count++;
  }
  
  return count;
}
Exemple #4
0
size_t fastq_gzread_se(array_list_t *reads, size_t num_reads, fastq_gzfile_t *fq_gzfile) {
	size_t count = 0;
	char header1[MAX_READ_ID_LENGTH];
	char sequence[MAX_READ_SEQUENCE_LENGTH];
	char header2[MAX_READ_ID_LENGTH];
	char qualities[MAX_READ_SEQUENCE_LENGTH];
	int header_length, sequence_length, quality_length;
	fastq_read_t *read;

	size_t num_lines_to_read = 4 * num_reads;	/* Each read consists of 4 lines */

	int max_data_len = CHUNK;
	int max_read_len = MAX_READ_SEQUENCE_LENGTH;	/* Each read is supposed to be shorter than MAX_READ_SEQUENCE_LENGTH */
	int eof_found = 0;
	int c = 0;
	int i = 0;
	//	fq_gzfile->i = 0;
	size_t lines = 0;
	char *aux;
	//	fq_gzfile->data = (char*) calloc (CHUNK, sizeof(char));
	char *data; // = (char*) calloc (CHUNK, sizeof(char));
	char *id = (char*) calloc (max_read_len, sizeof(char));
	char *seq = (char*) calloc (max_read_len, sizeof(char));
	char *qual = (char*) calloc (max_read_len, sizeof(char));

	// ZLIB variables
	unsigned have;
	unsigned char in[CHUNK];
	unsigned char out[CHUNK];


	// If there is some data from before calls
	if(fq_gzfile->data != NULL) {
		if(fq_gzfile->data_size > max_data_len) {
			data = (char*) calloc (fq_gzfile->data_size+max_data_len, sizeof(char));
			max_data_len = fq_gzfile->data_size+max_data_len;
		}else{
			data = (char*) calloc (max_data_len, sizeof(char));
		}
		strncpy(data, fq_gzfile->data, fq_gzfile->data_size);
		i = fq_gzfile->data_size;
	}else {
		// first time, no data has been saved before
		data = (char*) calloc (max_data_len, sizeof(char));
	}


	do {
		fq_gzfile->strm.avail_in = fread(in, 1, CHUNK, fq_gzfile->fd);
		//		printf("fq_gzfile->strm.avail_in: %i, CHUNK: %i\nnext_in: %s\n\n", fq_gzfile->strm.avail_in, CHUNK, fq_gzfile->strm.next_in);
		if (ferror(fq_gzfile->fd)) {
			(void)inflateEnd(&fq_gzfile->strm);
			return Z_ERRNO;
		}
		if (fq_gzfile->strm.avail_in == 0)
			break;
		fq_gzfile->strm.next_in = in;

		/* run inflate() on input until output buffer not full */
		do {
			fq_gzfile->strm.avail_out = CHUNK;
			fq_gzfile->strm.next_out = out;
			fq_gzfile->ret = inflate(&fq_gzfile->strm, Z_NO_FLUSH);
			assert(fq_gzfile->ret != Z_STREAM_ERROR);  /* state not clobbered */
			switch (fq_gzfile->ret) {
			case Z_NEED_DICT:
				fq_gzfile->ret = Z_DATA_ERROR;     /* and fall through */
			case Z_DATA_ERROR:
			case Z_MEM_ERROR:
				(void)inflateEnd(&fq_gzfile->strm);
				return fq_gzfile->ret;
			}
			have = CHUNK - fq_gzfile->strm.avail_out;
			for (int j = 0; j < have && !eof_found; j++) {
				c = out[j];

				if (c != EOF) {
					max_data_len = consume_input(c, &data, max_data_len, i);
					if (c == '\n') {
						lines++;
					}
					i++;
				} else {
					eof_found = 1;
				}
			}
		} while (fq_gzfile->strm.avail_out == 0);

		/* done when inflate() says it's done */
	} while (lines < num_lines_to_read && fq_gzfile->ret != Z_STREAM_END);

	//	printf("data: %s\n", data);
	//	LOG_DEBUG_F("lines: %i, num_lines_to_read: %i\n", lines, num_lines_to_read);

	// check if have read the expected number of lines
	size_t parsed_chars;
	size_t parsed_lines = 0;
	size_t data_size;
	//	if(lines > 0) { //= num_lines_to_read
	aux = data;
	for(parsed_chars = 0; parsed_chars < i && parsed_lines < num_lines_to_read; parsed_chars++) {
		if(data[parsed_chars] == '\n') {
//		printf(">>i: %i, parsed_chars: %i, %i, aux: %s\n", i, parsed_chars, data[i-1], aux);
			data[parsed_chars] = '\0';
			if(count % 4 == 0) {
				strcpy(id, aux);  //printf("%s\n", id);
			}
			if(count % 4 == 1) {
				strcpy(seq, aux);  //printf("%s\n", seq);
			}
			if(count % 4 == 2) {
			}
			if(count % 4 == 3) {
				strcpy(qual, aux);  //printf("%s\n", qual);
				read = fastq_read_new(id, seq, qual);
				array_list_insert(read, reads);
			}
			count++;
			aux = data + parsed_chars + 1;
			parsed_lines++;
		}
	}
	//		LOG_DEBUG_F("i: %lu, parsed_lines: %lu\n", i, parsed_lines);
	//		LOG_DEBUG_F("parsed_chars: %lu, parsed_lines: %lu\n", parsed_chars, parsed_lines);
	//		lines = 0;
	//		LOG_DEBUG_F("BEFORE memcpy: fq_gzfile->data_size: %lu, new size: %lu\n", fq_gzfile->data_size, data_size);
	data_size = i - parsed_chars;
	if(fq_gzfile->data == NULL) {
		fq_gzfile->data = (char*)malloc(data_size*sizeof(char));
	}
	if(fq_gzfile->data_size != 0 && fq_gzfile->data_size < data_size) {
		fq_gzfile->data = realloc(fq_gzfile->data, data_size);
	}
	if(data_size > 0) {
		memcpy(fq_gzfile->data, data+parsed_chars, data_size);
	}
	fq_gzfile->data_size = data_size;
	//	}

	free(data);
	free(id);
	free(seq);
	free(qual);

	//	if(fq_gzfile->ret == Z_STREAM_END) {
	//		(void)inflateEnd(&fq_gzfile->strm);
	//	}
	//		return fq_gzfile->ret == Z_STREAM_END ? Z_OK : Z_DATA_ERROR;
	//	printf(">>>>reads->size: %lu, num_reads: %lu\n", reads->size, num_reads);
	return reads->size;
}
Exemple #5
0
size_t fastq_gzread_bytes_se(array_list_t *reads, size_t bytes_to_read, fastq_gzfile_t *fq_gzfile) {
  size_t count = 0;





  fastq_read_t *read;

  //	size_t num_lines_to_read = bytes;	/* Each read consists of 4 lines */

  int max_data_len = CHUNK;
  int max_read_len = MAX_READ_SEQUENCE_LENGTH_GZ;	/* Each read is supposed to be shorter than MAX_READ_SEQUENCE_LENGTH */
  int eof_found = 0;
  int c = 0;
  int i = 0;
  size_t bytes_processed = 0;
  char *aux;

  char *data;
  char *id = (char*) calloc (max_read_len, sizeof(char));
  char *seq = (char*) calloc (max_read_len, sizeof(char));
  char *qual = (char*) calloc (max_read_len, sizeof(char));

  // ZLIB variables
  unsigned have;
  unsigned char in[CHUNK];
  unsigned char out[CHUNK];


  // If there is some data from before calls
  if(fq_gzfile->data != NULL) {
    if(fq_gzfile->data_size > max_data_len) {
      data = (char*) calloc (fq_gzfile->data_size+max_data_len, sizeof(char));
      max_data_len = fq_gzfile->data_size + max_data_len;
    }else{
      data = (char*) calloc (max_data_len, sizeof(char));
    }
    strncpy(data, fq_gzfile->data, fq_gzfile->data_size);
    i = fq_gzfile->data_size;
  }else {
    // first time, no data has been saved before
    data = (char*) calloc (max_data_len, sizeof(char));
  }


  do {
    fq_gzfile->strm.avail_in = fread(in, 1, CHUNK, fq_gzfile->fd);
    //		printf("fq_gzfile->strm.avail_in: %i, CHUNK: %i\nnext_in: %s\n\n", fq_gzfile->strm.avail_in, CHUNK, fq_gzfile->strm.next_in);
    if (ferror(fq_gzfile->fd)) {
      (void)inflateEnd(&fq_gzfile->strm);
      return Z_ERRNO;
    }
    if (fq_gzfile->strm.avail_in == 0)
      break;
    fq_gzfile->strm.next_in = in;

    /* run inflate() on input until output buffer not full */
    do {
      fq_gzfile->strm.avail_out = CHUNK;
      fq_gzfile->strm.next_out = out;
      fq_gzfile->ret = inflate(&fq_gzfile->strm, Z_NO_FLUSH);
      assert(fq_gzfile->ret != Z_STREAM_ERROR);  /* state not clobbered */
      switch (fq_gzfile->ret) {
      case Z_NEED_DICT:
	fq_gzfile->ret = Z_DATA_ERROR;     /* and fall through */
      case Z_DATA_ERROR:
      case Z_MEM_ERROR:
	(void)inflateEnd(&fq_gzfile->strm);
	return fq_gzfile->ret;
      }
      have = CHUNK - fq_gzfile->strm.avail_out;
      for (int j = 0; j < have && !eof_found; j++) {
	c = out[j];

	if (c != EOF) {
	  max_data_len = consume_input(c, &data, max_data_len, i);
	  //					if (c == '\n') {
	  //						bytes_processed++;
	  //					}
	  i++;
	  bytes_processed++;
	} else {
	  eof_found = 1;
	}
      }
    } while (fq_gzfile->strm.avail_out == 0);

    /* done when inflate() says it's done */
  } while (i < bytes_to_read && fq_gzfile->ret != Z_STREAM_END);

  // check if have read the expected number of lines
  size_t parsed_chars;

  size_t data_size;
  aux = data;
  for(parsed_chars = 0; parsed_chars < i; parsed_chars++) {	//parsed_chars < bytes_to_read || parsed_lines % 4 == 0
    if(data[parsed_chars] == '\n') {
      data[parsed_chars] = '\0';
      if(count % 4 == 0) {
	strcpy(id, aux);  //printf("%s\n", id);
      }
      if(count % 4 == 1) {
	strcpy(seq, aux);  //printf("%s\n", seq);
      }
      if(count % 4 == 2) {
      }
      if(count % 4 == 3) {
	strcpy(qual, aux);  //printf("%s\n", qual);
	read = fastq_read_new(id, seq, qual);
	array_list_insert(read, reads);
	if(parsed_chars+1 > bytes_to_read) {
	  parsed_chars++;
	  break;
	}
      }
      count++;
      aux = data + parsed_chars + 1;
      //			parsed_lines++;
    }
  }
  data_size = i - parsed_chars;
  if(fq_gzfile->data == NULL) {
    fq_gzfile->data = (char*)malloc(data_size*sizeof(char));
  }
  if(fq_gzfile->data_size != 0 && fq_gzfile->data_size < data_size) {
    fq_gzfile->data = realloc(fq_gzfile->data, data_size);
  }
  if(data_size > 0) {
    memcpy(fq_gzfile->data, data+parsed_chars, data_size);
  }
  fq_gzfile->data_size = data_size;

  free(data);
  free(id);
  free(seq);
  free(qual);

  return parsed_chars;
}