size_t fastq_fread_bytes_aligner_pe(array_list_t *reads, size_t bytes, fastq_file_t *fq_file1, fastq_file_t *fq_file2) { size_t accumulated_size = 0; char header1[MAX_READ_ID_LENGTH]; char header2[MAX_READ_ID_LENGTH]; char read_separator[MAX_READ_ID_LENGTH]; char sequence1[MAX_READ_SEQUENCE_LENGTH]; char sequence2[MAX_READ_SEQUENCE_LENGTH]; char qualities1[MAX_READ_SEQUENCE_LENGTH]; char qualities2[MAX_READ_SEQUENCE_LENGTH]; int header_length1, sequence_length1, quality_length1; int header_length2, sequence_length2, quality_length2; fastq_read_t *read1, *read2; while (accumulated_size < bytes && fgets(header1, MAX_READ_ID_LENGTH, fq_file1->fd) != NULL) { fgets(sequence1, MAX_READ_SEQUENCE_LENGTH, fq_file1->fd); fgets(read_separator, MAX_READ_ID_LENGTH, fq_file1->fd); fgets(qualities1, MAX_READ_SEQUENCE_LENGTH, fq_file1->fd); header_length1 = strlen(header1); sequence_length1 = strlen(sequence1); quality_length1 = strlen(qualities1); // '\n' char is removed, but '\0' is left chomp_at(header1, header_length1 - 1); chomp_at(sequence1, sequence_length1 - 1); chomp_at(qualities1, quality_length1 - 1); // second file fgets(header2, MAX_READ_ID_LENGTH, fq_file2->fd); fgets(sequence2, MAX_READ_SEQUENCE_LENGTH, fq_file2->fd); fgets(read_separator, MAX_READ_ID_LENGTH, fq_file2->fd); fgets(qualities2, MAX_READ_SEQUENCE_LENGTH, fq_file2->fd); header_length2 = strlen(header2); sequence_length2 = strlen(sequence2); quality_length2 = strlen(qualities2); // '\n' char is removed, but '\0' is left chomp_at(header2, header_length2 - 1); chomp_at(sequence2, sequence_length2 - 1); chomp_at(qualities2, quality_length2 - 1); read1 = fastq_read_new(header1, sequence1, qualities1); read2 = fastq_read_new(header2, sequence2, qualities2); array_list_insert(read1, reads); array_list_insert(read2, reads); accumulated_size += header_length1 + sequence_length1 + quality_length1 + header_length2 + sequence_length2 + quality_length2; } return accumulated_size; }
size_t fastq_fread_bytes_se(array_list_t *reads, size_t bytes, fastq_file_t *fq_file) { size_t accumulated_size = 0; char header1[MAX_READ_ID_LENGTH]; char sequence[MAX_READ_SEQUENCE_LENGTH]; char header2[MAX_READ_ID_LENGTH]; char qualities[MAX_READ_SEQUENCE_LENGTH]; int header_length, sequence_length, quality_length; fastq_read_t *read; while (accumulated_size < bytes && fgets(header1, MAX_READ_ID_LENGTH, fq_file->fd) != NULL) { fgets(sequence, MAX_READ_SEQUENCE_LENGTH, fq_file->fd); fgets(header2, MAX_READ_ID_LENGTH, fq_file->fd); fgets(qualities, MAX_READ_SEQUENCE_LENGTH, fq_file->fd); header_length = strlen(header1); sequence_length = strlen(sequence); quality_length = strlen(qualities); // '\n' char is removed, but '\0' is left chomp_at(header1, header_length - 1); chomp_at(sequence, sequence_length - 1); chomp_at(qualities, quality_length - 1); read = fastq_read_new(header1, sequence, qualities); array_list_insert(read, reads); accumulated_size += header_length + sequence_length + quality_length; } return accumulated_size; }
size_t fastq_fread_se_ex(array_list_t *reads, size_t num_reads, fastq_file_t *fq_file) { size_t count = 0; char *p; char header1[MAX_READ_ID_LENGTH]; char sequence[MAX_READ_SEQUENCE_LENGTH]; char header2[MAX_READ_ID_LENGTH]; char qualities[MAX_READ_SEQUENCE_LENGTH]; int header_length, sequence_length, quality_length; fastq_read_t *read; while (count < num_reads && fgets(header1, MAX_READ_ID_LENGTH, fq_file->fd) != NULL) { char *res = fgets(sequence, MAX_READ_SEQUENCE_LENGTH, fq_file->fd); res = fgets(header2, MAX_READ_ID_LENGTH, fq_file->fd); res = fgets(qualities, MAX_READ_SEQUENCE_LENGTH, fq_file->fd); header_length = strlen(header1); sequence_length = strlen(sequence); quality_length = strlen(qualities); // '\n' char is removed, but '\0' is left chomp_at(header1, header_length - 1); if ((p = strstr(header1, " ")) != NULL) { *p = 0; } chomp_at(sequence, sequence_length - 1); chomp_at(qualities, quality_length - 1); read = fastq_read_new(&header1[1], sequence, qualities); array_list_insert(read, reads); count++; } return count; }
size_t fastq_gzread_se(array_list_t *reads, size_t num_reads, fastq_gzfile_t *fq_gzfile) { size_t count = 0; char header1[MAX_READ_ID_LENGTH]; char sequence[MAX_READ_SEQUENCE_LENGTH]; char header2[MAX_READ_ID_LENGTH]; char qualities[MAX_READ_SEQUENCE_LENGTH]; int header_length, sequence_length, quality_length; fastq_read_t *read; size_t num_lines_to_read = 4 * num_reads; /* Each read consists of 4 lines */ int max_data_len = CHUNK; int max_read_len = MAX_READ_SEQUENCE_LENGTH; /* Each read is supposed to be shorter than MAX_READ_SEQUENCE_LENGTH */ int eof_found = 0; int c = 0; int i = 0; // fq_gzfile->i = 0; size_t lines = 0; char *aux; // fq_gzfile->data = (char*) calloc (CHUNK, sizeof(char)); char *data; // = (char*) calloc (CHUNK, sizeof(char)); char *id = (char*) calloc (max_read_len, sizeof(char)); char *seq = (char*) calloc (max_read_len, sizeof(char)); char *qual = (char*) calloc (max_read_len, sizeof(char)); // ZLIB variables unsigned have; unsigned char in[CHUNK]; unsigned char out[CHUNK]; // If there is some data from before calls if(fq_gzfile->data != NULL) { if(fq_gzfile->data_size > max_data_len) { data = (char*) calloc (fq_gzfile->data_size+max_data_len, sizeof(char)); max_data_len = fq_gzfile->data_size+max_data_len; }else{ data = (char*) calloc (max_data_len, sizeof(char)); } strncpy(data, fq_gzfile->data, fq_gzfile->data_size); i = fq_gzfile->data_size; }else { // first time, no data has been saved before data = (char*) calloc (max_data_len, sizeof(char)); } do { fq_gzfile->strm.avail_in = fread(in, 1, CHUNK, fq_gzfile->fd); // printf("fq_gzfile->strm.avail_in: %i, CHUNK: %i\nnext_in: %s\n\n", fq_gzfile->strm.avail_in, CHUNK, fq_gzfile->strm.next_in); if (ferror(fq_gzfile->fd)) { (void)inflateEnd(&fq_gzfile->strm); return Z_ERRNO; } if (fq_gzfile->strm.avail_in == 0) break; fq_gzfile->strm.next_in = in; /* run inflate() on input until output buffer not full */ do { fq_gzfile->strm.avail_out = CHUNK; fq_gzfile->strm.next_out = out; fq_gzfile->ret = inflate(&fq_gzfile->strm, Z_NO_FLUSH); assert(fq_gzfile->ret != Z_STREAM_ERROR); /* state not clobbered */ switch (fq_gzfile->ret) { case Z_NEED_DICT: fq_gzfile->ret = Z_DATA_ERROR; /* and fall through */ case Z_DATA_ERROR: case Z_MEM_ERROR: (void)inflateEnd(&fq_gzfile->strm); return fq_gzfile->ret; } have = CHUNK - fq_gzfile->strm.avail_out; for (int j = 0; j < have && !eof_found; j++) { c = out[j]; if (c != EOF) { max_data_len = consume_input(c, &data, max_data_len, i); if (c == '\n') { lines++; } i++; } else { eof_found = 1; } } } while (fq_gzfile->strm.avail_out == 0); /* done when inflate() says it's done */ } while (lines < num_lines_to_read && fq_gzfile->ret != Z_STREAM_END); // printf("data: %s\n", data); // LOG_DEBUG_F("lines: %i, num_lines_to_read: %i\n", lines, num_lines_to_read); // check if have read the expected number of lines size_t parsed_chars; size_t parsed_lines = 0; size_t data_size; // if(lines > 0) { //= num_lines_to_read aux = data; for(parsed_chars = 0; parsed_chars < i && parsed_lines < num_lines_to_read; parsed_chars++) { if(data[parsed_chars] == '\n') { // printf(">>i: %i, parsed_chars: %i, %i, aux: %s\n", i, parsed_chars, data[i-1], aux); data[parsed_chars] = '\0'; if(count % 4 == 0) { strcpy(id, aux); //printf("%s\n", id); } if(count % 4 == 1) { strcpy(seq, aux); //printf("%s\n", seq); } if(count % 4 == 2) { } if(count % 4 == 3) { strcpy(qual, aux); //printf("%s\n", qual); read = fastq_read_new(id, seq, qual); array_list_insert(read, reads); } count++; aux = data + parsed_chars + 1; parsed_lines++; } } // LOG_DEBUG_F("i: %lu, parsed_lines: %lu\n", i, parsed_lines); // LOG_DEBUG_F("parsed_chars: %lu, parsed_lines: %lu\n", parsed_chars, parsed_lines); // lines = 0; // LOG_DEBUG_F("BEFORE memcpy: fq_gzfile->data_size: %lu, new size: %lu\n", fq_gzfile->data_size, data_size); data_size = i - parsed_chars; if(fq_gzfile->data == NULL) { fq_gzfile->data = (char*)malloc(data_size*sizeof(char)); } if(fq_gzfile->data_size != 0 && fq_gzfile->data_size < data_size) { fq_gzfile->data = realloc(fq_gzfile->data, data_size); } if(data_size > 0) { memcpy(fq_gzfile->data, data+parsed_chars, data_size); } fq_gzfile->data_size = data_size; // } free(data); free(id); free(seq); free(qual); // if(fq_gzfile->ret == Z_STREAM_END) { // (void)inflateEnd(&fq_gzfile->strm); // } // return fq_gzfile->ret == Z_STREAM_END ? Z_OK : Z_DATA_ERROR; // printf(">>>>reads->size: %lu, num_reads: %lu\n", reads->size, num_reads); return reads->size; }
size_t fastq_gzread_bytes_se(array_list_t *reads, size_t bytes_to_read, fastq_gzfile_t *fq_gzfile) { size_t count = 0; fastq_read_t *read; // size_t num_lines_to_read = bytes; /* Each read consists of 4 lines */ int max_data_len = CHUNK; int max_read_len = MAX_READ_SEQUENCE_LENGTH_GZ; /* Each read is supposed to be shorter than MAX_READ_SEQUENCE_LENGTH */ int eof_found = 0; int c = 0; int i = 0; size_t bytes_processed = 0; char *aux; char *data; char *id = (char*) calloc (max_read_len, sizeof(char)); char *seq = (char*) calloc (max_read_len, sizeof(char)); char *qual = (char*) calloc (max_read_len, sizeof(char)); // ZLIB variables unsigned have; unsigned char in[CHUNK]; unsigned char out[CHUNK]; // If there is some data from before calls if(fq_gzfile->data != NULL) { if(fq_gzfile->data_size > max_data_len) { data = (char*) calloc (fq_gzfile->data_size+max_data_len, sizeof(char)); max_data_len = fq_gzfile->data_size + max_data_len; }else{ data = (char*) calloc (max_data_len, sizeof(char)); } strncpy(data, fq_gzfile->data, fq_gzfile->data_size); i = fq_gzfile->data_size; }else { // first time, no data has been saved before data = (char*) calloc (max_data_len, sizeof(char)); } do { fq_gzfile->strm.avail_in = fread(in, 1, CHUNK, fq_gzfile->fd); // printf("fq_gzfile->strm.avail_in: %i, CHUNK: %i\nnext_in: %s\n\n", fq_gzfile->strm.avail_in, CHUNK, fq_gzfile->strm.next_in); if (ferror(fq_gzfile->fd)) { (void)inflateEnd(&fq_gzfile->strm); return Z_ERRNO; } if (fq_gzfile->strm.avail_in == 0) break; fq_gzfile->strm.next_in = in; /* run inflate() on input until output buffer not full */ do { fq_gzfile->strm.avail_out = CHUNK; fq_gzfile->strm.next_out = out; fq_gzfile->ret = inflate(&fq_gzfile->strm, Z_NO_FLUSH); assert(fq_gzfile->ret != Z_STREAM_ERROR); /* state not clobbered */ switch (fq_gzfile->ret) { case Z_NEED_DICT: fq_gzfile->ret = Z_DATA_ERROR; /* and fall through */ case Z_DATA_ERROR: case Z_MEM_ERROR: (void)inflateEnd(&fq_gzfile->strm); return fq_gzfile->ret; } have = CHUNK - fq_gzfile->strm.avail_out; for (int j = 0; j < have && !eof_found; j++) { c = out[j]; if (c != EOF) { max_data_len = consume_input(c, &data, max_data_len, i); // if (c == '\n') { // bytes_processed++; // } i++; bytes_processed++; } else { eof_found = 1; } } } while (fq_gzfile->strm.avail_out == 0); /* done when inflate() says it's done */ } while (i < bytes_to_read && fq_gzfile->ret != Z_STREAM_END); // check if have read the expected number of lines size_t parsed_chars; size_t data_size; aux = data; for(parsed_chars = 0; parsed_chars < i; parsed_chars++) { //parsed_chars < bytes_to_read || parsed_lines % 4 == 0 if(data[parsed_chars] == '\n') { data[parsed_chars] = '\0'; if(count % 4 == 0) { strcpy(id, aux); //printf("%s\n", id); } if(count % 4 == 1) { strcpy(seq, aux); //printf("%s\n", seq); } if(count % 4 == 2) { } if(count % 4 == 3) { strcpy(qual, aux); //printf("%s\n", qual); read = fastq_read_new(id, seq, qual); array_list_insert(read, reads); if(parsed_chars+1 > bytes_to_read) { parsed_chars++; break; } } count++; aux = data + parsed_chars + 1; // parsed_lines++; } } data_size = i - parsed_chars; if(fq_gzfile->data == NULL) { fq_gzfile->data = (char*)malloc(data_size*sizeof(char)); } if(fq_gzfile->data_size != 0 && fq_gzfile->data_size < data_size) { fq_gzfile->data = realloc(fq_gzfile->data, data_size); } if(data_size > 0) { memcpy(fq_gzfile->data, data+parsed_chars, data_size); } fq_gzfile->data_size = data_size; free(data); free(id); free(seq); free(qual); return parsed_chars; }