void *fastq_reader(void *input) { struct timeval start, end; double time; extern size_t fd_read_bytes; size_t read_bytes; //if (time_on) { start_timer(start); } wf_input_t *wf_input = (wf_input_t *) input; batch_t *new_batch = NULL; batch_t *batch = wf_input->batch; fastq_batch_reader_input_t *fq_reader_input = wf_input->fq_reader_input; array_list_t *reads = array_list_new(10000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); if (fq_reader_input->gzip) { //Gzip fastq file if (fq_reader_input->flags == SINGLE_END_MODE) { fastq_gzread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1); } else { //printf("Gzip Reader for pair-end not implemented\n");; fastq_gzread_bytes_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2); //fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, // fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2); } } else { //Fastq file if (fq_reader_input->flags == SINGLE_END_MODE) { read_bytes = fastq_fread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1); } else { read_bytes = fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1, fq_reader_input->fq_file2); } fd_read_bytes += read_bytes; } size_t num_reads = array_list_size(reads); if (num_reads == 0) { array_list_free(reads, (void *)fastq_read_free); } else { mapping_batch_t *mapping_batch = mapping_batch_new(reads, batch->pair_input->pair_mng); new_batch = batch_new(batch->bwt_input, batch->region_input, batch->cal_input, batch->pair_input, batch->preprocess_rna, batch->sw_input, batch->writer_input, batch->mapping_mode, mapping_batch); } //if (time_on) { stop_timer(start, end, time); timing_add(time, FASTQ_READER, timing); } //printf("Read batch %i\n", num_reads); return new_batch; }
void *sa_fq_reader(void *input) { sa_wf_input_t *wf_input = (sa_wf_input_t *) input; sa_wf_batch_t *new_wf_batch = NULL; sa_wf_batch_t *curr_wf_batch = wf_input->wf_batch; fastq_batch_reader_input_t *fq_reader_input = wf_input->fq_reader_input; array_list_t *reads = array_list_new(fq_reader_input->batch_size, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); if (fq_reader_input->gzip) { // Gzip fastq file if (fq_reader_input->flags == SINGLE_END_MODE) { fastq_gzread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1); } else { fastq_gzread_bytes_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2); } } else { // Fastq file if (fq_reader_input->flags == SINGLE_END_MODE) { fastq_fread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1); } else { fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1, fq_reader_input->fq_file2); } } size_t num_reads = array_list_size(reads); if (num_reads == 0) { array_list_free(reads, (void *)fastq_read_free); } else { sa_mapping_batch_t *sa_mapping_batch = sa_mapping_batch_new(reads); sa_mapping_batch->bam_format = wf_input->bam_format; new_wf_batch = sa_wf_batch_new(curr_wf_batch->options, curr_wf_batch->sa_index, curr_wf_batch->writer_input, sa_mapping_batch, NULL); } return new_wf_batch; }
int main (int argc, char *argv[]) { if(!strcmp("count-lines", argv[1])) { fastq_file_t *file = fastq_fopen(argv[2]); array_list_t *reads = array_list_new(2000000, 1.8, COLLECTION_MODE_SYNCHRONIZED); size_t nread = 1; int count = 0; while((nread = fastq_fread_se(reads, 100000, file)) != 0) { count += nread; for(int i=0; i<reads->size; i++) { fastq_read_print(array_list_get(i, reads)); } // printf("Size: %i, Capacity: %i\n", reads->size, reads->capacity); array_list_clear(reads, fastq_read_free); } // printf("Total num reads: %i\n", reads->size); // fastq_read_print(array_list_get(0, reads)); // fastq_read_print(array_list_get(reads->size-1, reads)); array_list_free(reads, fastq_read_free); fastq_fclose(file); } if(!strcmp("count-lines-gz", argv[1])) { fastq_gzfile_t *file = fastq_gzopen(argv[2]); // printf("=>%i\n", file->ret); array_list_t *reads = array_list_new(1000000, 1.8, COLLECTION_MODE_SYNCHRONIZED); size_t nread = 1; int count = 0; while((nread = fastq_gzread_se(reads, 100000, file)) != 0) { // nread = fastq_gzread_se(reads, 1000000, file); count += nread; // printf("Size: %i, Capacity: %i, count = %i, nread: %i\n", reads->size, reads->capacity, count, nread); for(int i=0; i<reads->size; i++) { fastq_read_print(array_list_get(i, reads)); } // fastq_read_print((fastq_read_t*)array_list_get(reads->size-1, reads)); array_list_clear(reads, fastq_read_free); } // printf("Total num reads: %i\n", count); // fastq_read_print(array_list_get(0, reads)); array_list_free(reads, fastq_read_free); fastq_gzclose(file); } if(!strcmp("count-bytes-gz", argv[1])) { fastq_gzfile_t *file = fastq_gzopen(argv[2]); // printf("=>%i\n", file->ret); array_list_t *reads = array_list_new(1000000, 1.8, COLLECTION_MODE_SYNCHRONIZED); size_t nread = 1; int count = 0; while((nread = fastq_gzread_bytes_se(reads, 10000000, file)) != 0) { // nread = fastq_gzread_bytes_se(reads, 100000, file); count += reads->size; // printf("Size: %i, Capacity: %i, count = %i, nread: %i\n", reads->size, reads->capacity, count, nread); for(int i=0; i<reads->size; i++) { fastq_read_print(array_list_get(i, reads)); } // fastq_read_print(array_list_get(reads->size-1, reads)); array_list_clear(reads, fastq_read_free); } // printf("Total num reads: %i\n", count); // fastq_read_print(array_list_get(0, reads)); array_list_free(reads, fastq_read_free); fastq_gzclose(file); } if(!strcmp("filter", argv[1])) { fastq_file_t *file = fastq_fopen(argv[2]); fastq_filter_options_t *fastq_filter_options = fastq_filter_options_new(50,150, 30, 80, 2, 100); array_list_t *reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); array_list_t *passed_reads ;//= array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); array_list_t *failed_reads ;//= array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); size_t nread = 1; int count = 0; while((nread = fastq_fread_se(reads, 1000000, file)) != 0) { count += reads->size; passed_reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); failed_reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED); // for(int i=0; i<reads->size; i++) { // fastq_read_print(array_list_get(i, reads)); // } fastq_filter(reads, passed_reads, failed_reads, fastq_filter_options); fastq_read_print(array_list_get(0, passed_reads)); fastq_read_print(array_list_get(0, failed_reads)); printf("Total Reads: %lu, Passed Reads: %lu, Reads failed: %lu\n", reads->size, passed_reads->size, failed_reads->size); array_list_clear(reads, fastq_read_free); array_list_free(passed_reads, NULL); array_list_free(failed_reads, NULL); // fastq_read_print(array_list_get(0, passed_reads)); // fastq_read_print(array_list_get(0, failed_reads)); // printf("Total Reads: %lu, Passed Reads: %lu, Reads filter: %lu\n", reads->size, passed_reads->size, failed_reads->size); } // fastq_read_print(array_list_get(0, passed_reads)); // fastq_read_print(array_list_get(0, failed_reads)); // printf("Total Reads: %lu, Passed Reads: %lu, Reads filter: %lu\n", reads->size, passed_reads->size, failed_reads->size); fastq_filter_options_free(fastq_filter_options); array_list_free(reads, NULL); // array_list_free(passed_reads, fastq_read_free); // array_list_free(failed_reads, fastq_read_free); fastq_fclose(file); } return 0; }
size_t fastq_gzread_bytes_pe(array_list_t *reads, size_t bytes_to_read, fastq_gzfile_t *fq_gzfile1, fastq_gzfile_t *fq_gzfile2) { size_t bytes_processed; array_list_t *list1 = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_t *list2 = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); bytes_processed = fastq_gzread_bytes_se(list1, bytes_to_read / 2, fq_gzfile1); size_t num_reads = array_list_size(list1); fastq_gzread_se(list2, num_reads, fq_gzfile2); for (size_t i = 0; i < num_reads; i++) { array_list_insert(array_list_get(i, list1), reads); fastq_read_t *fq_read = array_list_get(i, list2); array_list_insert(fq_read, reads); bytes_processed += fq_read->length * 2 + strlen(fq_read->id) - 1; } return bytes_processed; //char *data1; //char *id = (char*) calloc (max_read_len, sizeof(char)); //char *seq = (char*) calloc (max_read_len, sizeof(char)); //char *qual = (char*) calloc (max_read_len, sizeof(char)); /* char *data2; char *id2 = (char*) calloc (max_read_len, sizeof(char)); char *seq2 = (char*) calloc (max_read_len, sizeof(char)); char *qual2 = (char*) calloc (max_read_len, sizeof(char)); */ // ZLIB variables /* unsigned have; unsigned char in1[CHUNK]; unsigned char in2[CHUNK]; unsigned char out1[CHUNK]; unsigned char out2[CHUNK]; // If there is some data from before calls if(fq_gzfile1->data != NULL) { if(fq_gzfile1->data_size > max_data_len) { data1 = (char*) calloc (fq_gzfile1->data_size+max_data_len, sizeof(char)); data2 = (char*) calloc (fq_gzfile1->data_size+max_data_len, sizeof(char)); max_data_len = fq_gzfile1->data_size + max_data_len; }else{ data1 = (char*) calloc (max_data_len, sizeof(char)); data2 = (char*) calloc (max_data_len, sizeof(char)); } strncpy(data1, fq_gzfile1->data, fq_gzfile1->data_size); strncpy(data2, fq_gzfile2->data, fq_gzfile1->data_size); i1 = fq_gzfile1->data_size; i2 = fq_gzfile2->data_size; }else { // first time, no data has been saved before data1 = (char*) calloc (max_data_len, sizeof(char)); data2 = (char*) calloc (max_data_len, sizeof(char)); } do { //Read P1 Read if (fq_gzfile1->ret != Z_STREAM_END) { fq_gzfile1->strm.avail_in = fread(in1, 1, CHUNK, fq_gzfile1->fd); if (ferror(fq_gzfile1->fd)) { (void)inflateEnd(&fq_gzfile1->strm); return Z_ERRNO; } if (fq_gzfile1->strm.avail_in == 0) break; fq_gzfile1->strm.next_in = in1; } //Read P2 Read if (fq_gzfile2->ret != Z_STREAM_END) { fq_gzfile2->strm.avail_in = fread(in2, 1, CHUNK, fq_gzfile2->fd); if (ferror(fq_gzfile2->fd)) { (void)inflateEnd(&fq_gzfile2->strm); return Z_ERRNO; } if (fq_gzfile2->strm.avail_in == 0) break; fq_gzfile2->strm.next_in = in2; } // run inflate() on input until output buffer not full do { //Process file P1 if (fq_gzfile1->strm.avail_out == 0) { fq_gzfile1->strm.avail_out = CHUNK; fq_gzfile1->strm.next_out = out1; fq_gzfile1->ret = inflate(&fq_gzfile1->strm, Z_NO_FLUSH); assert(fq_gzfile1->ret != Z_STREAM_ERROR); // state not clobbered switch (fq_gzfile1->ret) { case Z_NEED_DICT: fq_gzfile1->ret = Z_DATA_ERROR; //and fall through case Z_DATA_ERROR: case Z_MEM_ERROR: (void)inflateEnd(&fq_gzfile1->strm); return fq_gzfile1->ret; } have = CHUNK - fq_gzfile1->strm.avail_out; for (int j = 0; j < have && !eof_found1; j++) { c1 = out1[j]; if (c1 != EOF) { max_data_len = consume_input(c1, &data1, max_data_len, i1); i1++; bytes_processed++; } else { eof_found1 = 1; } } } //Read file P2 if (fq_gzfile2->strm.avail_out == 0) { fq_gzfile2->strm.avail_out = CHUNK; fq_gzfile2->strm.next_out = out2; fq_gzfile2->ret = inflate(&fq_gzfile2->strm, Z_NO_FLUSH); assert(fq_gzfile2->ret != Z_STREAM_ERROR); // state not clobbered switch (fq_gzfile2->ret) { case Z_NEED_DICT: fq_gzfile2->ret = Z_DATA_ERROR; // and fall through case Z_DATA_ERROR: case Z_MEM_ERROR: (void)inflateEnd(&fq_gzfile2->strm); return fq_gzfile2->ret; } have = CHUNK - fq_gzfile2->strm.avail_out; for (int j = 0; j < have && !eof_found2; j++) { c2 = out2[j]; if (c2 != EOF) { max_data_len = consume_input(c2, &data2, max_data_len, i2); i2++; bytes_processed++; } else { eof_found2 = 1; } } } } while (fq_gzfile1->strm.avail_out == 0 || fq_gzfile2->strm.avail_out == 0); // done when inflate() says it's done } while (bytes_processed < bytes_to_read && (fq_gzfile1->ret != Z_STREAM_END || fq_gzfile2->ret != Z_STREAM_END)); // check if have read the expected number of lines int n_reads; size_t parsed_chars; size_t parsed_lines = 0; size_t data_size; int new_bytes_to_read = bytes_to_read / 2; //size_t fastq_gzread_bytes_se(array_list_t *reads, size_t bytes_to_read, fastq_gzfile_t *fq_gzfile); aux = data1; for(parsed_chars = 0; parsed_chars < i1; parsed_chars++) { //parsed_chars < bytes_to_read || parsed_lines % 4 == 0 if(data1[parsed_chars] == '\n') { data1[parsed_chars] = '\0'; if(count1 % 4 == 0) { strcpy(id, aux1); //printf("%s\n", id); } if(count1 % 4 == 1) { strcpy(seq, aux1); //printf("%s\n", seq); } if(count1 % 4 == 2) { } if(count1 % 4 == 3) { strcpy(qual, aux1); //printf("%s\n", qual); read = fastq_read_new(id, seq, qual); array_list_insert(read, list1); if(parsed_chars + 1 > new_bytes_to_read) { parsed_chars++; break; } } count++; aux = data1 + parsed_chars + 1; } } data_size = i1 - parsed_chars; if(fq_gzfile1->data == NULL) { fq_gzfile1->data = (char*)malloc(data_size*sizeof(char)); } if(fq_gzfile1->data_size != 0 && fq_gzfile1->data_size < data_size) { fq_gzfile1->data = realloc(fq_gzfile1->data, data_size); } if(data_size > 0) { memcpy(fq_gzfile1->data, data1 + parsed_chars, data_size); } fq_gzfile1->data_size = data_size; n_reads = array_list_size(reads); count = 0; for(parsed_chars = 0; parsed_chars < i2; parsed_chars++) { //parsed_chars < bytes_to_read || parsed_lines % 4 == 0 if(data2[parsed_chars] == '\n') { data2[parsed_chars] = '\0'; if(count2 % 4 == 0) { strcpy(id, aux2); //printf("%s\n", id); } if(count2 % 4 == 1) { strcpy(seq, aux2); //printf("%s\n", seq); } if(count2 % 4 == 2) { } if(count2 % 4 == 3) { strcpy(qual, aux1); //printf("%s\n", qual); read = fastq_read_new(id, seq, qual); array_list_insert(read, reads); if(parsed_chars + 1 > bytes_to_read) { parsed_chars++; break; } } count++; aux2 = data2 + parsed_chars + 1; } } free(data); free(id); free(seq); free(qual); */ }