Exemplo n.º 1
0
int main (int argc, char *argv[]) {

	if(!strcmp("count-lines", argv[1])) {
		fastq_file_t *file = fastq_fopen(argv[2]);
		array_list_t *reads = array_list_new(2000000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		size_t nread = 1;
		int count = 0;
		while((nread = fastq_fread_se(reads, 100000, file)) != 0) {
			count += nread;
			for(int i=0; i<reads->size; i++) {
				fastq_read_print(array_list_get(i, reads));
			}
			//			printf("Size: %i, Capacity: %i\n", reads->size, reads->capacity);
			array_list_clear(reads, fastq_read_free);
		}
		//		printf("Total num reads: %i\n", reads->size);
		//		fastq_read_print(array_list_get(0, reads));
		//		fastq_read_print(array_list_get(reads->size-1, reads));
		array_list_free(reads, fastq_read_free);
		fastq_fclose(file);
	}

	if(!strcmp("count-lines-gz", argv[1])) {
		fastq_gzfile_t *file = fastq_gzopen(argv[2]);
		//		printf("=>%i\n", file->ret);
		array_list_t *reads = array_list_new(1000000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		size_t nread = 1;
		int count = 0;
		while((nread = fastq_gzread_se(reads, 100000, file)) != 0) {
			//			nread = fastq_gzread_se(reads, 1000000, file);
			count += nread;
			//			printf("Size: %i, Capacity: %i, count = %i, nread: %i\n", reads->size, reads->capacity, count, nread);
			for(int i=0; i<reads->size; i++) {
				fastq_read_print(array_list_get(i, reads));
			}
			//			fastq_read_print((fastq_read_t*)array_list_get(reads->size-1, reads));
			array_list_clear(reads, fastq_read_free);
		}
		//		printf("Total num reads: %i\n", count);
		//		fastq_read_print(array_list_get(0, reads));
		array_list_free(reads, fastq_read_free);
		fastq_gzclose(file);
	}

	if(!strcmp("count-bytes-gz", argv[1])) {
		fastq_gzfile_t *file = fastq_gzopen(argv[2]);
		//			printf("=>%i\n", file->ret);
		array_list_t *reads = array_list_new(1000000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		size_t nread = 1;
		int count = 0;
		while((nread = fastq_gzread_bytes_se(reads, 10000000, file)) != 0) {
			//				nread = fastq_gzread_bytes_se(reads, 100000, file);
			count += reads->size;
			//				printf("Size: %i, Capacity: %i, count = %i, nread: %i\n", reads->size, reads->capacity, count, nread);
			for(int i=0; i<reads->size; i++) {
				fastq_read_print(array_list_get(i, reads));
			}
			//				fastq_read_print(array_list_get(reads->size-1, reads));
			array_list_clear(reads, fastq_read_free);
		}
		//			printf("Total num reads: %i\n", count);
		//		fastq_read_print(array_list_get(0, reads));
		array_list_free(reads, fastq_read_free);
		fastq_gzclose(file);
	}

	if(!strcmp("filter", argv[1])) {
		fastq_file_t *file = fastq_fopen(argv[2]);
		fastq_filter_options_t *fastq_filter_options = fastq_filter_options_new(50,150, 30, 80, 2, 100);
		array_list_t *reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		array_list_t *passed_reads ;//= array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		array_list_t *failed_reads ;//= array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		size_t nread = 1;
		int count = 0;
		while((nread = fastq_fread_se(reads, 1000000, file)) != 0) {
			count += reads->size;
			passed_reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
			failed_reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
			//			for(int i=0; i<reads->size; i++) {
			//				fastq_read_print(array_list_get(i, reads));
			//			}
			fastq_filter(reads, passed_reads, failed_reads, fastq_filter_options);
			fastq_read_print(array_list_get(0, passed_reads));
			fastq_read_print(array_list_get(0, failed_reads));
			printf("Total Reads: %lu, Passed Reads: %lu, Reads failed: %lu\n", reads->size, passed_reads->size, failed_reads->size);
			array_list_clear(reads, fastq_read_free);
			array_list_free(passed_reads, NULL);
			array_list_free(failed_reads, NULL);
			//			fastq_read_print(array_list_get(0, passed_reads));
			//			fastq_read_print(array_list_get(0, failed_reads));
			//			printf("Total Reads: %lu, Passed Reads: %lu, Reads filter: %lu\n", reads->size, passed_reads->size, failed_reads->size);
		}
		//		fastq_read_print(array_list_get(0, passed_reads));
		//		fastq_read_print(array_list_get(0, failed_reads));
//		printf("Total Reads: %lu, Passed Reads: %lu, Reads filter: %lu\n", reads->size, passed_reads->size, failed_reads->size);

		fastq_filter_options_free(fastq_filter_options);
		array_list_free(reads, NULL);
		//		array_list_free(passed_reads, fastq_read_free);
		//		array_list_free(failed_reads, fastq_read_free);
		fastq_fclose(file);
	}

	return 0;
}
Exemplo n.º 2
0
size_t fastq_gzread_bytes_pe(array_list_t *reads, size_t bytes_to_read, fastq_gzfile_t *fq_gzfile1, fastq_gzfile_t *fq_gzfile2) {
  size_t bytes_processed;
  array_list_t *list1 = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  array_list_t *list2 = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);

  bytes_processed = fastq_gzread_bytes_se(list1, bytes_to_read / 2, fq_gzfile1);
  size_t num_reads = array_list_size(list1);
  fastq_gzread_se(list2, num_reads, fq_gzfile2);
  
  for (size_t i = 0; i < num_reads; i++) {
    array_list_insert(array_list_get(i, list1), reads);
    fastq_read_t *fq_read = array_list_get(i, list2);
    array_list_insert(fq_read, reads);

    bytes_processed += fq_read->length * 2 + strlen(fq_read->id) - 1;
  }

  return bytes_processed;

  //char *data1;
  //char *id   = (char*) calloc (max_read_len, sizeof(char));
  //char *seq  = (char*) calloc (max_read_len, sizeof(char));
  //char *qual = (char*) calloc (max_read_len, sizeof(char));

  /*
  char *data2;
  char *id2   = (char*) calloc (max_read_len, sizeof(char));
  char *seq2  = (char*) calloc (max_read_len, sizeof(char));
  char *qual2 = (char*) calloc (max_read_len, sizeof(char));
  */

  // ZLIB variables
  /*
  unsigned have;
  unsigned char in1[CHUNK];
  unsigned char in2[CHUNK];

  unsigned char out1[CHUNK];
  unsigned char out2[CHUNK];


  // If there is some data from before calls
  if(fq_gzfile1->data != NULL) {
    if(fq_gzfile1->data_size > max_data_len) {
      data1 = (char*) calloc (fq_gzfile1->data_size+max_data_len, sizeof(char));
      data2 = (char*) calloc (fq_gzfile1->data_size+max_data_len, sizeof(char));
      max_data_len = fq_gzfile1->data_size + max_data_len;
    }else{
      data1 = (char*) calloc (max_data_len, sizeof(char));
      data2 = (char*) calloc (max_data_len, sizeof(char));
    }
    strncpy(data1, fq_gzfile1->data, fq_gzfile1->data_size);
    strncpy(data2, fq_gzfile2->data, fq_gzfile1->data_size);
    i1 = fq_gzfile1->data_size;
    i2 = fq_gzfile2->data_size;
  }else {
    // first time, no data has been saved before
    data1 = (char*) calloc (max_data_len, sizeof(char));
    data2 = (char*) calloc (max_data_len, sizeof(char));
  }


  do {
    //Read P1 Read
    if (fq_gzfile1->ret != Z_STREAM_END) {
      fq_gzfile1->strm.avail_in = fread(in1, 1, CHUNK, fq_gzfile1->fd);
      if (ferror(fq_gzfile1->fd)) {
	(void)inflateEnd(&fq_gzfile1->strm);
	return Z_ERRNO;
      }
      if (fq_gzfile1->strm.avail_in == 0)
	break;
      fq_gzfile1->strm.next_in = in1;
    }

    //Read P2 Read
    if (fq_gzfile2->ret != Z_STREAM_END) {
      fq_gzfile2->strm.avail_in = fread(in2, 1, CHUNK, fq_gzfile2->fd);
      if (ferror(fq_gzfile2->fd)) {
	(void)inflateEnd(&fq_gzfile2->strm);
	return Z_ERRNO;
      }
      if (fq_gzfile2->strm.avail_in == 0)
	break;
      fq_gzfile2->strm.next_in = in2;
    }

    // run inflate() on input until output buffer not full 
    do {
      //Process file P1
      if (fq_gzfile1->strm.avail_out == 0) {
	fq_gzfile1->strm.avail_out = CHUNK;
	fq_gzfile1->strm.next_out = out1;
	fq_gzfile1->ret = inflate(&fq_gzfile1->strm, Z_NO_FLUSH);
	assert(fq_gzfile1->ret != Z_STREAM_ERROR);  // state not clobbered

	switch (fq_gzfile1->ret) {
	case Z_NEED_DICT:
	  fq_gzfile1->ret = Z_DATA_ERROR;     //and fall through
	case Z_DATA_ERROR:
	case Z_MEM_ERROR:
	  (void)inflateEnd(&fq_gzfile1->strm);
	  return fq_gzfile1->ret;
	}

	have = CHUNK - fq_gzfile1->strm.avail_out;
	for (int j = 0; j < have && !eof_found1; j++) {
	  c1 = out1[j];
	  if (c1 != EOF) {
	    max_data_len = consume_input(c1, &data1, max_data_len, i1);
	    i1++;
	    bytes_processed++;
	  } else {
	    eof_found1 = 1;
	  }
	}
      }

      //Read file P2
      if (fq_gzfile2->strm.avail_out == 0) {
	fq_gzfile2->strm.avail_out = CHUNK;
	fq_gzfile2->strm.next_out = out2;
	fq_gzfile2->ret = inflate(&fq_gzfile2->strm, Z_NO_FLUSH);
	assert(fq_gzfile2->ret != Z_STREAM_ERROR);  // state not clobbered

	switch (fq_gzfile2->ret) {
	case Z_NEED_DICT:
	  fq_gzfile2->ret = Z_DATA_ERROR;     // and fall through
	case Z_DATA_ERROR:
	case Z_MEM_ERROR:
	  (void)inflateEnd(&fq_gzfile2->strm);
	  return fq_gzfile2->ret;
	}

	have = CHUNK - fq_gzfile2->strm.avail_out;
	for (int j = 0; j < have && !eof_found2; j++) {
	  c2 = out2[j];
	  if (c2 != EOF) {
	    max_data_len = consume_input(c2, &data2, max_data_len, i2);
	    i2++;
	    bytes_processed++;
	  } else {
	    eof_found2 = 1;
	  }
	}
      }

    } while (fq_gzfile1->strm.avail_out == 0 || fq_gzfile2->strm.avail_out == 0);
    // done when inflate() says it's done
  } while (bytes_processed < bytes_to_read && (fq_gzfile1->ret != Z_STREAM_END || fq_gzfile2->ret != Z_STREAM_END));

  // check if have read the expected number of lines
  int n_reads;
  size_t parsed_chars;
  size_t parsed_lines = 0;
  size_t data_size;
  int new_bytes_to_read = bytes_to_read / 2;
  //size_t fastq_gzread_bytes_se(array_list_t *reads, size_t bytes_to_read, fastq_gzfile_t *fq_gzfile);

  aux = data1;
  for(parsed_chars = 0; parsed_chars < i1; parsed_chars++) {	//parsed_chars < bytes_to_read || parsed_lines % 4 == 0
    if(data1[parsed_chars] == '\n') {
      data1[parsed_chars] = '\0';
      if(count1 % 4 == 0) {
	strcpy(id, aux1);  //printf("%s\n", id);
      }
      if(count1 % 4 == 1) {
	strcpy(seq, aux1);  //printf("%s\n", seq);
      }
      if(count1 % 4 == 2) {
      }
      if(count1 % 4 == 3) {
	strcpy(qual, aux1);  //printf("%s\n", qual);
	read = fastq_read_new(id, seq, qual);
	array_list_insert(read, list1);
	if(parsed_chars + 1 > new_bytes_to_read) {
	  parsed_chars++;
	  break;
	}
      }
      count++;
      aux = data1 + parsed_chars + 1;
    }
  }

  data_size = i1 - parsed_chars;
  if(fq_gzfile1->data == NULL) {
    fq_gzfile1->data = (char*)malloc(data_size*sizeof(char));
  }
  if(fq_gzfile1->data_size != 0 && fq_gzfile1->data_size < data_size) {
    fq_gzfile1->data = realloc(fq_gzfile1->data, data_size);
  }
  if(data_size > 0) {
    memcpy(fq_gzfile1->data, data1 + parsed_chars, data_size);
  }
  fq_gzfile1->data_size = data_size;

  
  n_reads = array_list_size(reads);
  count = 0;
  for(parsed_chars = 0; parsed_chars < i2; parsed_chars++) {	//parsed_chars < bytes_to_read || parsed_lines % 4 == 0
    if(data2[parsed_chars] == '\n') {
      data2[parsed_chars] = '\0';
      if(count2 % 4 == 0) {
	strcpy(id, aux2);  //printf("%s\n", id);
      }
      if(count2 % 4 == 1) {
	strcpy(seq, aux2);  //printf("%s\n", seq);
      }
      if(count2 % 4 == 2) {
      }
      if(count2 % 4 == 3) {
	strcpy(qual, aux1);  //printf("%s\n", qual);
	read = fastq_read_new(id, seq, qual);
	array_list_insert(read, reads);
	if(parsed_chars + 1 > bytes_to_read) {
	  parsed_chars++;
	  break;
	}
      }
      count++;
      aux2 = data2 + parsed_chars + 1;
    }
  }


  free(data);
  free(id);
  free(seq);
  free(qual);

  */
}