int file_reader_fastq(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    * full_entry = true;

    if (new_entry!= true){
      die("new_entry has to be true for fastq");
    }

    return read_sequence_from_fastq(fp,seq,max_read_length,fastq_ascii_offset);
  }
示例#2
0
int main( int argc, char ** argv){
    log_and_screen_printf("Demultiplexer\n\n");
    log_and_screen_printf(SVN_VERSION);
	log_and_screen_printf(SVN_COMMIT_DATE);
	log_and_screen_printf("Compiled on %s at %s \n\n", __DATE__, __TIME__);
    if (argc < 2) {
        print_help();
    }
    log_write_timestamp(1);
    if (argc == 0) {
        print_help();
    }
    DemultiplexerCmdLine cmd = parse_args(argc, argv);
    FILE * in = stdin;
    if (cmd.input_reads != NULL) {
        in = fopen(cmd.input_reads, "r");
        if(in == NULL){
			log_and_screen_printf("Unable to open file %s\n", cmd.input_reads);
			exit(-1);
		}
    }
    Sequence * seq = sequence_new(cmd.max_read_length, cmd.max_name_length, 33);
    seq->header = new_sequence_header(CASAVA_1_8);
    header_function * f = (header_function *) seq->header;
    char * index = f->get_index(seq);
    size_t prefix_length = strlen(cmd.output_folder);
    char * output_file = calloc(prefix_length + MAX_FIELD_SIZE + 1, sizeof(char *));
    char * index_pointer = output_file + prefix_length;
    strcpy(output_file, cmd.output_folder);
    printf("prefix: %s\n", output_file);
    while (read_sequence_from_fastq(in, seq, cmd.max_read_length)) {
        strcpy(index_pointer, index);
//     printf("index: %s\n new output: %s\n", index, output_file);
        append_sequence(output_file, seq, FASTQ);
    }
    
    if (in != stdin) {
        fclose(in);
    }
    
    
    
    
    return 0;
}
示例#3
0
void test_read_sequence_from_fastq_with_bad_reads_and_long_reads()
{

  int ascii_offset=33;

  //pre-allocate space where to read the sequences
  Sequence* seq = malloc(sizeof(Sequence));
  if (seq==NULL){
    fputs("Out of memory trying to allocate a Sequence",stderr);
      exit(1);
  }

  int max_read_length=200;
  alloc_sequence(seq,max_read_length,LINE_MAX,ascii_offset);
  

  
  int length_seq;
  
  FILE* fp1 = fopen("../data/test/basic/includes_one_read_that_is_too_long.fastq", "r");
  
  // @read1
  // ACGT
  // +
  // !!!!
  // @read2
  // CCCC
  // +
  // 5555
  // @read3
  // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
  // -
  // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
  // @read4
  // ACGT
  // +
  // 3333



  length_seq = read_sequence_from_fastq(fp1,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  CU_ASSERT((int) (seq->qual[0])==0);
  CU_ASSERT((int) (seq->qual[1])==0);
  CU_ASSERT((int) (seq->qual[2])==0);
  CU_ASSERT((int) (seq->qual[3])==0);
  
  length_seq = read_sequence_from_fastq(fp1,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read2",seq->name);
  CU_ASSERT_STRING_EQUAL("CCCC",seq->seq);
  CU_ASSERT((int) (seq->qual[0])==20);
  CU_ASSERT((int) (seq->qual[1])==20);
  CU_ASSERT((int) (seq->qual[2])==20);
  CU_ASSERT((int) (seq->qual[3])==20);

  
  length_seq = read_sequence_from_fastq(fp1,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 100);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT((int) (seq->qual[0])==15); // 0 translates as ascii 48; subtract 33 and get 15
  
  

  length_seq = read_sequence_from_fastq(fp1,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read4",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  CU_ASSERT((int) (seq->qual[0])==18);

  
  
  fclose(fp1);
  

  FILE* fp2 = fopen("../data/test/basic/includes_reads_with_bad_characters.fastq", "r");

  //@read1
  //ACGTACGTACGTACGT
  //+
  //WEW2WEW2WEW2WEWA
  //@read2
  //AAAA#5A
  //+
  //1234567
  //@read3
  //TTTT
  //+
  //3333



  length_seq = read_sequence_from_fastq(fp2,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 16);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq);


  length_seq = read_sequence_from_fastq(fp2,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTT",seq->seq);

  
  length_seq = read_sequence_from_fastq(fp2,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 0);
  
  fclose(fp2);




  FILE* fp3 = fopen("../data/test/basic/includes_one_read_where_quality_is_longer_than_seq.fastq", "r");

  //@read1
  //ACGTACGTACGTACGT
  //+
  //WEW2WEW2WEW2WEWA
  //@read2
  //AAAA#5A
  //+
  //!!!!!!!!!!!!!!!!!!!!!!
  //@read3
  //TTTT
  //+
  //3333

  length_seq = read_sequence_from_fastq(fp3,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 16);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq);

  length_seq = read_sequence_from_fastq(fp3,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTT",seq->seq);

  length_seq = read_sequence_from_fastq(fp3,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 0);
  
  fclose(fp3);

  FILE* fp4 = fopen("../data/test/basic/includes_multiline_reads.fastq", "r");

  // @read1
  // ACGT
  // +
  // @@@@
  // @read2 45 bases
  // AAAAAAAAAAAAAAA
  // CCCCCCCCCCCCCCC
  // GGGGGGGGGGGGGGG
  // +
  // 222222222222222
  // 333333333333333
  // 444444444444444
  // @read3
  // TTT
  // -
  // ggg


  length_seq = read_sequence_from_fastq(fp4,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  

  length_seq = read_sequence_from_fastq(fp4,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 45);
  CU_ASSERT_STRING_EQUAL("read2",seq->name);
  CU_ASSERT_STRING_EQUAL("AAAAAAAAAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGG",seq->seq);
  

  length_seq = read_sequence_from_fastq(fp4,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 3);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT_STRING_EQUAL("TTT",seq->seq);
  


  length_seq = read_sequence_from_fastq(fp4,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 0);
  
  fclose(fp4);

  
 

  free_sequence(&seq);






}
示例#4
0
void test_read_sequence_from_fastq(){


  int ascii_offset = 33;

  //pre-allocate space where to read the sequences
  Sequence* seq = malloc(sizeof(Sequence));
  if (seq==NULL){
    fputs("Out of memory trying to allocate a Sequence",stderr);
      exit(1);
  }
  
  alloc_sequence(seq,200,LINE_MAX, ascii_offset);
  
  int length_seq;
  FILE* fp1 = fopen("../data/test/basic/one_entry.fastq", "r");

  // 1. Read from simple fasta:
  // >Zam
  // ACGT
  // +
  // &&&&

  length_seq = read_sequence_from_fastq(fp1,seq,1000);
  
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("Zam",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  //  CU_ASSERT_STRING_EQUAL("&&&&",seq->qual);/Zam says - /changed this line when I changed the ual reading code to offset by 33
  CU_ASSERT((int) seq->qual[0] == 5);




  FILE* fp2 = fopen("../data/test/basic/three_entries.fastq", "r");
  
  //2. Read from fastq:


  // @Zam1
  //ACGT
  //+
  //&&&&
  //@Zam2
  //AAAAAAAA
  //+
  //!((/8F+,
  //@Zam3
  //ATATATAT
  //TTTTTTTTTT
  //-
  //(((((((+AAAAAABAAA

  
  length_seq = read_sequence_from_fastq(fp2,seq, 1000);
  
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("Zam1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  //  CU_ASSERT_STRING_EQUAL("&&&&",seq->qual);/Zam says - /changed this line when I changed the ual reading code to offset by 33
  CU_ASSERT((int) seq->qual[0] == 5);


  length_seq = read_sequence_from_fastq(fp2,seq,1000);
  
  CU_ASSERT_EQUAL(length_seq, 8);
  CU_ASSERT_STRING_EQUAL("Zam2",seq->name);
  CU_ASSERT_STRING_EQUAL("AAAAAAAA",seq->seq);
  CU_ASSERT((int) seq->qual[0] == 0);//! is quality 0 - take 33 off its ascii code, can check on http://www.asciitable.com/
  CU_ASSERT((int) seq->qual[1] == 7);// ( is quality 7
  CU_ASSERT((int) seq->qual[2] == 7);// ( is quality 7
  CU_ASSERT((int) seq->qual[3] == 14);// / is quality 14
  CU_ASSERT((int) seq->qual[4] == 23);// 8 is quality 23
  CU_ASSERT((int) seq->qual[5] == 37);// F is quality 37
  CU_ASSERT((int) seq->qual[6] == 10);// + is quality 10
  CU_ASSERT((int) seq->qual[7] == 11);// , is quality 11



  length_seq = read_sequence_from_fastq(fp2,seq,1000);
  
  CU_ASSERT_EQUAL(length_seq, 18);
  CU_ASSERT_STRING_EQUAL("Zam3",seq->name);
  CU_ASSERT_STRING_EQUAL("ATATATATTTTTTTTTTT",seq->seq);
  CU_ASSERT((int) seq->qual[0] == 7);// ( is quality 7

  length_seq = read_sequence_from_fastq(fp2,seq,1000);

  CU_ASSERT_EQUAL(length_seq, 0);

  fclose(fp2);
  free_sequence(&seq);
}
示例#5
0
int main(int argc, char **argv){
  CmdLine cmd_line = parse_cmdline(argc,argv,sizeof(Element));
  long long As=0;
  long long Cs=0;
  long long Gs=0;
  long long Ts=0;
  long long Us=0;

  int max_read_length = 1000;

  FILE* fp = fopen(cmd_line.input_filename, "r"); 
  if (fp == NULL){
    fprintf(stderr,"cannot open file:%s\n",cmd_line.input_filename);
    exit(1); //TODO - prefer to print warning and skip file and return an error code?
  }


  //----------------------------------
  // preallocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    fputs("Out of memory trying to allocate Sequence\n",stderr);
    exit(1);
  }
  alloc_sequence(seq,max_read_length,LINE_MAX, cmd_line.quality_offset);
  
  int entry_length=0;
  boolean new_entry = true;
  boolean full_entry;
  do {

    switch (cmd_line.input_file_format) {
    case FASTQ:
      entry_length = read_sequence_from_fastq(fp,seq,max_read_length);
      break;
      
    case FASTA:
   
    entry_length = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,&full_entry,0);
    new_entry = full_entry;
    break;
    }

    int i;
    for(i=0;i<entry_length;i++){
      //printf("index %i char %c\n",i,seq->seq[i]);

      switch (seq->seq[i])
	{
	  
	case 'A':
	  As++;
	  break;
	case 'C':
	  Cs++;
	  break;
	case 'G':
	  Gs++;
	  break;
	case 'T':
	  Ts++;
	  break;
	case 'a':
	  As++;
	  break;
	case 'c':
	  Cs++;
	  break;
	case 'g':
	  Gs++;
	  break;
	case 't':
	  Ts++;
	  break;
	default:
	  Us++;
	}
      
    }
    
  } while (entry_length>0);
  
  long long total = As + Cs + Gs + Ts + Us;

  printf("%qd As counted - %5.2f%%\n",As, (As*100.0)/total);
  printf("%qd Cs counted - %5.2f%%\n",Cs,(Cs*100.0)/total);
  printf("%qd Gs counted - %5.2f%%\n",Gs,(Gs*100.0)/total);
  printf("%qd Ts counted - %5.2f%%\n",Ts,(Ts*100.0)/total);
  printf("%qd Us counted - %5.2f%%\n",Us,(Us*100.0)/total);
  
  return 0;
}