int file_reader_fastq(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){ * full_entry = true; if (new_entry!= true){ die("new_entry has to be true for fastq"); } return read_sequence_from_fastq(fp,seq,max_read_length,fastq_ascii_offset); }
int main( int argc, char ** argv){ log_and_screen_printf("Demultiplexer\n\n"); log_and_screen_printf(SVN_VERSION); log_and_screen_printf(SVN_COMMIT_DATE); log_and_screen_printf("Compiled on %s at %s \n\n", __DATE__, __TIME__); if (argc < 2) { print_help(); } log_write_timestamp(1); if (argc == 0) { print_help(); } DemultiplexerCmdLine cmd = parse_args(argc, argv); FILE * in = stdin; if (cmd.input_reads != NULL) { in = fopen(cmd.input_reads, "r"); if(in == NULL){ log_and_screen_printf("Unable to open file %s\n", cmd.input_reads); exit(-1); } } Sequence * seq = sequence_new(cmd.max_read_length, cmd.max_name_length, 33); seq->header = new_sequence_header(CASAVA_1_8); header_function * f = (header_function *) seq->header; char * index = f->get_index(seq); size_t prefix_length = strlen(cmd.output_folder); char * output_file = calloc(prefix_length + MAX_FIELD_SIZE + 1, sizeof(char *)); char * index_pointer = output_file + prefix_length; strcpy(output_file, cmd.output_folder); printf("prefix: %s\n", output_file); while (read_sequence_from_fastq(in, seq, cmd.max_read_length)) { strcpy(index_pointer, index); // printf("index: %s\n new output: %s\n", index, output_file); append_sequence(output_file, seq, FASTQ); } if (in != stdin) { fclose(in); } return 0; }
void test_read_sequence_from_fastq_with_bad_reads_and_long_reads() { int ascii_offset=33; //pre-allocate space where to read the sequences Sequence* seq = malloc(sizeof(Sequence)); if (seq==NULL){ fputs("Out of memory trying to allocate a Sequence",stderr); exit(1); } int max_read_length=200; alloc_sequence(seq,max_read_length,LINE_MAX,ascii_offset); int length_seq; FILE* fp1 = fopen("../data/test/basic/includes_one_read_that_is_too_long.fastq", "r"); // @read1 // ACGT // + // !!!! // @read2 // CCCC // + // 5555 // @read3 // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA // - // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // @read4 // ACGT // + // 3333 length_seq = read_sequence_from_fastq(fp1,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); CU_ASSERT((int) (seq->qual[0])==0); CU_ASSERT((int) (seq->qual[1])==0); CU_ASSERT((int) (seq->qual[2])==0); CU_ASSERT((int) (seq->qual[3])==0); length_seq = read_sequence_from_fastq(fp1,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read2",seq->name); CU_ASSERT_STRING_EQUAL("CCCC",seq->seq); CU_ASSERT((int) (seq->qual[0])==20); CU_ASSERT((int) (seq->qual[1])==20); CU_ASSERT((int) (seq->qual[2])==20); CU_ASSERT((int) (seq->qual[3])==20); length_seq = read_sequence_from_fastq(fp1,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 100); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT((int) (seq->qual[0])==15); // 0 translates as ascii 48; subtract 33 and get 15 length_seq = read_sequence_from_fastq(fp1,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read4",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); CU_ASSERT((int) (seq->qual[0])==18); fclose(fp1); FILE* fp2 = fopen("../data/test/basic/includes_reads_with_bad_characters.fastq", "r"); //@read1 //ACGTACGTACGTACGT //+ //WEW2WEW2WEW2WEWA //@read2 //AAAA#5A //+ //1234567 //@read3 //TTTT //+ //3333 length_seq = read_sequence_from_fastq(fp2,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 16); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq); length_seq = read_sequence_from_fastq(fp2,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT_STRING_EQUAL("TTTT",seq->seq); length_seq = read_sequence_from_fastq(fp2,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp2); FILE* fp3 = fopen("../data/test/basic/includes_one_read_where_quality_is_longer_than_seq.fastq", "r"); //@read1 //ACGTACGTACGTACGT //+ //WEW2WEW2WEW2WEWA //@read2 //AAAA#5A //+ //!!!!!!!!!!!!!!!!!!!!!! //@read3 //TTTT //+ //3333 length_seq = read_sequence_from_fastq(fp3,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 16); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq); length_seq = read_sequence_from_fastq(fp3,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT_STRING_EQUAL("TTTT",seq->seq); length_seq = read_sequence_from_fastq(fp3,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp3); FILE* fp4 = fopen("../data/test/basic/includes_multiline_reads.fastq", "r"); // @read1 // ACGT // + // @@@@ // @read2 45 bases // AAAAAAAAAAAAAAA // CCCCCCCCCCCCCCC // GGGGGGGGGGGGGGG // + // 222222222222222 // 333333333333333 // 444444444444444 // @read3 // TTT // - // ggg length_seq = read_sequence_from_fastq(fp4,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); length_seq = read_sequence_from_fastq(fp4,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 45); CU_ASSERT_STRING_EQUAL("read2",seq->name); CU_ASSERT_STRING_EQUAL("AAAAAAAAAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGG",seq->seq); length_seq = read_sequence_from_fastq(fp4,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 3); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT_STRING_EQUAL("TTT",seq->seq); length_seq = read_sequence_from_fastq(fp4,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp4); free_sequence(&seq); }
void test_read_sequence_from_fastq(){ int ascii_offset = 33; //pre-allocate space where to read the sequences Sequence* seq = malloc(sizeof(Sequence)); if (seq==NULL){ fputs("Out of memory trying to allocate a Sequence",stderr); exit(1); } alloc_sequence(seq,200,LINE_MAX, ascii_offset); int length_seq; FILE* fp1 = fopen("../data/test/basic/one_entry.fastq", "r"); // 1. Read from simple fasta: // >Zam // ACGT // + // &&&& length_seq = read_sequence_from_fastq(fp1,seq,1000); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("Zam",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); // CU_ASSERT_STRING_EQUAL("&&&&",seq->qual);/Zam says - /changed this line when I changed the ual reading code to offset by 33 CU_ASSERT((int) seq->qual[0] == 5); FILE* fp2 = fopen("../data/test/basic/three_entries.fastq", "r"); //2. Read from fastq: // @Zam1 //ACGT //+ //&&&& //@Zam2 //AAAAAAAA //+ //!((/8F+, //@Zam3 //ATATATAT //TTTTTTTTTT //- //(((((((+AAAAAABAAA length_seq = read_sequence_from_fastq(fp2,seq, 1000); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("Zam1",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); // CU_ASSERT_STRING_EQUAL("&&&&",seq->qual);/Zam says - /changed this line when I changed the ual reading code to offset by 33 CU_ASSERT((int) seq->qual[0] == 5); length_seq = read_sequence_from_fastq(fp2,seq,1000); CU_ASSERT_EQUAL(length_seq, 8); CU_ASSERT_STRING_EQUAL("Zam2",seq->name); CU_ASSERT_STRING_EQUAL("AAAAAAAA",seq->seq); CU_ASSERT((int) seq->qual[0] == 0);//! is quality 0 - take 33 off its ascii code, can check on http://www.asciitable.com/ CU_ASSERT((int) seq->qual[1] == 7);// ( is quality 7 CU_ASSERT((int) seq->qual[2] == 7);// ( is quality 7 CU_ASSERT((int) seq->qual[3] == 14);// / is quality 14 CU_ASSERT((int) seq->qual[4] == 23);// 8 is quality 23 CU_ASSERT((int) seq->qual[5] == 37);// F is quality 37 CU_ASSERT((int) seq->qual[6] == 10);// + is quality 10 CU_ASSERT((int) seq->qual[7] == 11);// , is quality 11 length_seq = read_sequence_from_fastq(fp2,seq,1000); CU_ASSERT_EQUAL(length_seq, 18); CU_ASSERT_STRING_EQUAL("Zam3",seq->name); CU_ASSERT_STRING_EQUAL("ATATATATTTTTTTTTTT",seq->seq); CU_ASSERT((int) seq->qual[0] == 7);// ( is quality 7 length_seq = read_sequence_from_fastq(fp2,seq,1000); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp2); free_sequence(&seq); }
int main(int argc, char **argv){ CmdLine cmd_line = parse_cmdline(argc,argv,sizeof(Element)); long long As=0; long long Cs=0; long long Gs=0; long long Ts=0; long long Us=0; int max_read_length = 1000; FILE* fp = fopen(cmd_line.input_filename, "r"); if (fp == NULL){ fprintf(stderr,"cannot open file:%s\n",cmd_line.input_filename); exit(1); //TODO - prefer to print warning and skip file and return an error code? } //---------------------------------- // preallocate the memory used to read the sequences //---------------------------------- Sequence * seq = malloc(sizeof(Sequence)); if (seq == NULL){ fputs("Out of memory trying to allocate Sequence\n",stderr); exit(1); } alloc_sequence(seq,max_read_length,LINE_MAX, cmd_line.quality_offset); int entry_length=0; boolean new_entry = true; boolean full_entry; do { switch (cmd_line.input_file_format) { case FASTQ: entry_length = read_sequence_from_fastq(fp,seq,max_read_length); break; case FASTA: entry_length = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,&full_entry,0); new_entry = full_entry; break; } int i; for(i=0;i<entry_length;i++){ //printf("index %i char %c\n",i,seq->seq[i]); switch (seq->seq[i]) { case 'A': As++; break; case 'C': Cs++; break; case 'G': Gs++; break; case 'T': Ts++; break; case 'a': As++; break; case 'c': Cs++; break; case 'g': Gs++; break; case 't': Ts++; break; default: Us++; } } } while (entry_length>0); long long total = As + Cs + Gs + Ts + Us; printf("%qd As counted - %5.2f%%\n",As, (As*100.0)/total); printf("%qd Cs counted - %5.2f%%\n",Cs,(Cs*100.0)/total); printf("%qd Gs counted - %5.2f%%\n",Gs,(Gs*100.0)/total); printf("%qd Ts counted - %5.2f%%\n",Ts,(Ts*100.0)/total); printf("%qd Us counted - %5.2f%%\n",Us,(Us*100.0)/total); return 0; }