void test_shift_last_kmer_to_start_of_sequence(){ Sequence * seq = malloc(sizeof(Sequence)); boolean full_entry; if (seq == NULL){ fputs("Out of memory trying to allocate Sequence\n",stderr); exit(1); } //pre-allocate space where to read the sequences alloc_sequence(seq,200,LINE_MAX, 0); FILE* fp1 = fopen("../data/test/basic/long_entries.fasta", "r"); int length_seq = read_sequence_from_fasta(fp1,seq,10,true,&full_entry,0); CU_ASSERT(seq->seq[0]=='A'); CU_ASSERT(seq->seq[1]=='C'); CU_ASSERT(seq->seq[2]=='G'); shift_last_kmer_to_start_of_sequence(seq, length_seq,3); CU_ASSERT(seq->seq[0]=='T'); CU_ASSERT(seq->seq[1]=='A'); CU_ASSERT(seq->seq[2]=='C'); CU_ASSERT(full_entry==false); fclose(fp1); }
void estimate_seq_error_rate_from_snps_for_each_colour(char* colourlist_snp_alleles, GraphInfo* db_graph_info, dBGraph* db_graph, int ref_colour, //long long genome_size, long double default_seq_err_rate, char* output_file) { int max_read_length = 2*(db_graph->kmer_size); // ****** malloc and initialisation // //---------------------------------- // allocate the memory used to read the sequences //---------------------------------- Sequence * seq = malloc(sizeof(Sequence)); if (seq == NULL){ die("Out of memory trying to allocate Sequence"); } alloc_sequence(seq,max_read_length,LINE_MAX); //We are going to load all the bases into a single sliding window KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow)); if (kmer_window==NULL) { die("Failed to malloc kmer sliding window in align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.\n"); } // kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size-1)); kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size+1)); if (kmer_window->kmer==NULL) { die("Failed to malloc kmer_window->kmer in align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.\n"); } kmer_window->nkmers=0; //create file reader int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){ long long ret; int offset = 0; if (new_entry == false){ offset = db_graph->kmer_size; //die("new_entry must be true in hsi test function"); } ret = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset); return ret; }
void get_coverage_on_panels(int* percentage_coverage,int* median_coverage, StrBuf** panel_file_paths, int max_branch_len, dBGraph *db_graph, int ignore_first, int ignore_last , int NUM_PANELS) { int i; FILE* fp; AlleleInfo* ai = alloc_allele_info(); int number_of_reads = 0; int number_of_covered_reads =0; int num_kmers=0; Covg tot_pos_kmers; Covg tot_kmers; Covg med; //---------------------------------- // allocate the memory used to read the sequences //---------------------------------- Sequence * seq = malloc(sizeof(Sequence)); if (seq == NULL) { die("Out of memory trying to allocate Sequence"); } alloc_sequence(seq,max_branch_len,LINE_MAX); //We are going to load all the bases into a single sliding window KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow)); if (kmer_window==NULL) { die("Failed to malloc kmer sliding window"); } CovgArray* working_ca = alloc_and_init_covg_array(max_branch_len); dBNode** array_nodes = (dBNode**) malloc(sizeof(dBNode*)*max_branch_len); Orientation* array_or =(Orientation*) malloc(sizeof(Orientation)*max_branch_len); kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_branch_len-db_graph->kmer_size+1)); if (kmer_window->kmer==NULL) { die("Failed to malloc kmer_window->kmer"); } kmer_window->nkmers=0; //create file readers int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry) { long long ret; int offset = 0; if (new_entry == false) { offset = db_graph->kmer_size; } ret = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset); return ret; }
void test_read_sequence_from_fastq_with_bad_reads_and_long_reads() { int ascii_offset=33; //pre-allocate space where to read the sequences Sequence* seq = malloc(sizeof(Sequence)); if (seq==NULL){ fputs("Out of memory trying to allocate a Sequence",stderr); exit(1); } int max_read_length=200; alloc_sequence(seq,max_read_length,LINE_MAX,ascii_offset); int length_seq; FILE* fp1 = fopen("../data/test/basic/includes_one_read_that_is_too_long.fastq", "r"); // @read1 // ACGT // + // !!!! // @read2 // CCCC // + // 5555 // @read3 // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA // - // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // @read4 // ACGT // + // 3333 length_seq = read_sequence_from_fastq(fp1,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); CU_ASSERT((int) (seq->qual[0])==0); CU_ASSERT((int) (seq->qual[1])==0); CU_ASSERT((int) (seq->qual[2])==0); CU_ASSERT((int) (seq->qual[3])==0); length_seq = read_sequence_from_fastq(fp1,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read2",seq->name); CU_ASSERT_STRING_EQUAL("CCCC",seq->seq); CU_ASSERT((int) (seq->qual[0])==20); CU_ASSERT((int) (seq->qual[1])==20); CU_ASSERT((int) (seq->qual[2])==20); CU_ASSERT((int) (seq->qual[3])==20); length_seq = read_sequence_from_fastq(fp1,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 100); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT((int) (seq->qual[0])==15); // 0 translates as ascii 48; subtract 33 and get 15 length_seq = read_sequence_from_fastq(fp1,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read4",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); CU_ASSERT((int) (seq->qual[0])==18); fclose(fp1); FILE* fp2 = fopen("../data/test/basic/includes_reads_with_bad_characters.fastq", "r"); //@read1 //ACGTACGTACGTACGT //+ //WEW2WEW2WEW2WEWA //@read2 //AAAA#5A //+ //1234567 //@read3 //TTTT //+ //3333 length_seq = read_sequence_from_fastq(fp2,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 16); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq); length_seq = read_sequence_from_fastq(fp2,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT_STRING_EQUAL("TTTT",seq->seq); length_seq = read_sequence_from_fastq(fp2,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp2); FILE* fp3 = fopen("../data/test/basic/includes_one_read_where_quality_is_longer_than_seq.fastq", "r"); //@read1 //ACGTACGTACGTACGT //+ //WEW2WEW2WEW2WEWA //@read2 //AAAA#5A //+ //!!!!!!!!!!!!!!!!!!!!!! //@read3 //TTTT //+ //3333 length_seq = read_sequence_from_fastq(fp3,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 16); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq); length_seq = read_sequence_from_fastq(fp3,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT_STRING_EQUAL("TTTT",seq->seq); length_seq = read_sequence_from_fastq(fp3,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp3); FILE* fp4 = fopen("../data/test/basic/includes_multiline_reads.fastq", "r"); // @read1 // ACGT // + // @@@@ // @read2 45 bases // AAAAAAAAAAAAAAA // CCCCCCCCCCCCCCC // GGGGGGGGGGGGGGG // + // 222222222222222 // 333333333333333 // 444444444444444 // @read3 // TTT // - // ggg length_seq = read_sequence_from_fastq(fp4,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); length_seq = read_sequence_from_fastq(fp4,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 45); CU_ASSERT_STRING_EQUAL("read2",seq->name); CU_ASSERT_STRING_EQUAL("AAAAAAAAAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGG",seq->seq); length_seq = read_sequence_from_fastq(fp4,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 3); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT_STRING_EQUAL("TTT",seq->seq); length_seq = read_sequence_from_fastq(fp4,seq,max_read_length); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp4); free_sequence(&seq); }
void test_read_sequence_from_fastq(){ int ascii_offset = 33; //pre-allocate space where to read the sequences Sequence* seq = malloc(sizeof(Sequence)); if (seq==NULL){ fputs("Out of memory trying to allocate a Sequence",stderr); exit(1); } alloc_sequence(seq,200,LINE_MAX, ascii_offset); int length_seq; FILE* fp1 = fopen("../data/test/basic/one_entry.fastq", "r"); // 1. Read from simple fasta: // >Zam // ACGT // + // &&&& length_seq = read_sequence_from_fastq(fp1,seq,1000); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("Zam",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); // CU_ASSERT_STRING_EQUAL("&&&&",seq->qual);/Zam says - /changed this line when I changed the ual reading code to offset by 33 CU_ASSERT((int) seq->qual[0] == 5); FILE* fp2 = fopen("../data/test/basic/three_entries.fastq", "r"); //2. Read from fastq: // @Zam1 //ACGT //+ //&&&& //@Zam2 //AAAAAAAA //+ //!((/8F+, //@Zam3 //ATATATAT //TTTTTTTTTT //- //(((((((+AAAAAABAAA length_seq = read_sequence_from_fastq(fp2,seq, 1000); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("Zam1",seq->name); CU_ASSERT_STRING_EQUAL("ACGT",seq->seq); // CU_ASSERT_STRING_EQUAL("&&&&",seq->qual);/Zam says - /changed this line when I changed the ual reading code to offset by 33 CU_ASSERT((int) seq->qual[0] == 5); length_seq = read_sequence_from_fastq(fp2,seq,1000); CU_ASSERT_EQUAL(length_seq, 8); CU_ASSERT_STRING_EQUAL("Zam2",seq->name); CU_ASSERT_STRING_EQUAL("AAAAAAAA",seq->seq); CU_ASSERT((int) seq->qual[0] == 0);//! is quality 0 - take 33 off its ascii code, can check on http://www.asciitable.com/ CU_ASSERT((int) seq->qual[1] == 7);// ( is quality 7 CU_ASSERT((int) seq->qual[2] == 7);// ( is quality 7 CU_ASSERT((int) seq->qual[3] == 14);// / is quality 14 CU_ASSERT((int) seq->qual[4] == 23);// 8 is quality 23 CU_ASSERT((int) seq->qual[5] == 37);// F is quality 37 CU_ASSERT((int) seq->qual[6] == 10);// + is quality 10 CU_ASSERT((int) seq->qual[7] == 11);// , is quality 11 length_seq = read_sequence_from_fastq(fp2,seq,1000); CU_ASSERT_EQUAL(length_seq, 18); CU_ASSERT_STRING_EQUAL("Zam3",seq->name); CU_ASSERT_STRING_EQUAL("ATATATATTTTTTTTTTT",seq->seq); CU_ASSERT((int) seq->qual[0] == 7);// ( is quality 7 length_seq = read_sequence_from_fastq(fp2,seq,1000); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp2); free_sequence(&seq); }
//test reading several short entries from a fasta file void test_read_sequence_from_fasta(){ Sequence * seq = malloc(sizeof(Sequence)); boolean full_entry; if (seq == NULL){ fputs("Out of memory trying to allocate Sequence\n",stderr); exit(1); } //pre-allocate space where to read the sequences alloc_sequence(seq,200,LINE_MAX, 0); int length_seq; FILE* fp1 = fopen("../data/test/basic/one_entry.fasta", "r"); if (fp1 == NULL){ fputs("cannot open file:../data/test/basic/one_entry.fasta\n",stderr); exit(1); } // 1. Read from simple fasta: // >Zam // ACGT // ACGTACGTACGT length_seq = read_sequence_from_fasta(fp1,seq,1000,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 16); CU_ASSERT_STRING_EQUAL("Zam",seq->name); CU_ASSERT_EQUAL(1,seq->start); CU_ASSERT_EQUAL(16,seq->end); CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq); CU_ASSERT(full_entry); fclose(fp1); FILE* fp2 = fopen("../data/test/basic/three_entries.fasta", "r"); // 2. Read from fasta: //>Zam1 //ACGT //ACGTACGTACGT //>Zam2 //ACGT //ACGTACGTACGT //TTTTTTTT //>Zam3 //ACGTNNACGTACGTACGT length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 16); CU_ASSERT_STRING_EQUAL("Zam1",seq->name); CU_ASSERT_EQUAL(1,seq->start); CU_ASSERT_EQUAL(16,seq->end); CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 24); CU_ASSERT_STRING_EQUAL("Zam2",seq->name); CU_ASSERT_EQUAL(1,seq->start); CU_ASSERT_EQUAL(24,seq->end); CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGTTTTTTTTt",seq->seq); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 18); CU_ASSERT_STRING_EQUAL("Zam3",seq->name); CU_ASSERT_EQUAL(1,seq->start); CU_ASSERT_EQUAL(18,seq->end); CU_ASSERT_STRING_EQUAL("ACGTNNACGTACGTACGT",seq->seq); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 0); CU_ASSERT(full_entry); fclose(fp2); free_sequence(&seq); }
void test_read_sequence_from_fasta_when_file_has_bad_reads() { int length_seq; Sequence * seq = malloc(sizeof(Sequence)); boolean full_entry; if (seq == NULL){ fputs("Out of memory trying to allocate Sequence\n",stderr); exit(1); } //pre-allocate space where to read the sequences int max_read_length=100; alloc_sequence(seq,max_read_length,LINE_MAX, 0); FILE* fp2= fopen("../data/test/basic/includes_reads_that_have_bad_characters.fasta", "r"); // >read1 // AAAAAAAAAAAA9 // >read2 // ¡€#9∞§¶#¶•#•#•#ª#ª#ª#ªº#º#º#º–––– // >read3 4 c's // CCCC // >read4 10 Ts // TTTTTTTTTT // >read5 // $ // >read6 // AAAAAAAAAAAAAAAAAA#A // >read7 // AAA length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 13); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 63); CU_ASSERT_STRING_EQUAL("read2",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT_STRING_EQUAL("CCCC",seq->seq); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 10); CU_ASSERT_STRING_EQUAL("read4",seq->name); CU_ASSERT_STRING_EQUAL("TTTTTTTTTT",seq->seq); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 1); CU_ASSERT_STRING_EQUAL("read5",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 20); CU_ASSERT_STRING_EQUAL("read6",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 3); CU_ASSERT_STRING_EQUAL("read7",seq->name); CU_ASSERT_STRING_EQUAL("AAA",seq->seq); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 0); CU_ASSERT(full_entry); fclose(fp2); //now make sure we do not get trapped in an infinite loop if the last read of a file is bad FILE* fp3= fopen("../data/test/basic/includes_final_read_that_has_bad_characters.fasta", "r"); // >read1 // AAAAAAAAAAAA9 // >read2 // ¡€#9∞§¶#¶•#•#•#ª#ª#ª#ªº#º#º#º–––– // >read3 4 c's // CCCC // >read4 10 Ts // TTTTTTTTTT // >read5 // $ // >read6 // AAAAAAAAAAAAAAAAAA#A length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 13); CU_ASSERT_STRING_EQUAL("read1",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 63); CU_ASSERT_STRING_EQUAL("read2",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 4); CU_ASSERT_STRING_EQUAL("CCCC",seq->seq); CU_ASSERT_STRING_EQUAL("read3",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 10); CU_ASSERT_STRING_EQUAL("read4",seq->name); CU_ASSERT_STRING_EQUAL("TTTTTTTTTT",seq->seq); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 1); CU_ASSERT_STRING_EQUAL("read5",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 20); CU_ASSERT_STRING_EQUAL("read6",seq->name); CU_ASSERT(full_entry); length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 0); CU_ASSERT(full_entry); fclose(fp3); free_sequence(&seq); }
void test_read_sequence_from_long_fasta(){ Sequence * seq = malloc(sizeof(Sequence)); boolean full_entry; if (seq == NULL){ fputs("Out of memory trying to allocate Sequence\n",stderr); exit(1); } //pre-allocate space where to read the sequences alloc_sequence(seq,200,LINE_MAX, 0); int length_seq; FILE* fp1 = fopen("../data/test/basic/long_entries.fasta", "r"); if (fp1 == NULL){ fputs("cannot open file: ../data/test/basic/long_entries.fasta\n",stderr); exit(1); } length_seq = read_sequence_from_fasta(fp1,seq,10,true,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 10); CU_ASSERT_EQUAL(seq->start,1); CU_ASSERT_EQUAL(seq->end,10); CU_ASSERT_STRING_EQUAL("Mario",seq->name); CU_ASSERT_STRING_EQUAL("ACGTACGTAC",seq->seq); CU_ASSERT(full_entry==false); length_seq = read_sequence_from_fasta(fp1,seq,10,false,&full_entry,0); CU_ASSERT_EQUAL(length_seq, 10); CU_ASSERT_EQUAL(seq->start,11); CU_ASSERT_EQUAL(seq->end,20); CU_ASSERT_STRING_EQUAL("Mario",seq->name); CU_ASSERT_STRING_EQUAL("GTACGTAAAA",seq->seq); seq->seq[0]='T'; seq->seq[1]='T'; seq->seq[2]='T'; length_seq = read_sequence_from_fasta(fp1,seq,10,false,&full_entry,3); CU_ASSERT_EQUAL(length_seq, 10); CU_ASSERT_STRING_EQUAL("Mario",seq->name); CU_ASSERT_EQUAL(seq->start, 21); CU_ASSERT_EQUAL(seq->end,27); CU_ASSERT_STRING_EQUAL("TTTAAAAAAA",seq->seq); //finish off the entry length_seq = read_sequence_from_fasta(fp1,seq,1000,false,&full_entry,0); CU_ASSERT(full_entry == true); length_seq = read_sequence_from_fasta(fp1,seq,16,true,&full_entry,0); CU_ASSERT(full_entry == true); CU_ASSERT_EQUAL(seq->start, 1); CU_ASSERT_EQUAL(seq->end,16); CU_ASSERT_EQUAL(length_seq, 16); CU_ASSERT_STRING_EQUAL("Pepe",seq->name); length_seq = read_sequence_from_fasta(fp1,seq,3,true,&full_entry,0); CU_ASSERT(full_entry == false); CU_ASSERT_EQUAL(seq->start, 1); CU_ASSERT_EQUAL(seq->end,3); CU_ASSERT_EQUAL(length_seq, 3); CU_ASSERT_STRING_EQUAL("COCO",seq->name); CU_ASSERT_STRING_EQUAL("TTT",seq->seq); length_seq = read_sequence_from_fasta(fp1,seq,1000,false,&full_entry,0); CU_ASSERT(full_entry == true); CU_ASSERT_EQUAL(seq->start, 4); CU_ASSERT_EQUAL(seq->end,10); CU_ASSERT_EQUAL(length_seq,7); CU_ASSERT_STRING_EQUAL("COCO",seq->name); CU_ASSERT_STRING_EQUAL("TAAAATT",seq->seq); length_seq = read_sequence_from_fasta(fp1,seq,15,true,&full_entry,0); CU_ASSERT(full_entry == true); CU_ASSERT_EQUAL(seq->start, 1); CU_ASSERT_EQUAL(seq->end,15); CU_ASSERT_EQUAL(length_seq, 15); CU_ASSERT_STRING_EQUAL("CACHO",seq->name); CU_ASSERT_STRING_EQUAL("TTTTTTAAAGGATAT",seq->seq); length_seq = read_sequence_from_fasta(fp1,seq,15,true,&full_entry,0); CU_ASSERT(full_entry == true); CU_ASSERT_EQUAL(length_seq, 0); fclose(fp1); }
void align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours(FileFormat format, char* list_of_fastaq, int max_read_length, int* array_of_colours, char** array_of_names_of_colours, int num_of_colours, dBGraph* db_graph,int fastq_ascii_offset, boolean is_for_testing, char** for_test_array_of_strings, int* for_test_index, boolean mark_nodes_for_dumping) { if ( (format != FASTA) && (format !=FASTQ) ) { die("Calling align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours " "with file format not set to fasta or fastq"); } //For each file in list_of_fasta, go through the reads, and for each read, // print one "coverage read" per colour (space separated) // e.g. for a read print // >read_id colour 0 // coverages of each of the nodes in the ref (colour 0) // >read_id colour 1 // ... covgs in colour 1 // >read_id colour 2 // ... //---------------------------------- // allocate the memory used to read the sequences //---------------------------------- Sequence * seq = malloc(sizeof(Sequence)); if (seq == NULL){ die("Out of memory trying to allocate Sequence"); } alloc_sequence(seq,max_read_length,LINE_MAX); //We are going to load all the bases into a single sliding window KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow)); if (kmer_window==NULL) { die("Failed to malloc kmer sliding window in " "align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit."); } // kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size-1)); kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size+1)); if (kmer_window->kmer==NULL) { die("Failed to malloc kmer_window->kmer in " "align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit."); } kmer_window->nkmers=0; //end of intialisation //create file readers int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){ long long ret; int offset = 0; if (new_entry == false){ offset = db_graph->kmer_size; //die("new_entry must be true in hsi test function"); } ret = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset); return ret; }
void test_get_next_gene_info() { uint16_t kmer_size = 31; int number_of_bits = 10; int bucket_size = 100; int max_retries = 10; dBGraph *db_graph= hash_table_new(number_of_bits, bucket_size, max_retries, kmer_size); int max_gene_len = 1500; uint64_t* kmer_covg_array = calloc(150, sizeof(uint64_t)); uint64_t* readlen_array = calloc(max_gene_len, sizeof(uint64_t)); StrBuf* list = strbuf_create("../data/test/myKrobe/predictor/gene_presence/sample1.fa.list"); unsigned long long num_bases = build_unclean_graph(db_graph, list, true, kmer_size, readlen_array, max_gene_len, kmer_covg_array, 150, false, 0); FILE* fp = fopen("../data/test/myKrobe/predictor/gene_presence/panel1.fasta", "r"); if (fp==NULL) { die("Cannot open this file: ../data/test/myKrobe/predictor/gene_presence/panel1.fasta"); } GeneInfo* gi = alloc_and_init_gene_info(); //---------------------------------- // allocate the memory used to read the sequences //---------------------------------- Sequence * seq = malloc(sizeof(Sequence)); if (seq == NULL){ die("Out of memory trying to allocate Sequence"); } alloc_sequence(seq,max_gene_len,LINE_MAX); //We are going to load all the bases into a single sliding window KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow)); if (kmer_window==NULL) { die("Failed to malloc kmer sliding window"); } kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_gene_len-db_graph->kmer_size+1)); if (kmer_window->kmer==NULL) { die("Failed to malloc kmer_window->kmer"); } kmer_window->nkmers=0; // int max_gene_len = 5000; CovgArray* working_ca = alloc_and_init_covg_array(max_gene_len); //end of intialisation //create file readers int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){ long long ret; int offset = 0; if (new_entry == false){ offset = db_graph->kmer_size; } ret = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset); return ret; }
void test_count_reads_where_snp_makes_clean_bubble1() { if(NUMBER_OF_BITFIELDS_IN_BINARY_KMER > 1) { warn("Test not configured for NUMBER_OF_BITFIELDS_IN_BINARY_KMER > 1\n"); return; } // simple test. Use this refernece genome which I load into colour 0: /// file: data/test/genome_complexity/test_allele_clean_file1.fa // >ref contains CAAGTTC CACGTTC and CAGGTTC // CAAGTTCAGAGTTACTCACACCCGATCGATAAGCGGTACAGAGCACGTTCAGAAAAAAACAGGTTCAGA // >ref + SNP // TATCCATGTTCAGAGTTACTGACACCCGATCGATAAGCG //first set up the hash/graph int kmer_size = 7; int number_of_bits = 8; int bucket_size = 10; int max_retries = 10; dBGraph *hash_table = hash_table_new(number_of_bits, bucket_size, max_retries, kmer_size); if (hash_table==NULL) { die("Unable to alloc the hash table."); } // Read FASTA sequence int fq_quality_cutoff = 0; int homopolymer_cutoff = 0; boolean remove_duplicates_se = false; char ascii_fq_offset = 33; int into_colour = 0; unsigned int files_loaded = 0; unsigned long long bad_reads = 0, dup_reads = 0; unsigned long long seq_loaded = 0, seq_read = 0; load_se_filelist_into_graph_colour( "../data/test/genome_complexity/pop_first_test.colours", fq_quality_cutoff, homopolymer_cutoff, remove_duplicates_se, ascii_fq_offset, into_colour, hash_table, 1, // 0 => falist/fqlist; 1 => colourlist &files_loaded, &bad_reads, &dup_reads, &seq_read, &seq_loaded, NULL, 0, &subsample_null); //and use this file of reads: /data/test/genome_complexity/test_allele_clean_file2.fa // >read lies entirely in graph defined by test_allele_clean_file1.fa, and is clean (forms supernode at k=7) // GTTCAGAGTTACT // >read lies in graph defined by test_allele_clean_file1.fa, but overlaps a junction so is not clean // TTACTGACACCCGATCG // >read lies in ref but any change to the 7th base results in an overlap with the ref, so // GTTCAGAGTTACTCACA int col_genome=0; int reads_tested=0; int reads_where_snp_makes_clean_bubble = 0; dBNode* array_nodes[500]; Orientation array_or[500]; //---------------------------------- // allocate the memory used to read the sequences //---------------------------------- Sequence * seq = malloc(sizeof(Sequence)); if (seq == NULL){ die("Out of memory trying to allocate Sequence\n"); } int max_read_length=2000; alloc_sequence(seq,max_read_length,LINE_MAX); //We are going to load all the bases into a single sliding window KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow)); if (kmer_window==NULL) { die("Failed to malloc kmer sliding window in test. Exit.\n"); } kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-hash_table->kmer_size-1)); if (kmer_window->kmer==NULL) { die("Failed to malloc kmer_window->kmer in test. Exit.\n"); } kmer_window->nkmers=0; //end of intialisation //create file readers int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){ long long ret; int offset = 0; if (new_entry == false){ offset = hash_table->kmer_size; //die("new_entry must be true in hsi test function"); } ret = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset); return ret; }
int main(int argc, char **argv){ CmdLine cmd_line = parse_cmdline(argc,argv,sizeof(Element)); long long As=0; long long Cs=0; long long Gs=0; long long Ts=0; long long Us=0; int max_read_length = 1000; FILE* fp = fopen(cmd_line.input_filename, "r"); if (fp == NULL){ fprintf(stderr,"cannot open file:%s\n",cmd_line.input_filename); exit(1); //TODO - prefer to print warning and skip file and return an error code? } //---------------------------------- // preallocate the memory used to read the sequences //---------------------------------- Sequence * seq = malloc(sizeof(Sequence)); if (seq == NULL){ fputs("Out of memory trying to allocate Sequence\n",stderr); exit(1); } alloc_sequence(seq,max_read_length,LINE_MAX, cmd_line.quality_offset); int entry_length=0; boolean new_entry = true; boolean full_entry; do { switch (cmd_line.input_file_format) { case FASTQ: entry_length = read_sequence_from_fastq(fp,seq,max_read_length); break; case FASTA: entry_length = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,&full_entry,0); new_entry = full_entry; break; } int i; for(i=0;i<entry_length;i++){ //printf("index %i char %c\n",i,seq->seq[i]); switch (seq->seq[i]) { case 'A': As++; break; case 'C': Cs++; break; case 'G': Gs++; break; case 'T': Ts++; break; case 'a': As++; break; case 'c': Cs++; break; case 'g': Gs++; break; case 't': Ts++; break; default: Us++; } } } while (entry_length>0); long long total = As + Cs + Gs + Ts + Us; printf("%qd As counted - %5.2f%%\n",As, (As*100.0)/total); printf("%qd Cs counted - %5.2f%%\n",Cs,(Cs*100.0)/total); printf("%qd Gs counted - %5.2f%%\n",Gs,(Gs*100.0)/total); printf("%qd Ts counted - %5.2f%%\n",Ts,(Ts*100.0)/total); printf("%qd Us counted - %5.2f%%\n",Us,(Us*100.0)/total); return 0; }