コード例 #1
0
ファイル: test_seq.c プロジェクト: homonecloco/PyroClean
void test_shift_last_kmer_to_start_of_sequence(){ 
  Sequence * seq = malloc(sizeof(Sequence));
  boolean full_entry;

  if (seq == NULL){
    fputs("Out of memory trying to allocate Sequence\n",stderr);
    exit(1);
  }
  //pre-allocate space where to read the sequences
  alloc_sequence(seq,200,LINE_MAX, 0);

  FILE* fp1 = fopen("../data/test/basic/long_entries.fasta", "r");

  int length_seq = read_sequence_from_fasta(fp1,seq,10,true,&full_entry,0);

  CU_ASSERT(seq->seq[0]=='A');
  CU_ASSERT(seq->seq[1]=='C');
  CU_ASSERT(seq->seq[2]=='G');
  
  shift_last_kmer_to_start_of_sequence(seq, length_seq,3);
  CU_ASSERT(seq->seq[0]=='T');
  CU_ASSERT(seq->seq[1]=='A');
  CU_ASSERT(seq->seq[2]=='C');
  
  CU_ASSERT(full_entry==false);
  fclose(fp1);
}
コード例 #2
0
void estimate_seq_error_rate_from_snps_for_each_colour(char* colourlist_snp_alleles, 
                                                       GraphInfo* db_graph_info, 
                                                       dBGraph* db_graph, 
                                                       int ref_colour, 
                                                       //long long genome_size,
                                                       long double default_seq_err_rate, 
                                                       char* output_file)
{

  int max_read_length = 2*(db_graph->kmer_size);

  // ****** malloc and initialisation //
  
  //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    die("Out of memory trying to allocate Sequence");
  }
  alloc_sequence(seq,max_read_length,LINE_MAX);
  
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
  {
    die("Failed to malloc kmer sliding window in align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.\n");
  }
  

  //  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size-1));
  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size+1));
  if (kmer_window->kmer==NULL)
  {
    die("Failed to malloc kmer_window->kmer in align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.\n");
  }
  kmer_window->nkmers=0;
  
  


  //create file reader
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = db_graph->kmer_size;
      //die("new_entry must be true in hsi test function");
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    
    return ret;
  }
コード例 #3
0
void get_coverage_on_panels(int* percentage_coverage,int* median_coverage,
                            StrBuf** panel_file_paths,
                            int max_branch_len, dBGraph *db_graph,
                            int ignore_first, int ignore_last , int NUM_PANELS)

{
  int i;
  FILE* fp;
  AlleleInfo* ai = alloc_allele_info();
  int number_of_reads = 0;
  int number_of_covered_reads =0; 
  int num_kmers=0;
  Covg tot_pos_kmers;
  Covg tot_kmers;
  Covg med;  
  //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL)
  {
    die("Out of memory trying to allocate Sequence");
  }
  alloc_sequence(seq,max_branch_len,LINE_MAX);
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
  {
    die("Failed to malloc kmer sliding window");
  }
  CovgArray* working_ca = alloc_and_init_covg_array(max_branch_len);
  dBNode** array_nodes = (dBNode**) malloc(sizeof(dBNode*)*max_branch_len);
  Orientation* array_or =(Orientation*)  malloc(sizeof(Orientation)*max_branch_len);
  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_branch_len-db_graph->kmer_size+1));
  if (kmer_window->kmer==NULL)
  {
    die("Failed to malloc kmer_window->kmer");
  }
  kmer_window->nkmers=0;
  //create file readers
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry)
  {
    long long ret;
    int offset = 0;
    if (new_entry == false)
    {
      offset = db_graph->kmer_size;
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    return ret;
  }
コード例 #4
0
ファイル: test_seq.c プロジェクト: homonecloco/PyroClean
void test_read_sequence_from_fastq_with_bad_reads_and_long_reads()
{

  int ascii_offset=33;

  //pre-allocate space where to read the sequences
  Sequence* seq = malloc(sizeof(Sequence));
  if (seq==NULL){
    fputs("Out of memory trying to allocate a Sequence",stderr);
      exit(1);
  }

  int max_read_length=200;
  alloc_sequence(seq,max_read_length,LINE_MAX,ascii_offset);
  

  
  int length_seq;
  
  FILE* fp1 = fopen("../data/test/basic/includes_one_read_that_is_too_long.fastq", "r");
  
  // @read1
  // ACGT
  // +
  // !!!!
  // @read2
  // CCCC
  // +
  // 5555
  // @read3
  // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
  // -
  // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
  // @read4
  // ACGT
  // +
  // 3333



  length_seq = read_sequence_from_fastq(fp1,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  CU_ASSERT((int) (seq->qual[0])==0);
  CU_ASSERT((int) (seq->qual[1])==0);
  CU_ASSERT((int) (seq->qual[2])==0);
  CU_ASSERT((int) (seq->qual[3])==0);
  
  length_seq = read_sequence_from_fastq(fp1,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read2",seq->name);
  CU_ASSERT_STRING_EQUAL("CCCC",seq->seq);
  CU_ASSERT((int) (seq->qual[0])==20);
  CU_ASSERT((int) (seq->qual[1])==20);
  CU_ASSERT((int) (seq->qual[2])==20);
  CU_ASSERT((int) (seq->qual[3])==20);

  
  length_seq = read_sequence_from_fastq(fp1,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 100);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT((int) (seq->qual[0])==15); // 0 translates as ascii 48; subtract 33 and get 15
  
  

  length_seq = read_sequence_from_fastq(fp1,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read4",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  CU_ASSERT((int) (seq->qual[0])==18);

  
  
  fclose(fp1);
  

  FILE* fp2 = fopen("../data/test/basic/includes_reads_with_bad_characters.fastq", "r");

  //@read1
  //ACGTACGTACGTACGT
  //+
  //WEW2WEW2WEW2WEWA
  //@read2
  //AAAA#5A
  //+
  //1234567
  //@read3
  //TTTT
  //+
  //3333



  length_seq = read_sequence_from_fastq(fp2,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 16);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq);


  length_seq = read_sequence_from_fastq(fp2,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTT",seq->seq);

  
  length_seq = read_sequence_from_fastq(fp2,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 0);
  
  fclose(fp2);




  FILE* fp3 = fopen("../data/test/basic/includes_one_read_where_quality_is_longer_than_seq.fastq", "r");

  //@read1
  //ACGTACGTACGTACGT
  //+
  //WEW2WEW2WEW2WEWA
  //@read2
  //AAAA#5A
  //+
  //!!!!!!!!!!!!!!!!!!!!!!
  //@read3
  //TTTT
  //+
  //3333

  length_seq = read_sequence_from_fastq(fp3,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 16);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq);

  length_seq = read_sequence_from_fastq(fp3,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTT",seq->seq);

  length_seq = read_sequence_from_fastq(fp3,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 0);
  
  fclose(fp3);

  FILE* fp4 = fopen("../data/test/basic/includes_multiline_reads.fastq", "r");

  // @read1
  // ACGT
  // +
  // @@@@
  // @read2 45 bases
  // AAAAAAAAAAAAAAA
  // CCCCCCCCCCCCCCC
  // GGGGGGGGGGGGGGG
  // +
  // 222222222222222
  // 333333333333333
  // 444444444444444
  // @read3
  // TTT
  // -
  // ggg


  length_seq = read_sequence_from_fastq(fp4,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  

  length_seq = read_sequence_from_fastq(fp4,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 45);
  CU_ASSERT_STRING_EQUAL("read2",seq->name);
  CU_ASSERT_STRING_EQUAL("AAAAAAAAAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGG",seq->seq);
  

  length_seq = read_sequence_from_fastq(fp4,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 3);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT_STRING_EQUAL("TTT",seq->seq);
  


  length_seq = read_sequence_from_fastq(fp4,seq,max_read_length);
  CU_ASSERT_EQUAL(length_seq, 0);
  
  fclose(fp4);

  
 

  free_sequence(&seq);






}
コード例 #5
0
ファイル: test_seq.c プロジェクト: homonecloco/PyroClean
void test_read_sequence_from_fastq(){


  int ascii_offset = 33;

  //pre-allocate space where to read the sequences
  Sequence* seq = malloc(sizeof(Sequence));
  if (seq==NULL){
    fputs("Out of memory trying to allocate a Sequence",stderr);
      exit(1);
  }
  
  alloc_sequence(seq,200,LINE_MAX, ascii_offset);
  
  int length_seq;
  FILE* fp1 = fopen("../data/test/basic/one_entry.fastq", "r");

  // 1. Read from simple fasta:
  // >Zam
  // ACGT
  // +
  // &&&&

  length_seq = read_sequence_from_fastq(fp1,seq,1000);
  
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("Zam",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  //  CU_ASSERT_STRING_EQUAL("&&&&",seq->qual);/Zam says - /changed this line when I changed the ual reading code to offset by 33
  CU_ASSERT((int) seq->qual[0] == 5);




  FILE* fp2 = fopen("../data/test/basic/three_entries.fastq", "r");
  
  //2. Read from fastq:


  // @Zam1
  //ACGT
  //+
  //&&&&
  //@Zam2
  //AAAAAAAA
  //+
  //!((/8F+,
  //@Zam3
  //ATATATAT
  //TTTTTTTTTT
  //-
  //(((((((+AAAAAABAAA

  
  length_seq = read_sequence_from_fastq(fp2,seq, 1000);
  
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("Zam1",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGT",seq->seq);
  //  CU_ASSERT_STRING_EQUAL("&&&&",seq->qual);/Zam says - /changed this line when I changed the ual reading code to offset by 33
  CU_ASSERT((int) seq->qual[0] == 5);


  length_seq = read_sequence_from_fastq(fp2,seq,1000);
  
  CU_ASSERT_EQUAL(length_seq, 8);
  CU_ASSERT_STRING_EQUAL("Zam2",seq->name);
  CU_ASSERT_STRING_EQUAL("AAAAAAAA",seq->seq);
  CU_ASSERT((int) seq->qual[0] == 0);//! is quality 0 - take 33 off its ascii code, can check on http://www.asciitable.com/
  CU_ASSERT((int) seq->qual[1] == 7);// ( is quality 7
  CU_ASSERT((int) seq->qual[2] == 7);// ( is quality 7
  CU_ASSERT((int) seq->qual[3] == 14);// / is quality 14
  CU_ASSERT((int) seq->qual[4] == 23);// 8 is quality 23
  CU_ASSERT((int) seq->qual[5] == 37);// F is quality 37
  CU_ASSERT((int) seq->qual[6] == 10);// + is quality 10
  CU_ASSERT((int) seq->qual[7] == 11);// , is quality 11



  length_seq = read_sequence_from_fastq(fp2,seq,1000);
  
  CU_ASSERT_EQUAL(length_seq, 18);
  CU_ASSERT_STRING_EQUAL("Zam3",seq->name);
  CU_ASSERT_STRING_EQUAL("ATATATATTTTTTTTTTT",seq->seq);
  CU_ASSERT((int) seq->qual[0] == 7);// ( is quality 7

  length_seq = read_sequence_from_fastq(fp2,seq,1000);

  CU_ASSERT_EQUAL(length_seq, 0);

  fclose(fp2);
  free_sequence(&seq);
}
コード例 #6
0
ファイル: test_seq.c プロジェクト: homonecloco/PyroClean
//test reading several short entries from a fasta file
void test_read_sequence_from_fasta(){

  Sequence * seq = malloc(sizeof(Sequence));
  boolean full_entry;

  if (seq == NULL){
    fputs("Out of memory trying to allocate Sequence\n",stderr);
    exit(1);
  }

  //pre-allocate space where to read the sequences
  alloc_sequence(seq,200,LINE_MAX, 0);

  int length_seq;
  FILE* fp1 = fopen("../data/test/basic/one_entry.fasta", "r");
  
  if (fp1 == NULL){
    fputs("cannot open file:../data/test/basic/one_entry.fasta\n",stderr);
    exit(1);
  }

  // 1. Read from simple fasta:
  // >Zam
  // ACGT
  // ACGTACGTACGT
  
  length_seq = read_sequence_from_fasta(fp1,seq,1000,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 16);
  CU_ASSERT_STRING_EQUAL("Zam",seq->name);
  CU_ASSERT_EQUAL(1,seq->start);
  CU_ASSERT_EQUAL(16,seq->end);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq);
  CU_ASSERT(full_entry);
  fclose(fp1);

  FILE* fp2 = fopen("../data/test/basic/three_entries.fasta", "r");

  // 2. Read from fasta:
  //>Zam1
  //ACGT
  //ACGTACGTACGT
  //>Zam2
  //ACGT
  //ACGTACGTACGT
  //TTTTTTTT
  //>Zam3
  //ACGTNNACGTACGTACGT

  length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 16);
  CU_ASSERT_STRING_EQUAL("Zam1",seq->name);
  CU_ASSERT_EQUAL(1,seq->start);
  CU_ASSERT_EQUAL(16,seq->end);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 24);
  CU_ASSERT_STRING_EQUAL("Zam2",seq->name);
  CU_ASSERT_EQUAL(1,seq->start);
  CU_ASSERT_EQUAL(24,seq->end);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGTTTTTTTTt",seq->seq);
  CU_ASSERT(full_entry);
   

  length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0);

  
  CU_ASSERT_EQUAL(length_seq, 18);
  CU_ASSERT_STRING_EQUAL("Zam3",seq->name);
  CU_ASSERT_EQUAL(1,seq->start);
  CU_ASSERT_EQUAL(18,seq->end);
  CU_ASSERT_STRING_EQUAL("ACGTNNACGTACGTACGT",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 0);
  CU_ASSERT(full_entry);
   
  fclose(fp2);
  free_sequence(&seq);

}
コード例 #7
0
ファイル: test_seq.c プロジェクト: homonecloco/PyroClean
void test_read_sequence_from_fasta_when_file_has_bad_reads()
{

  int length_seq;
  Sequence * seq = malloc(sizeof(Sequence));
  boolean full_entry;
  
  if (seq == NULL){							
    fputs("Out of memory trying to allocate Sequence\n",stderr);	
    exit(1);								
  }
  //pre-allocate space where to read the sequences
  int max_read_length=100;
  alloc_sequence(seq,max_read_length,LINE_MAX, 0);
 
  FILE* fp2= fopen("../data/test/basic/includes_reads_that_have_bad_characters.fasta", "r");

  // >read1
  // AAAAAAAAAAAA9
  // >read2
  // ¡€#9∞§¶#¶•#•#•#ª#ª#ª#ªº#º#º#º––––
  // >read3 4 c's
  // CCCC
  // >read4 10 Ts
  // TTTTTTTTTT
  // >read5
  // $
  // >read6
  // AAAAAAAAAAAAAAAAAA#A
  // >read7
  // AAA

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 13);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 63);
  CU_ASSERT_STRING_EQUAL("read2",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT_STRING_EQUAL("CCCC",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_STRING_EQUAL("read4",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTTTTTTTT",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 1);
  CU_ASSERT_STRING_EQUAL("read5",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
 
  CU_ASSERT_EQUAL(length_seq, 20);
  CU_ASSERT_STRING_EQUAL("read6",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
 
  CU_ASSERT_EQUAL(length_seq, 3);
  CU_ASSERT_STRING_EQUAL("read7",seq->name);
  CU_ASSERT_STRING_EQUAL("AAA",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  CU_ASSERT_EQUAL(length_seq, 0);
  CU_ASSERT(full_entry);
   
  fclose(fp2);


  //now make sure we do not get trapped in an infinite loop if the last read of a file is bad

  FILE* fp3= fopen("../data/test/basic/includes_final_read_that_has_bad_characters.fasta", "r");

  // >read1
  // AAAAAAAAAAAA9
  // >read2
  // ¡€#9∞§¶#¶•#•#•#ª#ª#ª#ªº#º#º#º––––
  // >read3 4 c's
  // CCCC
  // >read4 10 Ts
  // TTTTTTTTTT
  // >read5
  // $
  // >read6
  // AAAAAAAAAAAAAAAAAA#A


  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 13);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 63);
  CU_ASSERT_STRING_EQUAL("read2",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("CCCC",seq->seq);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT(full_entry);
  
  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_STRING_EQUAL("read4",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTTTTTTTT",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 1);
  CU_ASSERT_STRING_EQUAL("read5",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);

  
  CU_ASSERT_EQUAL(length_seq, 20);
  CU_ASSERT_STRING_EQUAL("read6",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 0);
  CU_ASSERT(full_entry);

  fclose(fp3);



  free_sequence(&seq);

}
コード例 #8
0
ファイル: test_seq.c プロジェクト: homonecloco/PyroClean
void test_read_sequence_from_long_fasta(){

  Sequence * seq = malloc(sizeof(Sequence));
  boolean full_entry;

  if (seq == NULL){							
    fputs("Out of memory trying to allocate Sequence\n",stderr);	
    exit(1);								
  }
  //pre-allocate space where to read the sequences
  alloc_sequence(seq,200,LINE_MAX, 0);

  int length_seq;
  FILE* fp1 = fopen("../data/test/basic/long_entries.fasta", "r");
  
  if (fp1 == NULL){							
    fputs("cannot open file: ../data/test/basic/long_entries.fasta\n",stderr);	
    exit(1);								
  }
  
  length_seq = read_sequence_from_fasta(fp1,seq,10,true,&full_entry,0);
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_EQUAL(seq->start,1);
  CU_ASSERT_EQUAL(seq->end,10);
  CU_ASSERT_STRING_EQUAL("Mario",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGTACGTAC",seq->seq);
  CU_ASSERT(full_entry==false);

  length_seq = read_sequence_from_fasta(fp1,seq,10,false,&full_entry,0);
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_EQUAL(seq->start,11);
  CU_ASSERT_EQUAL(seq->end,20);
  CU_ASSERT_STRING_EQUAL("Mario",seq->name);
  CU_ASSERT_STRING_EQUAL("GTACGTAAAA",seq->seq);
 
  seq->seq[0]='T';
  seq->seq[1]='T';
  seq->seq[2]='T';
  length_seq = read_sequence_from_fasta(fp1,seq,10,false,&full_entry,3);
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_STRING_EQUAL("Mario",seq->name);
  CU_ASSERT_EQUAL(seq->start, 21);
  CU_ASSERT_EQUAL(seq->end,27);
  CU_ASSERT_STRING_EQUAL("TTTAAAAAAA",seq->seq);

  //finish off the entry
  length_seq = read_sequence_from_fasta(fp1,seq,1000,false,&full_entry,0); 
  CU_ASSERT(full_entry == true);
  
  length_seq = read_sequence_from_fasta(fp1,seq,16,true,&full_entry,0); 
  CU_ASSERT(full_entry == true); 
  CU_ASSERT_EQUAL(seq->start, 1); 
  CU_ASSERT_EQUAL(seq->end,16); 
  CU_ASSERT_EQUAL(length_seq, 16); 
  CU_ASSERT_STRING_EQUAL("Pepe",seq->name);
  
  length_seq = read_sequence_from_fasta(fp1,seq,3,true,&full_entry,0); 
  CU_ASSERT(full_entry == false);
  CU_ASSERT_EQUAL(seq->start, 1);
  CU_ASSERT_EQUAL(seq->end,3);
  CU_ASSERT_EQUAL(length_seq, 3);
  CU_ASSERT_STRING_EQUAL("COCO",seq->name);
  CU_ASSERT_STRING_EQUAL("TTT",seq->seq);

  length_seq = read_sequence_from_fasta(fp1,seq,1000,false,&full_entry,0);
  CU_ASSERT(full_entry == true);
  CU_ASSERT_EQUAL(seq->start, 4);
  CU_ASSERT_EQUAL(seq->end,10);
  CU_ASSERT_EQUAL(length_seq,7);
  CU_ASSERT_STRING_EQUAL("COCO",seq->name);
  CU_ASSERT_STRING_EQUAL("TAAAATT",seq->seq);

  length_seq = read_sequence_from_fasta(fp1,seq,15,true,&full_entry,0);
  CU_ASSERT(full_entry == true);
  CU_ASSERT_EQUAL(seq->start, 1);
  CU_ASSERT_EQUAL(seq->end,15);
  CU_ASSERT_EQUAL(length_seq, 15);
  CU_ASSERT_STRING_EQUAL("CACHO",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTTTTAAAGGATAT",seq->seq);


  length_seq = read_sequence_from_fasta(fp1,seq,15,true,&full_entry,0);
  CU_ASSERT(full_entry == true);
  CU_ASSERT_EQUAL(length_seq, 0);

  fclose(fp1);


}
コード例 #9
0
void align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours(FileFormat format, char* list_of_fastaq, int max_read_length, 
								      int* array_of_colours, char** array_of_names_of_colours,
								      int num_of_colours, dBGraph* db_graph,int fastq_ascii_offset,
								      boolean is_for_testing, char** for_test_array_of_strings, int* for_test_index,
								      boolean mark_nodes_for_dumping)
{

  if ( (format != FASTA) && (format !=FASTQ) )
    {
      die("Calling align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours "
          "with file format not set to fasta or fastq");
    }

  //For each file in list_of_fasta, go through the reads, and for each read,
  // print one  "coverage read" per colour (space separated)
  // e.g. for a read print
  //    >read_id colour 0
  //    coverages of each of the nodes in the ref (colour 0)
  //   >read_id colour 1
  //     ... covgs in colour 1
  //   >read_id colour 2
  //     ... 
	
  
  //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    die("Out of memory trying to allocate Sequence");
  }
  alloc_sequence(seq,max_read_length,LINE_MAX);
  
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
    {
      die("Failed to malloc kmer sliding window in "
          "align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.");
    }
  

  //  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size-1));
  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size+1));
  if (kmer_window->kmer==NULL)
    {
      die("Failed to malloc kmer_window->kmer in "
          "align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.");
    }
  kmer_window->nkmers=0;
  
  
  //end of intialisation 
	  
	  
  //create file readers
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = db_graph->kmer_size;
      //die("new_entry must be true in hsi test function");
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    
    return ret;
  }
コード例 #10
0
void test_get_next_gene_info()
{
  
  uint16_t kmer_size = 31;
  int number_of_bits = 10;
  int bucket_size = 100;
  int max_retries = 10;


  dBGraph *db_graph= hash_table_new(number_of_bits, bucket_size,
				    max_retries, kmer_size);

  int max_gene_len = 1500;
  uint64_t* kmer_covg_array = calloc(150, sizeof(uint64_t));
  uint64_t* readlen_array = calloc(max_gene_len, sizeof(uint64_t));

  StrBuf* list = strbuf_create("../data/test/myKrobe/predictor/gene_presence/sample1.fa.list");
  unsigned long long  num_bases = build_unclean_graph(db_graph, 
						      list, true,
						      kmer_size,
						      readlen_array, max_gene_len,
						      kmer_covg_array, 150,
						      false, 0);

  FILE* fp = fopen("../data/test/myKrobe/predictor/gene_presence/panel1.fasta", "r");
  if (fp==NULL)
    {
      die("Cannot open this file: ../data/test/myKrobe/predictor/gene_presence/panel1.fasta");
    }
  
  GeneInfo* gi = alloc_and_init_gene_info();


  //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    die("Out of memory trying to allocate Sequence");
  }
  alloc_sequence(seq,max_gene_len,LINE_MAX);
  
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
    {
      die("Failed to malloc kmer sliding window");
    }
  

  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_gene_len-db_graph->kmer_size+1));
  if (kmer_window->kmer==NULL)
    {
      die("Failed to malloc kmer_window->kmer");
    }
  kmer_window->nkmers=0;
  

  //  int max_gene_len = 5000;
  CovgArray* working_ca = alloc_and_init_covg_array(max_gene_len);
  //end of intialisation 
	  
	  
  //create file readers
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = db_graph->kmer_size;
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    
    return ret;
  }
コード例 #11
0
void test_count_reads_where_snp_makes_clean_bubble1()
{
  if(NUMBER_OF_BITFIELDS_IN_BINARY_KMER > 1)
  {
    warn("Test not configured for NUMBER_OF_BITFIELDS_IN_BINARY_KMER > 1\n");
    return;
  }

  //  simple test. Use this refernece genome which I load into colour 0:

  /// file: data/test/genome_complexity/test_allele_clean_file1.fa

  //  >ref contains CAAGTTC CACGTTC and CAGGTTC
  //  CAAGTTCAGAGTTACTCACACCCGATCGATAAGCGGTACAGAGCACGTTCAGAAAAAAACAGGTTCAGA
  //  >ref + SNP
  //  TATCCATGTTCAGAGTTACTGACACCCGATCGATAAGCG


  //first set up the hash/graph
  int kmer_size = 7;
  int number_of_bits = 8;
  int bucket_size = 10;
  int max_retries = 10;

  dBGraph *hash_table = hash_table_new(number_of_bits, bucket_size,
                                       max_retries, kmer_size);

  if (hash_table==NULL)
    {
      die("Unable to alloc the hash table.");
    }

  // Read FASTA sequence
  int fq_quality_cutoff = 0;
  int homopolymer_cutoff = 0;
  boolean remove_duplicates_se = false;
  char ascii_fq_offset = 33;
  int into_colour = 0;

  unsigned int files_loaded = 0;
  unsigned long long bad_reads = 0, dup_reads = 0;
  unsigned long long seq_loaded = 0, seq_read = 0;

  load_se_filelist_into_graph_colour(
    "../data/test/genome_complexity/pop_first_test.colours",
    fq_quality_cutoff, homopolymer_cutoff,
    remove_duplicates_se, ascii_fq_offset,
    into_colour, hash_table, 1, // 0 => falist/fqlist; 1 => colourlist
    &files_loaded, &bad_reads, &dup_reads, &seq_read, &seq_loaded,
    NULL, 0, &subsample_null);

  //and use this file of reads: /data/test/genome_complexity/test_allele_clean_file2.fa
  //  >read lies entirely in graph defined by test_allele_clean_file1.fa, and is clean (forms supernode at k=7)
  //  GTTCAGAGTTACT
  //  >read lies in graph defined by test_allele_clean_file1.fa, but overlaps a junction so is not clean
  //  TTACTGACACCCGATCG
  //  >read lies in ref but any change to the 7th base results in an overlap with the ref, so
  //  GTTCAGAGTTACTCACA

  int col_genome=0;
  int reads_tested=0;
  int reads_where_snp_makes_clean_bubble = 0;
  dBNode* array_nodes[500];
  Orientation array_or[500];
  
    //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    die("Out of memory trying to allocate Sequence\n");
  }
  int max_read_length=2000;
  alloc_sequence(seq,max_read_length,LINE_MAX);
  
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
    {
      die("Failed to malloc kmer sliding window in test. Exit.\n");
    }
  

  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-hash_table->kmer_size-1));
  if (kmer_window->kmer==NULL)
    {
      die("Failed to malloc kmer_window->kmer in test. Exit.\n");
    }
  kmer_window->nkmers=0;
  
  
  //end of intialisation 
	  
	  
  //create file readers
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = hash_table->kmer_size;
      //die("new_entry must be true in hsi test function");
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    
    return ret;
  }
コード例 #12
0
ファイル: content.c プロジェクト: richardmleggett/metacortex
int main(int argc, char **argv){
  CmdLine cmd_line = parse_cmdline(argc,argv,sizeof(Element));
  long long As=0;
  long long Cs=0;
  long long Gs=0;
  long long Ts=0;
  long long Us=0;

  int max_read_length = 1000;

  FILE* fp = fopen(cmd_line.input_filename, "r"); 
  if (fp == NULL){
    fprintf(stderr,"cannot open file:%s\n",cmd_line.input_filename);
    exit(1); //TODO - prefer to print warning and skip file and return an error code?
  }


  //----------------------------------
  // preallocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    fputs("Out of memory trying to allocate Sequence\n",stderr);
    exit(1);
  }
  alloc_sequence(seq,max_read_length,LINE_MAX, cmd_line.quality_offset);
  
  int entry_length=0;
  boolean new_entry = true;
  boolean full_entry;
  do {

    switch (cmd_line.input_file_format) {
    case FASTQ:
      entry_length = read_sequence_from_fastq(fp,seq,max_read_length);
      break;
      
    case FASTA:
   
    entry_length = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,&full_entry,0);
    new_entry = full_entry;
    break;
    }

    int i;
    for(i=0;i<entry_length;i++){
      //printf("index %i char %c\n",i,seq->seq[i]);

      switch (seq->seq[i])
	{
	  
	case 'A':
	  As++;
	  break;
	case 'C':
	  Cs++;
	  break;
	case 'G':
	  Gs++;
	  break;
	case 'T':
	  Ts++;
	  break;
	case 'a':
	  As++;
	  break;
	case 'c':
	  Cs++;
	  break;
	case 'g':
	  Gs++;
	  break;
	case 't':
	  Ts++;
	  break;
	default:
	  Us++;
	}
      
    }
    
  } while (entry_length>0);
  
  long long total = As + Cs + Gs + Ts + Us;

  printf("%qd As counted - %5.2f%%\n",As, (As*100.0)/total);
  printf("%qd Cs counted - %5.2f%%\n",Cs,(Cs*100.0)/total);
  printf("%qd Gs counted - %5.2f%%\n",Gs,(Gs*100.0)/total);
  printf("%qd Ts counted - %5.2f%%\n",Ts,(Ts*100.0)/total);
  printf("%qd Us counted - %5.2f%%\n",Us,(Us*100.0)/total);
  
  return 0;
}