Exemplo n.º 1
0
int main(int argc, char **argv)
{
  setvbuf(stdout, NULL, _IOLBF, 0);
  CmdLine* cmd_line = cmd_line_alloc();
  if (cmd_line==NULL)
    {
      return -1;
    }
  parse_cmdline(cmd_line, argc, argv, sizeof(Element));
  dBGraph * db_graph = NULL;
  //Create the de Bruijn graph/hash table
  int max_retries=15;
  db_graph = hash_table_new(cmd_line->mem_height,
          cmd_line->mem_width,
          max_retries, 
          cmd_line->kmer_size);
  if (db_graph==NULL)
    {
      return -1;
    }
  //some setup
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = db_graph->kmer_size;
      //die("new_entry must be true in hsi test function");
    }
    ret =  read_sequence_from_fasta(fp, seq, max_read_length,new_entry,full_entry,offset);
    return ret;
  }
Exemplo n.º 2
0
void test_shift_last_kmer_to_start_of_sequence(){ 
  Sequence * seq = malloc(sizeof(Sequence));
  boolean full_entry;

  if (seq == NULL){
    fputs("Out of memory trying to allocate Sequence\n",stderr);
    exit(1);
  }
  //pre-allocate space where to read the sequences
  alloc_sequence(seq,200,LINE_MAX, 0);

  FILE* fp1 = fopen("../data/test/basic/long_entries.fasta", "r");

  int length_seq = read_sequence_from_fasta(fp1,seq,10,true,&full_entry,0);

  CU_ASSERT(seq->seq[0]=='A');
  CU_ASSERT(seq->seq[1]=='C');
  CU_ASSERT(seq->seq[2]=='G');
  
  shift_last_kmer_to_start_of_sequence(seq, length_seq,3);
  CU_ASSERT(seq->seq[0]=='T');
  CU_ASSERT(seq->seq[1]=='A');
  CU_ASSERT(seq->seq[2]=='C');
  
  CU_ASSERT(full_entry==false);
  fclose(fp1);
}
Exemplo n.º 3
0
int main(int argc, char **argv)
{
    FILE * in;
	FILE * out;
    boolean full_entry;
    int seq_len = MAX_SEQ_LENGTH ;//To make this parametizable in the future. 
    //Sequence * sequence_new(int max_read_length, int max_name_length, char offset);
    Sequence * seq_in = sequence_new(seq_len, 300, 0); //TODO: make this parametizable. 
    Sequence * seq_out = sequence_new(seq_len, 300, 0);
    struct arguments arg;
    arg.input_file = "-";
    arg.output_file = "-";
    arg.conv = BASE_TO_SOLID;
    
    converter_parse(argc, argv, &arg);
    
    if(strcmp(arg.input_file, "-") == 0){
        in = stdin;
    }else{
        in = fopen(arg.input_file, "r");
        if(in == NULL){
            fprintf(stderr, "Unable to open file %s to read.", arg.input_file);
        }
    }
    
    if(strcmp(arg.output_file, "-") == 0){
        out = stdout;
    }else{
        out = fopen(arg.output_file, "w");
        if(out == NULL){
            if(in != stdin){
                fclose(in);
            }
            fprintf(stderr, "Unable to open file %s for writing", arg.output_file);
        }
    }
    ////this routine can read long sequences (eg full chromosomes) , this is implemented by reading the sequence in chunks
    //int read_sequence_from_fasta(FILE * fp, Sequence * seq, int max_chunk_length, boolean new_entry, boolean * full_entry, int offset);
    
    while (read_sequence_from_fasta(in, seq_in, seq_len,  true,&full_entry ,0) > 0){
        if(arg.conv == BASE_TO_SOLID){
            base_sequence_to_cs_sequence(seq_in, seq_out);
        }else if(arg.conv == SOLID_TO_BASE){
            fflush(stdout);
            fprintf(stderr, "input seq:\n");
            sequence_print_fasta(stderr, seq_in);
            fprintf(stderr, "out seq:\n");
            fflush(stderr);
            fflush(stdout);
            cs_sequence_to_base_sequence(seq_in, seq_out);
        }
        sequence_set_name(seq_in->name, seq_out);//TODO: use accessor
        sequence_print_fasta(out, seq_out);
    }
    return 0;
}
Exemplo n.º 4
0
void estimate_seq_error_rate_from_snps_for_each_colour(char* colourlist_snp_alleles, 
                                                       GraphInfo* db_graph_info, 
                                                       dBGraph* db_graph, 
                                                       int ref_colour, 
                                                       //long long genome_size,
                                                       long double default_seq_err_rate, 
                                                       char* output_file)
{

  int max_read_length = 2*(db_graph->kmer_size);

  // ****** malloc and initialisation //
  
  //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    die("Out of memory trying to allocate Sequence");
  }
  alloc_sequence(seq,max_read_length,LINE_MAX);
  
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
  {
    die("Failed to malloc kmer sliding window in align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.\n");
  }
  

  //  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size-1));
  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size+1));
  if (kmer_window->kmer==NULL)
  {
    die("Failed to malloc kmer_window->kmer in align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.\n");
  }
  kmer_window->nkmers=0;
  
  


  //create file reader
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = db_graph->kmer_size;
      //die("new_entry must be true in hsi test function");
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    
    return ret;
  }
Exemplo n.º 5
0
void get_coverage_on_panels(int* percentage_coverage,int* median_coverage,
                            StrBuf** panel_file_paths,
                            int max_branch_len, dBGraph *db_graph,
                            int ignore_first, int ignore_last , int NUM_PANELS)

{
  int i;
  FILE* fp;
  AlleleInfo* ai = alloc_allele_info();
  int number_of_reads = 0;
  int number_of_covered_reads =0; 
  int num_kmers=0;
  Covg tot_pos_kmers;
  Covg tot_kmers;
  Covg med;  
  //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL)
  {
    die("Out of memory trying to allocate Sequence");
  }
  alloc_sequence(seq,max_branch_len,LINE_MAX);
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
  {
    die("Failed to malloc kmer sliding window");
  }
  CovgArray* working_ca = alloc_and_init_covg_array(max_branch_len);
  dBNode** array_nodes = (dBNode**) malloc(sizeof(dBNode*)*max_branch_len);
  Orientation* array_or =(Orientation*)  malloc(sizeof(Orientation)*max_branch_len);
  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_branch_len-db_graph->kmer_size+1));
  if (kmer_window->kmer==NULL)
  {
    die("Failed to malloc kmer_window->kmer");
  }
  kmer_window->nkmers=0;
  //create file readers
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry)
  {
    long long ret;
    int offset = 0;
    if (new_entry == false)
    {
      offset = db_graph->kmer_size;
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    return ret;
  }
Exemplo n.º 6
0
//test reading several short entries from a fasta file
void test_read_sequence_from_fasta(){

  Sequence * seq = malloc(sizeof(Sequence));
  boolean full_entry;

  if (seq == NULL){
    fputs("Out of memory trying to allocate Sequence\n",stderr);
    exit(1);
  }

  //pre-allocate space where to read the sequences
  alloc_sequence(seq,200,LINE_MAX, 0);

  int length_seq;
  FILE* fp1 = fopen("../data/test/basic/one_entry.fasta", "r");
  
  if (fp1 == NULL){
    fputs("cannot open file:../data/test/basic/one_entry.fasta\n",stderr);
    exit(1);
  }

  // 1. Read from simple fasta:
  // >Zam
  // ACGT
  // ACGTACGTACGT
  
  length_seq = read_sequence_from_fasta(fp1,seq,1000,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 16);
  CU_ASSERT_STRING_EQUAL("Zam",seq->name);
  CU_ASSERT_EQUAL(1,seq->start);
  CU_ASSERT_EQUAL(16,seq->end);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq);
  CU_ASSERT(full_entry);
  fclose(fp1);

  FILE* fp2 = fopen("../data/test/basic/three_entries.fasta", "r");

  // 2. Read from fasta:
  //>Zam1
  //ACGT
  //ACGTACGTACGT
  //>Zam2
  //ACGT
  //ACGTACGTACGT
  //TTTTTTTT
  //>Zam3
  //ACGTNNACGTACGTACGT

  length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 16);
  CU_ASSERT_STRING_EQUAL("Zam1",seq->name);
  CU_ASSERT_EQUAL(1,seq->start);
  CU_ASSERT_EQUAL(16,seq->end);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGT",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 24);
  CU_ASSERT_STRING_EQUAL("Zam2",seq->name);
  CU_ASSERT_EQUAL(1,seq->start);
  CU_ASSERT_EQUAL(24,seq->end);
  CU_ASSERT_STRING_EQUAL("ACGTACGTACGTACGTTTTTTTTt",seq->seq);
  CU_ASSERT(full_entry);
   

  length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0);

  
  CU_ASSERT_EQUAL(length_seq, 18);
  CU_ASSERT_STRING_EQUAL("Zam3",seq->name);
  CU_ASSERT_EQUAL(1,seq->start);
  CU_ASSERT_EQUAL(18,seq->end);
  CU_ASSERT_STRING_EQUAL("ACGTNNACGTACGTACGT",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,1000,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 0);
  CU_ASSERT(full_entry);
   
  fclose(fp2);
  free_sequence(&seq);

}
Exemplo n.º 7
0
void test_read_sequence_from_fasta_when_file_has_bad_reads()
{

  int length_seq;
  Sequence * seq = malloc(sizeof(Sequence));
  boolean full_entry;
  
  if (seq == NULL){							
    fputs("Out of memory trying to allocate Sequence\n",stderr);	
    exit(1);								
  }
  //pre-allocate space where to read the sequences
  int max_read_length=100;
  alloc_sequence(seq,max_read_length,LINE_MAX, 0);
 
  FILE* fp2= fopen("../data/test/basic/includes_reads_that_have_bad_characters.fasta", "r");

  // >read1
  // AAAAAAAAAAAA9
  // >read2
  // ¡€#9∞§¶#¶•#•#•#ª#ª#ª#ªº#º#º#º––––
  // >read3 4 c's
  // CCCC
  // >read4 10 Ts
  // TTTTTTTTTT
  // >read5
  // $
  // >read6
  // AAAAAAAAAAAAAAAAAA#A
  // >read7
  // AAA

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 13);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 63);
  CU_ASSERT_STRING_EQUAL("read2",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT_STRING_EQUAL("CCCC",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_STRING_EQUAL("read4",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTTTTTTTT",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 1);
  CU_ASSERT_STRING_EQUAL("read5",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
 
  CU_ASSERT_EQUAL(length_seq, 20);
  CU_ASSERT_STRING_EQUAL("read6",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
 
  CU_ASSERT_EQUAL(length_seq, 3);
  CU_ASSERT_STRING_EQUAL("read7",seq->name);
  CU_ASSERT_STRING_EQUAL("AAA",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp2,seq,max_read_length,true,&full_entry,0);
  CU_ASSERT_EQUAL(length_seq, 0);
  CU_ASSERT(full_entry);
   
  fclose(fp2);


  //now make sure we do not get trapped in an infinite loop if the last read of a file is bad

  FILE* fp3= fopen("../data/test/basic/includes_final_read_that_has_bad_characters.fasta", "r");

  // >read1
  // AAAAAAAAAAAA9
  // >read2
  // ¡€#9∞§¶#¶•#•#•#ª#ª#ª#ªº#º#º#º––––
  // >read3 4 c's
  // CCCC
  // >read4 10 Ts
  // TTTTTTTTTT
  // >read5
  // $
  // >read6
  // AAAAAAAAAAAAAAAAAA#A


  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 13);
  CU_ASSERT_STRING_EQUAL("read1",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 63);
  CU_ASSERT_STRING_EQUAL("read2",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 4);
  CU_ASSERT_STRING_EQUAL("CCCC",seq->seq);
  CU_ASSERT_STRING_EQUAL("read3",seq->name);
  CU_ASSERT(full_entry);
  
  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);
  
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_STRING_EQUAL("read4",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTTTTTTTT",seq->seq);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 1);
  CU_ASSERT_STRING_EQUAL("read5",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);

  
  CU_ASSERT_EQUAL(length_seq, 20);
  CU_ASSERT_STRING_EQUAL("read6",seq->name);
  CU_ASSERT(full_entry);

  length_seq = read_sequence_from_fasta(fp3,seq,max_read_length,true,&full_entry,0);

  CU_ASSERT_EQUAL(length_seq, 0);
  CU_ASSERT(full_entry);

  fclose(fp3);



  free_sequence(&seq);

}
Exemplo n.º 8
0
void test_read_sequence_from_long_fasta(){

  Sequence * seq = malloc(sizeof(Sequence));
  boolean full_entry;

  if (seq == NULL){							
    fputs("Out of memory trying to allocate Sequence\n",stderr);	
    exit(1);								
  }
  //pre-allocate space where to read the sequences
  alloc_sequence(seq,200,LINE_MAX, 0);

  int length_seq;
  FILE* fp1 = fopen("../data/test/basic/long_entries.fasta", "r");
  
  if (fp1 == NULL){							
    fputs("cannot open file: ../data/test/basic/long_entries.fasta\n",stderr);	
    exit(1);								
  }
  
  length_seq = read_sequence_from_fasta(fp1,seq,10,true,&full_entry,0);
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_EQUAL(seq->start,1);
  CU_ASSERT_EQUAL(seq->end,10);
  CU_ASSERT_STRING_EQUAL("Mario",seq->name);
  CU_ASSERT_STRING_EQUAL("ACGTACGTAC",seq->seq);
  CU_ASSERT(full_entry==false);

  length_seq = read_sequence_from_fasta(fp1,seq,10,false,&full_entry,0);
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_EQUAL(seq->start,11);
  CU_ASSERT_EQUAL(seq->end,20);
  CU_ASSERT_STRING_EQUAL("Mario",seq->name);
  CU_ASSERT_STRING_EQUAL("GTACGTAAAA",seq->seq);
 
  seq->seq[0]='T';
  seq->seq[1]='T';
  seq->seq[2]='T';
  length_seq = read_sequence_from_fasta(fp1,seq,10,false,&full_entry,3);
  CU_ASSERT_EQUAL(length_seq, 10);
  CU_ASSERT_STRING_EQUAL("Mario",seq->name);
  CU_ASSERT_EQUAL(seq->start, 21);
  CU_ASSERT_EQUAL(seq->end,27);
  CU_ASSERT_STRING_EQUAL("TTTAAAAAAA",seq->seq);

  //finish off the entry
  length_seq = read_sequence_from_fasta(fp1,seq,1000,false,&full_entry,0); 
  CU_ASSERT(full_entry == true);
  
  length_seq = read_sequence_from_fasta(fp1,seq,16,true,&full_entry,0); 
  CU_ASSERT(full_entry == true); 
  CU_ASSERT_EQUAL(seq->start, 1); 
  CU_ASSERT_EQUAL(seq->end,16); 
  CU_ASSERT_EQUAL(length_seq, 16); 
  CU_ASSERT_STRING_EQUAL("Pepe",seq->name);
  
  length_seq = read_sequence_from_fasta(fp1,seq,3,true,&full_entry,0); 
  CU_ASSERT(full_entry == false);
  CU_ASSERT_EQUAL(seq->start, 1);
  CU_ASSERT_EQUAL(seq->end,3);
  CU_ASSERT_EQUAL(length_seq, 3);
  CU_ASSERT_STRING_EQUAL("COCO",seq->name);
  CU_ASSERT_STRING_EQUAL("TTT",seq->seq);

  length_seq = read_sequence_from_fasta(fp1,seq,1000,false,&full_entry,0);
  CU_ASSERT(full_entry == true);
  CU_ASSERT_EQUAL(seq->start, 4);
  CU_ASSERT_EQUAL(seq->end,10);
  CU_ASSERT_EQUAL(length_seq,7);
  CU_ASSERT_STRING_EQUAL("COCO",seq->name);
  CU_ASSERT_STRING_EQUAL("TAAAATT",seq->seq);

  length_seq = read_sequence_from_fasta(fp1,seq,15,true,&full_entry,0);
  CU_ASSERT(full_entry == true);
  CU_ASSERT_EQUAL(seq->start, 1);
  CU_ASSERT_EQUAL(seq->end,15);
  CU_ASSERT_EQUAL(length_seq, 15);
  CU_ASSERT_STRING_EQUAL("CACHO",seq->name);
  CU_ASSERT_STRING_EQUAL("TTTTTTAAAGGATAT",seq->seq);


  length_seq = read_sequence_from_fasta(fp1,seq,15,true,&full_entry,0);
  CU_ASSERT(full_entry == true);
  CU_ASSERT_EQUAL(length_seq, 0);

  fclose(fp1);


}
Exemplo n.º 9
0
int main(int argc, char **argv)
{
  setvbuf(stdout, NULL, _IOLBF, 0);




  CmdLine* cmd_line = cmd_line_alloc();
  if (cmd_line==NULL)
    {
      return -1;
    }
    // VERSION_STR is passed from the makefile -- usually last commit hash

  parse_cmdline(cmd_line, argc,argv,sizeof(Element));

  if (cmd_line->format==Stdout){
    printf("myKrobe.predictor for Staphylococcus, version %d.%d.%d.%d"VERSION_STR"\n",
           VERSION, SUBVERSION, SUBSUBVERSION, SUBSUBSUBVERSION);  
  }

  dBGraph * db_graph = NULL;



  boolean (*subsample_function)();
  
  //local func
  boolean subsample_as_specified()
  {
    double ran = drand48();
    if (ran <= cmd_line->subsample_propn)
      {
	return true;
      }
    return false;
  }
  //end of local func
  
  if (cmd_line->subsample==true)
    {
      subsample_function = &subsample_as_specified;
    }
  else
    {
      subsample_function = &subsample_null;
    }
  





  //  int lim = cmd_line->max_expected_sup_len;
    /* CovgArray* working_ca_for_median=alloc_and_init_covg_array(lim);//will die if fails to alloc
  if (working_ca_for_median==NULL)
    {
      return -1;
    }*/

  //Create the de Bruijn graph/hash table
  int max_retries=15;
  db_graph = hash_table_new(cmd_line->mem_height,
			    cmd_line->mem_width,
			    max_retries, 
			    cmd_line->kmer_size);
  if (db_graph==NULL)
    {
      return -1;
    }

  //some setup
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = db_graph->kmer_size;
      //die("new_entry must be true in hsi test function");
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    return ret;
  }
void align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours(FileFormat format, char* list_of_fastaq, int max_read_length, 
								      int* array_of_colours, char** array_of_names_of_colours,
								      int num_of_colours, dBGraph* db_graph,int fastq_ascii_offset,
								      boolean is_for_testing, char** for_test_array_of_strings, int* for_test_index,
								      boolean mark_nodes_for_dumping)
{

  if ( (format != FASTA) && (format !=FASTQ) )
    {
      die("Calling align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours "
          "with file format not set to fasta or fastq");
    }

  //For each file in list_of_fasta, go through the reads, and for each read,
  // print one  "coverage read" per colour (space separated)
  // e.g. for a read print
  //    >read_id colour 0
  //    coverages of each of the nodes in the ref (colour 0)
  //   >read_id colour 1
  //     ... covgs in colour 1
  //   >read_id colour 2
  //     ... 
	
  
  //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    die("Out of memory trying to allocate Sequence");
  }
  alloc_sequence(seq,max_read_length,LINE_MAX);
  
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
    {
      die("Failed to malloc kmer sliding window in "
          "align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.");
    }
  

  //  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size-1));
  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-db_graph->kmer_size+1));
  if (kmer_window->kmer==NULL)
    {
      die("Failed to malloc kmer_window->kmer in "
          "align_list_of_fastaq_to_graph_and_print_coverages_in_all_colours. Exit.");
    }
  kmer_window->nkmers=0;
  
  
  //end of intialisation 
	  
	  
  //create file readers
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = db_graph->kmer_size;
      //die("new_entry must be true in hsi test function");
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    
    return ret;
  }
void test_get_next_gene_info()
{
  
  uint16_t kmer_size = 31;
  int number_of_bits = 10;
  int bucket_size = 100;
  int max_retries = 10;


  dBGraph *db_graph= hash_table_new(number_of_bits, bucket_size,
				    max_retries, kmer_size);

  int max_gene_len = 1500;
  uint64_t* kmer_covg_array = calloc(150, sizeof(uint64_t));
  uint64_t* readlen_array = calloc(max_gene_len, sizeof(uint64_t));

  StrBuf* list = strbuf_create("../data/test/myKrobe/predictor/gene_presence/sample1.fa.list");
  unsigned long long  num_bases = build_unclean_graph(db_graph, 
						      list, true,
						      kmer_size,
						      readlen_array, max_gene_len,
						      kmer_covg_array, 150,
						      false, 0);

  FILE* fp = fopen("../data/test/myKrobe/predictor/gene_presence/panel1.fasta", "r");
  if (fp==NULL)
    {
      die("Cannot open this file: ../data/test/myKrobe/predictor/gene_presence/panel1.fasta");
    }
  
  GeneInfo* gi = alloc_and_init_gene_info();


  //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    die("Out of memory trying to allocate Sequence");
  }
  alloc_sequence(seq,max_gene_len,LINE_MAX);
  
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
    {
      die("Failed to malloc kmer sliding window");
    }
  

  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_gene_len-db_graph->kmer_size+1));
  if (kmer_window->kmer==NULL)
    {
      die("Failed to malloc kmer_window->kmer");
    }
  kmer_window->nkmers=0;
  

  //  int max_gene_len = 5000;
  CovgArray* working_ca = alloc_and_init_covg_array(max_gene_len);
  //end of intialisation 
	  
	  
  //create file readers
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = db_graph->kmer_size;
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    
    return ret;
  }
Exemplo n.º 12
0
void test_count_reads_where_snp_makes_clean_bubble1()
{
  if(NUMBER_OF_BITFIELDS_IN_BINARY_KMER > 1)
  {
    warn("Test not configured for NUMBER_OF_BITFIELDS_IN_BINARY_KMER > 1\n");
    return;
  }

  //  simple test. Use this refernece genome which I load into colour 0:

  /// file: data/test/genome_complexity/test_allele_clean_file1.fa

  //  >ref contains CAAGTTC CACGTTC and CAGGTTC
  //  CAAGTTCAGAGTTACTCACACCCGATCGATAAGCGGTACAGAGCACGTTCAGAAAAAAACAGGTTCAGA
  //  >ref + SNP
  //  TATCCATGTTCAGAGTTACTGACACCCGATCGATAAGCG


  //first set up the hash/graph
  int kmer_size = 7;
  int number_of_bits = 8;
  int bucket_size = 10;
  int max_retries = 10;

  dBGraph *hash_table = hash_table_new(number_of_bits, bucket_size,
                                       max_retries, kmer_size);

  if (hash_table==NULL)
    {
      die("Unable to alloc the hash table.");
    }

  // Read FASTA sequence
  int fq_quality_cutoff = 0;
  int homopolymer_cutoff = 0;
  boolean remove_duplicates_se = false;
  char ascii_fq_offset = 33;
  int into_colour = 0;

  unsigned int files_loaded = 0;
  unsigned long long bad_reads = 0, dup_reads = 0;
  unsigned long long seq_loaded = 0, seq_read = 0;

  load_se_filelist_into_graph_colour(
    "../data/test/genome_complexity/pop_first_test.colours",
    fq_quality_cutoff, homopolymer_cutoff,
    remove_duplicates_se, ascii_fq_offset,
    into_colour, hash_table, 1, // 0 => falist/fqlist; 1 => colourlist
    &files_loaded, &bad_reads, &dup_reads, &seq_read, &seq_loaded,
    NULL, 0, &subsample_null);

  //and use this file of reads: /data/test/genome_complexity/test_allele_clean_file2.fa
  //  >read lies entirely in graph defined by test_allele_clean_file1.fa, and is clean (forms supernode at k=7)
  //  GTTCAGAGTTACT
  //  >read lies in graph defined by test_allele_clean_file1.fa, but overlaps a junction so is not clean
  //  TTACTGACACCCGATCG
  //  >read lies in ref but any change to the 7th base results in an overlap with the ref, so
  //  GTTCAGAGTTACTCACA

  int col_genome=0;
  int reads_tested=0;
  int reads_where_snp_makes_clean_bubble = 0;
  dBNode* array_nodes[500];
  Orientation array_or[500];
  
    //----------------------------------
  // allocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    die("Out of memory trying to allocate Sequence\n");
  }
  int max_read_length=2000;
  alloc_sequence(seq,max_read_length,LINE_MAX);
  
  //We are going to load all the bases into a single sliding window 
  KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow));
  if (kmer_window==NULL)
    {
      die("Failed to malloc kmer sliding window in test. Exit.\n");
    }
  

  kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_read_length-hash_table->kmer_size-1));
  if (kmer_window->kmer==NULL)
    {
      die("Failed to malloc kmer_window->kmer in test. Exit.\n");
    }
  kmer_window->nkmers=0;
  
  
  //end of intialisation 
	  
	  
  //create file readers
  int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){
    long long ret;
    int offset = 0;
    if (new_entry == false){
      offset = hash_table->kmer_size;
      //die("new_entry must be true in hsi test function");
    }
    ret =  read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset);
    
    return ret;
  }
Exemplo n.º 13
0
int main(int argc, char **argv){
  CmdLine cmd_line = parse_cmdline(argc,argv,sizeof(Element));
  long long As=0;
  long long Cs=0;
  long long Gs=0;
  long long Ts=0;
  long long Us=0;

  int max_read_length = 1000;

  FILE* fp = fopen(cmd_line.input_filename, "r"); 
  if (fp == NULL){
    fprintf(stderr,"cannot open file:%s\n",cmd_line.input_filename);
    exit(1); //TODO - prefer to print warning and skip file and return an error code?
  }


  //----------------------------------
  // preallocate the memory used to read the sequences
  //----------------------------------
  Sequence * seq = malloc(sizeof(Sequence));
  if (seq == NULL){
    fputs("Out of memory trying to allocate Sequence\n",stderr);
    exit(1);
  }
  alloc_sequence(seq,max_read_length,LINE_MAX, cmd_line.quality_offset);
  
  int entry_length=0;
  boolean new_entry = true;
  boolean full_entry;
  do {

    switch (cmd_line.input_file_format) {
    case FASTQ:
      entry_length = read_sequence_from_fastq(fp,seq,max_read_length);
      break;
      
    case FASTA:
   
    entry_length = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,&full_entry,0);
    new_entry = full_entry;
    break;
    }

    int i;
    for(i=0;i<entry_length;i++){
      //printf("index %i char %c\n",i,seq->seq[i]);

      switch (seq->seq[i])
	{
	  
	case 'A':
	  As++;
	  break;
	case 'C':
	  Cs++;
	  break;
	case 'G':
	  Gs++;
	  break;
	case 'T':
	  Ts++;
	  break;
	case 'a':
	  As++;
	  break;
	case 'c':
	  Cs++;
	  break;
	case 'g':
	  Gs++;
	  break;
	case 't':
	  Ts++;
	  break;
	default:
	  Us++;
	}
      
    }
    
  } while (entry_length>0);
  
  long long total = As + Cs + Gs + Ts + Us;

  printf("%qd As counted - %5.2f%%\n",As, (As*100.0)/total);
  printf("%qd Cs counted - %5.2f%%\n",Cs,(Cs*100.0)/total);
  printf("%qd Gs counted - %5.2f%%\n",Gs,(Gs*100.0)/total);
  printf("%qd Ts counted - %5.2f%%\n",Ts,(Ts*100.0)/total);
  printf("%qd Us counted - %5.2f%%\n",Us,(Us*100.0)/total);
  
  return 0;
}