static void my_fread(FILE *fh, void *ptr, int size, const char* entry_name)
{
  int read = fread_buf(fh, ptr, size, buffer);
  // int read = fread(ptr, 1, size, fh);

  if(read != size)
  {
    report_error("Couldn't read '%s': expected %li; recieved: %li; (fatal)\n",
                 entry_name, (long)size, (long)read);

    if(print_kmers)
      printf("----\n");

    print_kmer_stats();
    exit(EXIT_FAILURE);
  }

  num_bytes_read += read;
}
Exemple #2
0
int main(int argc, char **argv)
{
	setlocale (LC_ALL, "");

    log_and_screen_printf("\nkmer_contamination.\n\n");    
	log_and_screen_printf(SVN_VERSION);
	log_and_screen_printf(SVN_COMMIT_DATE);
	log_and_screen_printf("Compiled on %s at %s \n\n", __DATE__, __TIME__);
    
    
    KmerStatsCmdLine cmd_line = parse_cmdline(argc, argv, sizeof(Element));
    
    //log_and_screen_printf("Parsed options\n");
    KmerHash * kmer_hash = load_kmer_table(cmd_line);
    log_and_screen_printf("Kmers readed\n");
    load_reads_coverage_table(cmd_line, kmer_hash);
    print_kmer_stats(&cmd_line, kmer_hash);
    print_contaminated_kmers_histogram(&cmd_line, kmer_hash);
    log_and_screen_printf("\nDONE");
    return 0;
    
}
int main(int argc, char** argv)
{
  char* filepath;

  if(argc < 2)
  {
    print_usage();
  }
  else if(argc > 2)
  {
    print_info = 0;
    print_kmers = 0;
    parse_kmers = 0;

    int i;

    for(i = 1; i < argc-1; i++)
    {
      if(strcasecmp(argv[i], "--print_info") == 0)
      {
        print_info = 1;
      }
      else if(strcasecmp(argv[i], "--print_kmers") == 0)
      {
        print_kmers = 1;
      }
      else if(strcasecmp(argv[i], "--parse_kmers") == 0)
      {
        print_info = 1;
        parse_kmers = 1;
      }
      else
        print_usage();
    }
  }

  filepath = argv[argc-1];

  if(print_info)
    printf("Loading file: %s\n", filepath);

  file_size = get_file_size(filepath);

  FILE* fh = fopen(filepath, "r");

  if(fh == NULL)
  {
    report_error("cannot open file '%s'\n", filepath);
    exit(EXIT_FAILURE);
  }

  if(file_size != -1 && print_info)
  {
    char str[31];
    bytes_to_str(file_size, 0, str);
    printf("File size: %s\n", str);
  }

  buffer = buffer_new(BUFFER_SIZE);

  /*
  // Check sizes
  printf("-- Datatypes --\n");
  printf("int: %i\n", (int)sizeof(int));
  printf("long: %i\n", (int)sizeof(long));
  printf("long long: %i\n", (int)sizeof(long long));
  printf("double: %i\n", (int)sizeof(double));
  printf("long double: %i\n", (int)sizeof(long double));
  */

  if(print_info)
    printf("----\n");

  unsigned int i;

  // Read magic word at the start of header
  char magic_word[7];
  magic_word[6] = '\0';

  my_fread(fh, magic_word, strlen("CORTEX"), "Magic word");

  if(strcmp(magic_word, "CORTEX") != 0)
  {
    fprintf(stderr, "Magic word doesn't match 'CORTEX' (start)\n");
    exit(EXIT_FAILURE);
  }

  // Read version number
  my_fread(fh, &version, sizeof(uint32_t), "binary version");
  my_fread(fh, &kmer_size, sizeof(uint32_t), "kmer size");
  my_fread(fh, &num_of_bitfields, sizeof(uint32_t), "number of bitfields");
  my_fread(fh, &num_of_colours, sizeof(uint32_t), "number of colours");

  if(print_info)
  {
    printf("binary version: %i\n", (int)version);
    printf("kmer size: %i\n", (int)kmer_size);
    printf("bitfields: %i\n", (int)num_of_bitfields);
    printf("colours: %i\n", (int)num_of_colours);
  }

  if(version >= 7)
  {
    my_fread(fh, &expected_num_of_kmers, sizeof(uint64_t), "number of kmers");
    my_fread(fh, &num_of_shades, sizeof(uint32_t), "number of shades");

    if(print_info)
    {
      char tmp[256];
      printf("kmers: %s\n", ulong_to_str(expected_num_of_kmers,tmp));
      printf("shades: %i\n", (int)num_of_shades);
    }
  }

  // Checks

  if(version > 7 || version < 4)
    report_error("Sorry, we only support binary versions 4, 5, 6 & 7\n");

  if(kmer_size % 2 == 0)
    report_error("kmer size is not an odd number\n");

  if(kmer_size < 3)
    report_error("kmer size is less than three\n");

  if(num_of_bitfields * 32 < kmer_size)
    report_error("Not enough bitfields for kmer size\n");

  if((num_of_bitfields-1)*32 >= kmer_size)
    report_error("using more than the minimum number of bitfields\n");

  if(num_of_colours == 0)
    report_error("number of colours is zero\n");

  if(num_of_shades != 0 && (num_of_shades & (num_of_shades-1)))
    report_error("number of shades is not a power of 2\n");

  //

  // Read array of mean read lengths per colour
  uint32_t *mean_read_lens_per_colour = malloc(num_of_colours*sizeof(uint32_t));

  my_fread(fh, mean_read_lens_per_colour, sizeof(uint32_t) * num_of_colours,
           "mean read length for each colour");

  // Read array of total seq loaded per colour
  uint64_t *total_seq_loaded_per_colour = malloc(num_of_colours*sizeof(uint64_t));

  my_fread(fh, total_seq_loaded_per_colour, sizeof(uint64_t) * num_of_colours,
           "total sequance loaded for each colour");

  for(i = 0; i < num_of_colours; i++)
  {
    sum_of_seq_loaded += total_seq_loaded_per_colour[i];
  }

  if(version >= 6)
  {
    sample_names = malloc(sizeof(char*) * num_of_colours);

    for(i = 0; i < num_of_colours; i++)
    {
      uint32_t str_length;
      my_fread(fh, &str_length, sizeof(uint32_t), "sample name length");

      if(str_length == 0)
      {
        sample_names[i] = NULL;
      }
      else
      {
        sample_names[i] = (char*)malloc((str_length+1) * sizeof(char));
        my_fread(fh, sample_names[i], str_length, "sample name");
        sample_names[i][str_length] = '\0';

        // Check sample length is as long as we were told
        size_t sample_name_len = strlen(sample_names[i]);

        if(sample_name_len != str_length)
        {
          // Premature \0 in string
          report_warning("Sample %i name has length %lu but is only %lu chars "
                         "long (premature '\\0')\n",
                         i, str_length, sample_name_len);
        }
      }
    }

    seq_error_rates = malloc(sizeof(long double) * num_of_colours);
    my_fread(fh, seq_error_rates, sizeof(long double) * num_of_colours,
             "seq error rates");

    cleaning_infos = malloc(sizeof(CleaningInfo) * num_of_colours);

    for(i = 0; i < num_of_colours; i++)
    {
      my_fread(fh, &(cleaning_infos[i].tip_cleaning), 1, "tip cleaning");
      my_fread(fh, &(cleaning_infos[i].remove_low_covg_supernodes), 1,
               "remove low covg supernodes");
      my_fread(fh, &(cleaning_infos[i].remove_low_covg_kmers), 1,
               "remove low covg kmers");
      my_fread(fh, &(cleaning_infos[i].cleaned_against_graph), 1,
               "cleaned against graph");

      my_fread(fh, &(cleaning_infos[i].remove_low_covg_supernodes_thresh),
               sizeof(int32_t), "remove low covg supernode threshold");
    
      my_fread(fh, &(cleaning_infos[i].remove_low_covg_kmers_thresh),
               sizeof(int32_t), "remove low covg kmer threshold");

      if(version > 6)
      {
        if(cleaning_infos[i].remove_low_covg_supernodes_thresh < 0)
        {
          report_warning("Binary header gives sample %i a cleaning threshold of "
                         "%i for supernodes (should be >= 0)\n",
                         i, cleaning_infos[i].remove_low_covg_supernodes_thresh);
        }
        if(cleaning_infos[i].remove_low_covg_kmers_thresh < 0)
        {
          report_warning("Binary header gives sample %i a cleaning threshold of "
                         "%i for kmers (should be >= 0)\n",
                         i, cleaning_infos[i].remove_low_covg_kmers_thresh);
        }
      }

      if(!cleaning_infos[i].remove_low_covg_supernodes &&
         cleaning_infos[i].remove_low_covg_supernodes_thresh > 0)
      {
        report_warning("Binary header gives sample %i a cleaning threshold of "
                       "%i for supernodes when no cleaning was performed\n",
                       i, cleaning_infos[i].remove_low_covg_supernodes_thresh);
      }

      if(!cleaning_infos[i].remove_low_covg_kmers &&
         cleaning_infos[i].remove_low_covg_kmers_thresh > 0)
      {
        report_warning("Binary header gives sample %i a cleaning threshold of "
                       "%i for kmers when no cleaning was performed\n",
                       i, cleaning_infos[i].remove_low_covg_kmers_thresh);
      }

      uint32_t name_length;
      my_fread(fh, &name_length, sizeof(uint32_t), "graph name length");

      if(name_length == 0)
      {
        cleaning_infos[i].name_of_graph_clean_against = NULL;
      }
      else
      {
        cleaning_infos[i].name_of_graph_clean_against
          = (char*)malloc((name_length + 1) * sizeof(char));

        my_fread(fh, cleaning_infos[i].name_of_graph_clean_against,
                 name_length, "graph name length");

        cleaning_infos[i].name_of_graph_clean_against[name_length] = '\0';
      
        // Check sample length is as long as we were told
        size_t cleaned_name_len
          = strlen(cleaning_infos[i].name_of_graph_clean_against);

        if(cleaned_name_len != name_length)
        {
          // Premature \0 in string
          report_warning("Sample [%i] cleaned-against-name has length %u but is "
                         "only %u chars long (premature '\\0')\n",
                         i, name_length, cleaned_name_len);
        }
      }
    }
  }

  // Print colour info

  if(print_info)
  {
    for(i = 0; i < num_of_colours; i++)
    {
      printf("-- Colour %i --\n", i);

      if(version >= 6)
      {
        // Version 6 only output
        printf("  sample name: '%s'\n", sample_names[i]);
      }

      char tmp[32];

      printf("  mean read length: %u\n",
             (unsigned int)mean_read_lens_per_colour[i]);
      printf("  total sequence loaded: %s\n",
             ulong_to_str(total_seq_loaded_per_colour[i], tmp));
      
      if(version >= 6)
      {
        // Version 6 only output
        printf("  sequence error rate: %Lf\n", seq_error_rates[i]);

        printf("  tip clipping: %s\n",
               (cleaning_infos[i].tip_cleaning == 0 ? "no" : "yes"));

        printf("  remove low coverage supernodes: %s [threshold: %i]\n",
               cleaning_infos[i].remove_low_covg_supernodes ? "yes" : "no",
               cleaning_infos[i].remove_low_covg_supernodes_thresh);

        printf("  remove low coverage kmers: %s [threshold: %i]\n",
               cleaning_infos[i].remove_low_covg_kmers ? "yes" : "no",
               cleaning_infos[i].remove_low_covg_kmers_thresh);

        printf("  cleaned against graph: %s [against: '%s']\n",
               cleaning_infos[i].cleaned_against_graph ? "yes" : "no",
               (cleaning_infos[i].name_of_graph_clean_against == NULL
                  ? "" : cleaning_infos[i].name_of_graph_clean_against));
      }
    }

    printf("--\n");
  }

  // Read magic word at the end of header
  my_fread(fh, magic_word, strlen("CORTEX"), "magic word (end)");

  if(strcmp(magic_word, "CORTEX") != 0)
  {
    report_error("magic word doesn't match 'CORTEX' (end): '%s'\n", magic_word);
    exit(EXIT_FAILURE);
  }

  // Calculate number of kmers
  if(version < 7 && file_size != -1)
  {
    size_t bytes_remaining = file_size - num_bytes_read;
    size_t num_bytes_per_kmer = sizeof(uint64_t) * num_of_bitfields +
                                sizeof(uint32_t) * num_of_colours +
                                sizeof(uint8_t) * num_of_colours;

    expected_num_of_kmers = bytes_remaining / num_bytes_per_kmer;

    size_t excess = bytes_remaining - (expected_num_of_kmers * num_bytes_per_kmer);

    if(excess > 0)
    {
      report_error("Excess bytes. Bytes:\n  file size: %lu;\n  for kmers: %lu;"
                   "\n  num kmers: %lu;\n  per kmer: %lu;\n  excess: %lu\n",
                   file_size, bytes_remaining, expected_num_of_kmers,
                   num_bytes_per_kmer, excess);
    }
  }

  if(print_info)
  {
    char num_str[50];
    printf("Expected number of kmers: %s\n",
           ulong_to_str(expected_num_of_kmers, num_str));
    printf("----\n");
  }

  // Finished parsing header
  if(!parse_kmers && !print_kmers)
  {
    print_kmer_stats();
    fclose(fh);
    exit(EXIT_SUCCESS);
  }


  shade_bytes = num_of_shades >> 3;
  size_t shade_array_bytes = shade_bytes * num_of_colours;

  // Kmer data
  uint64_t* kmer = malloc(sizeof(uint64_t) * num_of_bitfields);
  uint32_t* covgs = malloc(sizeof(uint32_t) * num_of_colours);
  uint8_t* edges = malloc(sizeof(uint8_t) * num_of_colours);
  uint8_t* shade_data = malloc(shade_array_bytes);
  uint8_t* shend_data = malloc(shade_array_bytes);

  if(kmer == NULL || covgs == NULL || edges == NULL ||
     shade_data == NULL || shend_data == NULL) {
    report_error("Out of memory");
    exit(EXIT_SUCCESS);
  }

  // Convert values to strings
  char* seq = malloc(sizeof(char) * kmer_size);
  char kmer_colour_edge_str[9];

  // Check top word of each kmer
  int bits_in_top_word = 2 * (kmer_size % 32);
  uint64_t top_word_mask = (~(uint64_t)0) << bits_in_top_word;

  size_t num_bytes_per_bkmer = sizeof(uint64_t)*num_of_bitfields;

  // Read kmer in bytes so we can see if there are extra bytes at the end of
  // the file
  size_t bytes_read;

  // while((bytes_read = fread(kmer, 1, num_bytes_per_bkmer, fh)) > 0)
  while((bytes_read = fread_buf(fh, kmer, num_bytes_per_bkmer, buffer)) > 0)
  {
    if(bytes_read != num_bytes_per_bkmer)
    {
      report_error("unusual extra bytes [%i] at the end of the file\n",
                   (int)bytes_read);
      break;
    }
    num_bytes_read += bytes_read;

    my_fread(fh, covgs, sizeof(uint32_t) * num_of_colours, "kmer covg");
    my_fread(fh, edges, sizeof(uint8_t) * num_of_colours, "kmer edges");

    if(version >= 7)
    {
      uint8_t *shades = shade_data, *shends = shend_data;
      for(i = 0; i < num_of_colours; i++)
      {
        my_fread(fh, shades, sizeof(uint8_t) * shade_bytes, "shades");
        my_fread(fh, shends, sizeof(uint8_t) * shade_bytes, "shade ends");
        shades += shade_bytes;
        shends += shade_bytes;
      }
    }

    //
    // Kmer checks
    //

    // Check top bits of kmer
    if(kmer[0] & top_word_mask)
    {
      if(num_of_oversized_kmers == 0)
      {
        report_error("oversized kmer [index: %lu]\n", num_of_kmers_read);

        for(i = 0; i < num_of_bitfields; i++)
        {
          fprintf(stderr, "  word %i: ", i);
          print_binary(stderr, kmer[i]);
          fprintf(stderr, "\n");
        }
      }

      num_of_oversized_kmers++;
    }

    // Check for all-zeros (i.e. all As kmer: AAAAAA)
    uint64_t kmer_words_or = 0;

    for(i = 0; i < num_of_bitfields; i++)
      kmer_words_or |= kmer[i];

    if(kmer_words_or == 0)
    {
      if(num_of_all_zero_kmers == 1)
      {
        report_error("more than one all 'A's kmers seen [index: %lu]\n",
                     num_of_kmers_read);
      }

      num_of_all_zero_kmers++;
    }

    // Check covg is 0 for all colours
    for(i = 0; i < num_of_colours && covgs[i] == 0; i++);

    if(i == num_of_colours)
    {
      if(num_of_zero_covg_kmers == 0)
      {
        report_warning("a kmer has zero coverage in all colours [index: %lu]\n",
                       num_of_kmers_read);
      }

      num_of_zero_covg_kmers++;
    }

    // Print?
    if(print_kmers)
    {
      binary_kmer_to_seq(kmer, seq, kmer_size, num_of_bitfields);
      printf("%s", seq);

      // Print coverages
      for(i = 0; i < num_of_colours; i++)
        printf(" %li", (unsigned long)covgs[i]);

      // Print edges
      for(i = 0; i < num_of_colours; i++)
        printf(" %s", get_edges_str(edges[i], kmer_colour_edge_str));

      if(version >= 7 && num_of_shades > 0)
      {
        for(i = 0; i < num_of_colours; i++)
        {
          putc(' ', stdout);
          print_colour_shades(shade_data + i*shade_bytes, shend_data + i*shade_bytes);
        }
      }

      putc('\n', stdout);
    }

    num_of_kmers_read++;

    for(i = 0; i < num_of_colours; i++)
      sum_of_covgs_read += covgs[i];
  }

  if(num_of_kmers_read != expected_num_of_kmers)
  {
    report_error("Expected %lu kmers, read %lu\n",
                 expected_num_of_kmers, num_of_kmers_read);
  }

  if(print_kmers && print_info)
    printf("----\n");

  // check for various reading errors
  if(errno != 0)
  {
    report_error("errno set [%i]\n", (int)errno);
  }

  int err;
  if((err = ferror(fh)) != 0)
  {
    report_error("occurred after file reading [%i]\n", err);
  }

  // For testing output
  //num_of_bitfields = 2;
  //num_of_kmers_read = 3600000000;
  //num_of_kmers_read = 12345;
  //num_of_kmers_read = 3581787;
  //num_of_kmers_read = 0;

  print_kmer_stats();

  fclose(fh);

  free(kmer);
  free(covgs);
  free(edges);
  free(shade_data);
  free(shend_data);

  buffer_free(buffer);

  if((print_kmers || parse_kmers) && print_info)
  {
    printf("----\n");
    if(num_warnings > 0 || num_errors > 0)
      printf("Warnings: %u; Errors: %u\n", num_warnings, num_errors);
    if(num_errors == 0)
      printf(num_warnings ? "Binary may be ok\n" : "Binary is valid\n");
  }

  exit(EXIT_SUCCESS);
}