Esempio n. 1
0
//outputs fastQ unless add_greedy_bases_for_better_bwt_compression==true, in which case is for 1000genomes, and they want fastA
inline void error_correct_file_against_graph(char* fastq_file, char quality_cutoff, char ascii_qual_offset,
        dBGraphEc *db_graph, char* outfile,
        uint64_t *bases_modified_count_array,//distribution across reads; how many of the read_length bases are fixed
        uint64_t *posn_modified_count_array,//where in the read are we making corrections?
        int bases_modified_count_array_size,
        int min_read_len,
        HandleLowQualUncorrectable policy,
        boolean add_greedy_bases_for_better_bwt_compression,
        int num_greedy_bases,
        boolean rev_comp_read_if_on_reverse_strand)
{
    int max_read_len = bases_modified_count_array_size - 2;
    int read_len_upper_bound = max_read_len + num_greedy_bases;
    int read_len_lower_bound = min_read_len + num_greedy_bases;
    int read_len_final = 0;

    //reset the stats arrays, we get stats per input file
    set_uint64_t_array(bases_modified_count_array,bases_modified_count_array_size, (uint64_t) 0);
    set_uint64_t_array(posn_modified_count_array, bases_modified_count_array_size, (uint64_t) 0);


    //set some variables, quality etc
    quality_cutoff+=ascii_qual_offset;
    short kmer_size = db_graph->kmer_size;

    StrBuf* uncorrectedGoodQual_file = strbuf_create(outfile);
    StrBuf* uncorrectedLowQual_file = strbuf_create(outfile);
    StrBuf* corrected_file = strbuf_create(outfile);
    StrBuf* discarded_undefined_file = strbuf_create(outfile);
    StrBuf* discarded_uncorrectable_file = strbuf_create(outfile);
    StrBuf* discarded_shortread_file = strbuf_create(outfile);


    strbuf_append_str(uncorrectedGoodQual_file, ".printuncorrectedgoodqual");
    strbuf_append_str(uncorrectedLowQual_file, ".printuncorrectedlowqual");
    strbuf_append_str(corrected_file, ".printcorrected");
    strbuf_append_str(discarded_undefined_file, ".discardundefinedbase");
    strbuf_append_str(discarded_uncorrectable_file, ".discarduncorrectable");
    strbuf_append_str(discarded_shortread_file, ".discardshortread");

    FILE* uncorrectedGoodQual_fp = fopen(uncorrectedGoodQual_file->buff, "w");
    FILE* uncorrectedLowQual_fp = fopen(uncorrectedLowQual_file->buff, "w");
    FILE* corrected_fp = fopen(corrected_file->buff, "w");
    FILE* discarded_undefined_fp = fopen(discarded_undefined_file->buff, "w");
    FILE* discarded_uncorrectable_fp = fopen(discarded_uncorrectable_file->buff, "w");
    FILE* discarded_shortread_fp = fopen(discarded_shortread_file->buff, "w");

    char* suff1 = ".distrib_num_modified_bases";
    char* suff2 = ".distrib_posn_modified_bases";
    char* suff3 =".read_stats";
    char* stat1 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff1)+1));
    char* stat2 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff2)+1));
    char* stat3 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff3)+1));

    if ( (stat1==NULL) || (stat2==NULL) || (stat3==NULL) )
    {
        die("Unable to malloc FILENAME strings. Something badly wrong with your server\n");
    }
    set_string_to_null(stat1, strlen(outfile)+strlen(suff1)+1);
    set_string_to_null(stat2, strlen(outfile)+strlen(suff2)+1);
    set_string_to_null(stat3, strlen(outfile)+strlen(suff3)+1);
    strcpy(stat1, outfile);
    strcat(stat1, suff1);
    strcat(stat2, outfile);
    strcat(stat2, suff2);
    strcat(stat3, outfile);
    strcat(stat3, suff3);

    FILE* out_stat1 = fopen(stat1, "w");
    FILE* out_stat2 = fopen(stat2, "w");
    FILE* out_stat3 = fopen(stat3, "w");
    if ( (out_stat1==NULL)|| (out_stat2==NULL) || (out_stat3==NULL) )
    {
        die("Unable to open %s or %s or %s to write to - permissions issue?\n", stat1, stat2, stat3);
    }

    SeqFile *sf = seq_file_open(fastq_file);
    if(sf == NULL)
    {
        // Error opening file
        fprintf(stderr, "Error: cannot read seq file '%s'\n", fastq_file);
        exit(EXIT_FAILURE);
    }
    char is_fastq = seq_has_quality_scores(sf);
    if (is_fastq==0)
    {
        die("Error correction is only meant to work on FASTQ and this file: %s is not\n", fastq_file);
    }

    StrBuf* buf_seq  = strbuf_new();
    StrBuf* buf_qual = strbuf_new();
    StrBuf* working_buf=strbuf_new();
    dBNodeEc* last_node_in_read;
    Orientation last_or_in_read;
    int num_original_reads=0, num_final_reads=0, num_corrected_reads=0, num_discarded_reads=0;
    int num_discarded_undefined = 0;
    int num_discarded_uncorrectable = 0;
    int num_discarded_short_read = 0;
    int num_print_uncorrected_lowqual = 0;
    int num_print_uncorrected_goodqual = 0;
    int num_print_corrected = 0;

    StrBuf* buf_dashes = strbuf_create("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");

    while(seq_next_read(sf))
    {
        int count_corrected_bases=0;
        //NOTE - uses modified version fo Isaacs code - new func
        seq_read_all_bases_and_quals(sf, buf_seq, buf_qual);
        StrBuf* buf_seq_debug  = strbuf_clone(buf_seq);
        StrBuf* buf_seq_origin  = strbuf_clone(buf_seq);
        StrBuf* buf_seq_fixed = strbuf_clone(buf_dashes);
        strbuf_resize(buf_seq_fixed, strbuf_len(buf_seq)+1);

        int read_len = seq_get_length(sf);
        int num_kmers = read_len-kmer_size+1;
        int quality_good[read_len];
        set_int_array(quality_good, read_len, 1);

        int first_good=0;//index of first kmer in graph

        //populate the qual array showing which bases have qual >threshold
        //if all quals are high, will Print uncorrected
        //else, if all kmers NOT in graph, will discard or print uncorrected depending on policy
        //else print corrected.
        Orientation strand_first_good_kmer;
        ReadCorrectionDecison dec =
            get_first_good_kmer_and_populate_qual_array(seq_get_read_name(sf), buf_seq, buf_qual, num_kmers, read_len,
                    quality_good, quality_cutoff,
                    &first_good, &strand_first_good_kmer,
                    db_graph, policy, rev_comp_read_if_on_reverse_strand);


        //*** start of local functions

        //if going right, keep going to right hand end. if going left, keep going to left hand end
        boolean condition(WhichEndOfKmer direction, int pos)
        {
            if ((direction==Right) && (pos<num_kmers))
            {
                return true;
            }
            if ((direction==Left) && (pos>=0))
            {
                return true;
            }
            return false;
        }
        boolean kmer_is_in_graph(char* kmer, dBGraphEc* db_g)
        {
            BinaryKmer curr_kmer;
            if (seq_to_binary_kmer(kmer, kmer_size, &curr_kmer)==NULL)
            {
                //is an N
                return false;
            }

            BinaryKmer temp_key;
            element_ec_get_key(&curr_kmer, kmer_size, &temp_key);
            dBNodeEc* node = hash_table_ec_find(&temp_key, db_g);
            if (node==NULL)
            {
                return false;
            }
            else
            {
                return true;
            }
        }
        int increment(int i, WhichEndOfKmer direction)
        {
            if (direction==Right)
            {
                return i+1;
            }
            else
            {
                return i-1;
            }
        }
        char working_str[kmer_size+1];

        // start_pos is in kmer units
        boolean check_bases_to_end_of_read(int start_pos, ReadCorrectionDecison* decision,
                                           WhichEndOfKmer direction,
                                           int* num_corrected_bases_in_this_read_debug)

        {
            boolean any_correction_done=false;
            if ((start_pos<0) || (start_pos>=num_kmers))
            {
                return any_correction_done;
            }
            int pos=start_pos;
            int offset=0;
            if (direction==Right)
            {
                offset= kmer_size-1;
            }
            char local_kmer[kmer_size+1];
            local_kmer[kmer_size]='\0';

            while ( (*decision==PrintCorrected) && (condition(direction,pos)==true) )
            {
                strncpy(local_kmer, buf_seq->buff+pos, kmer_size);

                if (quality_good[pos+offset]==1)
                {
                    //nothing to do
                }
                else if (kmer_is_in_graph(local_kmer, db_graph)==true)
                {
                    //nothing to do - don't correct if kmer is in graph
                }
                else//kmer not in graph and quality bad
                {
                    boolean fixed = fix_end_if_unambiguous(direction, buf_seq, buf_seq_fixed, buf_qual, quality_cutoff, pos,
                                                           working_buf, working_str, db_graph);
                    if ( (policy==DiscardReadIfLowQualBaseUnCorrectable)
                            &&
                            (fixed==false) )
                    {
                        *decision=DiscardUncorrectable;
                    }
                    else if (fixed==true)
                    {
                        any_correction_done=true;
                        count_corrected_bases++;
                        *num_corrected_bases_in_this_read_debug=*num_corrected_bases_in_this_read_debug+1;
                        if (offset+pos<bases_modified_count_array_size)
                        {
                            posn_modified_count_array[offset+pos]++;
                        }
                        else
                        {
                            posn_modified_count_array[bases_modified_count_array_size-1]++;
                        }
                    }
                }
                pos = increment(pos, direction);
            }
            return any_correction_done;
        }
Esempio n. 2
0
int main(int argc, char** argv)
{
  if(argc < 3 || argc > 4)
  {
    print_usage();
  }

  char* in_path = argv[1];
  char* out_path = argv[2];

  unsigned long line_wrap = 0;
  
  if(argc == 4)
  {
    char *line_wrap_str = argv[3];
    char *endptr;
    line_wrap = strtoul(line_wrap_str, &endptr, 10);

    if((unsigned)(endptr-line_wrap_str) != strlen(line_wrap_str))
    {
      print_usage();
    }
  }

  SeqFileType out_file_type = SEQ_UNKNOWN;
  char out_zipped = 0;

  seq_guess_filetype_from_path(out_path, &out_file_type, &out_zipped);

  if(out_file_type == SEQ_UNKNOWN)
  {
    fprintf(stderr, "%s:%i: Sorry, I cannot identify the output file's format "
                    "from its path [file: %s]\n", __FILE__, __LINE__, out_path);
    exit(EXIT_FAILURE);
  }

  SeqFile* in_file = seq_file_open(in_path);
  SeqFile* out_file = seq_file_open_write(out_path, out_file_type,
                                          out_zipped, line_wrap);

  if(in_file == NULL)
  {
    fprintf(stderr, "%s:%i: Couldn't open input file: %s\n",
            __FILE__, __LINE__, in_path);
    exit(EXIT_FAILURE);
  }

  if(out_file == NULL)
  {
    fprintf(stderr, "%s:%i: Couldn't open output file: %s\n",
            __FILE__, __LINE__, out_path);
    exit(EXIT_FAILURE);
  }

  printf(" In : %s [%s]\n", in_path, seq_file_get_type_str(in_file));
  printf(" Out: %s [%s]\n", out_path, seq_file_get_type_str(out_file));

  // Start converting
  size_t bytes_written = 0;
  char c[2] = ".";

  if(out_file_type == SEQ_PLAIN)
  {
    // Example reading in an entire entry at a time using seq_read_all_bases()
    StrBuf *bases = strbuf_new();

    while(seq_next_read(in_file))
    {
      while(seq_read_all_bases(in_file, bases))
      {
        bytes_written += seq_file_write_seq(out_file, bases->buff);
      }
    }
  }
  else
  {
    // Example reading in a char at a time using seq_read_base()
    while(seq_next_read(in_file))
    {
      const char* read_name = seq_get_read_name(in_file);
      bytes_written += seq_file_write_name(out_file, read_name);

      unsigned long seq_length = 0;

      while(seq_read_base(in_file, c))
      {
        seq_length++;

        if(!(bytes_written += seq_file_write_seq(out_file, c)))
        {
          fprintf(stderr, "%s:%i: Couldn't write base to file "
                          "[file: %s; line: %lu]\n",
                  __FILE__, __LINE__,
                  seq_get_path(out_file), seq_curr_line_number(in_file));

          exit(EXIT_FAILURE);
        }
      }
    
      if(seq_has_quality_scores(out_file))
      {
        size_t bytes_written_before_qual = bytes_written;

        while(seq_read_qual(in_file, c))
        {
          if(!(bytes_written += seq_file_write_qual(out_file, c)))
          {
            fprintf(stderr, "%s:%i: Couldn't write quality score to file "
                            "[file: %s; line: %lu]\n",
                    __FILE__, __LINE__, seq_get_path(out_file),
                    seq_curr_line_number(in_file));

            exit(EXIT_FAILURE);
          }
        }

        if(bytes_written == bytes_written_before_qual)
        {
          // No quality scores were read - fill in
          unsigned long i;
          *c = '?';
          for(i = 0; i < seq_length; i++)
            bytes_written += seq_file_write_qual(out_file, c);
        }
      }
    }
  }

  unsigned long seq_total_bases_read = seq_total_bases_passed(in_file);
  unsigned long total_entries = seq_get_read_index(in_file);

  seq_file_close(in_file);
  bytes_written += seq_file_close(out_file);

  printf("%lu entries read\n", total_entries);
  printf("%lu bases read\n", seq_total_bases_read);
  printf("%lu bytes written\n", bytes_written);
  printf("Done. \n");

  return EXIT_SUCCESS;
}
Esempio n. 3
0
// If seq2 is NULL, read pair of entries from first file
// Otherwise read an entry from each
void align_from_file(const char *path1, const char *path2,
                     void (align)(StrBuf*, StrBuf*, const char*, const char*))
{
  SeqFile *sf1 = seq_file_open(path1);
  SeqFile *sf2;

  if(sf1 == NULL)
  {
    fprintf(stderr, "Alignment Error: couldn't open file %s\n", path1);
    fflush(stderr);
    return;
  }

  if(path2 != NULL)
  {
    sf2 = seq_file_open(path2);

    if(sf2 == NULL)
    {
      fprintf(stderr, "Alignment Error: couldn't open file %s\n", path1);
      fflush(stderr);
      return;
    }
  }
  else
  {
    sf2 = sf1;
  }

  StrBuf *entry1_title = strbuf_new();
  StrBuf *entry2_title = strbuf_new();
  StrBuf *entry1_seq = strbuf_new();
  StrBuf *entry2_seq = strbuf_new();

  char *title1 = NULL, *title2 = NULL;

  // Loop while we can read a sequence from the first file
  while(seq_next_read(sf1))
  {
    seq_read_all_bases(sf1, entry1_seq);

    if(seq_file_get_type(sf1) != SEQ_PLAIN)
    {
      strbuf_set(entry1_title, seq_get_read_name(sf1));
      title1 = entry1_title->buff;
    }

    if(!seq_next_read(sf2))
    {
      fprintf(stderr, "Alignment Error: Odd number of sequences - "
                      "I read in pairs!\n");
      fflush(stderr);
      break;
    }

    seq_read_all_bases(sf2, entry2_seq);

    if(seq_file_get_type(sf2) != SEQ_PLAIN)
    {
      strbuf_set(entry2_title, seq_get_read_name(sf2));
      title2 = entry2_title->buff;
    }

    (align)(entry1_seq, entry2_seq, title1, title2);
  }

  // warn if no bases read
  if(seq_total_bases_passed(sf1) == 0)
  {
    fprintf(stderr, "Alignment Warning: empty input\n");
    fflush(stderr);
  }

  // Close files
  seq_file_close(sf1);

  if(path2 != NULL)
    seq_file_close(sf2);

  // Free memory
  strbuf_free(entry1_title);
  strbuf_free(entry2_title);
  strbuf_free(entry1_seq);
  strbuf_free(entry2_seq);
}