Пример #1
0
//this is designed to be used when loading multiple binaries into a 
//single colour, which may have different metadata. In this case,
// we cumulate covg, get mean read length, and for each type of error cleaning we 
//say yesif ANY of them did, taking the last threshold for any of them.
void graph_info_set_all_metadata(GraphInfo* target, GraphInfo* src, int colour, boolean dont_set_pool_cleaning)
{
  graph_info_update_mean_readlen(target, colour, 
				 target->mean_read_length[colour], 
				 target->total_sequence[colour],
				 src->mean_read_length[colour],
				 src->total_sequence[colour]);
  graph_info_increment_seq(target, colour, src->total_sequence[colour]);
  target->seq_err[colour]=src->seq_err[colour];
  error_cleaning_assign_with_OR(target->cleaning[colour], src->cleaning[colour], dont_set_pool_cleaning);


  if ( (strcmp(src->sample_ids[colour], "undefined")!=0) //source has nontrivial sample id
       &&
       (strcmp(src->sample_ids[colour], target->sample_ids[colour])!=0) //src and target have different sample id
       && 
       (strcmp(target->sample_ids[colour],"undefined")!=0)//target has non trivial sample id
       )
    {
      set_string_to_null(target->sample_ids[colour], MAX_LEN_SAMPLE_NAME );
      strcat(target->sample_ids[colour], "pool");
    }
  else if (strcmp(src->sample_ids[colour], "undefined")!=0) //src has nontrivial sample id
    {
      //either it is the same, or target is undefined. Either way, this is ok:
      set_string_to_null(target->sample_ids[colour], MAX_LEN_SAMPLE_NAME);
      strcat(target->sample_ids[colour], src->sample_ids[colour]);
    }
}
Пример #2
0
void graph_info_initialise_one_colour_except_pool_cleaning(GraphInfo* ginfo, int colour)
{
  set_string_to_null(ginfo->sample_ids[colour], MAX_LEN_SAMPLE_NAME);
  strcat(ginfo->sample_ids[colour], "undefined");
  ginfo->sample_id_lens[colour]=strlen(ginfo->sample_ids[colour]);
  graph_info_set_seq(ginfo, colour, 0);
  graph_info_set_mean_readlen(ginfo, colour, 0);
  ginfo->seq_err[colour]=0.01;
  error_cleaning_initialise_except_pool_cleaning(ginfo->cleaning[colour]);
}
Пример #3
0
void error_cleaning_initialise(ErrorCleaning* cl)
{
  error_cleaning_initialise_except_pool_cleaning(cl);

  cl->cleaned_against_another_graph=false;

  set_string_to_null(cl->name_of_graph_against_which_was_cleaned, MAX_FILENAME_LENGTH);
  strcat(cl->name_of_graph_against_which_was_cleaned, "undefined");

  cl->len_name_of_graph_against_which_was_cleaned=
    strlen(cl->name_of_graph_against_which_was_cleaned);
}
Пример #4
0
void graph_info_initialise(GraphInfo* ginfo)
{
  int i;

  for (i=0; i<NUMBER_OF_COLOURS; i++)
    {
      set_string_to_null(ginfo->sample_ids[i], MAX_LEN_SAMPLE_NAME);
      strcat(ginfo->sample_ids[i], "undefined");
      ginfo->sample_id_lens[i]=strlen(ginfo->sample_ids[i]);
      graph_info_set_seq(ginfo, i, 0);
      graph_info_set_mean_readlen(ginfo, i, 0);
      ginfo->seq_err[i]=0.01;
      error_cleaning_initialise(ginfo->cleaning[i]);
    }
}
Пример #5
0
//outputs fastQ unless add_greedy_bases_for_better_bwt_compression==true, in which case is for 1000genomes, and they want fastA
inline void error_correct_file_against_graph(char* fastq_file, char quality_cutoff, char ascii_qual_offset,
        dBGraphEc *db_graph, char* outfile,
        uint64_t *bases_modified_count_array,//distribution across reads; how many of the read_length bases are fixed
        uint64_t *posn_modified_count_array,//where in the read are we making corrections?
        int bases_modified_count_array_size,
        int min_read_len,
        HandleLowQualUncorrectable policy,
        boolean add_greedy_bases_for_better_bwt_compression,
        int num_greedy_bases,
        boolean rev_comp_read_if_on_reverse_strand)
{
    int max_read_len = bases_modified_count_array_size - 2;
    int read_len_upper_bound = max_read_len + num_greedy_bases;
    int read_len_lower_bound = min_read_len + num_greedy_bases;
    int read_len_final = 0;

    //reset the stats arrays, we get stats per input file
    set_uint64_t_array(bases_modified_count_array,bases_modified_count_array_size, (uint64_t) 0);
    set_uint64_t_array(posn_modified_count_array, bases_modified_count_array_size, (uint64_t) 0);


    //set some variables, quality etc
    quality_cutoff+=ascii_qual_offset;
    short kmer_size = db_graph->kmer_size;

    StrBuf* uncorrectedGoodQual_file = strbuf_create(outfile);
    StrBuf* uncorrectedLowQual_file = strbuf_create(outfile);
    StrBuf* corrected_file = strbuf_create(outfile);
    StrBuf* discarded_undefined_file = strbuf_create(outfile);
    StrBuf* discarded_uncorrectable_file = strbuf_create(outfile);
    StrBuf* discarded_shortread_file = strbuf_create(outfile);


    strbuf_append_str(uncorrectedGoodQual_file, ".printuncorrectedgoodqual");
    strbuf_append_str(uncorrectedLowQual_file, ".printuncorrectedlowqual");
    strbuf_append_str(corrected_file, ".printcorrected");
    strbuf_append_str(discarded_undefined_file, ".discardundefinedbase");
    strbuf_append_str(discarded_uncorrectable_file, ".discarduncorrectable");
    strbuf_append_str(discarded_shortread_file, ".discardshortread");

    FILE* uncorrectedGoodQual_fp = fopen(uncorrectedGoodQual_file->buff, "w");
    FILE* uncorrectedLowQual_fp = fopen(uncorrectedLowQual_file->buff, "w");
    FILE* corrected_fp = fopen(corrected_file->buff, "w");
    FILE* discarded_undefined_fp = fopen(discarded_undefined_file->buff, "w");
    FILE* discarded_uncorrectable_fp = fopen(discarded_uncorrectable_file->buff, "w");
    FILE* discarded_shortread_fp = fopen(discarded_shortread_file->buff, "w");

    char* suff1 = ".distrib_num_modified_bases";
    char* suff2 = ".distrib_posn_modified_bases";
    char* suff3 =".read_stats";
    char* stat1 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff1)+1));
    char* stat2 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff2)+1));
    char* stat3 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff3)+1));

    if ( (stat1==NULL) || (stat2==NULL) || (stat3==NULL) )
    {
        die("Unable to malloc FILENAME strings. Something badly wrong with your server\n");
    }
    set_string_to_null(stat1, strlen(outfile)+strlen(suff1)+1);
    set_string_to_null(stat2, strlen(outfile)+strlen(suff2)+1);
    set_string_to_null(stat3, strlen(outfile)+strlen(suff3)+1);
    strcpy(stat1, outfile);
    strcat(stat1, suff1);
    strcat(stat2, outfile);
    strcat(stat2, suff2);
    strcat(stat3, outfile);
    strcat(stat3, suff3);

    FILE* out_stat1 = fopen(stat1, "w");
    FILE* out_stat2 = fopen(stat2, "w");
    FILE* out_stat3 = fopen(stat3, "w");
    if ( (out_stat1==NULL)|| (out_stat2==NULL) || (out_stat3==NULL) )
    {
        die("Unable to open %s or %s or %s to write to - permissions issue?\n", stat1, stat2, stat3);
    }

    SeqFile *sf = seq_file_open(fastq_file);
    if(sf == NULL)
    {
        // Error opening file
        fprintf(stderr, "Error: cannot read seq file '%s'\n", fastq_file);
        exit(EXIT_FAILURE);
    }
    char is_fastq = seq_has_quality_scores(sf);
    if (is_fastq==0)
    {
        die("Error correction is only meant to work on FASTQ and this file: %s is not\n", fastq_file);
    }

    StrBuf* buf_seq  = strbuf_new();
    StrBuf* buf_qual = strbuf_new();
    StrBuf* working_buf=strbuf_new();
    dBNodeEc* last_node_in_read;
    Orientation last_or_in_read;
    int num_original_reads=0, num_final_reads=0, num_corrected_reads=0, num_discarded_reads=0;
    int num_discarded_undefined = 0;
    int num_discarded_uncorrectable = 0;
    int num_discarded_short_read = 0;
    int num_print_uncorrected_lowqual = 0;
    int num_print_uncorrected_goodqual = 0;
    int num_print_corrected = 0;

    StrBuf* buf_dashes = strbuf_create("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");

    while(seq_next_read(sf))
    {
        int count_corrected_bases=0;
        //NOTE - uses modified version fo Isaacs code - new func
        seq_read_all_bases_and_quals(sf, buf_seq, buf_qual);
        StrBuf* buf_seq_debug  = strbuf_clone(buf_seq);
        StrBuf* buf_seq_origin  = strbuf_clone(buf_seq);
        StrBuf* buf_seq_fixed = strbuf_clone(buf_dashes);
        strbuf_resize(buf_seq_fixed, strbuf_len(buf_seq)+1);

        int read_len = seq_get_length(sf);
        int num_kmers = read_len-kmer_size+1;
        int quality_good[read_len];
        set_int_array(quality_good, read_len, 1);

        int first_good=0;//index of first kmer in graph

        //populate the qual array showing which bases have qual >threshold
        //if all quals are high, will Print uncorrected
        //else, if all kmers NOT in graph, will discard or print uncorrected depending on policy
        //else print corrected.
        Orientation strand_first_good_kmer;
        ReadCorrectionDecison dec =
            get_first_good_kmer_and_populate_qual_array(seq_get_read_name(sf), buf_seq, buf_qual, num_kmers, read_len,
                    quality_good, quality_cutoff,
                    &first_good, &strand_first_good_kmer,
                    db_graph, policy, rev_comp_read_if_on_reverse_strand);


        //*** start of local functions

        //if going right, keep going to right hand end. if going left, keep going to left hand end
        boolean condition(WhichEndOfKmer direction, int pos)
        {
            if ((direction==Right) && (pos<num_kmers))
            {
                return true;
            }
            if ((direction==Left) && (pos>=0))
            {
                return true;
            }
            return false;
        }
        boolean kmer_is_in_graph(char* kmer, dBGraphEc* db_g)
        {
            BinaryKmer curr_kmer;
            if (seq_to_binary_kmer(kmer, kmer_size, &curr_kmer)==NULL)
            {
                //is an N
                return false;
            }

            BinaryKmer temp_key;
            element_ec_get_key(&curr_kmer, kmer_size, &temp_key);
            dBNodeEc* node = hash_table_ec_find(&temp_key, db_g);
            if (node==NULL)
            {
                return false;
            }
            else
            {
                return true;
            }
        }
        int increment(int i, WhichEndOfKmer direction)
        {
            if (direction==Right)
            {
                return i+1;
            }
            else
            {
                return i-1;
            }
        }
        char working_str[kmer_size+1];

        // start_pos is in kmer units
        boolean check_bases_to_end_of_read(int start_pos, ReadCorrectionDecison* decision,
                                           WhichEndOfKmer direction,
                                           int* num_corrected_bases_in_this_read_debug)

        {
            boolean any_correction_done=false;
            if ((start_pos<0) || (start_pos>=num_kmers))
            {
                return any_correction_done;
            }
            int pos=start_pos;
            int offset=0;
            if (direction==Right)
            {
                offset= kmer_size-1;
            }
            char local_kmer[kmer_size+1];
            local_kmer[kmer_size]='\0';

            while ( (*decision==PrintCorrected) && (condition(direction,pos)==true) )
            {
                strncpy(local_kmer, buf_seq->buff+pos, kmer_size);

                if (quality_good[pos+offset]==1)
                {
                    //nothing to do
                }
                else if (kmer_is_in_graph(local_kmer, db_graph)==true)
                {
                    //nothing to do - don't correct if kmer is in graph
                }
                else//kmer not in graph and quality bad
                {
                    boolean fixed = fix_end_if_unambiguous(direction, buf_seq, buf_seq_fixed, buf_qual, quality_cutoff, pos,
                                                           working_buf, working_str, db_graph);
                    if ( (policy==DiscardReadIfLowQualBaseUnCorrectable)
                            &&
                            (fixed==false) )
                    {
                        *decision=DiscardUncorrectable;
                    }
                    else if (fixed==true)
                    {
                        any_correction_done=true;
                        count_corrected_bases++;
                        *num_corrected_bases_in_this_read_debug=*num_corrected_bases_in_this_read_debug+1;
                        if (offset+pos<bases_modified_count_array_size)
                        {
                            posn_modified_count_array[offset+pos]++;
                        }
                        else
                        {
                            posn_modified_count_array[bases_modified_count_array_size-1]++;
                        }
                    }
                }
                pos = increment(pos, direction);
            }
            return any_correction_done;
        }