static int
PyTabprm_set_map(
    PyTabprm* self,
    PyObject* value,
    /*@unused@*/ void* closure) {

  npy_intp M = 0;

  if (is_null(self->x->map)) {
    return -1;
  }

  M = (Py_ssize_t)self->x->M;

  note_change(self);

  return set_int_array("map", value, 1, &M, self->x->map);
}
Exemple #2
0
//outputs fastQ unless add_greedy_bases_for_better_bwt_compression==true, in which case is for 1000genomes, and they want fastA
inline void error_correct_file_against_graph(char* fastq_file, char quality_cutoff, char ascii_qual_offset,
        dBGraphEc *db_graph, char* outfile,
        uint64_t *bases_modified_count_array,//distribution across reads; how many of the read_length bases are fixed
        uint64_t *posn_modified_count_array,//where in the read are we making corrections?
        int bases_modified_count_array_size,
        int min_read_len,
        HandleLowQualUncorrectable policy,
        boolean add_greedy_bases_for_better_bwt_compression,
        int num_greedy_bases,
        boolean rev_comp_read_if_on_reverse_strand)
{
    int max_read_len = bases_modified_count_array_size - 2;
    int read_len_upper_bound = max_read_len + num_greedy_bases;
    int read_len_lower_bound = min_read_len + num_greedy_bases;
    int read_len_final = 0;

    //reset the stats arrays, we get stats per input file
    set_uint64_t_array(bases_modified_count_array,bases_modified_count_array_size, (uint64_t) 0);
    set_uint64_t_array(posn_modified_count_array, bases_modified_count_array_size, (uint64_t) 0);


    //set some variables, quality etc
    quality_cutoff+=ascii_qual_offset;
    short kmer_size = db_graph->kmer_size;

    StrBuf* uncorrectedGoodQual_file = strbuf_create(outfile);
    StrBuf* uncorrectedLowQual_file = strbuf_create(outfile);
    StrBuf* corrected_file = strbuf_create(outfile);
    StrBuf* discarded_undefined_file = strbuf_create(outfile);
    StrBuf* discarded_uncorrectable_file = strbuf_create(outfile);
    StrBuf* discarded_shortread_file = strbuf_create(outfile);


    strbuf_append_str(uncorrectedGoodQual_file, ".printuncorrectedgoodqual");
    strbuf_append_str(uncorrectedLowQual_file, ".printuncorrectedlowqual");
    strbuf_append_str(corrected_file, ".printcorrected");
    strbuf_append_str(discarded_undefined_file, ".discardundefinedbase");
    strbuf_append_str(discarded_uncorrectable_file, ".discarduncorrectable");
    strbuf_append_str(discarded_shortread_file, ".discardshortread");

    FILE* uncorrectedGoodQual_fp = fopen(uncorrectedGoodQual_file->buff, "w");
    FILE* uncorrectedLowQual_fp = fopen(uncorrectedLowQual_file->buff, "w");
    FILE* corrected_fp = fopen(corrected_file->buff, "w");
    FILE* discarded_undefined_fp = fopen(discarded_undefined_file->buff, "w");
    FILE* discarded_uncorrectable_fp = fopen(discarded_uncorrectable_file->buff, "w");
    FILE* discarded_shortread_fp = fopen(discarded_shortread_file->buff, "w");

    char* suff1 = ".distrib_num_modified_bases";
    char* suff2 = ".distrib_posn_modified_bases";
    char* suff3 =".read_stats";
    char* stat1 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff1)+1));
    char* stat2 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff2)+1));
    char* stat3 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff3)+1));

    if ( (stat1==NULL) || (stat2==NULL) || (stat3==NULL) )
    {
        die("Unable to malloc FILENAME strings. Something badly wrong with your server\n");
    }
    set_string_to_null(stat1, strlen(outfile)+strlen(suff1)+1);
    set_string_to_null(stat2, strlen(outfile)+strlen(suff2)+1);
    set_string_to_null(stat3, strlen(outfile)+strlen(suff3)+1);
    strcpy(stat1, outfile);
    strcat(stat1, suff1);
    strcat(stat2, outfile);
    strcat(stat2, suff2);
    strcat(stat3, outfile);
    strcat(stat3, suff3);

    FILE* out_stat1 = fopen(stat1, "w");
    FILE* out_stat2 = fopen(stat2, "w");
    FILE* out_stat3 = fopen(stat3, "w");
    if ( (out_stat1==NULL)|| (out_stat2==NULL) || (out_stat3==NULL) )
    {
        die("Unable to open %s or %s or %s to write to - permissions issue?\n", stat1, stat2, stat3);
    }

    SeqFile *sf = seq_file_open(fastq_file);
    if(sf == NULL)
    {
        // Error opening file
        fprintf(stderr, "Error: cannot read seq file '%s'\n", fastq_file);
        exit(EXIT_FAILURE);
    }
    char is_fastq = seq_has_quality_scores(sf);
    if (is_fastq==0)
    {
        die("Error correction is only meant to work on FASTQ and this file: %s is not\n", fastq_file);
    }

    StrBuf* buf_seq  = strbuf_new();
    StrBuf* buf_qual = strbuf_new();
    StrBuf* working_buf=strbuf_new();
    dBNodeEc* last_node_in_read;
    Orientation last_or_in_read;
    int num_original_reads=0, num_final_reads=0, num_corrected_reads=0, num_discarded_reads=0;
    int num_discarded_undefined = 0;
    int num_discarded_uncorrectable = 0;
    int num_discarded_short_read = 0;
    int num_print_uncorrected_lowqual = 0;
    int num_print_uncorrected_goodqual = 0;
    int num_print_corrected = 0;

    StrBuf* buf_dashes = strbuf_create("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");

    while(seq_next_read(sf))
    {
        int count_corrected_bases=0;
        //NOTE - uses modified version fo Isaacs code - new func
        seq_read_all_bases_and_quals(sf, buf_seq, buf_qual);
        StrBuf* buf_seq_debug  = strbuf_clone(buf_seq);
        StrBuf* buf_seq_origin  = strbuf_clone(buf_seq);
        StrBuf* buf_seq_fixed = strbuf_clone(buf_dashes);
        strbuf_resize(buf_seq_fixed, strbuf_len(buf_seq)+1);

        int read_len = seq_get_length(sf);
        int num_kmers = read_len-kmer_size+1;
        int quality_good[read_len];
        set_int_array(quality_good, read_len, 1);

        int first_good=0;//index of first kmer in graph

        //populate the qual array showing which bases have qual >threshold
        //if all quals are high, will Print uncorrected
        //else, if all kmers NOT in graph, will discard or print uncorrected depending on policy
        //else print corrected.
        Orientation strand_first_good_kmer;
        ReadCorrectionDecison dec =
            get_first_good_kmer_and_populate_qual_array(seq_get_read_name(sf), buf_seq, buf_qual, num_kmers, read_len,
                    quality_good, quality_cutoff,
                    &first_good, &strand_first_good_kmer,
                    db_graph, policy, rev_comp_read_if_on_reverse_strand);


        //*** start of local functions

        //if going right, keep going to right hand end. if going left, keep going to left hand end
        boolean condition(WhichEndOfKmer direction, int pos)
        {
            if ((direction==Right) && (pos<num_kmers))
            {
                return true;
            }
            if ((direction==Left) && (pos>=0))
            {
                return true;
            }
            return false;
        }
        boolean kmer_is_in_graph(char* kmer, dBGraphEc* db_g)
        {
            BinaryKmer curr_kmer;
            if (seq_to_binary_kmer(kmer, kmer_size, &curr_kmer)==NULL)
            {
                //is an N
                return false;
            }

            BinaryKmer temp_key;
            element_ec_get_key(&curr_kmer, kmer_size, &temp_key);
            dBNodeEc* node = hash_table_ec_find(&temp_key, db_g);
            if (node==NULL)
            {
                return false;
            }
            else
            {
                return true;
            }
        }
        int increment(int i, WhichEndOfKmer direction)
        {
            if (direction==Right)
            {
                return i+1;
            }
            else
            {
                return i-1;
            }
        }
        char working_str[kmer_size+1];

        // start_pos is in kmer units
        boolean check_bases_to_end_of_read(int start_pos, ReadCorrectionDecison* decision,
                                           WhichEndOfKmer direction,
                                           int* num_corrected_bases_in_this_read_debug)

        {
            boolean any_correction_done=false;
            if ((start_pos<0) || (start_pos>=num_kmers))
            {
                return any_correction_done;
            }
            int pos=start_pos;
            int offset=0;
            if (direction==Right)
            {
                offset= kmer_size-1;
            }
            char local_kmer[kmer_size+1];
            local_kmer[kmer_size]='\0';

            while ( (*decision==PrintCorrected) && (condition(direction,pos)==true) )
            {
                strncpy(local_kmer, buf_seq->buff+pos, kmer_size);

                if (quality_good[pos+offset]==1)
                {
                    //nothing to do
                }
                else if (kmer_is_in_graph(local_kmer, db_graph)==true)
                {
                    //nothing to do - don't correct if kmer is in graph
                }
                else//kmer not in graph and quality bad
                {
                    boolean fixed = fix_end_if_unambiguous(direction, buf_seq, buf_seq_fixed, buf_qual, quality_cutoff, pos,
                                                           working_buf, working_str, db_graph);
                    if ( (policy==DiscardReadIfLowQualBaseUnCorrectable)
                            &&
                            (fixed==false) )
                    {
                        *decision=DiscardUncorrectable;
                    }
                    else if (fixed==true)
                    {
                        any_correction_done=true;
                        count_corrected_bases++;
                        *num_corrected_bases_in_this_read_debug=*num_corrected_bases_in_this_read_debug+1;
                        if (offset+pos<bases_modified_count_array_size)
                        {
                            posn_modified_count_array[offset+pos]++;
                        }
                        else
                        {
                            posn_modified_count_array[bases_modified_count_array_size-1]++;
                        }
                    }
                }
                pos = increment(pos, direction);
            }
            return any_correction_done;
        }