ssize_t junos_strbuf_vsprintf(junos_strbuf_t *strbuf, const char *fmt, va_list ap) { int status; if (! strbuf) return -1; assert(strbuf->string[strbuf->pos] == '\0'); if (strbuf->pos >= strbuf->size) if (strbuf_resize(strbuf)) return -1; status = vsnprintf(strbuf->string + strbuf->pos, strbuf->size - strbuf->pos, fmt, ap); if (status < 0) return status; if ((size_t)status >= strbuf->size - strbuf->pos) { strbuf_resize(strbuf); return junos_strbuf_vsprintf(strbuf, fmt, ap); } strbuf->pos += (size_t)status; return (ssize_t)strbuf->pos; } /* junos_strbuf_vsprintf */
/* strbuf_append_fmt_retry() can be used when the there is no known * upper bound for the output string. */ void strbuf_append_fmt_retry(strbuf_t *s, const char *fmt, ...) { va_list arg; int fmt_len, try1; int empty_len; /* If the first attempt to append fails, resize the buffer appropriately * and try again */ for (try1 = 0; ; try1++) { va_start(arg, fmt); /* Append the new formatted string */ /* fmt_len is the length of the string required, excluding the * trailing NULL */ empty_len = strbuf_empty_length(s); /* Add 1 since there is also space to store the terminating NULL. */ fmt_len = vsnprintf(s->buf + s->length, empty_len + 1, fmt, arg); va_end(arg); if (fmt_len <= empty_len) break; /* SUCCESS */ if (try1 > 0) die("BUG: length of formatted string changed"); strbuf_resize(s, s->length + fmt_len); } s->length += fmt_len; }
static void strbuf_append(StringBuffer *strbuf, wchar_t c) { if (strbuf->length == strbuf->capacity) { strbuf_resize(strbuf, strbuf->capacity * 2); } strbuf->chars[strbuf->length] = c; strbuf->length++; strbuf->chars[strbuf->length] = L'\0'; }
void strbuf_append_string(strbuf_t *s, const char *str) { int space, i; space = strbuf_empty_length(s); for (i = 0; str[i]; i++) { if (space < 1) { strbuf_resize(s, s->length + 1); space = strbuf_empty_length(s); } s->buf[s->length] = str[i]; s->length++; space--; } }
//outputs fastQ unless add_greedy_bases_for_better_bwt_compression==true, in which case is for 1000genomes, and they want fastA inline void error_correct_file_against_graph(char* fastq_file, char quality_cutoff, char ascii_qual_offset, dBGraphEc *db_graph, char* outfile, uint64_t *bases_modified_count_array,//distribution across reads; how many of the read_length bases are fixed uint64_t *posn_modified_count_array,//where in the read are we making corrections? int bases_modified_count_array_size, int min_read_len, HandleLowQualUncorrectable policy, boolean add_greedy_bases_for_better_bwt_compression, int num_greedy_bases, boolean rev_comp_read_if_on_reverse_strand) { int max_read_len = bases_modified_count_array_size - 2; int read_len_upper_bound = max_read_len + num_greedy_bases; int read_len_lower_bound = min_read_len + num_greedy_bases; int read_len_final = 0; //reset the stats arrays, we get stats per input file set_uint64_t_array(bases_modified_count_array,bases_modified_count_array_size, (uint64_t) 0); set_uint64_t_array(posn_modified_count_array, bases_modified_count_array_size, (uint64_t) 0); //set some variables, quality etc quality_cutoff+=ascii_qual_offset; short kmer_size = db_graph->kmer_size; StrBuf* uncorrectedGoodQual_file = strbuf_create(outfile); StrBuf* uncorrectedLowQual_file = strbuf_create(outfile); StrBuf* corrected_file = strbuf_create(outfile); StrBuf* discarded_undefined_file = strbuf_create(outfile); StrBuf* discarded_uncorrectable_file = strbuf_create(outfile); StrBuf* discarded_shortread_file = strbuf_create(outfile); strbuf_append_str(uncorrectedGoodQual_file, ".printuncorrectedgoodqual"); strbuf_append_str(uncorrectedLowQual_file, ".printuncorrectedlowqual"); strbuf_append_str(corrected_file, ".printcorrected"); strbuf_append_str(discarded_undefined_file, ".discardundefinedbase"); strbuf_append_str(discarded_uncorrectable_file, ".discarduncorrectable"); strbuf_append_str(discarded_shortread_file, ".discardshortread"); FILE* uncorrectedGoodQual_fp = fopen(uncorrectedGoodQual_file->buff, "w"); FILE* uncorrectedLowQual_fp = fopen(uncorrectedLowQual_file->buff, "w"); FILE* corrected_fp = fopen(corrected_file->buff, "w"); FILE* discarded_undefined_fp = fopen(discarded_undefined_file->buff, "w"); FILE* discarded_uncorrectable_fp = fopen(discarded_uncorrectable_file->buff, "w"); FILE* discarded_shortread_fp = fopen(discarded_shortread_file->buff, "w"); char* suff1 = ".distrib_num_modified_bases"; char* suff2 = ".distrib_posn_modified_bases"; char* suff3 =".read_stats"; char* stat1 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff1)+1)); char* stat2 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff2)+1)); char* stat3 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff3)+1)); if ( (stat1==NULL) || (stat2==NULL) || (stat3==NULL) ) { die("Unable to malloc FILENAME strings. Something badly wrong with your server\n"); } set_string_to_null(stat1, strlen(outfile)+strlen(suff1)+1); set_string_to_null(stat2, strlen(outfile)+strlen(suff2)+1); set_string_to_null(stat3, strlen(outfile)+strlen(suff3)+1); strcpy(stat1, outfile); strcat(stat1, suff1); strcat(stat2, outfile); strcat(stat2, suff2); strcat(stat3, outfile); strcat(stat3, suff3); FILE* out_stat1 = fopen(stat1, "w"); FILE* out_stat2 = fopen(stat2, "w"); FILE* out_stat3 = fopen(stat3, "w"); if ( (out_stat1==NULL)|| (out_stat2==NULL) || (out_stat3==NULL) ) { die("Unable to open %s or %s or %s to write to - permissions issue?\n", stat1, stat2, stat3); } SeqFile *sf = seq_file_open(fastq_file); if(sf == NULL) { // Error opening file fprintf(stderr, "Error: cannot read seq file '%s'\n", fastq_file); exit(EXIT_FAILURE); } char is_fastq = seq_has_quality_scores(sf); if (is_fastq==0) { die("Error correction is only meant to work on FASTQ and this file: %s is not\n", fastq_file); } StrBuf* buf_seq = strbuf_new(); StrBuf* buf_qual = strbuf_new(); StrBuf* working_buf=strbuf_new(); dBNodeEc* last_node_in_read; Orientation last_or_in_read; int num_original_reads=0, num_final_reads=0, num_corrected_reads=0, num_discarded_reads=0; int num_discarded_undefined = 0; int num_discarded_uncorrectable = 0; int num_discarded_short_read = 0; int num_print_uncorrected_lowqual = 0; int num_print_uncorrected_goodqual = 0; int num_print_corrected = 0; StrBuf* buf_dashes = strbuf_create("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"); while(seq_next_read(sf)) { int count_corrected_bases=0; //NOTE - uses modified version fo Isaacs code - new func seq_read_all_bases_and_quals(sf, buf_seq, buf_qual); StrBuf* buf_seq_debug = strbuf_clone(buf_seq); StrBuf* buf_seq_origin = strbuf_clone(buf_seq); StrBuf* buf_seq_fixed = strbuf_clone(buf_dashes); strbuf_resize(buf_seq_fixed, strbuf_len(buf_seq)+1); int read_len = seq_get_length(sf); int num_kmers = read_len-kmer_size+1; int quality_good[read_len]; set_int_array(quality_good, read_len, 1); int first_good=0;//index of first kmer in graph //populate the qual array showing which bases have qual >threshold //if all quals are high, will Print uncorrected //else, if all kmers NOT in graph, will discard or print uncorrected depending on policy //else print corrected. Orientation strand_first_good_kmer; ReadCorrectionDecison dec = get_first_good_kmer_and_populate_qual_array(seq_get_read_name(sf), buf_seq, buf_qual, num_kmers, read_len, quality_good, quality_cutoff, &first_good, &strand_first_good_kmer, db_graph, policy, rev_comp_read_if_on_reverse_strand); //*** start of local functions //if going right, keep going to right hand end. if going left, keep going to left hand end boolean condition(WhichEndOfKmer direction, int pos) { if ((direction==Right) && (pos<num_kmers)) { return true; } if ((direction==Left) && (pos>=0)) { return true; } return false; } boolean kmer_is_in_graph(char* kmer, dBGraphEc* db_g) { BinaryKmer curr_kmer; if (seq_to_binary_kmer(kmer, kmer_size, &curr_kmer)==NULL) { //is an N return false; } BinaryKmer temp_key; element_ec_get_key(&curr_kmer, kmer_size, &temp_key); dBNodeEc* node = hash_table_ec_find(&temp_key, db_g); if (node==NULL) { return false; } else { return true; } } int increment(int i, WhichEndOfKmer direction) { if (direction==Right) { return i+1; } else { return i-1; } } char working_str[kmer_size+1]; // start_pos is in kmer units boolean check_bases_to_end_of_read(int start_pos, ReadCorrectionDecison* decision, WhichEndOfKmer direction, int* num_corrected_bases_in_this_read_debug) { boolean any_correction_done=false; if ((start_pos<0) || (start_pos>=num_kmers)) { return any_correction_done; } int pos=start_pos; int offset=0; if (direction==Right) { offset= kmer_size-1; } char local_kmer[kmer_size+1]; local_kmer[kmer_size]='\0'; while ( (*decision==PrintCorrected) && (condition(direction,pos)==true) ) { strncpy(local_kmer, buf_seq->buff+pos, kmer_size); if (quality_good[pos+offset]==1) { //nothing to do } else if (kmer_is_in_graph(local_kmer, db_graph)==true) { //nothing to do - don't correct if kmer is in graph } else//kmer not in graph and quality bad { boolean fixed = fix_end_if_unambiguous(direction, buf_seq, buf_seq_fixed, buf_qual, quality_cutoff, pos, working_buf, working_str, db_graph); if ( (policy==DiscardReadIfLowQualBaseUnCorrectable) && (fixed==false) ) { *decision=DiscardUncorrectable; } else if (fixed==true) { any_correction_done=true; count_corrected_bases++; *num_corrected_bases_in_this_read_debug=*num_corrected_bases_in_this_read_debug+1; if (offset+pos<bases_modified_count_array_size) { posn_modified_count_array[offset+pos]++; } else { posn_modified_count_array[bases_modified_count_array_size-1]++; } } } pos = increment(pos, direction); } return any_correction_done; }