//outputs fastQ unless add_greedy_bases_for_better_bwt_compression==true, in which case is for 1000genomes, and they want fastA inline void error_correct_file_against_graph(char* fastq_file, char quality_cutoff, char ascii_qual_offset, dBGraphEc *db_graph, char* outfile, uint64_t *bases_modified_count_array,//distribution across reads; how many of the read_length bases are fixed uint64_t *posn_modified_count_array,//where in the read are we making corrections? int bases_modified_count_array_size, int min_read_len, HandleLowQualUncorrectable policy, boolean add_greedy_bases_for_better_bwt_compression, int num_greedy_bases, boolean rev_comp_read_if_on_reverse_strand) { int max_read_len = bases_modified_count_array_size - 2; int read_len_upper_bound = max_read_len + num_greedy_bases; int read_len_lower_bound = min_read_len + num_greedy_bases; int read_len_final = 0; //reset the stats arrays, we get stats per input file set_uint64_t_array(bases_modified_count_array,bases_modified_count_array_size, (uint64_t) 0); set_uint64_t_array(posn_modified_count_array, bases_modified_count_array_size, (uint64_t) 0); //set some variables, quality etc quality_cutoff+=ascii_qual_offset; short kmer_size = db_graph->kmer_size; StrBuf* uncorrectedGoodQual_file = strbuf_create(outfile); StrBuf* uncorrectedLowQual_file = strbuf_create(outfile); StrBuf* corrected_file = strbuf_create(outfile); StrBuf* discarded_undefined_file = strbuf_create(outfile); StrBuf* discarded_uncorrectable_file = strbuf_create(outfile); StrBuf* discarded_shortread_file = strbuf_create(outfile); strbuf_append_str(uncorrectedGoodQual_file, ".printuncorrectedgoodqual"); strbuf_append_str(uncorrectedLowQual_file, ".printuncorrectedlowqual"); strbuf_append_str(corrected_file, ".printcorrected"); strbuf_append_str(discarded_undefined_file, ".discardundefinedbase"); strbuf_append_str(discarded_uncorrectable_file, ".discarduncorrectable"); strbuf_append_str(discarded_shortread_file, ".discardshortread"); FILE* uncorrectedGoodQual_fp = fopen(uncorrectedGoodQual_file->buff, "w"); FILE* uncorrectedLowQual_fp = fopen(uncorrectedLowQual_file->buff, "w"); FILE* corrected_fp = fopen(corrected_file->buff, "w"); FILE* discarded_undefined_fp = fopen(discarded_undefined_file->buff, "w"); FILE* discarded_uncorrectable_fp = fopen(discarded_uncorrectable_file->buff, "w"); FILE* discarded_shortread_fp = fopen(discarded_shortread_file->buff, "w"); char* suff1 = ".distrib_num_modified_bases"; char* suff2 = ".distrib_posn_modified_bases"; char* suff3 =".read_stats"; char* stat1 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff1)+1)); char* stat2 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff2)+1)); char* stat3 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff3)+1)); if ( (stat1==NULL) || (stat2==NULL) || (stat3==NULL) ) { die("Unable to malloc FILENAME strings. Something badly wrong with your server\n"); } set_string_to_null(stat1, strlen(outfile)+strlen(suff1)+1); set_string_to_null(stat2, strlen(outfile)+strlen(suff2)+1); set_string_to_null(stat3, strlen(outfile)+strlen(suff3)+1); strcpy(stat1, outfile); strcat(stat1, suff1); strcat(stat2, outfile); strcat(stat2, suff2); strcat(stat3, outfile); strcat(stat3, suff3); FILE* out_stat1 = fopen(stat1, "w"); FILE* out_stat2 = fopen(stat2, "w"); FILE* out_stat3 = fopen(stat3, "w"); if ( (out_stat1==NULL)|| (out_stat2==NULL) || (out_stat3==NULL) ) { die("Unable to open %s or %s or %s to write to - permissions issue?\n", stat1, stat2, stat3); } SeqFile *sf = seq_file_open(fastq_file); if(sf == NULL) { // Error opening file fprintf(stderr, "Error: cannot read seq file '%s'\n", fastq_file); exit(EXIT_FAILURE); } char is_fastq = seq_has_quality_scores(sf); if (is_fastq==0) { die("Error correction is only meant to work on FASTQ and this file: %s is not\n", fastq_file); } StrBuf* buf_seq = strbuf_new(); StrBuf* buf_qual = strbuf_new(); StrBuf* working_buf=strbuf_new(); dBNodeEc* last_node_in_read; Orientation last_or_in_read; int num_original_reads=0, num_final_reads=0, num_corrected_reads=0, num_discarded_reads=0; int num_discarded_undefined = 0; int num_discarded_uncorrectable = 0; int num_discarded_short_read = 0; int num_print_uncorrected_lowqual = 0; int num_print_uncorrected_goodqual = 0; int num_print_corrected = 0; StrBuf* buf_dashes = strbuf_create("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"); while(seq_next_read(sf)) { int count_corrected_bases=0; //NOTE - uses modified version fo Isaacs code - new func seq_read_all_bases_and_quals(sf, buf_seq, buf_qual); StrBuf* buf_seq_debug = strbuf_clone(buf_seq); StrBuf* buf_seq_origin = strbuf_clone(buf_seq); StrBuf* buf_seq_fixed = strbuf_clone(buf_dashes); strbuf_resize(buf_seq_fixed, strbuf_len(buf_seq)+1); int read_len = seq_get_length(sf); int num_kmers = read_len-kmer_size+1; int quality_good[read_len]; set_int_array(quality_good, read_len, 1); int first_good=0;//index of first kmer in graph //populate the qual array showing which bases have qual >threshold //if all quals are high, will Print uncorrected //else, if all kmers NOT in graph, will discard or print uncorrected depending on policy //else print corrected. Orientation strand_first_good_kmer; ReadCorrectionDecison dec = get_first_good_kmer_and_populate_qual_array(seq_get_read_name(sf), buf_seq, buf_qual, num_kmers, read_len, quality_good, quality_cutoff, &first_good, &strand_first_good_kmer, db_graph, policy, rev_comp_read_if_on_reverse_strand); //*** start of local functions //if going right, keep going to right hand end. if going left, keep going to left hand end boolean condition(WhichEndOfKmer direction, int pos) { if ((direction==Right) && (pos<num_kmers)) { return true; } if ((direction==Left) && (pos>=0)) { return true; } return false; } boolean kmer_is_in_graph(char* kmer, dBGraphEc* db_g) { BinaryKmer curr_kmer; if (seq_to_binary_kmer(kmer, kmer_size, &curr_kmer)==NULL) { //is an N return false; } BinaryKmer temp_key; element_ec_get_key(&curr_kmer, kmer_size, &temp_key); dBNodeEc* node = hash_table_ec_find(&temp_key, db_g); if (node==NULL) { return false; } else { return true; } } int increment(int i, WhichEndOfKmer direction) { if (direction==Right) { return i+1; } else { return i-1; } } char working_str[kmer_size+1]; // start_pos is in kmer units boolean check_bases_to_end_of_read(int start_pos, ReadCorrectionDecison* decision, WhichEndOfKmer direction, int* num_corrected_bases_in_this_read_debug) { boolean any_correction_done=false; if ((start_pos<0) || (start_pos>=num_kmers)) { return any_correction_done; } int pos=start_pos; int offset=0; if (direction==Right) { offset= kmer_size-1; } char local_kmer[kmer_size+1]; local_kmer[kmer_size]='\0'; while ( (*decision==PrintCorrected) && (condition(direction,pos)==true) ) { strncpy(local_kmer, buf_seq->buff+pos, kmer_size); if (quality_good[pos+offset]==1) { //nothing to do } else if (kmer_is_in_graph(local_kmer, db_graph)==true) { //nothing to do - don't correct if kmer is in graph } else//kmer not in graph and quality bad { boolean fixed = fix_end_if_unambiguous(direction, buf_seq, buf_seq_fixed, buf_qual, quality_cutoff, pos, working_buf, working_str, db_graph); if ( (policy==DiscardReadIfLowQualBaseUnCorrectable) && (fixed==false) ) { *decision=DiscardUncorrectable; } else if (fixed==true) { any_correction_done=true; count_corrected_bases++; *num_corrected_bases_in_this_read_debug=*num_corrected_bases_in_this_read_debug+1; if (offset+pos<bases_modified_count_array_size) { posn_modified_count_array[offset+pos]++; } else { posn_modified_count_array[bases_modified_count_array_size-1]++; } } } pos = increment(pos, direction); } return any_correction_done; }
int main(int argc, char** argv) { if(argc < 3 || argc > 4) { print_usage(); } char* in_path = argv[1]; char* out_path = argv[2]; unsigned long line_wrap = 0; if(argc == 4) { char *line_wrap_str = argv[3]; char *endptr; line_wrap = strtoul(line_wrap_str, &endptr, 10); if((unsigned)(endptr-line_wrap_str) != strlen(line_wrap_str)) { print_usage(); } } SeqFileType out_file_type = SEQ_UNKNOWN; char out_zipped = 0; seq_guess_filetype_from_path(out_path, &out_file_type, &out_zipped); if(out_file_type == SEQ_UNKNOWN) { fprintf(stderr, "%s:%i: Sorry, I cannot identify the output file's format " "from its path [file: %s]\n", __FILE__, __LINE__, out_path); exit(EXIT_FAILURE); } SeqFile* in_file = seq_file_open(in_path); SeqFile* out_file = seq_file_open_write(out_path, out_file_type, out_zipped, line_wrap); if(in_file == NULL) { fprintf(stderr, "%s:%i: Couldn't open input file: %s\n", __FILE__, __LINE__, in_path); exit(EXIT_FAILURE); } if(out_file == NULL) { fprintf(stderr, "%s:%i: Couldn't open output file: %s\n", __FILE__, __LINE__, out_path); exit(EXIT_FAILURE); } printf(" In : %s [%s]\n", in_path, seq_file_get_type_str(in_file)); printf(" Out: %s [%s]\n", out_path, seq_file_get_type_str(out_file)); // Start converting size_t bytes_written = 0; char c[2] = "."; if(out_file_type == SEQ_PLAIN) { // Example reading in an entire entry at a time using seq_read_all_bases() StrBuf *bases = strbuf_new(); while(seq_next_read(in_file)) { while(seq_read_all_bases(in_file, bases)) { bytes_written += seq_file_write_seq(out_file, bases->buff); } } } else { // Example reading in a char at a time using seq_read_base() while(seq_next_read(in_file)) { const char* read_name = seq_get_read_name(in_file); bytes_written += seq_file_write_name(out_file, read_name); unsigned long seq_length = 0; while(seq_read_base(in_file, c)) { seq_length++; if(!(bytes_written += seq_file_write_seq(out_file, c))) { fprintf(stderr, "%s:%i: Couldn't write base to file " "[file: %s; line: %lu]\n", __FILE__, __LINE__, seq_get_path(out_file), seq_curr_line_number(in_file)); exit(EXIT_FAILURE); } } if(seq_has_quality_scores(out_file)) { size_t bytes_written_before_qual = bytes_written; while(seq_read_qual(in_file, c)) { if(!(bytes_written += seq_file_write_qual(out_file, c))) { fprintf(stderr, "%s:%i: Couldn't write quality score to file " "[file: %s; line: %lu]\n", __FILE__, __LINE__, seq_get_path(out_file), seq_curr_line_number(in_file)); exit(EXIT_FAILURE); } } if(bytes_written == bytes_written_before_qual) { // No quality scores were read - fill in unsigned long i; *c = '?'; for(i = 0; i < seq_length; i++) bytes_written += seq_file_write_qual(out_file, c); } } } } unsigned long seq_total_bases_read = seq_total_bases_passed(in_file); unsigned long total_entries = seq_get_read_index(in_file); seq_file_close(in_file); bytes_written += seq_file_close(out_file); printf("%lu entries read\n", total_entries); printf("%lu bases read\n", seq_total_bases_read); printf("%lu bytes written\n", bytes_written); printf("Done. \n"); return EXIT_SUCCESS; }
// If seq2 is NULL, read pair of entries from first file // Otherwise read an entry from each void align_from_file(const char *path1, const char *path2, void (align)(StrBuf*, StrBuf*, const char*, const char*)) { SeqFile *sf1 = seq_file_open(path1); SeqFile *sf2; if(sf1 == NULL) { fprintf(stderr, "Alignment Error: couldn't open file %s\n", path1); fflush(stderr); return; } if(path2 != NULL) { sf2 = seq_file_open(path2); if(sf2 == NULL) { fprintf(stderr, "Alignment Error: couldn't open file %s\n", path1); fflush(stderr); return; } } else { sf2 = sf1; } StrBuf *entry1_title = strbuf_new(); StrBuf *entry2_title = strbuf_new(); StrBuf *entry1_seq = strbuf_new(); StrBuf *entry2_seq = strbuf_new(); char *title1 = NULL, *title2 = NULL; // Loop while we can read a sequence from the first file while(seq_next_read(sf1)) { seq_read_all_bases(sf1, entry1_seq); if(seq_file_get_type(sf1) != SEQ_PLAIN) { strbuf_set(entry1_title, seq_get_read_name(sf1)); title1 = entry1_title->buff; } if(!seq_next_read(sf2)) { fprintf(stderr, "Alignment Error: Odd number of sequences - " "I read in pairs!\n"); fflush(stderr); break; } seq_read_all_bases(sf2, entry2_seq); if(seq_file_get_type(sf2) != SEQ_PLAIN) { strbuf_set(entry2_title, seq_get_read_name(sf2)); title2 = entry2_title->buff; } (align)(entry1_seq, entry2_seq, title1, title2); } // warn if no bases read if(seq_total_bases_passed(sf1) == 0) { fprintf(stderr, "Alignment Warning: empty input\n"); fflush(stderr); } // Close files seq_file_close(sf1); if(path2 != NULL) seq_file_close(sf2); // Free memory strbuf_free(entry1_title); strbuf_free(entry2_title); strbuf_free(entry1_seq); strbuf_free(entry2_seq); }