char* read_file(char *path) { HEX_ASSERT(path); Strbuf strbuf = strbuf_create(); HEX_ASSERT(strbuf); char c; while( (c = fgetc(_f) ) != EOF ) { if(c == '\n') continue; char s[2]; memset(s, 0, sizeof(s)); snprintf(s, sizeof(s), "%c", c); strbuf_append(strbuf, s); } char *str = strdup(strbuf_cstr(strbuf)); HEX_ASSERT(str); strbuf_free(&strbuf); return str; }
/* * mingled name = 'name' + '_' + type + '_' + 'indent_level' * */ char* vtable_mingle_name(VtableEntry entry) { HEX_ASSERT(entry); HEX_ASSERT(entry->name); Strbuf strbuf = strbuf_create(); HEX_ASSERT(strbuf); char type_str[2]; char indent_level_str[5]; snprintf(type_str, sizeof(type_str), "%u", (unsigned int)entry->type); snprintf(indent_level_str, sizeof(indent_level_str), "%u", entry->indent_level); strbuf_append(strbuf, (const char*)entry->name); strbuf_append(strbuf, "_"); strbuf_append(strbuf, (const char*)type_str); strbuf_append(strbuf, "_"); strbuf_append(strbuf, (const char*)indent_level_str); char *mingled_name = strbuf_cstr(strbuf); HEX_ASSERT(mingled_name); HEX_FREE(strbuf); return mingled_name; }
void setup(char *path) { _root=NULL; _root_type=0; _res=0; _strbuf=strbuf_create(); HEX_ASSERT(_strbuf); _f = fopen(path, "r"); HEX_ASSERT(_f); }
char* ftable_mingle_name(FtableEntry entry) { HEX_ASSERT(entry); HEX_ASSERT(entry->name); Strbuf strbuf = strbuf_create(); HEX_ASSERT(strbuf); strbuf_append(strbuf, (const char*)entry->name); ParameterList paramlist = (ParameterList)entry->paramlist; while(paramlist) { Parameter param = paramlist->parameter; HEX_ASSERT(param); HEX_ASSERT(param->parameter_name); strbuf_append(strbuf, "_"); strbuf_append(strbuf, (const char*)param->parameter_name); strbuf_append(strbuf, "-"); if(param->type_specifier) { char buf[10]; snprintf(buf, sizeof(buf), "%d", param->type_specifier); strbuf_append(strbuf, buf); } else { strbuf_append(strbuf, (const char*)param->custom_type); } paramlist = paramlist->next; } char *mingled_name = strdup(strbuf_cstr(strbuf)); HEX_ASSERT(mingled_name); HEX_FREE(strbuf); return mingled_name; }
//outputs fastQ unless add_greedy_bases_for_better_bwt_compression==true, in which case is for 1000genomes, and they want fastA inline void error_correct_file_against_graph(char* fastq_file, char quality_cutoff, char ascii_qual_offset, dBGraphEc *db_graph, char* outfile, uint64_t *bases_modified_count_array,//distribution across reads; how many of the read_length bases are fixed uint64_t *posn_modified_count_array,//where in the read are we making corrections? int bases_modified_count_array_size, int min_read_len, HandleLowQualUncorrectable policy, boolean add_greedy_bases_for_better_bwt_compression, int num_greedy_bases, boolean rev_comp_read_if_on_reverse_strand) { int max_read_len = bases_modified_count_array_size - 2; int read_len_upper_bound = max_read_len + num_greedy_bases; int read_len_lower_bound = min_read_len + num_greedy_bases; int read_len_final = 0; //reset the stats arrays, we get stats per input file set_uint64_t_array(bases_modified_count_array,bases_modified_count_array_size, (uint64_t) 0); set_uint64_t_array(posn_modified_count_array, bases_modified_count_array_size, (uint64_t) 0); //set some variables, quality etc quality_cutoff+=ascii_qual_offset; short kmer_size = db_graph->kmer_size; StrBuf* uncorrectedGoodQual_file = strbuf_create(outfile); StrBuf* uncorrectedLowQual_file = strbuf_create(outfile); StrBuf* corrected_file = strbuf_create(outfile); StrBuf* discarded_undefined_file = strbuf_create(outfile); StrBuf* discarded_uncorrectable_file = strbuf_create(outfile); StrBuf* discarded_shortread_file = strbuf_create(outfile); strbuf_append_str(uncorrectedGoodQual_file, ".printuncorrectedgoodqual"); strbuf_append_str(uncorrectedLowQual_file, ".printuncorrectedlowqual"); strbuf_append_str(corrected_file, ".printcorrected"); strbuf_append_str(discarded_undefined_file, ".discardundefinedbase"); strbuf_append_str(discarded_uncorrectable_file, ".discarduncorrectable"); strbuf_append_str(discarded_shortread_file, ".discardshortread"); FILE* uncorrectedGoodQual_fp = fopen(uncorrectedGoodQual_file->buff, "w"); FILE* uncorrectedLowQual_fp = fopen(uncorrectedLowQual_file->buff, "w"); FILE* corrected_fp = fopen(corrected_file->buff, "w"); FILE* discarded_undefined_fp = fopen(discarded_undefined_file->buff, "w"); FILE* discarded_uncorrectable_fp = fopen(discarded_uncorrectable_file->buff, "w"); FILE* discarded_shortread_fp = fopen(discarded_shortread_file->buff, "w"); char* suff1 = ".distrib_num_modified_bases"; char* suff2 = ".distrib_posn_modified_bases"; char* suff3 =".read_stats"; char* stat1 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff1)+1)); char* stat2 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff2)+1)); char* stat3 = (char*) malloc(sizeof(char)*(strlen(outfile)+strlen(suff3)+1)); if ( (stat1==NULL) || (stat2==NULL) || (stat3==NULL) ) { die("Unable to malloc FILENAME strings. Something badly wrong with your server\n"); } set_string_to_null(stat1, strlen(outfile)+strlen(suff1)+1); set_string_to_null(stat2, strlen(outfile)+strlen(suff2)+1); set_string_to_null(stat3, strlen(outfile)+strlen(suff3)+1); strcpy(stat1, outfile); strcat(stat1, suff1); strcat(stat2, outfile); strcat(stat2, suff2); strcat(stat3, outfile); strcat(stat3, suff3); FILE* out_stat1 = fopen(stat1, "w"); FILE* out_stat2 = fopen(stat2, "w"); FILE* out_stat3 = fopen(stat3, "w"); if ( (out_stat1==NULL)|| (out_stat2==NULL) || (out_stat3==NULL) ) { die("Unable to open %s or %s or %s to write to - permissions issue?\n", stat1, stat2, stat3); } SeqFile *sf = seq_file_open(fastq_file); if(sf == NULL) { // Error opening file fprintf(stderr, "Error: cannot read seq file '%s'\n", fastq_file); exit(EXIT_FAILURE); } char is_fastq = seq_has_quality_scores(sf); if (is_fastq==0) { die("Error correction is only meant to work on FASTQ and this file: %s is not\n", fastq_file); } StrBuf* buf_seq = strbuf_new(); StrBuf* buf_qual = strbuf_new(); StrBuf* working_buf=strbuf_new(); dBNodeEc* last_node_in_read; Orientation last_or_in_read; int num_original_reads=0, num_final_reads=0, num_corrected_reads=0, num_discarded_reads=0; int num_discarded_undefined = 0; int num_discarded_uncorrectable = 0; int num_discarded_short_read = 0; int num_print_uncorrected_lowqual = 0; int num_print_uncorrected_goodqual = 0; int num_print_corrected = 0; StrBuf* buf_dashes = strbuf_create("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"); while(seq_next_read(sf)) { int count_corrected_bases=0; //NOTE - uses modified version fo Isaacs code - new func seq_read_all_bases_and_quals(sf, buf_seq, buf_qual); StrBuf* buf_seq_debug = strbuf_clone(buf_seq); StrBuf* buf_seq_origin = strbuf_clone(buf_seq); StrBuf* buf_seq_fixed = strbuf_clone(buf_dashes); strbuf_resize(buf_seq_fixed, strbuf_len(buf_seq)+1); int read_len = seq_get_length(sf); int num_kmers = read_len-kmer_size+1; int quality_good[read_len]; set_int_array(quality_good, read_len, 1); int first_good=0;//index of first kmer in graph //populate the qual array showing which bases have qual >threshold //if all quals are high, will Print uncorrected //else, if all kmers NOT in graph, will discard or print uncorrected depending on policy //else print corrected. Orientation strand_first_good_kmer; ReadCorrectionDecison dec = get_first_good_kmer_and_populate_qual_array(seq_get_read_name(sf), buf_seq, buf_qual, num_kmers, read_len, quality_good, quality_cutoff, &first_good, &strand_first_good_kmer, db_graph, policy, rev_comp_read_if_on_reverse_strand); //*** start of local functions //if going right, keep going to right hand end. if going left, keep going to left hand end boolean condition(WhichEndOfKmer direction, int pos) { if ((direction==Right) && (pos<num_kmers)) { return true; } if ((direction==Left) && (pos>=0)) { return true; } return false; } boolean kmer_is_in_graph(char* kmer, dBGraphEc* db_g) { BinaryKmer curr_kmer; if (seq_to_binary_kmer(kmer, kmer_size, &curr_kmer)==NULL) { //is an N return false; } BinaryKmer temp_key; element_ec_get_key(&curr_kmer, kmer_size, &temp_key); dBNodeEc* node = hash_table_ec_find(&temp_key, db_g); if (node==NULL) { return false; } else { return true; } } int increment(int i, WhichEndOfKmer direction) { if (direction==Right) { return i+1; } else { return i-1; } } char working_str[kmer_size+1]; // start_pos is in kmer units boolean check_bases_to_end_of_read(int start_pos, ReadCorrectionDecison* decision, WhichEndOfKmer direction, int* num_corrected_bases_in_this_read_debug) { boolean any_correction_done=false; if ((start_pos<0) || (start_pos>=num_kmers)) { return any_correction_done; } int pos=start_pos; int offset=0; if (direction==Right) { offset= kmer_size-1; } char local_kmer[kmer_size+1]; local_kmer[kmer_size]='\0'; while ( (*decision==PrintCorrected) && (condition(direction,pos)==true) ) { strncpy(local_kmer, buf_seq->buff+pos, kmer_size); if (quality_good[pos+offset]==1) { //nothing to do } else if (kmer_is_in_graph(local_kmer, db_graph)==true) { //nothing to do - don't correct if kmer is in graph } else//kmer not in graph and quality bad { boolean fixed = fix_end_if_unambiguous(direction, buf_seq, buf_seq_fixed, buf_qual, quality_cutoff, pos, working_buf, working_str, db_graph); if ( (policy==DiscardReadIfLowQualBaseUnCorrectable) && (fixed==false) ) { *decision=DiscardUncorrectable; } else if (fixed==true) { any_correction_done=true; count_corrected_bases++; *num_corrected_bases_in_this_read_debug=*num_corrected_bases_in_this_read_debug+1; if (offset+pos<bases_modified_count_array_size) { posn_modified_count_array[offset+pos]++; } else { posn_modified_count_array[bases_modified_count_array_size-1]++; } } } pos = increment(pos, direction); } return any_correction_done; }
void test_get_next_gene_info() { uint16_t kmer_size = 31; int number_of_bits = 10; int bucket_size = 100; int max_retries = 10; dBGraph *db_graph= hash_table_new(number_of_bits, bucket_size, max_retries, kmer_size); int max_gene_len = 1500; uint64_t* kmer_covg_array = calloc(150, sizeof(uint64_t)); uint64_t* readlen_array = calloc(max_gene_len, sizeof(uint64_t)); StrBuf* list = strbuf_create("../data/test/myKrobe/predictor/gene_presence/sample1.fa.list"); unsigned long long num_bases = build_unclean_graph(db_graph, list, true, kmer_size, readlen_array, max_gene_len, kmer_covg_array, 150, false, 0); FILE* fp = fopen("../data/test/myKrobe/predictor/gene_presence/panel1.fasta", "r"); if (fp==NULL) { die("Cannot open this file: ../data/test/myKrobe/predictor/gene_presence/panel1.fasta"); } GeneInfo* gi = alloc_and_init_gene_info(); //---------------------------------- // allocate the memory used to read the sequences //---------------------------------- Sequence * seq = malloc(sizeof(Sequence)); if (seq == NULL){ die("Out of memory trying to allocate Sequence"); } alloc_sequence(seq,max_gene_len,LINE_MAX); //We are going to load all the bases into a single sliding window KmerSlidingWindow* kmer_window = malloc(sizeof(KmerSlidingWindow)); if (kmer_window==NULL) { die("Failed to malloc kmer sliding window"); } kmer_window->kmer = (BinaryKmer*) malloc(sizeof(BinaryKmer)*(max_gene_len-db_graph->kmer_size+1)); if (kmer_window->kmer==NULL) { die("Failed to malloc kmer_window->kmer"); } kmer_window->nkmers=0; // int max_gene_len = 5000; CovgArray* working_ca = alloc_and_init_covg_array(max_gene_len); //end of intialisation //create file readers int file_reader_fasta(FILE * fp, Sequence * seq, int max_read_length, boolean new_entry, boolean * full_entry){ long long ret; int offset = 0; if (new_entry == false){ offset = db_graph->kmer_size; } ret = read_sequence_from_fasta(fp,seq,max_read_length,new_entry,full_entry,offset); return ret; }