void align_scoring_load_matrix(gzFile file, const char* file_path, scoring_t* scoring, char case_sensitive) { StrBuf* sbuf = strbuf_new(500); size_t read_length; int line_num = 0; // Read first line (column headings) while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment !string_is_all_whitespace(sbuf->b)) // and not whitespace { // Read first line if(sbuf->end < 2) { _loading_error("Too few column headings", file_path, line_num, 1); } break; } line_num++; } if(line_num == 0 && sbuf->end <= 0) { _loading_error("Empty file", file_path, -1, 0); } // If the separator character is whitespace, // the set of whitespace characters is used char sep = sbuf->b[0]; if((sep >= (int)'0' && sep <= (int)'9') || sep == '-') { _loading_error("Numbers (0-9) and dashes (-) do not make good separators", file_path, line_num, 0); } char* characters = (char*)malloc(sbuf->end); int num_of_chars = 0; if(isspace(sep)) { char* next = sbuf->b; while((next = string_next_nonwhitespace(next+1)) != NULL) { characters[num_of_chars++] = case_sensitive ? *next : tolower(*next); } // Now read lines below while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); char* from_char_pos = string_next_nonwhitespace(sbuf->b); if(from_char_pos == NULL || sbuf->b[0] == '#') { // skip this line continue; } char from_char = case_sensitive ? *from_char_pos : tolower(*from_char_pos); char to_char; char* score_txt = sbuf->b+1; int score; int i; for(i = 0; i < num_of_chars; i++) { to_char = characters[i]; if(!isspace(*score_txt)) { _loading_error("Expected whitespace between elements - found character", file_path, line_num, 1); } score_txt = string_next_nonwhitespace(score_txt+1); char* strtol_last_char_ptr = score_txt; score = (int)strtol(strtol_last_char_ptr, &strtol_last_char_ptr, 10); // If pointer to end of number string hasn't moved -> error if(strtol_last_char_ptr == score_txt) { _loading_error("Missing number value on line", file_path, line_num, 1); } scoring_add_mutation(scoring, from_char, to_char, score); score_txt = strtol_last_char_ptr; } if(*score_txt != '\0' && !string_is_all_whitespace(score_txt)) { _loading_error("Too many columns on row", file_path, line_num, 1); } line_num++; } } else { size_t i; for(i = 0; i < sbuf->end; i += 2) { if(sbuf->b[i] != sep) { _loading_error("Separator missing from line", file_path, line_num, 1); } char c = case_sensitive ? sbuf->b[i+1] : tolower(sbuf->b[i+1]); characters[num_of_chars++] = c; } int score; // Read rows while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); char from_char = case_sensitive ? sbuf->b[0] : tolower(sbuf->b[0]); if(from_char == '#' || string_is_all_whitespace(sbuf->b)) { // skip this line continue; } char* str_pos = sbuf->b; int to_char_index = 0; char to_char; while(*str_pos != '\0') { to_char = characters[to_char_index++]; if(*str_pos != sep) { _loading_error("Separator missing from line", file_path, line_num, 1); } // Move past separator str_pos++; char* after_num_str = str_pos; score = (int)strtol(str_pos, &after_num_str, 10); // If pointer to end of number string hasn't moved -> error if(str_pos == after_num_str) { _loading_error("Missing number value on line", file_path, line_num, 1); } if(to_char_index >= num_of_chars) { _loading_error("Too many columns on row", file_path, line_num, 1); } scoring_add_mutation(scoring, from_char, to_char, score); str_pos = after_num_str; } line_num++; } } free(characters); strbuf_free(sbuf); }
int main(int argc, char **argv) { // compiler complains about unused function without these linese (void)kh_clear_ghash; (void)kh_del_ghash; if(argc < 2) print_usage(usage, NULL); char swap_alleles = 0; int c; while((c = getopt(argc, argv, "s")) >= 0) { switch (c) { case 's': swap_alleles = 1; break; default: die("Unknown option: %c", c); } } if(optind == argc) print_usage(usage, "Not enough arguments"); char *inputpath = argv[optind]; char **refpaths = argv + optind + 1; size_t num_refs = argc - optind - 1; gzFile gzin = gzopen(inputpath, "r"); if(gzin == NULL) die("Cannot read file: %s", inputpath); size_t i, nchroms = 0, capacity = 1024; khash_t(ghash) *genome = kh_init(ghash); read_t *reads = malloc(capacity * sizeof(read_t)), *r; int hret; khiter_t k; for(i = 0; i < num_refs; i++) { fprintf(stderr, "Loading %s\n", refpaths[i]); load_reads(refpaths[i], &reads, &capacity, &nchroms); } if(num_refs == 0) { fprintf(stderr, "Loading from stdin\n"); load_reads("-", &reads, &capacity, &nchroms); } if(nchroms == 0) die("No chromosomes loaded"); for(i = 0; i < nchroms; i++) { r = reads + i; fprintf(stderr, "Loaded: '%s'\n", r->name.b); k = kh_put(ghash, genome, r->name.b, &hret); if(hret == 0) warn("Duplicate read name (taking first): %s", r->name.b); else kh_value(genome, k) = r; } // Now read VCF StrBuf line; strbuf_alloc(&line, 1024); char *fields[9]; char *chr; int pos, reflen, altlen; while(strbuf_reset_gzreadline(&line, gzin) > 0) { if(line.b[0] == '#') fputs(line.b, stdout); else { strbuf_chomp(&line); vcf_columns(line.b, fields); fields[1][-1] = fields[2][-1] = '\0'; chr = line.b; pos = atoi(fields[1])-1; k = kh_get(ghash, genome, chr); r = kh_value(genome, k); fields[1][-1] = fields[2][-1] = '\t'; reflen = fields[4] - fields[3] - 1; altlen = fields[5] - fields[4] - 1; if(k == kh_end(genome)) warn("Cannot find chrom: %s", chr); else if(pos < 0) warn("Bad line: %s\n", line.b); else if((reflen == 1 && altlen == 1) || fields[3][0] == fields[4][0]) { if((unsigned)pos + reflen <= r->seq.end && strncasecmp(r->seq.b+pos,fields[3],reflen) == 0) { fputs(line.b, stdout); fputc('\n', stdout); } else if(swap_alleles && (unsigned)pos + altlen <= r->seq.end && strncasecmp(r->seq.b+pos,fields[4],altlen) == 0) { // swap alleles char tmp[altlen], *ref = fields[3], *alt = fields[4]; memcpy(tmp, alt, altlen); memmove(ref+altlen+1, ref, reflen); memcpy(ref, tmp, altlen); ref[altlen] = '\t'; fputs(line.b, stdout); fputc('\n', stdout); } // else printf("FAIL0\n"); } // else printf("FAIL1\n"); } } kh_destroy(ghash, genome); strbuf_dealloc(&line); gzclose(gzin); for(i = 0; i < nchroms; i++) seq_read_dealloc(reads+i); free(reads); fprintf(stderr, " Done.\n"); return 0; }
void align_scoring_load_pairwise(gzFile file, const char* file_path, scoring_t* scoring, char case_sensitive) { StrBuf* sbuf = strbuf_new(200); size_t read_length; int line_num = 0; char a, b; int score; int num_pairs_added = 0; while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment !string_is_all_whitespace(sbuf->b)) // and not whitespace { if(read_length < 5) { _loading_error("Too few column headings", file_path, line_num, 0); } if(isspace(sbuf->b[1])) { // split by whitespace a = sbuf->b[0]; size_t char2_pos; for(char2_pos = 1; sbuf->b[char2_pos] != '\0' && isspace(sbuf->b[char2_pos]); char2_pos++); if(char2_pos+2 >= sbuf->end || !isspace(sbuf->b[char2_pos+1])) { _loading_error("Line too short", file_path, line_num, 0); } b = sbuf->b[char2_pos]; if(!parse_entire_int(sbuf->b+char2_pos+2, &score)) { _loading_error("Invalid number", file_path, line_num, 0); } } else { if(sbuf->b[1] != sbuf->b[3]) { _loading_error("Inconsistent separators used", file_path, line_num, 0); } a = sbuf->b[0]; b = sbuf->b[2]; if(!parse_entire_int(sbuf->b + 4, &score)) { _loading_error("Invalid number", file_path, line_num, 0); } } if(!case_sensitive) { a = tolower(a); b = tolower(b); } scoring_add_mutation(scoring, a, b, score); num_pairs_added++; } line_num++; } strbuf_free(sbuf); if(num_pairs_added == 0) { _loading_error("No pairs added from file (file empty?)", file_path, line_num, 0); } }