void align_scoring_load_pairwise(gzFile file, const char* file_path, scoring_t* scoring, char case_sensitive) { StrBuf* sbuf = strbuf_new(200); size_t read_length; int line_num = 0; char a, b; int score; int num_pairs_added = 0; while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0) { strbuf_chomp(sbuf); if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment !string_is_all_whitespace(sbuf->b)) // and not whitespace { if(read_length < 5) { _loading_error("Too few column headings", file_path, line_num, 0); } if(isspace(sbuf->b[1])) { // split by whitespace a = sbuf->b[0]; size_t char2_pos; for(char2_pos = 1; sbuf->b[char2_pos] != '\0' && isspace(sbuf->b[char2_pos]); char2_pos++); if(char2_pos+2 >= sbuf->end || !isspace(sbuf->b[char2_pos+1])) { _loading_error("Line too short", file_path, line_num, 0); } b = sbuf->b[char2_pos]; if(!parse_entire_int(sbuf->b+char2_pos+2, &score)) { _loading_error("Invalid number", file_path, line_num, 0); } } else { if(sbuf->b[1] != sbuf->b[3]) { _loading_error("Inconsistent separators used", file_path, line_num, 0); } a = sbuf->b[0]; b = sbuf->b[2]; if(!parse_entire_int(sbuf->b + 4, &score)) { _loading_error("Invalid number", file_path, line_num, 0); } } if(!case_sensitive) { a = tolower(a); b = tolower(b); } scoring_add_mutation(scoring, a, b, score); num_pairs_added++; } line_num++; } strbuf_free(sbuf); if(num_pairs_added == 0) { _loading_error("No pairs added from file (file empty?)", file_path, line_num, 0); } }
cmdline_t* cmdline_new(int argc, char **argv, scoring_t *scoring, enum SeqAlignCmdType cmd_type) { cmdline_t* cmd = calloc(1, sizeof(cmdline_t)); cmd->file_list_length = 0; cmd->file_list_capacity = 256; cmd->file_paths1 = malloc(sizeof(char*) * cmd->file_list_capacity); cmd->file_paths2 = malloc(sizeof(char*) * cmd->file_list_capacity); cmd->seq1 = cmd->seq2 = NULL; // All values initially 0 // Store defaults score_t defaults[4] = {scoring->match, scoring->mismatch, scoring->gap_open, scoring->gap_extend}; if(argc == 1) usage(NULL); // First run through arguments to set up case_sensitive and scoring system // case sensitive needs to be dealt with first // (is is used to construct hash table for swap_scores) char scoring_set = 0, substitutions_set = 0, match_set = 0, mismatch_set = 0; int argi; for(argi = 1; argi < argc; argi++) { if(strcasecmp(argv[argi], "--help") == 0 || strcasecmp(argv[argi], "-help") == 0 || strcasecmp(argv[argi], "-h") == 0) { usage(NULL); } else if(strcasecmp(argv[argi], "--case_sensitive") == 0) { cmd->case_sensitive = 1; } else if(strcasecmp(argv[argi], "--scoring") == 0) { if(scoring_set) { usage("More than one scoring system specified - not permitted"); } if(strcasecmp(argv[argi+1], "PAM30") == 0) { scoring_system_PAM30(scoring); } else if(strcasecmp(argv[argi+1], "PAM70") == 0) { scoring_system_PAM70(scoring); } else if(strcasecmp(argv[argi+1], "BLOSUM80") == 0) { scoring_system_BLOSUM80(scoring); } else if(strcasecmp(argv[argi+1], "BLOSUM62") == 0) { scoring_system_BLOSUM62(scoring); } else if(strcasecmp(argv[argi+1], "DNA_HYBRIDIZATION") == 0) { scoring_system_DNA_hybridization(scoring); } else { usage("Unknown --scoring choice, not one of " "PAM30|PAM70|BLOSUM80|BLOSUM62"); } scoring_set = 1; argi++; // took an argument } } for(argi = 1; argi < argc; argi++) { if(argv[argi][0] == '-') { // strcasecmp does case insensitive comparison if(strcasecmp(argv[argi], "--freestartgap") == 0) { if(cmd_type != SEQ_ALIGN_NW_CMD) usage("--freestartgap only valid with Needleman-Wunsch"); scoring->no_start_gap_penalty = true; } else if(strcasecmp(argv[argi], "--freeendgap") == 0) { if(cmd_type != SEQ_ALIGN_NW_CMD) usage("--freeendgap only valid with Needleman-Wunsch"); scoring->no_end_gap_penalty = true; } else if(strcasecmp(argv[argi], "--nogaps") == 0) { scoring->no_gaps_in_a = true; scoring->no_gaps_in_b = true; } else if(strcasecmp(argv[argi], "--nogapsin1") == 0) { scoring->no_gaps_in_a = true; } else if(strcasecmp(argv[argi], "--nogapsin2") == 0) { scoring->no_gaps_in_b = true; } else if(strcasecmp(argv[argi], "--nomismatches") == 0) { scoring->no_mismatches = true; } else if(strcasecmp(argv[argi], "--case_sensitive") == 0) { // Already dealt with //case_sensitive = true; } else if(strcasecmp(argv[argi], "--printseq") == 0) { if(cmd_type != SEQ_ALIGN_SW_CMD) usage("--printseq only valid with Smith-Waterman"); cmd->print_seq = true; } else if(strcasecmp(argv[argi], "--printmatrices") == 0) { cmd->print_matrices = true; } else if(strcasecmp(argv[argi], "--printscores") == 0) { if(cmd_type != SEQ_ALIGN_NW_CMD) usage("--printscores only valid with Needleman-Wunsch"); cmd->print_scores = true; } else if(strcasecmp(argv[argi], "--printfasta") == 0) { cmd->print_fasta = true; } else if(strcasecmp(argv[argi], "--pretty") == 0) { cmd->print_pretty = true; } else if(strcasecmp(argv[argi], "--colour") == 0) { cmd->print_colour = true; } else if(strcasecmp(argv[argi], "--zam") == 0) { if(cmd_type != SEQ_ALIGN_NW_CMD) usage("--zam only valid with Needleman-Wunsch"); cmd->zam_stle_output = true; } else if(strcasecmp(argv[argi], "--stdin") == 0) { // Similar to --file argument below cmdline_add_files(cmd, "", NULL); cmd->interactive = true; } else if(argi == argc-1) { // All the remaining options take an extra argument usage("Unknown argument without parameter: %s", argv[argi]); } else if(strcasecmp(argv[argi], "--scoring") == 0) { // This handled above argi++; // took an argument } else if(strcasecmp(argv[argi], "--substitution_matrix") == 0) { gzFile sub_matrix_file = gzopen(argv[argi+1], "r"); if(sub_matrix_file == NULL) usage("Couldn't read: %s", argv[argi+1]); align_scoring_load_matrix(sub_matrix_file, argv[argi+1], scoring, cmd->case_sensitive); gzclose(sub_matrix_file); substitutions_set = true; argi++; // took an argument } else if(strcasecmp(argv[argi], "--substitution_pairs") == 0) { gzFile sub_pairs_file = gzopen(argv[argi+1], "r"); if(sub_pairs_file == NULL) usage("Couldn't read: %s", argv[argi+1]); align_scoring_load_pairwise(sub_pairs_file, argv[argi+1], scoring, cmd->case_sensitive); gzclose(sub_pairs_file); substitutions_set = true; argi++; // took an argument } else if(strcasecmp(argv[argi], "--minscore") == 0) { if(cmd_type != SEQ_ALIGN_SW_CMD) usage("--minscore only valid with Smith-Waterman"); if(!parse_entire_int(argv[argi+1], &cmd->min_score)) usage("Invalid --minscore <score> argument (must be a +ve int)"); cmd->min_score_set = true; argi++; } else if(strcasecmp(argv[argi], "--maxhits") == 0) { if(cmd_type != SEQ_ALIGN_SW_CMD) usage("--maxhits only valid with Smith-Waterman"); if(!parse_entire_uint(argv[argi+1], &cmd->max_hits_per_alignment)) usage("Invalid --maxhits <hits> argument (must be a +ve int)"); cmd->max_hits_per_alignment_set = true; argi++; } else if(strcasecmp(argv[argi], "--context") == 0) { if(cmd_type != SEQ_ALIGN_SW_CMD) usage("--context only valid with Smith-Waterman"); if(!parse_entire_uint(argv[argi+1], &cmd->print_context)) usage("Invalid --context <c> argument (must be >= 0)"); argi++; } else if(strcasecmp(argv[argi], "--match") == 0) { if(!parse_entire_int(argv[argi+1], &scoring->match)) { usage("Invalid --match argument ('%s') must be an int", argv[argi+1]); } match_set = true; argi++; // took an argument } else if(strcasecmp(argv[argi], "--mismatch") == 0) { if(!parse_entire_int(argv[argi+1], &scoring->mismatch)) { usage("Invalid --mismatch argument ('%s') must be an int", argv[argi+1]); } mismatch_set = true; argi++; // took an argument } else if(strcasecmp(argv[argi], "--gapopen") == 0) { if(!parse_entire_int(argv[argi+1], &scoring->gap_open)) { usage("Invalid --gapopen argument ('%s') must be an int", argv[argi+1]); } argi++; // took an argument } else if(strcasecmp(argv[argi], "--gapextend") == 0) { if(!parse_entire_int(argv[argi+1], &scoring->gap_extend)) { usage("Invalid --gapextend argument ('%s') must be an int", argv[argi+1]); } argi++; // took an argument } else if(strcasecmp(argv[argi], "--file") == 0) { cmdline_add_files(cmd, argv[argi+1], NULL); argi++; // took an argument } // Remaining options take two arguments but check themselves else if(strcasecmp(argv[argi], "--files") == 0) { if(argi >= argc-2) { usage("--files option takes 2 arguments"); } else if(strcmp(argv[argi+1], "-") == 0 && strcmp(argv[argi+2], "-") == 0) { // Read both from stdin cmdline_add_files(cmd, argv[argi+1], NULL); } else { cmdline_add_files(cmd, argv[argi+1], argv[argi+2]); } argi += 2; // took two arguments } else if(strcasecmp(argv[argi], "--wildcard") == 0) { int wildscore = 0; if(argi == argc-2 || strlen(argv[argi+1]) != 1 || !parse_entire_int(argv[argi+2], &wildscore)) { usage("--wildcard <w> <s> takes a single character and a number"); } scoring_add_wildcard(scoring, argv[argi+1][0], wildscore); argi += 2; // took two arguments } else usage("Unknown argument '%s'", argv[argi]); } else { if(argc - argi != 2) usage("Unknown options: '%s'", argv[argi]); break; } } if((match_set && !mismatch_set && !scoring->no_mismatches) || (!match_set && mismatch_set)) { usage("--match --mismatch must both be set or neither set"); } else if(substitutions_set && !match_set) { // if substitution table set and not match/mismatch scoring->use_match_mismatch = 0; } if(scoring->use_match_mismatch && scoring->match < scoring->mismatch) { usage("Match value should not be less than mismatch penalty"); } // Cannot guarantee that we can perform a global alignment if nomismatches // and nogaps is true if(cmd_type == SEQ_ALIGN_NW_CMD && scoring->no_mismatches && (scoring->no_gaps_in_a || scoring->no_gaps_in_b)) { usage("--nogaps.. --nomismatches cannot be used at together"); } // Check for extra unused arguments // and set seq1 and seq2 if they have been passed if(argi < argc) { cmd->seq1 = argv[argi]; cmd->seq2 = argv[argi+1]; } if(cmd->seq1 == NULL && cmd->file_list_length == 0) { usage("No input specified"); } if(cmd->zam_stle_output && (cmd->print_pretty || cmd->print_scores || cmd->print_colour || cmd->print_fasta)) { usage("Cannot use --printscore, --printfasta, --pretty or --colour with " "--zam"); } return cmd; }
void read_thread_args_parse(struct ReadThreadCmdArgs *args, int argc, char **argv, const struct option *longopts, bool correct_cmd) { size_t i; int tmp_thresh; // 0 => no calling, -1 => auto CorrectAlnInput task = CORRECT_ALN_INPUT_INIT; uint8_t fq_offset = 0; size_t dump_seq_n = 0, dump_mp_n = 0; // how many times are -g -G specified PathFileReader tmp_pfile; CorrectAlnInputBuffer *inputs = &args->inputs; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int used = 1, c; char *tmp_path; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': if(args->out_ctp_path != NULL) cmd_print_usage(NULL); args->out_ctp_path = optarg; break; case 'p': tmp_pfile = INIT_PATH_READER; path_file_open(&tmp_pfile, optarg, true); pfile_buf_add(&args->pfiles, tmp_pfile); break; case 't': if(args->num_of_threads != 0) die("%s set twice", cmd); args->num_of_threads = cmd_parse_arg_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break; case 'c': args->colour = cmd_parse_arg_uint32(cmd, optarg); break; case '1': case '2': case 'i': used = 1; correct_aln_input_buf_add(inputs, task); asyncio_task_parse(&inputs->data[inputs->len-1].files, c, optarg, fq_offset, correct_cmd ? &tmp_path : NULL); if(correct_cmd) inputs->data[inputs->len-1].out_base = tmp_path; break; case 'f': task.matedir = READPAIR_FR; used = 0; break; case 'F': task.matedir = READPAIR_FF; used = 0; break; case 'r': task.matedir = READPAIR_RF; used = 0; break; case 'R': task.matedir = READPAIR_RR; used = 0; break; case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break; case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break; case 'q': fq_offset = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'Q': task.fq_cutoff = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'H': task.hp_cutoff = cmd_parse_arg_uint8(cmd, optarg); used = 0; break; case 'e': task.crt_params.use_end_check = true; used = 0; break; case 'E': task.crt_params.use_end_check = false; used = 0; break; case 'g': task.crt_params.ins_gap_min = cmd_parse_arg_uint32(cmd, optarg); used = 0; break; case 'G': task.crt_params.ins_gap_max = cmd_parse_arg_uint32(cmd, optarg); used = 0; break; case 'S': args->dump_seq_sizes = optarg; dump_seq_n++; break; case 'M': args->dump_mp_sizes = optarg; dump_mp_n++; break; case 'u': args->use_new_paths = true; break; case 'C': if(optarg == NULL || strcmp(optarg,"auto")) args->clean_threshold = -1; else if(parse_entire_int(optarg,&tmp_thresh) && tmp_thresh >= -1) { if(tmp_thresh != -1 && tmp_thresh < 2) warn("Ignoring --clean %u (too small < 2)", tmp_thresh); else if(tmp_thresh > 255) warn("Ignoring --clean %u (too big > 255)", tmp_thresh); else args->clean_threshold = tmp_thresh; } else die("Bad argument for %s <auto|N> where N > 1", cmd); args->clean_paths = (args->clean_threshold != 0); break; case 'X': gen_paths_print_contigs = true; break; case 'Y': gen_paths_print_paths = true; break; case 'Z': gen_paths_print_reads = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" thread -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(args->num_of_threads == 0) args->num_of_threads = DEFAULT_NTHREADS; // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(!used) cmd_print_usage("Ignored arguments after last --seq"); if(dump_seq_n > 1) die("Cannot specify --seq-gaps <out> more than once"); if(dump_mp_n > 1) die("Cannot specify --mp-gaps <out> more than once"); // // Open graph graph file // GraphFileReader *gfile = &args->gfile; graph_file_open(gfile, graph_path, true); file_filter_update_intocol(&gfile->fltr, 0); if(!correct_cmd && graph_file_usedcols(gfile) > 1) die("Please specify a single colour e.g. %s:0", gfile->fltr.file_path.buff); // // Open path files // size_t path_max_usedcols = 0; for(i = 0; i < args->pfiles.len; i++) { // file_filter_update_intocol(&args->pfiles.data[i].fltr, 0); if(!correct_cmd && path_file_usedcols(&args->pfiles.data[i]) > 1) { die("Please specify a single colour e.g. %s:0", args->pfiles.data[i].fltr.file_path.buff); } path_max_usedcols = MAX2(path_max_usedcols, path_file_usedcols(&args->pfiles.data[i])); } args->path_max_usedcols = path_max_usedcols; // Check for compatibility between graph files and path files graphs_paths_compatible(gfile, 1, args->pfiles.data, args->pfiles.len); // Check ins_gap_min < ins_gap_max for(i = 0; i < inputs->len; i++) { CorrectAlnInput *t = &inputs->data[i]; t->files.ptr = t; if(t->crt_params.ins_gap_min > t->crt_params.ins_gap_max) { die("--min-ins %u is greater than --max-ins %u", t->crt_params.ins_gap_min, t->crt_params.ins_gap_max); } correct_aln_input_print(&inputs->data[i]); args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.ins_gap_max); } }