static void parse_args(int argc, char **argv) { seq_format fmt = SEQ_FMT_FASTQ; bool invert = false; size_t i; aln_reads_buf_alloc(&inputs, 8); asyncio_buf_alloc(&files, 8); AlignReadsData input; AsyncIOInput seqfiles; // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 't': cmd_check(!nthreads,cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'F': cmd_check(fmt==SEQ_FMT_FASTQ, cmd); fmt = cmd_parse_format(cmd, optarg); break; case 'v': cmd_check(!invert,cmd); invert = true; break; case '1': case '2': case 'i': memset(&input, 0, sizeof(input)); memset(&seqfiles, 0, sizeof(seqfiles)); asyncio_task_parse(&seqfiles, c, optarg, 0, &input.out_base); aln_reads_buf_push(&inputs, &input, 1); asyncio_buf_push(&files, &seqfiles, 1); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" reads -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } ctx_assert(inputs.len == files.len); // Defaults if(!nthreads) nthreads = DEFAULT_NTHREADS; if(inputs.len == 0) cmd_print_usage("Please specify at least one sequence file (-1, -2 or -i)"); if(optind >= argc) cmd_print_usage("Please specify input graph file(s)"); num_gfiles = (size_t)(argc - optind); gfile_paths = argv + optind; for(i = 0; i < inputs.len; i++) { inputs.b[i].invert = invert; inputs.b[i].fmt = fmt; files.b[i].ptr = &inputs.b[i]; } }
int ctx_rmsubstr(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; size_t kmer_size = 0, nthreads = 0; const char *output_file = NULL; seq_format fmt = SEQ_FMT_FASTA; bool invert = false; // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!output_file, cmd); output_file = optarg; break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_uint32(cmd, optarg); break; case 'F': cmd_check(fmt==SEQ_FMT_FASTA, cmd); fmt = cmd_parse_format(cmd, optarg); break; case 'v': cmd_check(!invert,cmd); invert = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); cmd_print_usage("`"CMD" rmsubstr -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults if(!nthreads) nthreads = DEFAULT_NTHREADS; if(!kmer_size) kmer_size = DEFAULT_KMER; if(!(kmer_size&1)) cmd_print_usage("Kmer size must be odd"); if(kmer_size < MIN_KMER_SIZE) cmd_print_usage("Kmer size too small (recompile)"); if(kmer_size > MAX_KMER_SIZE) cmd_print_usage("Kmer size too large (recompile?)"); if(optind >= argc) cmd_print_usage("Please specify at least one input sequence file (.fq, .fq etc.)"); size_t i, num_seq_files = argc - optind; char **seq_paths = argv + optind; seq_file_t **seq_files = ctx_calloc(num_seq_files, sizeof(seq_file_t*)); for(i = 0; i < num_seq_files; i++) if((seq_files[i] = seq_open(seq_paths[i])) == NULL) die("Cannot read sequence file %s", seq_paths[i]); // Estimate number of bases // set to -1 if we cannot calc int64_t est_num_bases = seq_est_seq_bases(seq_files, num_seq_files); if(est_num_bases < 0) { warn("Cannot get file sizes, using pipes"); est_num_bases = memargs.num_kmers * IDEAL_OCCUPANCY; } status("[memory] Estimated number of bases: %li", (long)est_num_bases); // Use file sizes to decide on memory // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(KONodeList) + sizeof(KOccur) + // see kmer_occur.h 8; // 1 byte per kmer for each base to load sequence files kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, est_num_bases, est_num_bases, false, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // if(output_file == NULL) output_file = "-"; FILE *fout = futil_fopen_create(output_file, "w"); // // Set up memory // dBGraph db_graph; db_graph_alloc(&db_graph, kmer_size, 1, 0, kmers_in_hash, DBG_ALLOC_BKTLOCKS); // // Load reference sequence into a read buffer // ReadBuffer rbuf; read_buf_alloc(&rbuf, 1024); seq_load_all_reads(seq_files, num_seq_files, &rbuf); // Check for reads too short for(i = 0; i < rbuf.len && rbuf.b[i].seq.end >= kmer_size; i++) {} if(i < rbuf.len) warn("Reads shorter than kmer size (%zu) will not be filtered", kmer_size); KOGraph kograph = kograph_create(rbuf.b, rbuf.len, true, 0, nthreads, &db_graph); size_t num_reads = rbuf.len, num_reads_printed = 0, num_bad_reads = 0; // Loop over reads printing those that are not substrings int ret; for(i = 0; i < rbuf.len; i++) { ret = _is_substr(&rbuf, i, &kograph, &db_graph); if(ret == -1) num_bad_reads++; else if((ret && invert) || (!ret && !invert)) { seqout_print_read(&rbuf.b[i], fmt, fout); num_reads_printed++; } } char num_reads_str[100], num_reads_printed_str[100], num_bad_reads_str[100]; ulong_to_str(num_reads, num_reads_str); ulong_to_str(num_reads_printed, num_reads_printed_str); ulong_to_str(num_bad_reads, num_bad_reads_str); status("Printed %s / %s (%.1f%%) to %s", num_reads_printed_str, num_reads_str, !num_reads ? 0.0 : (100.0 * num_reads_printed) / num_reads, futil_outpath_str(output_file)); if(num_bad_reads > 0) { status("Bad reads: %s / %s (%.1f%%) - no kmer {ACGT} of length %zu", num_bad_reads_str, num_reads_str, (100.0 * num_bad_reads) / num_reads, kmer_size); } fclose(fout); kograph_dealloc(&kograph); // Free sequence memory for(i = 0; i < rbuf.len; i++) seq_read_dealloc(&rbuf.b[i]); read_buf_dealloc(&rbuf); ctx_free(seq_files); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
void read_thread_args_parse(struct ReadThreadCmdArgs *args, int argc, char **argv, const struct option *longopts, bool correct_cmd) { size_t i; CorrectAlnInput task = CORRECT_ALN_INPUT_INIT; uint8_t fq_offset = 0; GPathReader tmp_gpfile; CorrectAlnInputBuffer *inputs = &args->inputs; args->memargs = (struct MemArgs)MEM_ARGS_INIT; args->fmt = SEQ_FMT_FASTQ; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int used = 1, c; char *tmp_path; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!args->out_ctp_path,cmd); args->out_ctp_path = optarg; break; case 'p': memset(&tmp_gpfile, 0, sizeof(GPathReader)); gpath_reader_open(&tmp_gpfile, optarg); gpfile_buf_push(&args->gpfiles, &tmp_gpfile, 1); break; case 't': cmd_check(!args->nthreads, cmd); args->nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break; case 'c': args->colour = cmd_uint32(cmd, optarg); break; case 'F': cmd_check(args->fmt == SEQ_FMT_FASTQ, cmd); args->fmt = cmd_parse_format(cmd, optarg); break; case '1': case '2': case 'i': used = 1; correct_aln_input_buf_push(inputs, &task, 1); asyncio_task_parse(&inputs->b[inputs->len-1].files, c, optarg, fq_offset, correct_cmd ? &tmp_path : NULL); if(correct_cmd) inputs->b[inputs->len-1].out_base = tmp_path; break; case 'M': if(!strcmp(optarg,"FF")) task.matedir = READPAIR_FF; else if(!strcmp(optarg,"FR")) task.matedir = READPAIR_FR; else if(!strcmp(optarg,"RF")) task.matedir = READPAIR_RF; else if(!strcmp(optarg,"RR")) task.matedir = READPAIR_RR; else die("-M,--matepair <orient> must be one of: FF,FR,RF,RR"); used = 0; break; case 'O': fq_offset = cmd_uint8(cmd, optarg); used = 0; break; case 'Q': task.fq_cutoff = cmd_uint8(cmd, optarg); used = 0; break; case 'H': task.hp_cutoff = cmd_uint8(cmd, optarg); used = 0; break; case 'l': task.crt_params.frag_len_min = cmd_uint32(cmd, optarg); used = 0; break; case 'L': task.crt_params.frag_len_max = cmd_uint32(cmd, optarg); used = 0; break; case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break; case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break; case 'd': task.crt_params.gap_wiggle = cmd_udouble(cmd, optarg); used = 0; break; case 'D': task.crt_params.gap_variance = cmd_udouble(cmd, optarg); used = 0; break; case 'X': task.crt_params.max_context = cmd_uint32(cmd, optarg); used = 0; break; case 'e': task.crt_params.use_end_check = true; used = 0; break; case 'E': task.crt_params.use_end_check = false; used = 0; break; case 'g': cmd_check(!args->dump_seq_sizes, cmd); args->dump_seq_sizes = optarg; break; case 'G': cmd_check(!args->dump_frag_sizes, cmd); args->dump_frag_sizes = optarg; break; case 'u': args->use_new_paths = true; break; case 'x': gen_paths_print_contigs = true; break; case 'y': gen_paths_print_paths = true; break; case 'z': gen_paths_print_reads = true; break; case 'Z': cmd_check(!args->fq_zero, cmd); if(strlen(optarg) != 1) cmd_print_usage("--fq-zero <c> requires a single char"); args->fq_zero = optarg[0]; break; case 'P': cmd_check(!args->append_orig_seq,cmd); args->append_orig_seq = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" thread/correct -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(args->nthreads == 0) args->nthreads = DEFAULT_NTHREADS; // Check that optind+1 == argc if(optind+1 > argc) cmd_print_usage("Expected exactly one graph file"); else if(optind+1 < argc) cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]); char *graph_path = argv[optind]; status("Reading graph: %s", graph_path); if(!used) cmd_print_usage("Ignored arguments after last --seq"); // ctx_thread requires output file if(!correct_cmd && !args->out_ctp_path) cmd_print_usage("--out <out.ctp> is required"); // // Open graph graph file // GraphFileReader *gfile = &args->gfile; graph_file_open(gfile, graph_path); if(!correct_cmd && file_filter_into_ncols(&gfile->fltr) > 1) die("Please specify a single colour e.g. %s:0", file_filter_path(&gfile->fltr)); // // Open path files // size_t path_max_usedcols = 0; for(i = 0; i < args->gpfiles.len; i++) { // file_filter_update_intocol(&args->pfiles.b[i].fltr, 0); if(!correct_cmd && file_filter_into_ncols(&args->gpfiles.b[i].fltr) > 1) { die("Please specify a single colour e.g. %s:0", file_filter_path(&args->gpfiles.b[i].fltr)); } path_max_usedcols = MAX2(path_max_usedcols, file_filter_into_ncols(&args->gpfiles.b[i].fltr)); } args->path_max_usedcols = path_max_usedcols; // Check for compatibility between graph files and path files graphs_gpaths_compatible(gfile, 1, args->gpfiles.b, args->gpfiles.len, -1); // if no paths loaded, set all max_context values to 1, since >1 kmer only // useful if can pickup paths if(args->gpfiles.len == 0) { for(i = 0; i < inputs->len; i++) inputs->b[i].crt_params.max_context = 1; } // Check frag_len_min < frag_len_max for(i = 0; i < inputs->len; i++) { CorrectAlnInput *t = &inputs->b[i]; t->files.ptr = t; if(t->crt_params.frag_len_min > t->crt_params.frag_len_max) { die("--min-ins %u is greater than --max-ins %u", t->crt_params.frag_len_min, t->crt_params.frag_len_max); } correct_aln_input_print(&inputs->b[i]); args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.frag_len_max); } futil_create_output(args->dump_seq_sizes); futil_create_output(args->dump_frag_sizes); }