コード例 #1
0
ファイル: alignment_scoring_load.c プロジェクト: xujl12/BLAST
void align_scoring_load_pairwise(gzFile file, const char* file_path,
                                 scoring_t* scoring, char case_sensitive)
{
    StrBuf* sbuf = strbuf_new(200);
    size_t read_length;
    int line_num = 0;

    char a, b;
    int score;

    int num_pairs_added = 0;

    while((read_length = strbuf_reset_gzreadline(sbuf, file)) > 0)
    {
        strbuf_chomp(sbuf);

        if(sbuf->end > 0 && sbuf->b[0] != '#' && // line is not empty, not comment
                !string_is_all_whitespace(sbuf->b)) // and not whitespace
        {
            if(read_length < 5)
            {
                _loading_error("Too few column headings", file_path, line_num, 0);
            }

            if(isspace(sbuf->b[1]))
            {
                // split by whitespace
                a = sbuf->b[0];

                size_t char2_pos;

                for(char2_pos = 1;
                        sbuf->b[char2_pos] != '\0' && isspace(sbuf->b[char2_pos]);
                        char2_pos++);

                if(char2_pos+2 >= sbuf->end || !isspace(sbuf->b[char2_pos+1]))
                {
                    _loading_error("Line too short", file_path, line_num, 0);
                }

                b = sbuf->b[char2_pos];

                if(!parse_entire_int(sbuf->b+char2_pos+2, &score))
                {
                    _loading_error("Invalid number", file_path, line_num, 0);
                }
            }
            else
            {
                if(sbuf->b[1] != sbuf->b[3])
                {
                    _loading_error("Inconsistent separators used", file_path, line_num, 0);
                }

                a = sbuf->b[0];
                b = sbuf->b[2];

                if(!parse_entire_int(sbuf->b + 4, &score))
                {
                    _loading_error("Invalid number", file_path, line_num, 0);
                }
            }

            if(!case_sensitive)
            {
                a = tolower(a);
                b = tolower(b);
            }

            scoring_add_mutation(scoring, a, b, score);
            num_pairs_added++;
        }

        line_num++;
    }

    strbuf_free(sbuf);

    if(num_pairs_added == 0)
    {
        _loading_error("No pairs added from file (file empty?)",
                       file_path, line_num, 0);
    }
}
コード例 #2
0
ファイル: alignment_cmdline.c プロジェクト: fw1121/seq-align
cmdline_t* cmdline_new(int argc, char **argv, scoring_t *scoring,
                       enum SeqAlignCmdType cmd_type)
{
  cmdline_t* cmd = calloc(1, sizeof(cmdline_t));
  cmd->file_list_length = 0;
  cmd->file_list_capacity = 256;
  cmd->file_paths1 = malloc(sizeof(char*) * cmd->file_list_capacity);
  cmd->file_paths2 = malloc(sizeof(char*) * cmd->file_list_capacity);
  cmd->seq1 = cmd->seq2 = NULL;
  // All values initially 0

  // Store defaults
  score_t defaults[4] = {scoring->match, scoring->mismatch,
                         scoring->gap_open, scoring->gap_extend};

  if(argc == 1) usage(NULL);

  // First run through arguments to set up case_sensitive and scoring system

  // case sensitive needs to be dealt with first
  // (is is used to construct hash table for swap_scores)
  char scoring_set = 0, substitutions_set = 0, match_set = 0, mismatch_set = 0;

  int argi;
  for(argi = 1; argi < argc; argi++)
  {
    if(strcasecmp(argv[argi], "--help") == 0 ||
       strcasecmp(argv[argi], "-help") == 0 ||
       strcasecmp(argv[argi], "-h") == 0)
    {
      usage(NULL);
    }
    else if(strcasecmp(argv[argi], "--case_sensitive") == 0)
    {
      cmd->case_sensitive = 1;
    }
    else if(strcasecmp(argv[argi], "--scoring") == 0)
    {
      if(scoring_set)
      {
        usage("More than one scoring system specified - not permitted");
      }

      if(strcasecmp(argv[argi+1], "PAM30") == 0)
      {
        scoring_system_PAM30(scoring);
      }
      else if(strcasecmp(argv[argi+1], "PAM70") == 0)
      {
        scoring_system_PAM70(scoring);
      }
      else if(strcasecmp(argv[argi+1], "BLOSUM80") == 0)
      {
        scoring_system_BLOSUM80(scoring);
      }
      else if(strcasecmp(argv[argi+1], "BLOSUM62") == 0)
      {
        scoring_system_BLOSUM62(scoring);
      }
      else if(strcasecmp(argv[argi+1], "DNA_HYBRIDIZATION") == 0)
      {
        scoring_system_DNA_hybridization(scoring);
      }
      else {
        usage("Unknown --scoring choice, not one of "
              "PAM30|PAM70|BLOSUM80|BLOSUM62");
      }

      scoring_set = 1;
      argi++; // took an argument
    }
  }

  for(argi = 1; argi < argc; argi++)
  {
    if(argv[argi][0] == '-')
    {
      // strcasecmp does case insensitive comparison
      if(strcasecmp(argv[argi], "--freestartgap") == 0)
      {
        if(cmd_type != SEQ_ALIGN_NW_CMD)
          usage("--freestartgap only valid with Needleman-Wunsch");
        scoring->no_start_gap_penalty = true;
      }
      else if(strcasecmp(argv[argi], "--freeendgap") == 0)
      {
        if(cmd_type != SEQ_ALIGN_NW_CMD)
          usage("--freeendgap only valid with Needleman-Wunsch");
        scoring->no_end_gap_penalty = true;
      }
      else if(strcasecmp(argv[argi], "--nogaps") == 0)
      {
        scoring->no_gaps_in_a = true;
        scoring->no_gaps_in_b = true;
      }
      else if(strcasecmp(argv[argi], "--nogapsin1") == 0)
      {
        scoring->no_gaps_in_a = true;
      }
      else if(strcasecmp(argv[argi], "--nogapsin2") == 0)
      {
        scoring->no_gaps_in_b = true;
      }
      else if(strcasecmp(argv[argi], "--nomismatches") == 0)
      {
        scoring->no_mismatches = true;
      }
      else if(strcasecmp(argv[argi], "--case_sensitive") == 0)
      {
        // Already dealt with
        //case_sensitive = true;
      }
      else if(strcasecmp(argv[argi], "--printseq") == 0)
      {
        if(cmd_type != SEQ_ALIGN_SW_CMD)
          usage("--printseq only valid with Smith-Waterman");
        cmd->print_seq = true;
      }
      else if(strcasecmp(argv[argi], "--printmatrices") == 0)
      {
        cmd->print_matrices = true;
      }
      else if(strcasecmp(argv[argi], "--printscores") == 0)
      {
        if(cmd_type != SEQ_ALIGN_NW_CMD)
          usage("--printscores only valid with Needleman-Wunsch");
        cmd->print_scores = true;
      }
      else if(strcasecmp(argv[argi], "--printfasta") == 0)
      {
        cmd->print_fasta = true;
      }
      else if(strcasecmp(argv[argi], "--pretty") == 0)
      {
        cmd->print_pretty = true;
      }
      else if(strcasecmp(argv[argi], "--colour") == 0)
      {
        cmd->print_colour = true;
      }
      else if(strcasecmp(argv[argi], "--zam") == 0)
      {
        if(cmd_type != SEQ_ALIGN_NW_CMD)
          usage("--zam only valid with Needleman-Wunsch");
        cmd->zam_stle_output = true;
      }
      else if(strcasecmp(argv[argi], "--stdin") == 0)
      {
        // Similar to --file argument below
        cmdline_add_files(cmd, "", NULL);
        cmd->interactive = true;
      }
      else if(argi == argc-1)
      {
        // All the remaining options take an extra argument
        usage("Unknown argument without parameter: %s", argv[argi]);
      }
      else if(strcasecmp(argv[argi], "--scoring") == 0)
      {
        // This handled above
        argi++; // took an argument
      }
      else if(strcasecmp(argv[argi], "--substitution_matrix") == 0)
      {
        gzFile sub_matrix_file = gzopen(argv[argi+1], "r");
        if(sub_matrix_file == NULL) usage("Couldn't read: %s", argv[argi+1]);

        align_scoring_load_matrix(sub_matrix_file, argv[argi+1],
                                  scoring, cmd->case_sensitive);

        gzclose(sub_matrix_file);
        substitutions_set = true;

        argi++; // took an argument
      }
      else if(strcasecmp(argv[argi], "--substitution_pairs") == 0)
      {
        gzFile sub_pairs_file = gzopen(argv[argi+1], "r");
        if(sub_pairs_file == NULL) usage("Couldn't read: %s", argv[argi+1]);

        align_scoring_load_pairwise(sub_pairs_file, argv[argi+1],
                                    scoring, cmd->case_sensitive);

        gzclose(sub_pairs_file);
        substitutions_set = true;

        argi++; // took an argument
      }
      else if(strcasecmp(argv[argi], "--minscore") == 0)
      {
        if(cmd_type != SEQ_ALIGN_SW_CMD)
          usage("--minscore only valid with Smith-Waterman");

        if(!parse_entire_int(argv[argi+1], &cmd->min_score))
          usage("Invalid --minscore <score> argument (must be a +ve int)");

        cmd->min_score_set = true;

        argi++;
      }
      else if(strcasecmp(argv[argi], "--maxhits") == 0)
      {
        if(cmd_type != SEQ_ALIGN_SW_CMD)
          usage("--maxhits only valid with Smith-Waterman");

        if(!parse_entire_uint(argv[argi+1], &cmd->max_hits_per_alignment))
          usage("Invalid --maxhits <hits> argument (must be a +ve int)");

        cmd->max_hits_per_alignment_set = true;

        argi++;
      }
      else if(strcasecmp(argv[argi], "--context") == 0)
      {
        if(cmd_type != SEQ_ALIGN_SW_CMD)
          usage("--context only valid with Smith-Waterman");

        if(!parse_entire_uint(argv[argi+1], &cmd->print_context))
          usage("Invalid --context <c> argument (must be >= 0)");

        argi++;
      }
      else if(strcasecmp(argv[argi], "--match") == 0)
      {
        if(!parse_entire_int(argv[argi+1], &scoring->match))
        {
          usage("Invalid --match argument ('%s') must be an int", argv[argi+1]);
        }

        match_set = true;
        argi++; // took an argument
      }
      else if(strcasecmp(argv[argi], "--mismatch") == 0)
      {
        if(!parse_entire_int(argv[argi+1], &scoring->mismatch))
        {
          usage("Invalid --mismatch argument ('%s') must be an int", argv[argi+1]);
        }

        mismatch_set = true;
        argi++; // took an argument
      }
      else if(strcasecmp(argv[argi], "--gapopen") == 0)
      {
        if(!parse_entire_int(argv[argi+1], &scoring->gap_open))
        {
          usage("Invalid --gapopen argument ('%s') must be an int", argv[argi+1]);
        }

        argi++; // took an argument
      }
      else if(strcasecmp(argv[argi], "--gapextend") == 0)
      {
        if(!parse_entire_int(argv[argi+1], &scoring->gap_extend))
        {
          usage("Invalid --gapextend argument ('%s') must be an int",
                argv[argi+1]);
        }

        argi++; // took an argument
      }
      else if(strcasecmp(argv[argi], "--file") == 0)
      {
        cmdline_add_files(cmd, argv[argi+1], NULL);
        argi++; // took an argument
      }
      // Remaining options take two arguments but check themselves
      else if(strcasecmp(argv[argi], "--files") == 0)
      {
        if(argi >= argc-2)
        {
          usage("--files option takes 2 arguments");
        }
        else if(strcmp(argv[argi+1], "-") == 0 && strcmp(argv[argi+2], "-") == 0)
        {
          // Read both from stdin
          cmdline_add_files(cmd, argv[argi+1], NULL);
        }
        else
        {
          cmdline_add_files(cmd, argv[argi+1], argv[argi+2]);
        }

        argi += 2; // took two arguments
      }
      else if(strcasecmp(argv[argi], "--wildcard") == 0)
      {
        int wildscore = 0;

        if(argi == argc-2 || strlen(argv[argi+1]) != 1 ||
           !parse_entire_int(argv[argi+2], &wildscore))
        {
          usage("--wildcard <w> <s> takes a single character and a number");
        }

        scoring_add_wildcard(scoring, argv[argi+1][0], wildscore);

        argi += 2; // took two arguments
      }
      else usage("Unknown argument '%s'", argv[argi]);
    }
    else
    {
      if(argc - argi != 2) usage("Unknown options: '%s'", argv[argi]);
      break;
    }
  }

  if((match_set && !mismatch_set && !scoring->no_mismatches) ||
     (!match_set && mismatch_set))
  {
    usage("--match --mismatch must both be set or neither set");
  }
  else if(substitutions_set && !match_set)
  {
    // if substitution table set and not match/mismatch
    scoring->use_match_mismatch = 0;
  }

  if(scoring->use_match_mismatch && scoring->match < scoring->mismatch) {
    usage("Match value should not be less than mismatch penalty");
  }

  // Cannot guarantee that we can perform a global alignment if nomismatches
  // and nogaps is true
  if(cmd_type == SEQ_ALIGN_NW_CMD && scoring->no_mismatches &&
     (scoring->no_gaps_in_a || scoring->no_gaps_in_b))
  {
    usage("--nogaps.. --nomismatches cannot be used at together");
  }

  // Check for extra unused arguments
  // and set seq1 and seq2 if they have been passed
  if(argi < argc)
  {
    cmd->seq1 = argv[argi];
    cmd->seq2 = argv[argi+1];
  }

  if(cmd->seq1 == NULL && cmd->file_list_length == 0)
  {
    usage("No input specified");
  }

  if(cmd->zam_stle_output &&
     (cmd->print_pretty || cmd->print_scores ||
      cmd->print_colour || cmd->print_fasta))
  {
    usage("Cannot use --printscore, --printfasta, --pretty or --colour with "
          "--zam");
  }

  return cmd;
}
コード例 #3
0
void read_thread_args_parse(struct ReadThreadCmdArgs *args,
                            int argc, char **argv,
                            const struct option *longopts, bool correct_cmd)
{
  size_t i;
  int tmp_thresh; // 0 => no calling, -1 => auto
  CorrectAlnInput task = CORRECT_ALN_INPUT_INIT;
  uint8_t fq_offset = 0;
  size_t dump_seq_n = 0, dump_mp_n = 0; // how many times are -g -G specified
  PathFileReader tmp_pfile;

  CorrectAlnInputBuffer *inputs = &args->inputs;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int used = 1, c;
  char *tmp_path;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o':
        if(args->out_ctp_path != NULL) cmd_print_usage(NULL);
        args->out_ctp_path = optarg;
        break;
      case 'p':
        tmp_pfile = INIT_PATH_READER;
        path_file_open(&tmp_pfile, optarg, true);
        pfile_buf_add(&args->pfiles, tmp_pfile);
        break;
      case 't':
        if(args->num_of_threads != 0) die("%s set twice", cmd);
        args->num_of_threads = cmd_parse_arg_uint32_nonzero(cmd, optarg);
        break;
      case 'm': cmd_mem_args_set_memory(&args->memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&args->memargs, optarg); break;
      case 'c': args->colour = cmd_parse_arg_uint32(cmd, optarg); break;
      case '1':
      case '2':
      case 'i':
        used = 1;
        correct_aln_input_buf_add(inputs, task);
        asyncio_task_parse(&inputs->data[inputs->len-1].files, c, optarg,
                           fq_offset, correct_cmd ? &tmp_path : NULL);
        if(correct_cmd) inputs->data[inputs->len-1].out_base = tmp_path;
        break;
      case 'f': task.matedir = READPAIR_FR; used = 0; break;
      case 'F': task.matedir = READPAIR_FF; used = 0; break;
      case 'r': task.matedir = READPAIR_RF; used = 0; break;
      case 'R': task.matedir = READPAIR_RR; used = 0; break;
      case 'w': task.crt_params.one_way_gap_traverse = true; used = 0; break;
      case 'W': task.crt_params.one_way_gap_traverse = false; used = 0; break;
      case 'q': fq_offset = cmd_parse_arg_uint8(cmd, optarg); used = 0; break;
      case 'Q': task.fq_cutoff = cmd_parse_arg_uint8(cmd, optarg); used = 0; break;
      case 'H': task.hp_cutoff = cmd_parse_arg_uint8(cmd, optarg); used = 0; break;
      case 'e': task.crt_params.use_end_check = true; used = 0; break;
      case 'E': task.crt_params.use_end_check = false; used = 0; break;
      case 'g': task.crt_params.ins_gap_min = cmd_parse_arg_uint32(cmd, optarg); used = 0; break;
      case 'G': task.crt_params.ins_gap_max = cmd_parse_arg_uint32(cmd, optarg); used = 0; break;
      case 'S': args->dump_seq_sizes = optarg; dump_seq_n++; break;
      case 'M': args->dump_mp_sizes = optarg; dump_mp_n++; break;
      case 'u': args->use_new_paths = true; break;
      case 'C':
        if(optarg == NULL || strcmp(optarg,"auto")) args->clean_threshold = -1;
        else if(parse_entire_int(optarg,&tmp_thresh) && tmp_thresh >= -1) {
          if(tmp_thresh != -1 && tmp_thresh < 2)
            warn("Ignoring --clean %u (too small < 2)", tmp_thresh);
          else if(tmp_thresh > 255)
            warn("Ignoring --clean %u (too big > 255)", tmp_thresh);
          else
            args->clean_threshold = tmp_thresh;
        }
        else die("Bad argument for %s <auto|N> where N > 1", cmd);
        args->clean_paths = (args->clean_threshold != 0);
        break;
      case 'X': gen_paths_print_contigs = true; break;
      case 'Y': gen_paths_print_paths = true; break;
      case 'Z': gen_paths_print_reads = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" thread -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(args->num_of_threads == 0) args->num_of_threads = DEFAULT_NTHREADS;

  // Check that optind+1 == argc
  if(optind+1 > argc)
    cmd_print_usage("Expected exactly one graph file");
  else if(optind+1 < argc)
    cmd_print_usage("Expected only one graph file. What is this: '%s'", argv[optind]);

  char *graph_path = argv[optind];
  status("Reading graph: %s", graph_path);

  if(!used) cmd_print_usage("Ignored arguments after last --seq");

  if(dump_seq_n > 1) die("Cannot specify --seq-gaps <out> more than once");
  if(dump_mp_n > 1) die("Cannot specify --mp-gaps <out> more than once");

  //
  // Open graph graph file
  //
  GraphFileReader *gfile = &args->gfile;
  graph_file_open(gfile, graph_path, true);
  file_filter_update_intocol(&gfile->fltr, 0);
  if(!correct_cmd && graph_file_usedcols(gfile) > 1)
    die("Please specify a single colour e.g. %s:0", gfile->fltr.file_path.buff);

  //
  // Open path files
  //
  size_t path_max_usedcols = 0;
  for(i = 0; i < args->pfiles.len; i++) {
    // file_filter_update_intocol(&args->pfiles.data[i].fltr, 0);
    if(!correct_cmd && path_file_usedcols(&args->pfiles.data[i]) > 1) {
      die("Please specify a single colour e.g. %s:0",
          args->pfiles.data[i].fltr.file_path.buff);
    }
    path_max_usedcols = MAX2(path_max_usedcols,
                             path_file_usedcols(&args->pfiles.data[i]));
  }
  args->path_max_usedcols = path_max_usedcols;

  // Check for compatibility between graph files and path files
  graphs_paths_compatible(gfile, 1, args->pfiles.data, args->pfiles.len);

  // Check ins_gap_min < ins_gap_max
  for(i = 0; i < inputs->len; i++)
  {
    CorrectAlnInput *t = &inputs->data[i];
    t->files.ptr = t;
    if(t->crt_params.ins_gap_min > t->crt_params.ins_gap_max) {
      die("--min-ins %u is greater than --max-ins %u",
          t->crt_params.ins_gap_min, t->crt_params.ins_gap_max);
    }
    correct_aln_input_print(&inputs->data[i]);
    args->max_gap_limit = MAX2(args->max_gap_limit, t->crt_params.ins_gap_max);
  }
}