/*****************************************************************************
 * MEME > training_set > alphabet > letter
 * Read in a identifier and symbol pair for a letter.
 ****************************************************************************/
void mxml_alphabet_letter(void *ctx, char *id, char symbol, char* aliases, char complement, char *equals, char *name, int colour) {
  CTX_T *data;
  char sym[2];
  data = (CTX_T*)ctx;
  if (data->alph != NULL) {
    // check that the symbol exists in the alphabet
    if (!alph_is_concrete(data->alph, symbol)) {
      local_error(data, "The symbol %c does not exist in the built-in alphabet %s.\n",
          symbol, alph_name(data->alph));
      return;
    }
    // Note: don't bother checking complement value as it will not be set
  } else {
    if (equals == NULL) {
      alph_reader_core(data->alph_rdr, symbol, aliases, name, colour, complement);
    } else {
      alph_reader_ambig(data->alph_rdr, symbol, aliases, name, colour, equals);
    }
  }
  // create a mapping from the id to the symbol
  sym[0] = symbol;
  sym[1] = '\0';
  if (!rbtree_make(data->letter_lookup, id, sym)) {
    local_error(data, "The letter identifier %s has been used before.\n", id);
  }
}
/*****************************************************************************
 * MEME > training_set > ambigs > letter
 * not used in modern meme outputs
 ****************************************************************************/
void mxml_ambigs_letter(void *ctx, char *id, char symbol) {
  CTX_T *data;
  char sym[2];
  data = (CTX_T*)ctx;
  // create a mapping from the id to the symbol
  sym[0] = symbol;
  sym[1] = '\0';
  if (!rbtree_make(data->letter_lookup, id, sym)) {
    local_error(data, "The letter identifier %s has been used before.\n", id);
  }
}
Esempio n. 3
0
static void process_command_line(int argc, char **argv, AMA_OPTIONS_T *options) {
  int option_index = 0;
  const int num_options = 16;
  struct option ama_options[] = {
    {"max-seq-length",  required_argument,  NULL, OPT_MAX_SEQ_LENGTH},
    {"motif",           required_argument,  NULL, OPT_MOTIF},
    {"motif-pseudo",    required_argument,  NULL, OPT_MOTIF_PSEUDO},
    {"rma",             no_argument,        NULL, OPT_RMA},
    {"pvalues",         no_argument,        NULL, OPT_PVALUES},
    {"sdbg",            required_argument,  NULL, OPT_SDBG},
    {"norc",            no_argument,        NULL, OPT_NORC},
    {"cs",              no_argument,        NULL, OPT_CS},
    {"o-format",        required_argument,  NULL, OPT_O_FORMAT},
    {"o",               required_argument,  NULL, OPT_O},
    {"oc",              required_argument,  NULL, OPT_OC},
    {"scoring",         required_argument,  NULL, OPT_SCORING},
    {"verbosity",       required_argument,  NULL, OPT_VERBOSITY},
    {"gcbins",          required_argument,  NULL, OPT_GCBINS},
    {"last",            required_argument,  NULL, OPT_LAST},
    {"version",         no_argument,        NULL, OPT_VERSION},
    {NULL, 0, NULL, 0} //boundary indicator
  };
  bool out_set = false;
  bool format_set = false;
  // set option defaults
  options->max_seq_length = MAX_SEQ;
  options->scan_both_strands = true;
  options->combine_duplicates = false;
  options->selected_motifs = rbtree_create(rbtree_strcmp, NULL, NULL, NULL, NULL);
  options->pseudocount = DEFAULT_PSEUDOCOUNT;
  options->output_format = CISML_FORMAT;
  options->clobber = false;
  options->out_dir = NULL;
  options->scoring = AVG_ODDS;
  options->pvalues = false;
  options->normalize_scores = false;
  options->num_gc_bins = 1;
  options->sdbg_order = -1;
  options->last = 0;
  options->motif_filename = NULL;
  options->fasta_filename = NULL;
  options->bg_filename = NULL;

  // parse command line
  while (1) {
    int opt = getopt_long_only(argc, argv, "", ama_options, NULL);
    if (opt == -1) break;
    switch (opt) {
      case OPT_MAX_SEQ_LENGTH:
        options->max_seq_length = atoi(optarg);
        break;
      case OPT_MOTIF:
        rbtree_make(options->selected_motifs, optarg, NULL);
        break;
      case OPT_MOTIF_PSEUDO:
        options->pseudocount = atof(optarg);
        break;
      case OPT_RMA:
        options->normalize_scores = true;
        break;
      case OPT_PVALUES:
        options->pvalues = true;
        break;
      case OPT_SDBG:
        options->sdbg_order = atoi(optarg); // >=0 means use sequence bkg
        break;
      case OPT_NORC:
        options->scan_both_strands = false;
        break;
      case OPT_CS:
        options->combine_duplicates = true;
        break;
      case OPT_O_FORMAT:
        if (out_set) {
          usage("Option -o-format is incompatible with option -o/-oc");
        } else {
          format_set = true;
          if (strcmp(optarg, "gff") == 0) {
            options->output_format = GFF_FORMAT;
          } else if (strcmp(optarg, "cisml") == 0) {
            options->output_format = CISML_FORMAT;
          } else {
            usage("Output format \"%s\" is not recognised. "
                "Expected \"gff\" or \"cisml\".", optarg);
          }
        }
        break;
      case OPT_OC:
        options->clobber = true;
      case OPT_O:
        if (format_set) {
          usage("Option -o/-oc is incompatible with option -o-format");
        } else {
          out_set = true;
          options->out_dir = optarg;
          options->output_format = DIRECTORY_FORMAT;
        }
        break;
      case OPT_SCORING:
        if (strcmp(optarg, "max-odds") == 0) {
          options->scoring = MAX_ODDS;
        } else if (strcmp(optarg, "avg-odds") == 0) {
          options->scoring = AVG_ODDS;
        } else if (strcmp(optarg, "sum-odds") == 0) {
          options->scoring = SUM_ODDS;
        } else {
          usage("Scoring method \"%s\" is not recognised. "
              "Expected \"max-odds\", \"avg-odds\" or \"sum-odds\".", optarg);
        }
        break;
      case OPT_VERBOSITY:
        verbosity = atoi(optarg);
        break;
      case OPT_GCBINS:
        options->num_gc_bins = atoi(optarg);
        options->pvalues = true;
        if (options->num_gc_bins <= 1)
          usage("Number of bins in --gcbins must be greater than 1.");
        break;
      case OPT_LAST:
        options->last = atoi(optarg);
        if (options->last < 0) usage("Option --last must not be negative.");
        break;
      case OPT_VERSION:
        fprintf(stdout, VERSION "\n");
        exit(EXIT_SUCCESS);
        break;
      case '?':           //unrecognised or ambiguous argument
        usage("Unrecognized or ambiguous option.");
    }
  }

  // --sdbg overrides --pvalues and --gcbins and --rma
  if (options->sdbg_order >= 0) {
    options->pvalues = false;
    options->normalize_scores = false;
    options->num_gc_bins = 1;
  }

  if (argc <= optind) usage("Expected motif file.");
  options->motif_filename = argv[optind++];
  if (argc <= optind) usage("Expected fasta file.");
  options->fasta_filename = argv[optind++];
  if (argc > optind) options->bg_filename = argv[optind++];

  if (options->sdbg_order >= 0) {
    if (options->bg_filename) usage("A background file can not be used together with --sdbg.");
  } else {
    if (!options->bg_filename) usage("You must provide a background file unless you specify --sdbg.");
  }
  if (argc > optind) usage("Too many parameters");
  // for now, use uniform to mimic old implementation. I will probably remove this later
  if (!options->bg_filename) options->bg_filename = "--uniform--";
 
  // Record the command line.
  options->command_line = get_command_line(argc, argv);
}
Esempio n. 4
0
/*****************************************************************************
 * dreme > model > background
 *
 *  type            is the alphabet DNA or RNA (optional when custom alphabet specified)
 *  <symbol>        frequency of <symbol> from core alphabet
 *  from            from the negative dataset or a background file
 *  file            the background file (optional)
 *  last_mod_date   the last modified date of the background file (optional)
 ****************************************************************************/
static void start_ele_background(PS_T *ps, const xmlChar **attrs) {
  int type, from;
  char *file, *lastmod;

  // set reasonable defaults
  type = DREME_BG_FROM_DATASET;
  from = DREME_ALPH_DNA;
  file = NULL;
  lastmod = NULL;

  char* type_options[2] = {"dna", "rna"};
  int type_values[2] = {DREME_ALPH_DNA, DREME_ALPH_RNA};
  MULTI_T type_multi = {.count = 2, .options = type_options, 
    .outputs = type_values, .target = &(type)};

  char* from_options[2] = {"dataset", "file"};
  int from_values[2] = {DREME_BG_FROM_DATASET, DREME_BG_FROM_FILE};
  MULTI_T from_multi = {.count = 2, .options = from_options, 
    .outputs = from_values, .target = &(from)};

  char* names[4] = {"file", "from", "last_mod_date", "type"};
  int (*parsers[4])(char*, void*) = {ld_str, ld_multi, ld_str, ld_multi};
  void *data[4] = {&file, &from_multi, &lastmod, &type_multi};
  BOOLEAN_T required[4] = {FALSE, TRUE, FALSE, FALSE};
  BOOLEAN_T done[4];

  required[3] = !ps->seen_alphabet;
  parse_attributes(dreme_attr_parse_error, ps, "background", attrs, 4, names, parsers, data, required, done);

  if (from == DREME_BG_FROM_FILE) {
    if (!done[0]) dreme_attr_parse_error(ps, PARSE_ATTR_MISSING, "background", "file", NULL);
    if (!done[2]) dreme_attr_parse_error(ps, PARSE_ATTR_MISSING, "background", "last_mod_date", NULL);
  }

  // if we haven't seen the alphabet then we must define it from the type
  if (!ps->seen_alphabet) {
    int idx = 0;
    rbtree_make(ps->alph_ids, "A", &idx); idx++;
    rbtree_make(ps->alph_ids, "C", &idx); idx++;
    rbtree_make(ps->alph_ids, "G", &idx); idx++;
    rbtree_make(ps->alph_ids, (type == DREME_ALPH_DNA ? "T" : "U"), &idx);
  }
  parse_freq_attrs(ps, "background", attrs);

  if (ps->callbacks->handle_background && ps->state != PS_ERROR) {
    ps->callbacks->handle_background(ps->user_data, rbtree_size(ps->alph_ids), ps->freqs, from, file, lastmod);
  }
}

/*****************************************************************************
 * dreme > model > stop
 *
 *  evalue          the stopping evalue (returned as log10).
 *  count           the stopping count.
 *  time            the stopping time.
 ****************************************************************************/
static void start_ele_stop(PS_T *ps, const xmlChar **attrs) {
  int count, time;
  double log10evalue;

  char* names[3] = {"count", "evalue", "time"};
  int (*parsers[3])(char*, void*) = {ld_int, ld_log10_ev, ld_int};
  void *data[3] = {&count, &log10evalue, &time};
  BOOLEAN_T required[3] = {FALSE, FALSE, FALSE};
  BOOLEAN_T done[3];

  parse_attributes(dreme_attr_parse_error, ps, "stop", attrs, 3, names, parsers, data, required, done);

  if (ps->callbacks->handle_stop && ps->state != PS_ERROR) {
    ps->callbacks->handle_stop(ps->user_data, &log10evalue, &count, &time);
  }
}

/*****************************************************************************
 * dreme > model > /ngen
 * the number of generations to check (or something like that).
 ****************************************************************************/
static void end_ele_ngen(PS_T *ps) {
  int ngen;

  if (ld_int(ps->characters.buffer, &ngen)) {
    error(ps, "Bad value \"%s\" for ngen.\n", ps->characters.buffer);
  }

  if (ps->callbacks->handle_ngen && ps->state != PS_ERROR) {
    ps->callbacks->handle_ngen(ps->user_data, ngen);
  }
}
Esempio n. 5
0
/*****************************************************************************
 * dreme > model > negatives
 *
 *  name            the name of the negative dataset
 *  count           the number of sequences in the negative dataset
 *  from            the source of the negative dataset (eg shuffled positives)
 *  file            the file containing the negative dataset (optional)
 *  last_mod_date   the last modified date of the file (optional)
 ****************************************************************************/
static void start_ele_negatives(PS_T *ps, const xmlChar **attrs) {
  char *name, *file, *lastmod;
  long count;
  int from;

  file = NULL;
  lastmod = NULL;

  char* from_options[2] = {"file", "shuffled"};
  int from_values[2] = {DREME_NEG_FILE, DREME_NEG_SHUFFLED};
  MULTI_T from_multi = {.count = 2, .options = from_options, .outputs = from_values, .target = &(from)};

  char* names[5] = {"count", "file", "from", "last_mod_date", "name"};
  int (*parsers[5])(char*, void*) = {ld_long, ld_str, ld_multi, ld_str, ld_str};
  void *data[5] = {&count, &file, &from_multi, &lastmod, &name};
  BOOLEAN_T required[5] = {TRUE, FALSE, TRUE, FALSE, TRUE};
  BOOLEAN_T done[5];

  parse_attributes(dreme_attr_parse_error, ps, "negatives", attrs, 5, names, parsers, data, required, done);

  if (ps->state != PS_ERROR && from == DREME_NEG_FILE) {
    if (file == NULL) {
      dreme_attr_parse_error(ps, PARSE_ATTR_MISSING, "negatives", "file", NULL);
    } 
    if (lastmod == NULL) {
      dreme_attr_parse_error(ps, PARSE_ATTR_MISSING, "negatives", "last_mod_date", NULL);
    }
  }

  if (ps->callbacks->handle_negatives && ps->state != PS_ERROR) {
    ps->callbacks->handle_negatives(ps->user_data, name, count, (DREME_NEG_EN)from, file, lastmod);
  }
}

/*****************************************************************************
 * DREME > model > alphabet
 ****************************************************************************/
static void start_ele_alphabet(PS_T *ps, const xmlChar **attrs) {
  char *name;
  int extends;

  char* extends_options[3] = {"dna", "protein", "rna"};
  int extends_values[3] = {ALPH_FLAG_EXTENDS_DNA, ALPH_FLAG_EXTENDS_PROTEIN, ALPH_FLAG_EXTENDS_RNA};
  MULTI_T extends_multi = {.count = 3, .options = extends_options, 
    .outputs = extends_values, .target = &(extends)};

  char* names[2] = {"like", "name"};
  int (*parsers[2])(char*, void*) = {ld_multi, ld_str};
  void *data[2] = {&extends_multi, &name};
  BOOLEAN_T required[2] = {FALSE, FALSE};
  BOOLEAN_T done[2];
  // just so we know later on when reading the background which used to set the alphabet
  ps->seen_alphabet = true;

  // defaults
  name = NULL;
  extends = 0;
  parse_attributes(dreme_attr_parse_error, ps, "alphabet", attrs, 2, names, parsers, data, required, done);

  if (ps->callbacks->start_alphabet && ps->state != PS_ERROR) {
    ps->callbacks->start_alphabet(ps->user_data, name, extends);
  }
  dreme_push_es(ps, PS_IN_ALPHABET_LETTER, ES_ONE_OR_MORE);
}

/*****************************************************************************
 * DREME > model > /alphabet
 ****************************************************************************/
static void end_ele_alphabet(PS_T *ps) {
  if (ps->callbacks->end_alphabet && ps->state != PS_ERROR) {
    ps->callbacks->end_alphabet(ps->user_data);
  }
}

/*****************************************************************************
 * DREME > model > alphabet > letter
 ****************************************************************************/
static void start_ele_alphabet_letter(PS_T *ps, const xmlChar **attrs) {
  char *aliases, *id, *name, *equals, symbol, complement;
  int colour, idx;

  char* names[7] = {"aliases", "colour", "complement", "equals", "id", "name", "symbol"};
  int (*parsers[7])(char*, void*) = {ld_str, ld_hex, ld_char, ld_str, ld_str, ld_str, ld_char};
  void *data[7] = {&aliases, &colour, &complement, &equals, &id, &name, &symbol};
  BOOLEAN_T required[7] = {FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE};
  BOOLEAN_T done[7];

  aliases = NULL;
  name = NULL;
  equals = NULL;
  complement = '\0';
  colour = -1;
  parse_attributes(dreme_attr_parse_error, ps, "letter", attrs, 7, names, parsers, data, required, done);

  if (ps->seen_ambig) {
    if (equals == NULL) {
      error(ps, "All core symbols must appear before any ambigous symbols.\n");
    }
  } else if (equals == NULL) {
    idx = rbtree_size(ps->alph_ids);
    rbtree_make(ps->alph_ids, id, &idx);
  } else {
    ps->seen_ambig = true;
  }

  if (ps->callbacks->handle_alphabet_letter && ps->state != PS_ERROR) {
    ps->callbacks->handle_alphabet_letter(ps->user_data, id, symbol, aliases, complement, equals, name, colour);
  }
}