Exemple #1
0
int main(int argc, char *argv[]) {
  char c;
  int i, j, t, opt_idx, ntrees, nleaves = -1;
  TreeNode *n, *node_i, *node_j, *lca, *nametree = NULL;
  TreeNode **tree;
  List *leaves, ***distance, *tree_fnames, *tot_dist;
  int mod = FALSE;
  char **leaf_name;
  String *trees_arg;
  FILE *F;

  struct option long_opts[] = {
    {"mod", 0, 0, 'm'},
    {"tree", 1, 0, 't'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "mt:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'm':
      mod = TRUE;
      break;
    case 't':
      if (optarg[0] == '(')
        nametree = tr_new_from_string(optarg);
      else 
        nametree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind > argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  set_seed(-1);

  /* build a comma-delimited list and pass to get_arg_list; allows
     possibility of reading from file via '*' operator */
  trees_arg = str_new(1000);
  for (i = optind; i < argc; i++) {
    str_append_charstr(trees_arg, argv[i]);
    if (i < argc - 1) str_append_char(trees_arg, ',');
  }
  tree_fnames = get_arg_list(trees_arg->chars);

  ntrees = lst_size(tree_fnames);
  tree = smalloc(ntrees * sizeof(void*));

  /* read trees */
  for (t = 0; t < ntrees; t++) {
    String *fname = lst_get_ptr(tree_fnames, t);
    if (mod) {
      TreeModel *m = tm_new_from_file(F = phast_fopen(fname->chars, "r"), 1);
      tree[t] = tr_create_copy(m->tree);
      tm_free(m);
      phast_fclose(F);
    }
    else
      tree[t] = tr_new_from_file(phast_fopen(fname->chars, "r"));
  }

  /* initialization */
  nleaves = (tree[0]->nnodes + 1)/2;
  leaves = lst_new_ptr(nleaves);    
  distance = smalloc(nleaves * sizeof(void*));
  leaf_name = smalloc(nleaves * sizeof(void*));
  for (i = 0; i < nleaves; i++) {
    distance[i] = smalloc(nleaves * sizeof(void*));
    for (j = i+1; j < nleaves; j++) 
      distance[i][j] = lst_new_dbl(ntrees);
  }
  if (nametree == NULL) nametree = tree[0];
  for (i = 0, j = 0; i < lst_size(nametree->nodes); i++) {
    n = lst_get_ptr(nametree->nodes, i);
    if (n->lchild == NULL && n->rchild == NULL)
      leaf_name[j++] = n->name;
  }
  tot_dist = lst_new_dbl(ntrees);

  /* now compute distances */
  for (t = 0; t < ntrees; t++) {
    /* obtain list of leaves */
    lst_clear(leaves);
    for (i = 0; i < lst_size(tree[t]->nodes); i++) {
      n = lst_get_ptr(tree[t]->nodes, i);
      if (n->lchild == NULL && n->rchild == NULL)
        lst_push_ptr(leaves, n);
    }

    if (lst_size(leaves) != nleaves)
      die("ERROR: trees have different numbers of leaves.\n");

    /* look at all pairs */
    for (i = 0; i < nleaves; i++) {
      node_i = lst_get_ptr(leaves, i);
      for (j = i+1; j < nleaves; j++) {
        double dist = 0;
        node_j = lst_get_ptr(leaves, j);
        /* because ids are assigned in pre-order, the first ancestor of
           node j that has an id less than i is the LCA of i and j; we
           seek the sum of distances from both i and j to this node */
        for (n = node_j; n->id >= node_i->id; n = n->parent)
          dist += n->dparent;      
        lca = n;
        for (n = node_i; n != lca; n = n->parent)
          dist += n->dparent;            
        lst_push_dbl(distance[i][j], dist);
      }
    }
    lst_push_dbl(tot_dist, tr_total_len(tree[t]));
  }


  /* print distances and (optionally) stats */
  if (ntrees == 1) {
    for (i = 0; i < nleaves; i++) {
      for (j = i+1; j < nleaves; j++) {
        printf ("%s\t%s\t%f\n", leaf_name[i], leaf_name[j], 
                lst_get_dbl(distance[i][j], 0));
      }
    }
    printf ("%s\t%s\t%f\n", "(total)", "-", lst_get_dbl(tot_dist, 0));
  }
  else {
    double mean, stdev;
    double quantiles[] = {0, 0.025, 0.05, 0.5, 0.95, 0.975, 1};
    double quantile_vals[7]; 

    printf("%-15s %-15s %9s %9s %9s %9s %9s %9s %9s %9s %9s\n", "leaf1", 
           "leaf2", "mean", "stdev", "median", "min", "max", "95%_min", 
           "95%_max", "90%_min", "90%_max");

    for (i = 0; i < nleaves; i++) {
      for (j = i+1; j < nleaves; j++) {
        mean = lst_dbl_mean(distance[i][j]);
        stdev = lst_dbl_stdev(distance[i][j]);
        lst_qsort_dbl(distance[i][j], ASCENDING);
        lst_dbl_quantiles(distance[i][j], quantiles, 7, quantile_vals);

        printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", 
               leaf_name[i], leaf_name[j], mean, stdev, quantile_vals[3], quantile_vals[0], 
               quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], 
               quantile_vals[4]);
      }
    }

    /* also do total branch len */
    mean = lst_dbl_mean(tot_dist);
    stdev = lst_dbl_stdev(tot_dist);
    lst_qsort_dbl(tot_dist, ASCENDING);
    lst_dbl_quantiles(tot_dist, quantiles, 7, quantile_vals);
    
    printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", 
	   "(total)", "-", mean, stdev, quantile_vals[3], quantile_vals[0], 
	   quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], 
	   quantile_vals[4]);
  }

  return 0;
}
Exemple #2
0
int main(int argc, char *argv[]) {
  char c;
  char *msa_fname = NULL;
  int opt_idx, i, old_nnodes;
  MSA *msa;
  List *pruned_names = lst_new_ptr(5), *tmpl;
  BDPhyloHmm *bdphmm;
  GFF_Set *predictions;
  int found = FALSE;
  List *ignore_types = lst_new_ptr(1);

  struct option long_opts[] = {
    {"refseq", 1, 0, 'M'},
    {"msa-format", 1, 0, 'i'},
    {"refidx", 1, 0, 'r'},
    {"rho", 1, 0, 'R'},
    {"phi", 1, 0, 'p'},
    {"transitions", 1, 0, 't'},    
    {"expected-length", 1, 0, 'E'},
    {"target-coverage", 1, 0, 'C'},
    {"seqname", 1, 0, 'N'},
    {"idpref", 1, 0, 'P'},
    {"indel-model", 1, 0, 'I'},
    {"indel-history", 1, 0, 'H'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* arguments and defaults for options */
  FILE *refseq_f = NULL, *msa_f = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  TreeModel *source_mod;
  double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, 
    phi = DEFAULT_PHI, gamma = -1, omega = -1, 
    alpha_c = -1, beta_c = -1, tau_c = -1,
    alpha_n = -1, beta_n = -1, tau_n = -1;
  int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, 
    estim_gamma = TRUE, estim_omega = TRUE;
  char *seqname = NULL, *idpref = NULL;
  IndelHistory *ih = NULL;

  while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'R':
      rho = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 't':
      if (optarg[0] != '~') estim_gamma = estim_omega = FALSE;
      else optarg = &optarg[1];
      set_transitions = TRUE;
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 2) 
        die("ERROR: bad argument to --transitions.\n");
      mu = lst_get_dbl(tmpl, 0);
      nu = lst_get_dbl(tmpl, 1);
      if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1)
        die("ERROR: bad argument to --transitions.\n");
      lst_free(tmpl);
      break;
    case 'p':
      if (optarg[0] != '~') estim_phi = FALSE;
      else optarg = &optarg[1];
      phi = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'E':
      if (optarg[0] != '~') estim_omega = FALSE;
      else optarg = &optarg[1];
      omega = get_arg_dbl_bounds(optarg, 1, INFTY);
      mu = 1/omega;
      break;
    case 'C':
      if (optarg[0] != '~') estim_gamma = FALSE;
      else optarg = &optarg[1];
      gamma = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'r':
      refidx = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'M':
      refseq_f = phast_fopen(optarg, "r");
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'N':
      seqname = optarg;
      break;
    case 'P':
      idpref = optarg;
      break;
    case 'I':
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6)
        die("ERROR: bad argument to --indel-model.\n");
      alpha_n = lst_get_dbl(tmpl, 0);
      beta_n = lst_get_dbl(tmpl, 1);
      tau_n = lst_get_dbl(tmpl, 2);
      if (lst_size(tmpl) == 6) {
        alpha_c = lst_get_dbl(tmpl, 3);
        beta_c = lst_get_dbl(tmpl, 4);
        tau_c = lst_get_dbl(tmpl, 5);
      }
      else {
        alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n;
      }
      if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || 
          tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || 
          beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1)
        die("ERROR: bad argument to --indel-model.\n");
      break;
    case 'H':
      fprintf(stderr, "Reading indel history from %s...\n", optarg);
      ih = ih_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'dless -h'.\n");
    }
  }

  if (optind != argc - 1)
    die("Missing alignment file or model file.  Try 'dless -h'.\n");

  if (set_transitions && (gamma != -1 || omega != -1))
    die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n");

  if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1))
    die("ERROR: --target-coverage and --expecteed-length must be used together.\n");

  set_seed(-1);

  if (gamma != -1)
    nu = gamma/(1-gamma) * mu;

  fprintf(stderr, "Reading tree model from %s...\n", argv[optind]);
  source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (source_mod->nratecats > 1) 
    die("ERROR: rate variation not currently supported.\n");

  if (source_mod->order > 0)
    die("ERROR: only single nucleotide models are currently supported.\n");

  if (!tm_is_reversible(source_mod))
    phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n");

  /* read alignment */
  msa_f = phast_fopen(argv[optind], "r");

  fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
  if (msa_format == UNKNOWN_FORMAT) 
    msa_format = msa_format_for_content(msa_f, 1);

  if (msa_format == MAF) {
    msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, 
                   NO_STRIP, FALSE); 
  }
  else 
    msa = msa_new_from_file_define_format(msa_f, msa_format, NULL);

  if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
  msa_remove_N_from_alph(msa);

  if (msa->ss == NULL) {
    fprintf(stderr, "Extracting sufficient statistics...\n");
    ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
  }
  else if (msa->ss->tuple_idx == NULL)
    die("ERROR: ordered representation of alignment required unless --suff-stats.\n");

  /* prune tree, if necessary */
  old_nnodes = source_mod->tree->nnodes;
  tm_prune(source_mod, msa, pruned_names);

  if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
    die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
  if (lst_size(pruned_names) > 0) {
    fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
    for (i = 0; i < lst_size(pruned_names); i++)
      fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, 
              i < lst_size(pruned_names) - 1 ? ", " : ").\n");
  }

  /* this has to be done after pruning tree */
  tr_name_ancestors(source_mod->tree);

  /* also make sure match for reference sequence in tree */
  if (refidx > 0) {
    for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) {
      TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i);
      if (!strcmp(n->name, msa->names[refidx-1]))
        found = TRUE;
    }
    if (!found) die("ERROR: no match for reference sequence in tree.\n");
  }

  /* checks for indel model */
  if (alpha_c > 0) {
    if (ih == NULL) {
      fprintf(stderr, "Reconstructing indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
    else {
      if (ih->ncols != msa->length)
        die("ERROR: indel history doesn't seem to match alignment.\n");
      if (ih->tree->nnodes != source_mod->tree->nnodes)
        die("ERROR: indel history doesn't seem to match tree model.\n");
    }
  }

  bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, 
                  alpha_n, beta_n, tau_n, estim_gamma, estim_omega, 
                  estim_phi);

  /* compute emissions */
  phmm_compute_emissions(bdphmm->phmm, msa, FALSE);

  /* add emissions for indel model, if necessary */
  if (alpha_c > 0) {
    fprintf(stderr, "Adjusting emissions for indels...\n");
    bd_add_indel_emissions(bdphmm, ih);
  }

  /* postprocess for missing data (requires special handling) */
  fprintf(stderr, "Adjusting emissions for missing data...\n");
  bd_handle_missing_data(bdphmm, msa);

  if (estim_gamma || estim_omega || estim_phi) {
    fprintf(stderr, "Estimating free parameters...\n");
    bd_estimate_transitions(bdphmm, msa);
  }

  /* set seqname and idpref, if necessary */
  if (seqname == NULL || idpref == NULL) {
    /* derive default from file name root */
    String *tmp = str_new_charstr(msa_fname);
    if (!str_equals_charstr(tmp, "-")) {
      str_remove_path(tmp);
      str_root(tmp, '.');
      if (idpref == NULL) idpref = copy_charstr(tmp->chars);
      str_root(tmp, '.');         /* apply one more time for double suffix */
      if (seqname == NULL) seqname = tmp->chars;    
    }
    else if (seqname == NULL) seqname = "refseq";
  }

  /* obtain predictions */
  fprintf(stderr, "Running Viterbi algorithm...\n");
  predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL);
  lst_push_ptr(ignore_types, str_new_charstr("nonconserved"));
  gff_filter_by_type(predictions, ignore_types, TRUE, NULL);

  /* score predictions */
  fprintf(stderr, "Scoring predictions...\n");
  bd_score_predictions(bdphmm, predictions);
  
  /* can free emissions now */
  for (i = 0; i < bdphmm->phmm->hmm->nstates; i++)
    sfree(bdphmm->phmm->emissions[i]);
  sfree(bdphmm->phmm->emissions);
  bdphmm->phmm->emissions = NULL;

  /* convert GFF to coord frame of reference sequence and adjust
     coords by idx_offset, if necessary  */
  if (refidx != 0 || msa->idx_offset != 0)
    msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset);

  if (refidx != 0) 
    gff_flatten(predictions);	
  /* necessary because coord conversion might create overlapping
     features (can happen in deletions in reference sequence) */

  /* now output predictions */
  fprintf(stderr, "Writing GFF to stdout...\n");
  gff_print_set(stdout, predictions);

  fprintf(stderr, "Done.\n");
  
  return 0;
}
Exemple #3
0
int main(int argc, char *argv[]) {
    struct phastCons_struct *p = phastCons_struct_new(0);
    struct option long_opts[] = {
        {"states", 1, 0, 'S'},
        {"hmm", 1, 0, 'H'},
        {"viterbi", 1, 0, 'V'},
        {"most-conserved", 1, 0, 'V'}, /* same as --viterbi */
        {"no-post-probs", 0, 0, 'n'},
        {"msa-format", 1, 0, 'i'},
        {"FC", 0, 0, 'X'},
        {"lambda", 1, 0, 'l'},
        {"target-coverage", 1, 0, 'C'},
        {"transitions", 1, 0, 't'},
        {"expected-length", 1, 0, 'E'},
        {"expected-lengths", 1, 0, 'E'}, /* for backward compatibility */
        {"estimate-trees", 1, 0, 'T'},
        {"estimate-rho", 1, 0, 'O'},
        {"rho", 1, 0, 'R'},
        {"gc", 1, 0, 'G'},
        {"ignore-missing", 0, 0, 'z'},
        {"nrates", 1, 0, 'k'},
        {"log", 1, 0, 'g'},
        {"refidx", 1, 0, 'r'},
        {"suppress-missing", 0, 0, 'x'}, /* for backward compatibility */
        {"reflect-strand", 1, 0, 'U'},
        {"catmap", 1, 0, 'c'},
        {"extrapolate", 1, 0, 'e'},
        {"indels", 0, 0, 'I'},
        {"max-micro-indel", 1, 0, 'Y'},
        {"indel-params", 1, 0, 'D'},
        {"min-informative-types", 1, 0, 'M'}, /* for backward compatibility */
        {"require-informative", 1, 0, 'M'},
        {"not-informative", 1, 0, 'F'},
        {"lnl", 1, 0, 'L'},
        {"seqname", 1, 0, 'N'},
        {"idpref", 1, 0, 'P'},
        {"score", 0, 0, 's'},
        {"coding-potential", 0, 0, 'p'},
        {"indels-only", 0, 0, 'J'},
        {"alias", 1, 0, 'A'},
        {"quiet", 0, 0, 'q'},
        {"help", 0, 0, 'h'},
        {0, 0, 0, 0}
    };

    /* other vars */
    FILE *infile;
    char *msa_fname;
    char c;
    int opt_idx, i, coding_potential=FALSE;
    List *tmpl = NULL;
    String *tmpstr;
    char *mods_fname = NULL;
    List *mod_fname_list;
    msa_format_type msa_format = UNKNOWN_FORMAT;

    while ((c = getopt_long(argc, argv,
                            "S:H:V:ni:k:l:C:G:zt:E:R:T:O:r:xL:sN:P:g:U:c:e:IY:D:JM:F:pA:Xqh",
                            long_opts, &opt_idx)) != -1) {
        switch (c) {
        case 'S':
            p->states = get_arg_list(optarg);
            break;
        case 'H':
            p->hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            p->two_state = FALSE;
            break;
        case 'V':
            p->viterbi_f = phast_fopen(optarg, "w+");
            tmpstr = str_new_charstr(optarg);
            if (str_ends_with_charstr(tmpstr, ".gff"))
                p->gff = TRUE;
            str_free(tmpstr);
            break;
        case 'n':
            p->post_probs = FALSE;
            break;
        case 'i':
            msa_format = msa_str_to_format(optarg);
            if (msa_format == UNKNOWN_FORMAT)
                die("ERROR: bad argument to --msa-format\n");
            break;
        case 'X':
            p->FC = TRUE;
            p->two_state = FALSE;
            break;
        case 'l':
            if (optarg[0] != '~')
                p->estim_lambda = FALSE;
            else optarg = &optarg[1];
            p->lambda = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'C':
            p->gamma = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'G':
            p->gc = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 't':
            p->set_transitions = TRUE;
            if (optarg[0] != '~')
                p->estim_transitions = FALSE;
            else optarg = &optarg[1];
            tmpl = get_arg_list_dbl(optarg);
            if (lst_size(tmpl) != 2)
                die("ERROR: bad argument to --transitions.\n");
            p->mu = lst_get_dbl(tmpl, 0);
            p->nu = lst_get_dbl(tmpl, 1);
            if (p->mu <= 0 || p->mu >= 1 || p->nu <= 0 || p->nu >= 1)
                die("ERROR: bad argument to --transitions.\n");
            lst_free(tmpl);
            break;
        case 'E':
            if (optarg[0] != '~')
                p->estim_transitions = FALSE;
            else optarg = &optarg[1];
            p->omega = get_arg_dbl_bounds(optarg, 1, INFTY);
            p->mu = 1/p->omega;
            break;
        case 'T':
            p->estim_trees = TRUE;
            p->estim_trees_fname_root = optarg;
            break;
        case 'O':
            p->estim_rho = TRUE;
            p->estim_trees_fname_root = optarg;
            break;
        case 'z':
            p->ignore_missing = TRUE;
            break;
        case 'k':
            tmpl = get_arg_list_int(optarg);
            if (lst_size(tmpl) > 2)
                die("ERROR: too many arguments with --nrates.\n");
            p->nrates = lst_get_int(tmpl, 0);
            if (p->nrates <= 0)
                die("ERROR: bad argument to --nrates (%d).\n", p->nrates);
            if (lst_size(tmpl) == 2) {
                p->nrates2 = lst_get_int(tmpl, 1);
                if (p->nrates2 <= 0)
                    die("ERROR: bad argument to --nrates (%d).\n", p->nrates2);
            }
            lst_free(tmpl);
            break;
        case 'R':
            p->rho = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'g':
            if (!strcmp(optarg, "-"))
                p->log_f = stderr;
            else p->log_f = phast_fopen(optarg, "w+");
            break;
        case 'r':
            p->refidx = get_arg_int_bounds(optarg, 0, INFTY);
            break;
        case 'x':
            /* do nothing; left in for backward compatibility */
            break;
        case 'U':
            p->pivot_states = get_arg_list(optarg); /* we want strings not ints
						 for phmm_new */
            break;
        case 'e':
            p->extrapolate_tree_fname = optarg;
            break;
        case 'I':
            p->indels = TRUE;
            break;
        case 'Y':
            p->max_micro_indel = get_arg_int_bounds(optarg, 1, INFTY);
            break;
        case 'D':
            if (optarg[0] != '~')
                p->estim_indels = FALSE;
            else optarg = &optarg[1];
            tmpl = get_arg_list_dbl(optarg);
            if (lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-params.\n");
            p->alpha_0 = lst_get_dbl(tmpl, 0);
            p->beta_0 = lst_get_dbl(tmpl, 1);
            p->tau_0 = lst_get_dbl(tmpl, 2);
            p->alpha_1 = lst_get_dbl(tmpl, 3);
            p->beta_1 = lst_get_dbl(tmpl, 4);
            p->tau_1 = lst_get_dbl(tmpl, 5);
            if (p->alpha_0 < 0 || p->beta_0 < 0 || p->tau_0 < 0 ||
                    p->alpha_1 < 0 || p->beta_1 < 0 || p->tau_1 < 0)
                die("ERROR: bad argument to --indel-params.\n");
            lst_free(tmpl);
            break;
        case 'J':
            p->indels_only = TRUE;
            p->two_state = FALSE;
            p->indels = TRUE;
            p->post_probs = FALSE;
            break;
        case 'M':
            p->inform_reqd = get_arg_list(optarg);
            break;
        case 'F':
            p->not_informative = get_arg_list(optarg);
            break;
        case 'c':
            p->cm = cm_new_string_or_file(optarg);
            break;
        case 'L':
            p->lnl_f = phast_fopen(optarg, "w+");
            break;
        case 'N':
            p->seqname = optarg;
            break;
        case 'P':
            p->idpref = optarg;
            break;
        case 's':
            p->score = TRUE;
            break;
        case 'p':
            coding_potential = TRUE;
            break;
        case 'A':
            p->alias_hash = make_name_hash(optarg);
            break;
        case 'q':
            p->results_f = NULL;
            break;
        case 'h':
            printf("%s", HELP);
            exit(0);
        case '?':
            die("Bad argument.  Try '%s -h'.\n", argv[0]);
        }
    }

    if ((!coding_potential && optind != argc - 2) ||
            (coding_potential && optind != argc - 2 && optind != argc - 1))
        die("ERROR: extra or missing arguments.  Try '%s -h'.\n", argv[0]);

    set_seed(-1);

    if (p->extrapolate_tree_fname != NULL &&
            !strcmp(p->extrapolate_tree_fname, "default")) {
        p->extrapolate_tree_fname = smalloc((strlen(PHAST_HOME)+100)*sizeof(char));
#if defined(__MINGW32__)
        sprintf(p->extrapolate_tree_fname,
                "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME);
#else
        sprintf(p->extrapolate_tree_fname,
                "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME);
#endif
    }
    if (p->extrapolate_tree_fname != NULL)
        p->extrapolate_tree = tr_new_from_file(phast_fopen(p->extrapolate_tree_fname, "r"));

    mods_fname = (optind == argc - 2 ? argv[argc - 1] : NULL);
    /* if there are two args, mods are the second one; otherwise will
       use default mods for coding potential (see below) */

    /* set defaults for coding-potential mode */
    if (coding_potential) {
        char tmp[5000];
        p->two_state = FALSE;
        if (p->cm == NULL)
            p->cm = cm_new_string_or_file("NCATS=4; CNS 1; CDS 2-4");
        if (p->hmm == NULL) {
#if defined(__MINGW32__)
            sprintf(tmp, "%s\\data\\phastCons\\%s", PHAST_HOME,
                    p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm");
#else
            sprintf(tmp, "%s/data/phastCons/%s", PHAST_HOME,
                    p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm");
#endif
            if (p->results_f!=NULL)
                fprintf(p->results_f, "Reading HMM from %s...\n", tmp);
            p->hmm = hmm_new_from_file(phast_fopen(tmp, "r"));
        }
        if (mods_fname == NULL) {
#if defined(__MINGW32__)
            sprintf(tmp, "%s\\data\\exoniphy\\mammals\\r3.ncns.mod, %s\\data\\exoniphy\\mammals\\r3.cns.mod, %s\\data\\exoniphy\\mammals\\r3.cds-1.mod, %s\\data\\exoniphy\\mammals\\r3.cds-2.mod, %s\\data\\exoniphy\\mammals\\r3.cds-3.mod",  PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME);
#else
            sprintf(tmp, "\
%s/data/exoniphy/mammals/r3.ncns.mod,\
%s/data/exoniphy/mammals/r3.cns.mod,\
%s/data/exoniphy/mammals/r3.cds-1.mod,\
%s/data/exoniphy/mammals/r3.cds-2.mod,\
%s/data/exoniphy/mammals/r3.cds-3.mod",
                    PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME);
#endif
            mods_fname = tmp;
        }
        if (p->states == NULL)
            p->states = get_arg_list("CDS");
        if (p->pivot_states == NULL)
            p->pivot_states = get_arg_list("background,CNS");
    }
List *pwm_read(const char *filename) {
  List *result;
  Matrix *pwm = NULL;
  int i, currBase, nBases = 0;
  FILE * F;
  //  char *motifName;
  String *line = str_new(STR_MED_LEN);
  List *l = lst_new_ptr(3);
  List *probabilitiesStr = lst_new_ptr(4);
  List *probabilitiesDbl;
  Regex *pssm_re = NULL;
  Regex *motif_name_re = NULL;
  int alphabetLength;

  result = lst_new_ptr(1);
  //letter-probability matrix: alength= 4 w= 8 nsites= 2 E= 1.5e+004

  pssm_re = str_re_new("^letter-probability matrix: alength= ([0-9]+) w= ([0-9]+)");
  motif_name_re = str_re_new("^MOTIF[[:space:]]+(.+?)[[:space:]].*");
  //open PWM file
  F = phast_fopen(filename, "r");
  currBase = 0;
  nBases = -1;
  //For each line in the MEME file
  while ((str_readline(line, F)) != EOF) {
    //If line matches Motif name
    if (str_re_match(line, motif_name_re, l, 1) > 0) {
      //      motifName = copy_charstr(((String*)lst_get_ptr(l, 1))->chars);
      //printf("motifName=%s\n", motifName);
    }
    //If line matches beginning of a probability matrix
    else if (str_re_match(line, pssm_re, l, 2) > 0) {
      //Extract the alphabet size & number of bases in matrix

      if (str_as_int((String*)lst_get_ptr(l, 1), &alphabetLength) != 0)
        die("ERROR: Unable to parse 'alength=' from MEME file, expected integer, read %s", ((String*)lst_get_ptr(l, 1))->chars);
      if (str_as_int((String*)lst_get_ptr(l, 2), &nBases) != 0)
        die("ERROR: Unable to parse 'w=' from MEME file, expected integer, read %s ", ((String*)lst_get_ptr(l, 2))->chars);
      currBase = 0;
      if (nBases <= 0) //We must have at least one base in the PWM
        die("ERROR: No Position Weight Matrices were detected in the provided PWM file");
      if (alphabetLength <= 0) //We must have a positive alphabet length
        die("ERROR: Alphabet lengh specified in PWM file must be greater than zero");
      pwm = mat_new(nBases, alphabetLength);
      mat_set_all(pwm, -1);
      continue;
      //If this row contains matrix data
    } else if (currBase < nBases) {
      //Parse row of probabilities
      str_double_trim(line);
      str_split(line, NULL, probabilitiesStr);
      probabilitiesDbl = str_list_as_dbl(probabilitiesStr);
      for (i = 0; i < lst_size(probabilitiesDbl); i++)
        mat_set(pwm, currBase, i, log(lst_get_dbl(probabilitiesDbl, i)));
      currBase++;
    } else if ((currBase == nBases) && (pwm != NULL)) {
      //Push full matrix
      lst_push_ptr(result, pwm);
      pwm = NULL;
    }
  }
  if (currBase == nBases && pwm != NULL) 
    lst_push_ptr(result, pwm);
  else if (pwm != NULL) 
    die("Premature end of PWM file\n");
  str_re_free(motif_name_re);
  str_re_free(pssm_re);
  phast_fclose(F);
  return result;
}