Exemple #1
0
MafSubBlock *mafSubBlock_copy(MafSubBlock *src) {
  MafSubBlock *sub = smalloc(sizeof(MafSubBlock));
  int i, j;
  if (src->seq == NULL) sub->seq = NULL;
  else sub->seq = str_new_charstr(src->seq->chars);
  if (src->src == NULL) sub->src = NULL;
  else sub->src = str_new_charstr(src->src->chars);
  if (src->specName == NULL) sub->specName = NULL;
  else sub->specName = str_new_charstr(src->specName->chars);
  sub->start = src->start;
  sub->size = src->size;
  sub->strand = src->strand;
  sub->srcSize = src->srcSize;
  sub->numLine = src->numLine;
  for (i=0; i<src->numLine; i++) {
    sub->lineType[i] = src->lineType[i];
    if (src->lineType[i]=='i') {
      for (j=0; j<2; j++) {
	sub->iStatus[j] = src->iStatus[j];
	sub->iCount[j]  = src->iCount[j];
      }
    }
    if (src->lineType[i]=='e') 
      sub->eStatus = src->eStatus;
  }
  if (src->quality == NULL) sub->quality = NULL;
  else sub->quality = str_new_charstr(src->quality->chars);
  return sub;
}
Exemple #2
0
/* Create a new category map from a string that can
    either be a filename or a brief "inlined" category map, e.g.,
    "NCATS = 3 ; CDS 1-3".  Useful for command-line arguments. */
CategoryMap* cm_new_string_or_file(const char *optarg) {
  int i;
  char fname[STR_SHORT_LEN];
  String *str = str_new_charstr(optarg);
  FILE *F;
  CategoryMap *retval = NULL;

  str_double_trim(str);
  if (str_starts_with_charstr(str, "NCATS")) {
    /* replace semicolons with carriage returns */
    for (i = 0; i < str->length; i++)
      if (str->chars[i] == ';') str->chars[i] = '\n';

    /* we'll just dump a little tmp file and read it with cm_read */
    sprintf(fname, "cm.tmp.%d", getpid());
    F = phast_fopen(fname, "w+");
    fprintf(F, "%s", str->chars);
    phast_fclose(F);
    F = phast_fopen(fname, "r");
    retval = cm_read(F);
    phast_fclose(F);
    unlink(fname);
  }
  else {
    F = phast_fopen(str->chars, "r");
    retval = cm_read(F);
    phast_fclose(F);
  }

  str_free(str);
  return retval;
}
Exemple #3
0
/* Create a category map with a category for each feature type in a
    GFF_Set.  Category numbers are assigned in order of appearance of
    types */
CategoryMap* cm_new_from_features(GFF_Set *feats) {
  int i;
  CategoryMap *retval;
  Hashtable *hash;
  List *types;

  /* first scan features for all types */
  hash = hsh_new(10);
  types = lst_new_ptr(10);
  for (i = 0; i < lst_size(feats->features); i++) {
    GFF_Feature *f = lst_get_ptr(feats->features, i);
    checkInterruptN(i, 10000);
    if (hsh_get(hash, f->feature->chars) == (void*)-1) {
      lst_push_ptr(types, f->feature);
      hsh_put_int(hash, f->feature->chars, 1);
    }
  }
  hsh_free(hash);

  /* now create a simple category map */
  retval = cm_new(lst_size(types));
  for (i = 0; i <= retval->ncats; i++) {
    String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : 
      str_dup(lst_get_ptr(types, i-1));
    retval->ranges[i] = cm_new_category_range(type, i, i);
  }
  lst_free(types);
  return retval;
}
Exemple #4
0
SEXP rph_tree_prune(SEXP treeStr, SEXP seqsP, SEXP allButP) {
  TreeNode *tr = rph_tree_new(treeStr);
  List *names = lst_new_ptr(LENGTH(seqsP));
  String *tempStr;
  char *temp;
  int i;
  SEXP result;
  for (i=0; i<LENGTH(seqsP); i++) {
    tempStr = str_new_charstr(CHAR(STRING_ELT(seqsP, i)));
    lst_push_ptr(names, tempStr);
  }
  tr_prune(&tr, names, INTEGER_VALUE(allButP), NULL);
  temp = tr_to_string(tr, 1);
  PROTECT(result = NEW_CHARACTER(1));
  SET_STRING_ELT(result, 0, mkChar(temp));
  UNPROTECT(1);
  return result;
}
Exemple #5
0
MafBlock* mafBlock_copy(MafBlock *src) {
  MafBlock *block = smalloc(sizeof(MafBlock));
  MafSubBlock *sub;
  int i;
  if (src->aLine == NULL) block->aLine = NULL;
  else block->aLine = str_new_charstr(src->aLine->chars);
  if (src->specMap == NULL) block->specMap = NULL;
  else block->specMap = hsh_copy(src->specMap);
  block->seqlen = src->seqlen;
  if (src->data==NULL) block->data = NULL;
  else {
    block->data = lst_new_ptr(lst_size(src->data));
    for (i=0; i<lst_size(src->data); i++) {
      sub = mafSubBlock_copy((MafSubBlock*)lst_get_ptr(src->data, i));
      lst_push_ptr(block->data, (void*)sub);
    }
  }
  return block;
}
Exemple #6
0
/* Read a CategoryMap from a file */
CategoryMap *cm_read(FILE *F) {
  String *line, *name;
  List *l;
  int cat, cat2, lineno, i, cm_read_error;
  CategoryMap *cm = NULL;
  CategoryRange *existing_range;
  static Regex *cat_range_re = NULL;
  static Regex *ncats_re = NULL;
  static Regex *fill_re = NULL;
  static Regex *label_re = NULL;
  static Regex *extend_re = NULL;
  int has_dependencies = 0;

  line = str_new(STR_SHORT_LEN);
  l = lst_new_ptr(3);
  if (cat_range_re == NULL) {
    cat_range_re = str_re_new("^[[:space:]]*([^[:space:]]+)[[:space:]]+([[:digit:]]+)(-([[:digit:]]+))?([[:space:]]+([[:digit:]].*))?"); 
    ncats_re = str_re_new("^[[:space:]]*NCATS[[:space:]]*=[[:space:]]*([[:digit:]]+)");
    fill_re = str_re_new("^[[:space:]]*FILL_PRECEDENCE[[:space:]]*=[[:space:]]*(.*)$");
    label_re = str_re_new("^[[:space:]]*LABELLING_PRECEDENCE[[:space:]]*=[[:space:]]*(.*)$");
    extend_re = str_re_new("^[[:space:]]*FEATURE_EXTEND[[:space:]]*:[[:space:]]*(.+)[[:space:]]*\\((.+)\\)$");
  }

  lineno = 0;
  while ((str_readline(line, F)) != EOF) {
    lineno++;
    str_trim(line);
    if (str_equals_charstr(line, ""))
      continue;

    if (str_re_match(line, ncats_re, l, 1) >= 0) { 
                                /* NCATS line */
      int ncats;
      str_as_int(lst_get_ptr(l, 1), &ncats);
      cm = cm_new(ncats);

      /* 0th category is "background" */
      cm->ranges[0] = 
        cm_new_category_range(str_new_charstr(BACKGD_CAT_NAME), 0, 0);
    }

    else if (cm == NULL || cm->ncats == 0) 
      die("ERROR: NCATS line must appear first, and must specify a positive number of categories.\n");

    else if (str_re_match(line, label_re, l, 1) >= 0) {               
                                /* LABELLING_PRECEDENCE line */
      List *tmpl = lst_new_ptr(cm->ncats);
      int tmpi;
      str_split((String*)lst_get_ptr(l, 1), " ,", tmpl);
      for (i = 0; i < lst_size(tmpl); i++) {
        String *s = (String*)lst_get_ptr(tmpl, i);
        if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
          die("ERROR: bad integer in LABELLING_PRECEDENCE.\n");
        cm->labelling_precedence[tmpi] = i;
        str_free(s);
      }
      lst_free(tmpl);
    }

    else if (str_re_match(line, fill_re, l, 1) >= 0) {               
                                /* FILL_PRECEDENCE line */
      List *tmpl = lst_new_ptr(cm->ncats);
      int tmpi;
      str_split(lst_get_ptr(l, 1), " ,", tmpl);
      for (i = 0; i < lst_size(tmpl); i++) {
        String *s = lst_get_ptr(tmpl, i);
        if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
          die("ERROR: bad integer in FILL_PRECEDENCE.\n");
        cm->fill_precedence[tmpi] = i;
        str_free(s);
      }
      lst_free(tmpl);
    }

    else if (str_re_match(line, extend_re, l, 2) >= 0) {
                                /* FEATURE_EXTEND line */
      String *target = lst_get_ptr(l, 2);
      List *sources = lst_new_ptr(2);
      str_split(lst_get_ptr(l, 1), " ,", sources);

      if (cm == NULL || (cat = cm_get_category(cm, target)) == 0)
        die("ERROR: FEATURE_EXTEND target must be a previously-defined non-background feature type.\n");

      for (i = 0; i < lst_size(sources); i++) {
        if (cm_get_category(cm, lst_get_ptr(sources, i)) == 0) 
          die("ERROR: FEATURE_EXTEND source list must consist of previously-defined non-background feature types.\n");
      }
    }

    else {                      /* 'range' line */
      if (str_re_match(line, cat_range_re, l, 6) < 0) 
        die("ERROR at line %d: '%s'\n", lineno, line->chars);

      name = str_dup((String*)lst_get_ptr(l, 1));
      str_as_int((String*)lst_get_ptr(l, 2), &cat);

      cat2 = cat;
      if (lst_get_ptr(l, 4) != NULL) 
        str_as_int((String*)lst_get_ptr(l, 4), &cat2);

      if (cat < 0 || cat2 < cat || cat2 > cm->ncats)
        die("ERROR: Illegal category range.\n");

      /* check for existing definitions of the specified category
         range.  Either no such definition must exist, or one must
         exist that spans exactly the same category numbers */
      existing_range = NULL;
      cm_read_error = 0;
      for (i = cat; !cm_read_error && i <= cat2; i++) {
        if (cm->ranges[i] != NULL && existing_range == NULL) 
          existing_range = cm->ranges[i];
        else if (cm->ranges[i] != existing_range)
          cm_read_error = 1;
      }
      if (existing_range != NULL && (existing_range->start_cat_no != cat || 
                                     existing_range->end_cat_no != cat2)) 
        cm_read_error = 1;

      if (cm_read_error) 
        die("ERROR: Overlapping category ranges.\n");

      /* either add new category range, or add new type to existing one */
      if (existing_range != NULL) {
        lst_push_ptr(existing_range->feature_types, name);
      }
      else {
        CategoryRange *cr = cm_new_category_range(name, cat, cat2);
        for (i = cat; i <= cat2; i++) 
          cm->ranges[i] = cr;
      }

      /* now address "conditioned_on" dependencies, if they have been
         specified */
      if (lst_get_ptr(l, 6) != NULL) {
        if (existing_range != NULL) 
          fprintf(stderr, "WARNING: ignoring 'conditioned on' list for type '%s'\n", 
                  name->chars);
        else {
          List *tmpl = lst_new_ptr(cm->ncats);
          int tmpi;
	  if (cm->conditioned_on[cat] != NULL)
	    die("ERROR cm_read: cm->conditioned_on[%i] should be NULL\n",
		cat);

          str_split((String*)lst_get_ptr(l, 6), " ,", tmpl);
          cm->conditioned_on[cat] = lst_new_int(lst_size(tmpl));
          for (i = cat + 1; i <= cat2; i++)
            cm->conditioned_on[i] = cm->conditioned_on[cat];
          /* all categories in range point to
             same "conditioned on" list */

          for (i = 0; i < lst_size(tmpl); i++) {
            String *s = (String*)lst_get_ptr(tmpl, i);
            if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
              die("ERROR: bad integer in 'conditioned on' list for type '%s'.\n", 
                      name->chars);
            lst_push_int(cm->conditioned_on[cat], tmpi);
            str_free(s);
          }
          lst_free(tmpl);
          
          has_dependencies = 1;
        }
      }
    }

    for (i = 0; i < lst_size(l); i++)
      if (lst_get_ptr(l, i) != NULL)
        str_free((String*)lst_get_ptr(l, i));
  }

  /* make sure every category has been specified */
  for (i = 0; i <= cm->ncats; i++) 
    if (cm->ranges[i] == 0) 
      die("ERROR: category %d has not been specified.\n", i);

  /* build unspooler, if necessary */
  if (has_dependencies)
    cm->unspooler = cm_create_unspooler(cm->ncats + 1, cm->conditioned_on);

  str_free(line);
  lst_free(l);
  return cm;
}
Exemple #7
0
int main(int argc, char *argv[]) {
    struct phastCons_struct *p = phastCons_struct_new(0);
    struct option long_opts[] = {
        {"states", 1, 0, 'S'},
        {"hmm", 1, 0, 'H'},
        {"viterbi", 1, 0, 'V'},
        {"most-conserved", 1, 0, 'V'}, /* same as --viterbi */
        {"no-post-probs", 0, 0, 'n'},
        {"msa-format", 1, 0, 'i'},
        {"FC", 0, 0, 'X'},
        {"lambda", 1, 0, 'l'},
        {"target-coverage", 1, 0, 'C'},
        {"transitions", 1, 0, 't'},
        {"expected-length", 1, 0, 'E'},
        {"expected-lengths", 1, 0, 'E'}, /* for backward compatibility */
        {"estimate-trees", 1, 0, 'T'},
        {"estimate-rho", 1, 0, 'O'},
        {"rho", 1, 0, 'R'},
        {"gc", 1, 0, 'G'},
        {"ignore-missing", 0, 0, 'z'},
        {"nrates", 1, 0, 'k'},
        {"log", 1, 0, 'g'},
        {"refidx", 1, 0, 'r'},
        {"suppress-missing", 0, 0, 'x'}, /* for backward compatibility */
        {"reflect-strand", 1, 0, 'U'},
        {"catmap", 1, 0, 'c'},
        {"extrapolate", 1, 0, 'e'},
        {"indels", 0, 0, 'I'},
        {"max-micro-indel", 1, 0, 'Y'},
        {"indel-params", 1, 0, 'D'},
        {"min-informative-types", 1, 0, 'M'}, /* for backward compatibility */
        {"require-informative", 1, 0, 'M'},
        {"not-informative", 1, 0, 'F'},
        {"lnl", 1, 0, 'L'},
        {"seqname", 1, 0, 'N'},
        {"idpref", 1, 0, 'P'},
        {"score", 0, 0, 's'},
        {"coding-potential", 0, 0, 'p'},
        {"indels-only", 0, 0, 'J'},
        {"alias", 1, 0, 'A'},
        {"quiet", 0, 0, 'q'},
        {"help", 0, 0, 'h'},
        {0, 0, 0, 0}
    };

    /* other vars */
    FILE *infile;
    char *msa_fname;
    char c;
    int opt_idx, i, coding_potential=FALSE;
    List *tmpl = NULL;
    String *tmpstr;
    char *mods_fname = NULL;
    List *mod_fname_list;
    msa_format_type msa_format = UNKNOWN_FORMAT;

    while ((c = getopt_long(argc, argv,
                            "S:H:V:ni:k:l:C:G:zt:E:R:T:O:r:xL:sN:P:g:U:c:e:IY:D:JM:F:pA:Xqh",
                            long_opts, &opt_idx)) != -1) {
        switch (c) {
        case 'S':
            p->states = get_arg_list(optarg);
            break;
        case 'H':
            p->hmm = hmm_new_from_file(phast_fopen(optarg, "r"));
            p->two_state = FALSE;
            break;
        case 'V':
            p->viterbi_f = phast_fopen(optarg, "w+");
            tmpstr = str_new_charstr(optarg);
            if (str_ends_with_charstr(tmpstr, ".gff"))
                p->gff = TRUE;
            str_free(tmpstr);
            break;
        case 'n':
            p->post_probs = FALSE;
            break;
        case 'i':
            msa_format = msa_str_to_format(optarg);
            if (msa_format == UNKNOWN_FORMAT)
                die("ERROR: bad argument to --msa-format\n");
            break;
        case 'X':
            p->FC = TRUE;
            p->two_state = FALSE;
            break;
        case 'l':
            if (optarg[0] != '~')
                p->estim_lambda = FALSE;
            else optarg = &optarg[1];
            p->lambda = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'C':
            p->gamma = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'G':
            p->gc = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 't':
            p->set_transitions = TRUE;
            if (optarg[0] != '~')
                p->estim_transitions = FALSE;
            else optarg = &optarg[1];
            tmpl = get_arg_list_dbl(optarg);
            if (lst_size(tmpl) != 2)
                die("ERROR: bad argument to --transitions.\n");
            p->mu = lst_get_dbl(tmpl, 0);
            p->nu = lst_get_dbl(tmpl, 1);
            if (p->mu <= 0 || p->mu >= 1 || p->nu <= 0 || p->nu >= 1)
                die("ERROR: bad argument to --transitions.\n");
            lst_free(tmpl);
            break;
        case 'E':
            if (optarg[0] != '~')
                p->estim_transitions = FALSE;
            else optarg = &optarg[1];
            p->omega = get_arg_dbl_bounds(optarg, 1, INFTY);
            p->mu = 1/p->omega;
            break;
        case 'T':
            p->estim_trees = TRUE;
            p->estim_trees_fname_root = optarg;
            break;
        case 'O':
            p->estim_rho = TRUE;
            p->estim_trees_fname_root = optarg;
            break;
        case 'z':
            p->ignore_missing = TRUE;
            break;
        case 'k':
            tmpl = get_arg_list_int(optarg);
            if (lst_size(tmpl) > 2)
                die("ERROR: too many arguments with --nrates.\n");
            p->nrates = lst_get_int(tmpl, 0);
            if (p->nrates <= 0)
                die("ERROR: bad argument to --nrates (%d).\n", p->nrates);
            if (lst_size(tmpl) == 2) {
                p->nrates2 = lst_get_int(tmpl, 1);
                if (p->nrates2 <= 0)
                    die("ERROR: bad argument to --nrates (%d).\n", p->nrates2);
            }
            lst_free(tmpl);
            break;
        case 'R':
            p->rho = get_arg_dbl_bounds(optarg, 0, 1);
            break;
        case 'g':
            if (!strcmp(optarg, "-"))
                p->log_f = stderr;
            else p->log_f = phast_fopen(optarg, "w+");
            break;
        case 'r':
            p->refidx = get_arg_int_bounds(optarg, 0, INFTY);
            break;
        case 'x':
            /* do nothing; left in for backward compatibility */
            break;
        case 'U':
            p->pivot_states = get_arg_list(optarg); /* we want strings not ints
						 for phmm_new */
            break;
        case 'e':
            p->extrapolate_tree_fname = optarg;
            break;
        case 'I':
            p->indels = TRUE;
            break;
        case 'Y':
            p->max_micro_indel = get_arg_int_bounds(optarg, 1, INFTY);
            break;
        case 'D':
            if (optarg[0] != '~')
                p->estim_indels = FALSE;
            else optarg = &optarg[1];
            tmpl = get_arg_list_dbl(optarg);
            if (lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-params.\n");
            p->alpha_0 = lst_get_dbl(tmpl, 0);
            p->beta_0 = lst_get_dbl(tmpl, 1);
            p->tau_0 = lst_get_dbl(tmpl, 2);
            p->alpha_1 = lst_get_dbl(tmpl, 3);
            p->beta_1 = lst_get_dbl(tmpl, 4);
            p->tau_1 = lst_get_dbl(tmpl, 5);
            if (p->alpha_0 < 0 || p->beta_0 < 0 || p->tau_0 < 0 ||
                    p->alpha_1 < 0 || p->beta_1 < 0 || p->tau_1 < 0)
                die("ERROR: bad argument to --indel-params.\n");
            lst_free(tmpl);
            break;
        case 'J':
            p->indels_only = TRUE;
            p->two_state = FALSE;
            p->indels = TRUE;
            p->post_probs = FALSE;
            break;
        case 'M':
            p->inform_reqd = get_arg_list(optarg);
            break;
        case 'F':
            p->not_informative = get_arg_list(optarg);
            break;
        case 'c':
            p->cm = cm_new_string_or_file(optarg);
            break;
        case 'L':
            p->lnl_f = phast_fopen(optarg, "w+");
            break;
        case 'N':
            p->seqname = optarg;
            break;
        case 'P':
            p->idpref = optarg;
            break;
        case 's':
            p->score = TRUE;
            break;
        case 'p':
            coding_potential = TRUE;
            break;
        case 'A':
            p->alias_hash = make_name_hash(optarg);
            break;
        case 'q':
            p->results_f = NULL;
            break;
        case 'h':
            printf("%s", HELP);
            exit(0);
        case '?':
            die("Bad argument.  Try '%s -h'.\n", argv[0]);
        }
    }

    if ((!coding_potential && optind != argc - 2) ||
            (coding_potential && optind != argc - 2 && optind != argc - 1))
        die("ERROR: extra or missing arguments.  Try '%s -h'.\n", argv[0]);

    set_seed(-1);

    if (p->extrapolate_tree_fname != NULL &&
            !strcmp(p->extrapolate_tree_fname, "default")) {
        p->extrapolate_tree_fname = smalloc((strlen(PHAST_HOME)+100)*sizeof(char));
#if defined(__MINGW32__)
        sprintf(p->extrapolate_tree_fname,
                "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME);
#else
        sprintf(p->extrapolate_tree_fname,
                "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME);
#endif
    }
    if (p->extrapolate_tree_fname != NULL)
        p->extrapolate_tree = tr_new_from_file(phast_fopen(p->extrapolate_tree_fname, "r"));

    mods_fname = (optind == argc - 2 ? argv[argc - 1] : NULL);
    /* if there are two args, mods are the second one; otherwise will
       use default mods for coding potential (see below) */

    /* set defaults for coding-potential mode */
    if (coding_potential) {
        char tmp[5000];
        p->two_state = FALSE;
        if (p->cm == NULL)
            p->cm = cm_new_string_or_file("NCATS=4; CNS 1; CDS 2-4");
        if (p->hmm == NULL) {
#if defined(__MINGW32__)
            sprintf(tmp, "%s\\data\\phastCons\\%s", PHAST_HOME,
                    p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm");
#else
            sprintf(tmp, "%s/data/phastCons/%s", PHAST_HOME,
                    p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm");
#endif
            if (p->results_f!=NULL)
                fprintf(p->results_f, "Reading HMM from %s...\n", tmp);
            p->hmm = hmm_new_from_file(phast_fopen(tmp, "r"));
        }
        if (mods_fname == NULL) {
#if defined(__MINGW32__)
            sprintf(tmp, "%s\\data\\exoniphy\\mammals\\r3.ncns.mod, %s\\data\\exoniphy\\mammals\\r3.cns.mod, %s\\data\\exoniphy\\mammals\\r3.cds-1.mod, %s\\data\\exoniphy\\mammals\\r3.cds-2.mod, %s\\data\\exoniphy\\mammals\\r3.cds-3.mod",  PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME);
#else
            sprintf(tmp, "\
%s/data/exoniphy/mammals/r3.ncns.mod,\
%s/data/exoniphy/mammals/r3.cns.mod,\
%s/data/exoniphy/mammals/r3.cds-1.mod,\
%s/data/exoniphy/mammals/r3.cds-2.mod,\
%s/data/exoniphy/mammals/r3.cds-3.mod",
                    PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME);
#endif
            mods_fname = tmp;
        }
        if (p->states == NULL)
            p->states = get_arg_list("CDS");
        if (p->pivot_states == NULL)
            p->pivot_states = get_arg_list("background,CNS");
    }
Exemple #8
0
//read next block in mfile and return MafBlock object or NULL if EOF.
//specHash and numSpec are not used, but if specHash is not NULL,
//it should be initialized, and any new species encountered will be added
//to the hash, with numSpec increased accordingly.  If specHash is NULL,
//numSpec will not be used or modified.
MafBlock *mafBlock_read_next(FILE *mfile, Hashtable *specHash, int *numSpec) {
  int i;
  char firstchar;
  String *currLine = str_new(1000);
  MafBlock *block=NULL;
  MafSubBlock *sub=NULL;

  if (specHash != NULL && numSpec==NULL) 
    die("ERROR: mafBlock_read_next: numSpec cannot be NULL "
	"if specHash is not NULL\n");

  while (EOF != str_readline(currLine, mfile)) {
    str_trim(currLine);
    if (currLine->length==0) {  //if blank line, it is either first or last line
      if (block == NULL) continue;
      else break;
    }
    firstchar = currLine->chars[0];
    if (firstchar == '#') continue;  //ignore comments
    if (block == NULL) {
      if (firstchar != 'a') 
	die("ERROR: first line of MAF block should start with 'a'\n");
      block = mafBlock_new();
      block->aLine = str_new_charstr(currLine->chars);
    }
    //if 's' or 'e', then this is first line of data for this species
    else if (firstchar == 's' || firstchar == 'e') {
      sub = mafBlock_get_subBlock(currLine);
      if (hsh_get_int(block->specMap, sub->src->chars) != -1) 
	die("ERROR: mafBlock has two alignments with same srcName (%s)\n", 
	    sub->src->chars);
      hsh_put_int(block->specMap, sub->src->chars, lst_size(block->data));
      hsh_put_int(block->specMap, sub->specName->chars, lst_size(block->data));
      lst_push_ptr(block->data, (void*)sub);
      if (specHash != NULL) {
	if (-1 == hsh_get_int(specHash, sub->specName->chars)) {
	  hsh_put_int(specHash, sub->specName->chars, *numSpec);
	  (*numSpec)++;
	}
      }
    }
    else {
      if (firstchar == 'i')
	mafBlock_add_iLine(currLine, sub);
      else if (firstchar == 'q')
	mafBlock_add_qLine(currLine, sub);
      else die("ERROR: found line in MAF block starting with '%c'\n", firstchar);
    }
  }
  str_free(currLine);
  if (block == NULL) return NULL;

  //set seqlen and make sure all seq arrays agree
  for (i=0; i<lst_size(block->data); i++) {
    sub = (MafSubBlock*)lst_get_ptr(block->data, i);
    if (sub->lineType[0]=='e') continue;
    if (block->seqlen == -1) block->seqlen = sub->seq->length;
    else if (sub->seq->length != block->seqlen) {
      die("ERROR: lengths of sequences in MAF block do not agree (%i, %i)\n",
	  block->seqlen, sub->seq->length);
    }
  }
  return block;
}
Exemple #9
0
int main(int argc, char* argv[]) {
    FILE* F;
    MSA *msa;
    int *msa_gap_patterns = NULL;
    HMM *hmm = NULL;
    TreeNode *tree = NULL;
    int i, input_format = SS, msa_idx, quiet_mode = FALSE,
           ncats, nmsas, ncats_unspooled, indel_nseqs = -1;
    String *msa_fname, *gff_fname;
    List *gff_fname_list = NULL, *msa_fname_list = NULL,
          *msa_length_list = NULL, *model_indels_str = NULL;
    Matrix *traincounts = NULL;
    Vector *begcounts = NULL, *statecounts = NULL;
    CategoryMap *cm = NULL;
    char c;
    GapPatternMap *gpm = NULL;
    GFF_Set *gff;
    char *reverse_groups_tag = NULL;

    while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) {
        switch(c) {
        case 'i':
            input_format = msa_str_to_format(optarg);
            if (input_format == -1)
                die("ERROR: bad alignment format.\n");
            break;
        case 'g':
            gff_fname_list = get_arg_list(optarg);
            break;
        case 'c':
            cm = cm_new_string_or_file(optarg);
            break;
        case 'm':
            msa_fname_list = get_arg_list(optarg);
            break;
        case 'M':
            msa_length_list = str_list_as_int(get_arg_list(optarg));
            break;
        case 'R':
            reverse_groups_tag = optarg;
            break;
        case 'I':
            model_indels_str = get_arg_list(optarg);
            break;
        case 'n':
            indel_nseqs = get_arg_int(optarg);
            break;
        case 't':
            if (optarg[0] == '(')     /* in this case, assume topology given
                                   at command line */
                tree = tr_new_from_string(optarg);
            else
                tree = tr_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'q':
            quiet_mode = TRUE;
            break;
        case 'h':
            print_usage();
            exit(0);
        case '?':
            die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n");
        }
    }

    if (msa_fname_list == NULL)
        die("ERROR: -m required.  Type 'hmm_train -h' for usage.\n");
    if (gff_fname_list == NULL)
        die("ERROR: -g required in training mode.  Type 'hmm_train -h' for usage.\n");
    if (msa_length_list != NULL && msa_fname_list != NULL)
        die("ERROR: -m and -M are mutually exclusive.  Type 'hmm_train -h' for usage.\n");
    if (model_indels_str != NULL && tree == NULL)
        die("ERROR: -I requires -t.  Type 'hmm_train -h' for usage.\n");
    if (cm == NULL)
        die("ERROR: category map required.\n");

    set_seed(-1);

    ncats = cm->ncats + 1;
    ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled :
                      ncats;
    nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) :
             lst_size(msa_fname_list));

    if (model_indels_str != NULL) {
        if (tree == NULL)
            die("ERROR: tree is NULL\n");  /*FIXME: indel_ncats broken */
        gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE);
        ncats = cm->ncats + 1;    /* numbers will change */
        ncats_unspooled = cm->unspooler == NULL ? ncats :
                          cm->unspooler->nstates_unspooled;
    }

    /* allocate memory for storage of "training paths" */
    traincounts = mat_new(ncats_unspooled, ncats_unspooled);
    statecounts = vec_new(ncats_unspooled);
    begcounts = vec_new(ncats_unspooled);
    mat_zero(traincounts);
    vec_zero(statecounts);
    vec_zero(begcounts);


    /* create skeleton of new HMM. */
    hmm = hmm_new_nstates(ncats_unspooled, 0, 0);

    /* Main loop: consider each MSA in turn */
    for (msa_idx = 0; msa_idx < nmsas; msa_idx++) {
        if (msa_fname_list != NULL) {
            msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx);
            F = phast_fopen(msa_fname->chars, "r");
            if (!quiet_mode)
                fprintf(stderr, "Reading alignment from %s ...\n",
                        F == stdin ? "stdin" : msa_fname->chars);
            msa = msa_new_from_file(F, NULL);
            phast_fclose(F);

        }
        else {                      /* only lengths of alignments specified */
            msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL);
            /* just a shell in this case */
        }

        gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx);
        if (!quiet_mode)
            fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars);
        gff = gff_read_set(phast_fopen(gff_fname->chars, "r"));

        /* convert GFF to coordinate frame of alignment */
        if (msa_length_list == NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Mapping annotations to alignment ...\n");
            msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */
        }

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Obtaining gap patterns ...\n");
            msa_gap_patterns = smalloc(msa->length * sizeof(int));
            gp_set_phylo_patterns(gpm, msa_gap_patterns, msa);
        }

        /* at this point, we don't actually need the alignment anymore;
           if using ordered suff stats (likely with large data sets),
           can free them now, to avoid running out of memory */
        if (msa->ss != NULL) {
            ss_free(msa->ss);
            msa->ss = NULL;
        }

        if (reverse_groups_tag != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n",
                        reverse_groups_tag);
            /* we don't need to reverse complement the whole alignment --
               just the gff and possibly the gap pattern array (pass a
               NULL msa) */
            gff_group(gff, reverse_groups_tag);
            msa_reverse_compl_feats(NULL, gff, msa_gap_patterns);
        }

        if (!quiet_mode)
            fprintf(stderr, "Labeling sites by category ...\n");
        msa_label_categories(msa, gff, cm);

        gff_free_set(gff);

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Remapping categories according to gap patterns ...\n");

            if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) {
                /* in this case, we'll simply reassign non-trivial gap
                   patterns randomly.  This will achieve the desired
                   effect with minimal coding, as long as the number of
                   sites is not too small (the indel model is probably
                   useless anyway if the number is small) */
                int pat, newpat;
                int npatterns = 4 * indel_nseqs - 5;
                int complex_allowed[cm->ncats+1];
                List *no_complex_names, *no_complex_nums;

                if (!quiet_mode)
                    fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs);

                /* set up index indicating by cat no. whether complex gaps
                   are allowed */
                for (i = 0; i < ncats; i++) complex_allowed[i] = 1;
                no_complex_names = lst_new_ptr(10);
                str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names);
                no_complex_nums = cm_get_category_list(cm, no_complex_names, 1);
                for (i = 0; i < lst_size(no_complex_nums); i++)
                    complex_allowed[lst_get_int(no_complex_nums, i)] = 0;
                lst_free(no_complex_nums);
                lst_free_strings(no_complex_names);
                lst_free(no_complex_names);

                /* now reassign all non-null numbers */
                for (i = 0; i < msa->length; ) {
                    if ((pat = msa_gap_patterns[i]) != 0) {
                        if (complex_allowed[msa->categories[i]])
                            newpat = 1 + ((double)npatterns * unif_rand());
                        /* random number in interval [1, npatterns] */
                        else
                            newpat = 1 + ((double)(npatterns-1) * unif_rand());
                        /* random number in interval [1,npatterns-1]
                           (excludes complex gap pattern) */
                        for (; i < msa->length && msa_gap_patterns[i] == pat; i++)
                            msa_gap_patterns[i] = newpat; /* change for whole sequence */
                    }
                    else i++;
                }
            }

            /* obtain gapped category number for each site */
            for (i = 0; i < msa->length; i++)
                if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL)
                    msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]];
        }

        if (!quiet_mode)
            fprintf(stderr, "Unspooling categories ...\n");
        cm_spooled_to_unspooled(cm, msa->categories, msa->length);

        if (!quiet_mode)
            fprintf(stderr, "Collecting training data ...\n");
        hmm_train_update_counts(traincounts, statecounts, begcounts,
                                msa->categories, msa->length,
                                ncats_unspooled);

        if (msa_gap_patterns != NULL) sfree(msa_gap_patterns);
        msa_free(msa);
    }

    /* now train HMM, using cumulative data */
    hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL,
                          begcounts, NULL);

    /* if modeling indels, adjust begin transitions so probability is
       distributed among different "gap pattern" states that all
       correspond to the same ungapped state (category); this helps
       avoid problems that occur when training on a few large sequences
       (e.g., whole chromosomes) and then testing on many shorter ones */
    if (model_indels_str != NULL) {
        double tprob[gpm->ncats];
        int nst[gpm->ncats];  /* total prob and number of states per
                             spooled, ungapped category */
        for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0;
        for (i = 0; i < hmm->nstates; i++) {
            if (vec_get(hmm->begin_transitions, i) > 0)
                /* have to go from unspooled space to spooled space, then to
                   ungapped space (HMM states correspond to unspooled,
                   gapped categories).  Note that states with nonzero begin
                   probs shouldn't be conditioned on other states. */
                tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] +=
                    vec_get(hmm->begin_transitions, i);
            nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++;
        }
        for (i = 0; i < hmm->nstates; i++)
            if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0)
                vec_set(hmm->begin_transitions, i,
                        tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] /
                        nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]);
        /* (uniform prior) */
    }

    /* write trained HMM */
    hmm_print(stdout, hmm);

    if (!quiet_mode) fprintf(stderr, "Done.\n");

    return 0;
}
/* checks to see if reference sequence looks okay wrt a given
   list of features */
int ref_seq_okay(List *features, MSA *msa, int offset3, 
                 int indel_strict, int splice_strict, List *problems) {
  List *signals = NULL;
  char *seq = NULL;
  int seqalloc = 0;
  int idx, retval = TRUE;
  GFF_Feature *feat, *lastfeat_helper = NULL;

  if (indel_strict) {
    signals = lst_new_ptr(10);
    str_split(str_new_charstr(SIGNALS), ",", signals);
  }

  for (idx = 0; idx < lst_size(features); idx++) {
    int i, j, len, has_gaps = 0; 

    feat = lst_get_ptr(features, idx);

    if (seqalloc <= feat->end - feat->start + 2) {
      seqalloc = (feat->end - feat->start) * 2; 
      seq = srealloc(seq, seqalloc * sizeof(char));
    }

    for (i = feat->start - 1, len = 0; i < feat->end; i++) {
      if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR)
        seq[len++] = ss_get_char_pos(msa, i, 0, 0);
      else if (!has_gaps) has_gaps = 1;
    }
    seq[len] = '\0';
    if (feat->strand == '-') msa_reverse_compl_seq(seq, len);

    if (str_equals_charstr(feat->feature, GFF_START_TYPE) && strcmp(seq, "ATG") != 0) {
      problem_add(problems, feat, BAD_REF_START, -1, -1);
      retval = FALSE;
    }
    else if (str_equals_charstr(feat->feature, GFF_STOP_TYPE) && 
             (feat->frame != 0 || !is_stop_codon(seq))) {
      problem_add(problems, feat, BAD_REF_STOP, -1, -1);
      retval = FALSE;
    }
    else if (str_starts_with_charstr(feat->feature, SPLICE_5) && 
             !is_valid_5splice(seq, splice_strict)) {
      problem_add(problems, feat, BAD_REF_5_SPLICE, -1, -1);
      retval = FALSE;
    }
    else if (str_starts_with_charstr(feat->feature, SPLICE_3) &&
             !is_valid_3splice(&seq[offset3], splice_strict)) {
      problem_add(problems, feat, BAD_REF_3_SPLICE, -1, -1);
      retval = FALSE;
    }
    else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
      for (i = (3 - feat->frame) % 3; i <= len - 3; i += 3) {
        if (is_stop_codon(&seq[i])) {
          problem_add(problems, feat, BAD_REF_ORF, -1, -1);
          retval = FALSE;
          break;
        }
      }
    }

    if (indel_strict) {
      int strict_okay = TRUE;
      List *signals = lst_new_ptr(10);
      str_split(str_new_charstr(SIGNALS), ",", signals);

      if (str_in_list(feat->feature, signals)) {
        /* reject any signal feature with gaps in the ref seq, unless they
           appear in a non-critical part of a splice site or in a
           "prestart" feature  */
        if (has_gaps) {          
          if (str_starts_with_charstr(feat->feature, SPLICE_5)) {
            if (ss_get_char_pos(msa, feat->start-1, 0, 0) == GAP_CHAR ||
                ss_get_char_pos(msa, feat->start, 0, 0) == GAP_CHAR)
              strict_okay = FALSE;
          }
          else if (str_starts_with_charstr(feat->feature, SPLICE_3)) {
            if (ss_get_char_pos(msa, feat->end-1, 0, 0) == GAP_CHAR ||
                ss_get_char_pos(msa, feat->end-2, 0, 0) == GAP_CHAR)
              strict_okay = FALSE;
          }
          else if (!str_equals_charstr(feat->feature, "prestart"))
            strict_okay = FALSE;
        }
        /* in addition, if two signals occur consec. with gaps and
           only gaps between them, assume a violation of
           --indel-strict */
        if (lastfeat_helper != NULL && lastfeat_helper->end < feat->start-1) {
          int allgaps = 1;
          for (j = lastfeat_helper->end; allgaps && j < feat->start-1; j++) 
                                /* note indexing: -1+1 for end and -1
                                   for start  */
            if (ss_get_char_pos(msa, j, 0, 0) != GAP_CHAR) allgaps = 0;
          if (allgaps) 
            strict_okay = FALSE;
        }
        lastfeat_helper = feat;
      }
      else lastfeat_helper = NULL;
    
      /* also exclude CDS exons of length less than 6 in indel_strict
         case -- these cause problems in exoniphy training because
         start_codon is adjacent to cds5ss */
      if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && len <= 6)
        strict_okay = FALSE;

      if (!strict_okay) {
        problem_add(problems, feat, BAD_REF_INDEL_STRICT_FAIL, -1, -1);
        retval = FALSE;
      }
      lst_free_strings(signals);
      lst_free(signals);
    }
  }
  if (seq != NULL) sfree(seq);
  return retval;
}
Exemple #11
0
//take the elements of a GFF in R and make a GFF object in C; return pointer
//Assume length of vectors are all equal (except optional elements can be NULL)
SEXP rph_gff_new(SEXP seqnameP, SEXP srcP, SEXP featureP, SEXP startP, SEXP endP,
		 SEXP scoreP, SEXP strandP, SEXP frameP, SEXP attributeP) {
  GFF_Set *gff;
  GFF_Feature *feat;
  int gfflen, i;
  int haveScore=0, haveStrand=0, haveFrame=0, haveAttribute=0, numProtect=5;
  String *seqname, *source, *feature, *attribute;
  int *start, *end, frame=GFF_NULL_FRAME, *frameVec=NULL;
  double *scoreVec=NULL, score;
  char strand;

  PROTECT(seqnameP = AS_CHARACTER(seqnameP));
  PROTECT(srcP = AS_CHARACTER(srcP));
  PROTECT(featureP = AS_CHARACTER(featureP));
  PROTECT(startP = AS_INTEGER(startP));
  start = INTEGER_POINTER(startP);
  PROTECT(endP = AS_INTEGER(endP));
  end = INTEGER_POINTER(endP);
  if (scoreP != R_NilValue) {
    PROTECT(scoreP = AS_NUMERIC(scoreP));
    haveScore = 1;
    scoreVec = NUMERIC_POINTER(scoreP);
  } else score=0;
  if (strandP != R_NilValue) {
    PROTECT(strandP = AS_CHARACTER(strandP));
    haveStrand=1;
  } else strand='.';
  if (frameP != R_NilValue) {
    PROTECT(frameP = AS_INTEGER(frameP));
    haveFrame=1;
    frameVec = INTEGER_POINTER(frameP);
  }
  if (attributeP != R_NilValue) {
    PROTECT(attributeP = AS_CHARACTER(attributeP));
    haveAttribute=1;
  }

  numProtect += (haveScore + haveStrand + haveFrame + haveAttribute);

  gfflen = LENGTH(seqnameP);
  gff = gff_new_set_len(gfflen);

  for (i=0; i<gfflen; i++) {
    checkInterruptN(i, 1000);
    seqname = str_new_charstr(CHAR(STRING_ELT(seqnameP, i)));
    source = str_new_charstr(CHAR(STRING_ELT(srcP, i)));
    feature = str_new_charstr(CHAR(STRING_ELT(featureP, i)));
    if (haveScore) score = scoreVec[i];
    if (haveStrand) strand = (CHAR(STRING_ELT(strandP, i)))[0];
    if (haveFrame) {
      if (frameVec[i] == 0) frame = 0;
      else if (frameVec[i] == 1) frame = 2;
      else if (frameVec[i] == 2) frame = 1;
    }
    if (haveAttribute) attribute = str_new_charstr(CHAR(STRING_ELT(attributeP, i)));
    else attribute = str_new_charstr("");

    if (seqname == NULL) die("seqname is NULL\n");
    if (source == NULL) die ("source is NULL\n");
    if (feature ==  NULL) die("feature is NULL\n");
    if (attribute == NULL) die("attribute is NULL\n");
    if (strand != '+' && strand != '-' && strand!='.') die("strand is %c\n", strand);
    if (frame != GFF_NULL_FRAME && (frame<0 || frame>2)) die("frame is %i\n", frame);

    feat = gff_new_feature(seqname, source, feature, start[i], end[i], score, strand,
			   frame, attribute, haveScore==0);
    lst_push_ptr(gff->features, feat);
  }

  UNPROTECT(numProtect);
  return rph_gff_new_extptr(gff);
}
Exemple #12
0
int main(int argc, char *argv[]) {
  char c;
  char *msa_fname = NULL;
  int opt_idx, i, old_nnodes;
  MSA *msa;
  List *pruned_names = lst_new_ptr(5), *tmpl;
  BDPhyloHmm *bdphmm;
  GFF_Set *predictions;
  int found = FALSE;
  List *ignore_types = lst_new_ptr(1);

  struct option long_opts[] = {
    {"refseq", 1, 0, 'M'},
    {"msa-format", 1, 0, 'i'},
    {"refidx", 1, 0, 'r'},
    {"rho", 1, 0, 'R'},
    {"phi", 1, 0, 'p'},
    {"transitions", 1, 0, 't'},    
    {"expected-length", 1, 0, 'E'},
    {"target-coverage", 1, 0, 'C'},
    {"seqname", 1, 0, 'N'},
    {"idpref", 1, 0, 'P'},
    {"indel-model", 1, 0, 'I'},
    {"indel-history", 1, 0, 'H'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* arguments and defaults for options */
  FILE *refseq_f = NULL, *msa_f = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  TreeModel *source_mod;
  double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, 
    phi = DEFAULT_PHI, gamma = -1, omega = -1, 
    alpha_c = -1, beta_c = -1, tau_c = -1,
    alpha_n = -1, beta_n = -1, tau_n = -1;
  int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, 
    estim_gamma = TRUE, estim_omega = TRUE;
  char *seqname = NULL, *idpref = NULL;
  IndelHistory *ih = NULL;

  while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'R':
      rho = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 't':
      if (optarg[0] != '~') estim_gamma = estim_omega = FALSE;
      else optarg = &optarg[1];
      set_transitions = TRUE;
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 2) 
        die("ERROR: bad argument to --transitions.\n");
      mu = lst_get_dbl(tmpl, 0);
      nu = lst_get_dbl(tmpl, 1);
      if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1)
        die("ERROR: bad argument to --transitions.\n");
      lst_free(tmpl);
      break;
    case 'p':
      if (optarg[0] != '~') estim_phi = FALSE;
      else optarg = &optarg[1];
      phi = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'E':
      if (optarg[0] != '~') estim_omega = FALSE;
      else optarg = &optarg[1];
      omega = get_arg_dbl_bounds(optarg, 1, INFTY);
      mu = 1/omega;
      break;
    case 'C':
      if (optarg[0] != '~') estim_gamma = FALSE;
      else optarg = &optarg[1];
      gamma = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'r':
      refidx = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'M':
      refseq_f = phast_fopen(optarg, "r");
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'N':
      seqname = optarg;
      break;
    case 'P':
      idpref = optarg;
      break;
    case 'I':
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6)
        die("ERROR: bad argument to --indel-model.\n");
      alpha_n = lst_get_dbl(tmpl, 0);
      beta_n = lst_get_dbl(tmpl, 1);
      tau_n = lst_get_dbl(tmpl, 2);
      if (lst_size(tmpl) == 6) {
        alpha_c = lst_get_dbl(tmpl, 3);
        beta_c = lst_get_dbl(tmpl, 4);
        tau_c = lst_get_dbl(tmpl, 5);
      }
      else {
        alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n;
      }
      if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || 
          tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || 
          beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1)
        die("ERROR: bad argument to --indel-model.\n");
      break;
    case 'H':
      fprintf(stderr, "Reading indel history from %s...\n", optarg);
      ih = ih_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'dless -h'.\n");
    }
  }

  if (optind != argc - 1)
    die("Missing alignment file or model file.  Try 'dless -h'.\n");

  if (set_transitions && (gamma != -1 || omega != -1))
    die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n");

  if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1))
    die("ERROR: --target-coverage and --expecteed-length must be used together.\n");

  set_seed(-1);

  if (gamma != -1)
    nu = gamma/(1-gamma) * mu;

  fprintf(stderr, "Reading tree model from %s...\n", argv[optind]);
  source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (source_mod->nratecats > 1) 
    die("ERROR: rate variation not currently supported.\n");

  if (source_mod->order > 0)
    die("ERROR: only single nucleotide models are currently supported.\n");

  if (!tm_is_reversible(source_mod))
    phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n");

  /* read alignment */
  msa_f = phast_fopen(argv[optind], "r");

  fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
  if (msa_format == UNKNOWN_FORMAT) 
    msa_format = msa_format_for_content(msa_f, 1);

  if (msa_format == MAF) {
    msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, 
                   NO_STRIP, FALSE); 
  }
  else 
    msa = msa_new_from_file_define_format(msa_f, msa_format, NULL);

  if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
  msa_remove_N_from_alph(msa);

  if (msa->ss == NULL) {
    fprintf(stderr, "Extracting sufficient statistics...\n");
    ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
  }
  else if (msa->ss->tuple_idx == NULL)
    die("ERROR: ordered representation of alignment required unless --suff-stats.\n");

  /* prune tree, if necessary */
  old_nnodes = source_mod->tree->nnodes;
  tm_prune(source_mod, msa, pruned_names);

  if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
    die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
  if (lst_size(pruned_names) > 0) {
    fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
    for (i = 0; i < lst_size(pruned_names); i++)
      fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, 
              i < lst_size(pruned_names) - 1 ? ", " : ").\n");
  }

  /* this has to be done after pruning tree */
  tr_name_ancestors(source_mod->tree);

  /* also make sure match for reference sequence in tree */
  if (refidx > 0) {
    for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) {
      TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i);
      if (!strcmp(n->name, msa->names[refidx-1]))
        found = TRUE;
    }
    if (!found) die("ERROR: no match for reference sequence in tree.\n");
  }

  /* checks for indel model */
  if (alpha_c > 0) {
    if (ih == NULL) {
      fprintf(stderr, "Reconstructing indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
    else {
      if (ih->ncols != msa->length)
        die("ERROR: indel history doesn't seem to match alignment.\n");
      if (ih->tree->nnodes != source_mod->tree->nnodes)
        die("ERROR: indel history doesn't seem to match tree model.\n");
    }
  }

  bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, 
                  alpha_n, beta_n, tau_n, estim_gamma, estim_omega, 
                  estim_phi);

  /* compute emissions */
  phmm_compute_emissions(bdphmm->phmm, msa, FALSE);

  /* add emissions for indel model, if necessary */
  if (alpha_c > 0) {
    fprintf(stderr, "Adjusting emissions for indels...\n");
    bd_add_indel_emissions(bdphmm, ih);
  }

  /* postprocess for missing data (requires special handling) */
  fprintf(stderr, "Adjusting emissions for missing data...\n");
  bd_handle_missing_data(bdphmm, msa);

  if (estim_gamma || estim_omega || estim_phi) {
    fprintf(stderr, "Estimating free parameters...\n");
    bd_estimate_transitions(bdphmm, msa);
  }

  /* set seqname and idpref, if necessary */
  if (seqname == NULL || idpref == NULL) {
    /* derive default from file name root */
    String *tmp = str_new_charstr(msa_fname);
    if (!str_equals_charstr(tmp, "-")) {
      str_remove_path(tmp);
      str_root(tmp, '.');
      if (idpref == NULL) idpref = copy_charstr(tmp->chars);
      str_root(tmp, '.');         /* apply one more time for double suffix */
      if (seqname == NULL) seqname = tmp->chars;    
    }
    else if (seqname == NULL) seqname = "refseq";
  }

  /* obtain predictions */
  fprintf(stderr, "Running Viterbi algorithm...\n");
  predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL);
  lst_push_ptr(ignore_types, str_new_charstr("nonconserved"));
  gff_filter_by_type(predictions, ignore_types, TRUE, NULL);

  /* score predictions */
  fprintf(stderr, "Scoring predictions...\n");
  bd_score_predictions(bdphmm, predictions);
  
  /* can free emissions now */
  for (i = 0; i < bdphmm->phmm->hmm->nstates; i++)
    sfree(bdphmm->phmm->emissions[i]);
  sfree(bdphmm->phmm->emissions);
  bdphmm->phmm->emissions = NULL;

  /* convert GFF to coord frame of reference sequence and adjust
     coords by idx_offset, if necessary  */
  if (refidx != 0 || msa->idx_offset != 0)
    msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset);

  if (refidx != 0) 
    gff_flatten(predictions);	
  /* necessary because coord conversion might create overlapping
     features (can happen in deletions in reference sequence) */

  /* now output predictions */
  fprintf(stderr, "Writing GFF to stdout...\n");
  gff_print_set(stdout, predictions);

  fprintf(stderr, "Done.\n");
  
  return 0;
}
GFF_Set *ms_score(char *seqName, char *seqData, int seqLen, int seqIdxOff, int seqAlphLen, List *MarkovMatrices, Matrix *pwm, Matrix *reverseCmpPWM, int conservative, double threshold, char *strand) { 
  int i, k,j,l,col;
  double MMprob, PWMprob=0, ReversePWMprob=0;
  GFF_Set *scores = gff_new_set();
  double *MMprobs = (double*)smalloc((pwm->nrows+1) * sizeof(double));    //Sliding window of mmOrder previous MM probabilities
		
  if ((conservative != 0) && (conservative != 1))
    die("ERROR: Conserverative (boolean) value must be 0 or 1");
	
  if (seqLen < pwm->nrows)  //Check to see if the sequence is shorter than the pwm
    return scores;

  for (i = 0; i <= pwm->nrows; i++)							//Calculate MM scores from sites 0 to pwm->nrows
    if (i < seqLen)
      MMprobs[i] = calcMMscore(seqData, i, MarkovMatrices, conservative);
		
  for (i = 0; i <= seqLen-(pwm->nrows); i++) {				//For each base in the sequence
    PWMprob = 0; MMprob = 0; ReversePWMprob = 0;
		
    for (k = 0, j = i; k < pwm->nrows; k++, j++) {		//Sum PWM, ReversePWM, MM probabilities for score calculation
      col = basetocol(seqData[j]);
      if (col >= 0)
        {
          PWMprob += mat_get(pwm, k, col);
          ReversePWMprob += mat_get(reverseCmpPWM, k, col);
          MMprob += MMprobs[k];
        }
      else {		
        if (conservative)	
          {		
            PWMprob = log(0);			//If we get something other than the expected language (i.e. A,C,T,G) i.e. N, then our probability is -Inf
            ReversePWMprob = log(0);
            break;
          }
        else
          {
            PWMprob = 0;										
            ReversePWMprob = 0;
          }
      }
    }
	
    if (i < (seqLen - pwm->nrows)) { //Only if there are more bases in this sequence to test
      for (l = 0; l < pwm->nrows; l++)		//Shift probs left to make room for next
	MMprobs[l] = MMprobs[l + 1];

      MMprobs[pwm->nrows-1] = calcMMscore(seqData, i+pwm->nrows,  //Calculate MM probability for site at (i+pwm->nrows)
                                          MarkovMatrices, conservative);
    }

    if (((PWMprob - MMprob) > threshold) && ((strcmp(strand, "+") == 0) || (strcmp(strand, "both") == 0) || ((strcmp(strand, "best") == 0) && ((PWMprob - MMprob) >= (ReversePWMprob - MMprob))))) {			//If we have a positive score add it to the list of scores
      GFF_Feature *feat = gff_new_feature(str_new_charstr(seqName), str_new_charstr(""), 
                                          str_new_charstr(""), seqIdxOff+i+1, 
                                          seqIdxOff+i+pwm->nrows, (PWMprob - MMprob), '+', 
                                          0, str_new_charstr(""), 0);
      lst_push_ptr(scores->features, feat);
    }

    if (((ReversePWMprob - MMprob) > threshold) && ((strcmp(strand, "-") == 0) || (strcmp(strand, "both") == 0) || ((strcmp(strand, "best") == 0) && ((ReversePWMprob - MMprob) > (PWMprob - MMprob))))) {
      GFF_Feature *feat = gff_new_feature(str_new_charstr(seqName), str_new_charstr(""), 
                                          str_new_charstr(""), seqIdxOff+i+1, 
                                          seqIdxOff+i+pwm->nrows, (ReversePWMprob - MMprob), '-', 
                                          0, str_new_charstr(""), 0);
      lst_push_ptr(scores->features, feat);
    }
  }
  sfree(MMprobs);
  return scores; 
}
Exemple #14
0
int main(int argc, char *argv[]) {
  /* variables for options, with defaults */
  TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL;
  Hashtable *rename_hash = NULL;
  double scale_factor = 1;
  List *prune_names = NULL, *label = NULL, *labelType = NULL;
  int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE,
    name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE,
    inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE;
  TreeModel *mod = NULL, *merge_mod = NULL;
  char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL,
    *node_distance_name = NULL;
  
  /* other variables */
  String *suffix,  *optstr;
  char c;
  int i, opt_idx;
  TreeNode *n;

  struct option long_opts[] = {
    {"scale", 1, 0, 's'},
    {"extrapolate", 1, 0, 'e'},
    {"prune", 1, 0, 'p'},
    {"prune-all-but", 1, 0, 'P'},
    {"get-subtree", 1, 0, 'g'},
    {"merge", 1, 0, 'm'},
    {"rename", 1, 0, 'r'},
    {"tree-only", 0, 0, 't'},
    {"no-branchlen", 0, 0, 'N'},
    {"dissect", 0, 0, 'd'},
    {"name-ancestors", 0, 0, 'a'},
    {"reroot", 1, 0, 'R'},
    {"with-branch", 1, 0, 'B'},
    {"subtree", 1, 0, 'S'},
    {"branchlen", 0, 0, 'b'},
    {"newick", 0, 0, 'n'},
    {"label-subtree", 1, 0, 'L'},
    {"label-branches", 1, 0, 'l'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", 
                          long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 's':
      scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'e':
      if (!strcmp(optarg, "default")) {
        optarg = smalloc(1000 * sizeof(char));
        #if defined(__MINGW32__)
          sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh",
		  PHAST_HOME);
        #else
          sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", 
                  PHAST_HOME);
        #endif
      }
      extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'p':
      prune_names = get_arg_list(optarg);
      break;
    case 'P':
      prune_names = get_arg_list(optarg);
      prune_all_but = TRUE;
      break;
    case 'g':
      get_subtree_name = optarg;
      break;
    case 'm':
      suffix = str_new_charstr(optarg);
      str_suffix(suffix, '.');
      if (str_equals_charstr(suffix, "nh"))
        merge_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      else {
        merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
        merge_tree = merge_mod->tree;
      }
      break;
    case 'r':
      rename_hash = make_name_hash(optarg);
      break;
    case 't':
      tree_only = TRUE;
      break;
    case 'N':
      no_branchlen = TRUE;
      tree_only = TRUE;
      break;
    case 'd':
      dissect = TRUE;
      break;
    case 'b':
      print_branchlen = TRUE;
      break;
    case 'D':
      print_distance_to_root = TRUE;
      node_distance_name = optarg;
      break;
    case 'R':
      reroot_name = optarg;
      break;
    case 'B':
      with_branch = TRUE;
      break;
    case 'a':
      name_ancestors = TRUE;
      break;
    case 'S':
      subtree_name = optarg;
      break;
    case 'n':
      inNewick=TRUE;
      break;
    case 'L':  //do the same for --label--subtree and --label-branches
    case 'l':
      if (label == NULL) {
	label = lst_new_ptr(1);
	labelType = lst_new_int(1);
      }
      optstr = str_new_charstr(optarg);
      lst_push_ptr(label, optstr);
      lst_push_int(labelType, (int)c);
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  if (merge_tree != NULL && extrapolate_tree != NULL)
    die("ERROR: Can't use --merge and --extrapolate together");

  set_seed(-1);
    
  suffix = str_new_charstr(argv[optind]);
  str_suffix(suffix, '.');
  if (inNewick || str_equals_charstr(suffix, "nh")) {
    tree = tr_new_from_file(phast_fopen(argv[optind], "r"));
    tree_only = TRUE;           /* can't output tree model in this case */
  }
  else {
    mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);
    tree = mod->tree;
  }

  if (prune_names != NULL) {
    tr_prune(&tree, prune_names, prune_all_but, NULL);
    if (mod != NULL) mod->tree = tree; /* root may have changed */
  }

  if (get_subtree_name != NULL) {
    n = tr_get_node(tree, get_subtree_name);
    if (n == NULL) {
      tr_name_ancestors(tree);
      n = tr_get_node(tree, get_subtree_name);
      if (n == NULL) {
	die("ERROR: no node named '%s'.\n", subtree_name);
      }
    }
    tr_prune_supertree(&tree, n);
    if (mod != NULL) mod->tree = tree;
  }

  if (merge_tree != NULL) {
    tree = tr_hybrid(tree, merge_tree);
    if (mod != NULL) mod->tree = tree;
  }

  else if (extrapolate_tree != NULL) {
    tr_scale_by_subtree(extrapolate_tree, tree);
    tree = extrapolate_tree;
    if (mod != NULL) mod->tree = tree;
  }

  if (scale_factor != 1) {
    if (subtree_name == NULL)
      tr_scale(tree, scale_factor);
    else {
      n = tr_get_node(tree, subtree_name);
      if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name);
      tr_scale_subtree(tree, n, scale_factor, with_branch);
    }
  }

  if (name_ancestors)
    tr_name_ancestors(tree);

  if (rename_hash != NULL) {
    char *newname;
    for (i = 0; i < tree->nnodes; i++) {
      n = lst_get_ptr(tree->nodes, i);
      if (n->name != NULL && n->name[0] != '\0' && 
          (newname = hsh_get(rename_hash, n->name)) != (char*)-1) {
        strcpy(n->name, newname);
      }
    }
  }

  if (reroot_name != NULL) {
    n = tr_get_node(tree, reroot_name);
    if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name);
    tr_reroot(tree, n, with_branch);
    if (mod != NULL) mod->tree = with_branch ? n->parent : n;
    tree = with_branch ? n->parent : n;
  }

  if (label != NULL) {
    for (i=0; i < lst_size(label); i++) {
      String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal;
      List *tmplst = lst_new_ptr(10);
      String *nodename;
      int j;
      str_split(currstr, ":", tmplst);
      if (lst_size(tmplst) != 2) 
	die("ERROR: bad argument to --label-branches or --label-subtree.\n");
      arg1 = lst_get_ptr(tmplst, 0);
      labelVal = lst_get_ptr(tmplst, 1);
      lst_clear(tmplst);
      if (lst_get_int(labelType, i) == (int)'l') {
	str_split(arg1, ",", tmplst);
	for (j=0; j < lst_size(tmplst); j++) {
	  nodename = (String*)lst_get_ptr(tmplst, j);
	  tr_label_node(tree, nodename->chars, labelVal->chars);
	}
	lst_free_strings(tmplst);
      } else if (lst_get_int(labelType, i) == (int)'L') {
	int include_leading_branch = FALSE;
	TreeNode *node;
	nodename = arg1;
	node = tr_get_node(tree, nodename->chars);
	if (node == NULL && nodename->chars[nodename->length-1] == '+') {
	  nodename->chars[--nodename->length] = '\0';
	  node = tr_get_node(tree, nodename->chars);
	  include_leading_branch = TRUE;
	}
	tr_label_subtree(tree, nodename->chars, include_leading_branch, 
			 labelVal->chars);
      } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i));
      str_free(arg1);
      str_free(labelVal);
      lst_free(tmplst);
      str_free(currstr);
    }
    lst_free(label);
    lst_free(labelType);
  }

  if (dissect) 
    tr_print_nodes(stdout, tree);
  if (print_branchlen) 
    printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree));
  if (print_distance_to_root) {
    TreeNode *node = tr_get_node(tree, node_distance_name);
    if (node == NULL) 
      die("ERROR: no node named '%s'.\n", node_distance_name);
    printf("length(root-%s): %f\n", node_distance_name, 
	   tr_distance_to_root(node));
  }

  if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) {
    if (tree_only)
      tr_print(stdout, tree, no_branchlen==FALSE);
    else
      tm_print(stdout, mod);
  }
  return 0;
}
Exemple #15
0
//parses a line from maf block starting with 'e' or 's' and returns a new MafSubBlock 
//object. 
MafSubBlock *mafBlock_get_subBlock(String *line) {
  int i;
  List *l = lst_new_ptr(7);
  String *str;
  MafSubBlock *sub;

  if (7 != str_split(line, NULL, l)) 
    die("Error: mafBlock_get_subBlock expected seven fields in MAF line starting "
	"with %s\n",
	((String*)lst_get_ptr(l, 0))->chars);
  
  sub = mafBlock_new_subBlock();
  
  //field 0: should be 's' or 'e'
  str = (String*)lst_get_ptr(l, 0);
  if (str_compare_charstr(str, "s")==0)
    sub->lineType[0]='s';
  else if (str_compare_charstr(str, "e")==0)
    sub->lineType[0]='e';
  else die("ERROR: mafBlock_get_subBlock expected first field 's' or 'e' (got %s)\n",
	   str->chars);

  //field 1: should be src.  Also set specName
  sub->src = (String*)lst_get_ptr(l, 1);
  sub->specName = str_new_charstr(sub->src->chars);
  str_shortest_root(sub->specName, '.');

  //field 2: should be start
  sub->start = atol(((String*)lst_get_ptr(l, 2))->chars);
  
  //field 3: should be length
  sub->size = atoi(((String*)lst_get_ptr(l, 3))->chars);

  //field 4: should be strand
  str = (String*)lst_get_ptr(l, 4);
  if (str_compare_charstr(str, "+")==0)
    sub->strand = '+';
  else if (str_compare_charstr(str, "-")==0)
    sub->strand = '-';
  else die("ERROR: got strand %s\n", str->chars);
  
  //field 5: should be srcSize
  sub->srcSize = atol(((String*)lst_get_ptr(l, 5))->chars);

  //field 6: sequence if sLine, eStatus if eLine.
  str = (String*)lst_get_ptr(l, 6);
  if (sub->lineType[0]=='s')
    sub->seq = str;
  else {
    if (sub->lineType[0] != 'e')
      die("ERROR mafBlock_get_subBlock: bad lineType (expected 'e', got %c)\n",
	  sub->lineType[0]);
    if (str->length != 1)
      die("ERROR: e-Line with status %s in MAF block\n", str->chars);
    sub->eStatus = str->chars[0];
    //note: don't know what status 'T' means (it's not in MAF documentation), but
    //it is in the 44-way MAFs
    if (sub->eStatus != 'C' && sub->eStatus != 'I' && sub->eStatus != 'M' &&
	sub->eStatus != 'n' && sub->eStatus != 'T')
      die("ERROR: e-Line has illegal status %c\n", sub->eStatus);
  }
  sub->numLine = 1;
  //free all strings except field 1 and field 6 when lineType=='s'
  for (i=0; i<7; i++)
    if (i!=1 && (i!=6 || sub->lineType[0]!='s')) 
      str_free((String*)lst_get_ptr(l, i));
  lst_free(l);
  return sub;
}
Exemple #16
0
/* Create a GFF_Set from a sequence of category/state numbers, using
   a specified category map and mapping from raw state numbers to
   category numbers.  */
GFF_Set *cm_labeling_as_gff(CategoryMap *cm, int *path, 
                            int length, int *path_to_cat, 
                            int *reverse_compl, char *seqname, 
                            char *source, List *frame_cats, 
                            char *grouptag,  char *idpref
                            ) {
  int beg, end, i, cat, frame, groupno;
  GFF_Set *gff = gff_new_set_init("PHAST", PHAST_VERSION);
  int do_frame[cm->ncats+1];
  char strand;
  char groupstr[STR_SHORT_LEN];
  int ignore_0 = str_equals_charstr(cm_get_feature(cm, 0), BACKGD_CAT_NAME);
                                /* ignore category 0 if background  */

  if (length <= 0) return gff;

  for (i = 0; i <= cm->ncats; i++) do_frame[i] = 0;
  if (frame_cats != NULL)
    for (i = 0; i < lst_size(frame_cats); i++) {
      int cat = cm_get_category(cm, lst_get_ptr(frame_cats, i));
      if (cat != 0)             /* ignore background or unrecognized name */
        do_frame[cat] = 1;
    }

  groupno = 1;
  if (idpref != NULL)
    sprintf(groupstr, "%s \"%s.%d\"", grouptag != NULL ? grouptag : "id", 
            idpref, groupno);
  else
    sprintf(groupstr, "%s \"%d\"", grouptag != NULL ? grouptag : "id", groupno);

  i = 0;
  while (i < length) {
    checkInterruptN(i, 10000);
    cat = cm->ranges[path_to_cat[path[i]]]->start_cat_no;
    strand = reverse_compl[path[i]] ? '-' : '+';
    frame = do_frame[cat] ? path_to_cat[path[i]] - cat : GFF_NULL_FRAME;

    /* scan ahead until enter new category range (or reach end of seq) */
    beg = i + 1;                /* begin of feature (GFF coords) */
    for (i++; i < length && 
           cm->ranges[path_to_cat[path[i]]]->start_cat_no == cat; i++);
    end = i;                    /* end of feature (GFF coords) */

    /* if minus strand, adjust frame to reflect end */
    if (strand == '-' && do_frame[cat]) 
      frame = path_to_cat[path[i-1]] - cat;

    /* if legitimate feature (non-background), then incorp into GFF_Set */
    if (cat != 0 || !ignore_0)  /* create new feature and add */
      lst_push_ptr(gff->features, 
                   gff_new_feature(str_new_charstr(seqname), 
                                   str_new_charstr(source), 
                                   str_dup(cm_get_feature(cm, cat)), 
                                   beg, end, 0, strand, frame, 
                                   str_new_charstr(groupstr), TRUE));

    if (cat == 0 && beg > 1) {
      groupno++;                /* increment group number each time a
                                   sequence of 0s is encountered  */
      if (idpref != NULL)
        sprintf(groupstr, "%s \"%s.%d\"", grouptag != NULL ? grouptag : "id", 
                idpref, groupno);
      else
        sprintf(groupstr, "%s \"%d\"", grouptag != NULL ? grouptag : "id", 
                groupno);
    }
  }

  return gff;
}
int main(int argc, char* argv[]) {
  FILE* F;
  GFF_Set *gff_real=NULL, *gff_pred=NULL;
  char c;
  List *real_fname_list = NULL, *pred_fname_list = NULL, 
    *feat_list = NULL, *seq_len_list = NULL, *l = NULL;
  int nfile, i, j;
  char *prefix = NULL;
  int tot_tp = 0, tot_fp = 0, tot_nreal_pos = 0, tot_npred_pos = 0, 
    tot_seqlen = 0, tot_ncr = 0, tot_npca = 0, tot_nola = 0, tot_nme = 0, 
    tot_npcp = 0, tot_nolp = 0, tot_nwe = 0, tot_nexons_real = 0, 
    tot_nexons_pred = 0, dump_exons = 0, nnc = -1, tot_nnc = -1, 
    nc_threshold = 0;

  while ((c = (char)getopt(argc, argv, "r:p:f:l:d:n:h")) != -1) {
    switch(c) {
    case 'r':
      real_fname_list = get_arg_list(optarg);
      break;
    case 'p':
      pred_fname_list = get_arg_list(optarg);
      break;
    case 'l':
      l = get_arg_list(optarg);
      /* convert to ints */
      seq_len_list = lst_new_int(lst_size(l));
      for (i = 0; i < lst_size(l); i++) {
        int tmp;
        if (str_as_int((String*)lst_get_ptr(l, i), 
                       &tmp) != 0) {
          die("ERROR: Bad integer in <seq_len_list>.\n"); 
        }
        lst_push_int(seq_len_list, tmp);
      }
      break;
    case 'f':
      feat_list = get_arg_list(optarg);
      break;
    case 'd':
      dump_exons = 1;
      prefix = optarg;
      break;
    case 'n':
      nnc = tot_nnc = 0;
      nc_threshold = get_arg_int(optarg);
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Unrecognized option.  Try \"eval_predictions -h\" for help.\n");
    }
  }

  set_seed(-1);

  if (feat_list == NULL) {
    feat_list = lst_new_ptr(1);
    lst_push_ptr(feat_list, str_new_charstr(GFF_CDS_TYPE));
  }
  
  if (real_fname_list == NULL || pred_fname_list == NULL || 
      seq_len_list == NULL) {
    die("ERROR: Must specify -r, -p, and -l.  Try \"eval_predictions -h\" for help.\n");
  }

  if (lst_size(real_fname_list) != lst_size(pred_fname_list)) {
    die("ERROR: Must specify lists of equal length for real and predicted filenames.\n\n.");
  }

  if (lst_size(seq_len_list) == 1 && lst_size(real_fname_list) > 1)
    for (i = 1; i < lst_size(real_fname_list); i++)
      lst_push_int(seq_len_list, lst_get_int(seq_len_list, 0));
  else if (lst_size(seq_len_list) != lst_size(real_fname_list))
    die("ERROR: List of sequence lengths does not match lists of real and predicted filenames.\n");

  /* print header */
  printf("%-25s %-25s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "Real_fname", "Pred_fname", "Sn", "Sp", "AC", "CC", "ESn", "ESp", "CRa", "PCa", "OLa", "ME", "CRp", "PCp", "OLp", "WE");
  if (nnc != -1) printf(" %7s %7s %7s %7s", "NCa", "NCp", "CR+NCa", "CR+NCp");
  printf("\n");

  for (nfile = 0; nfile < lst_size(real_fname_list); nfile++) {
    int tp, fp, nexons_real, nexons_pred, nwe, nme, ncr, npca, 
      npcp, nola, nolp, nreal_pos, npred_pos, len_real, len_pred, seqlen,
      already_counted_real;
    String *real_fname, *pred_fname;
    GFF_Feature *feat_real, *feat_pred=NULL;

    real_fname = (String*)lst_get_ptr(real_fname_list, nfile);
    F = phast_fopen(real_fname->chars, "r");
    if ((gff_real = gff_read_set(F)) == NULL) {
      die("ERROR: Unable to read file \"%s\".\n", 
	  real_fname->chars);
    }
    phast_fclose(F);

    pred_fname = (String*)lst_get_ptr(pred_fname_list, nfile);
    F = phast_fopen(pred_fname->chars, "r");
    if ((gff_pred = gff_read_set(F)) == NULL) {
      die("ERROR: Unable to read file \"%s\".\n", 
	  pred_fname->chars);
    }
    phast_fclose(F);

    seqlen = lst_get_int(seq_len_list, nfile);

    /* sort ungrouped -- only cds exons will be considered, and each
       one will be considered individually */
    gff_ungroup(gff_real); 
    gff_ungroup(gff_pred);
    gff_sort(gff_real);
    gff_sort(gff_pred);

    nexons_real = nexons_pred = nwe = nme = ncr = npca = npcp = nola = 
      nolp = tp = fp = nreal_pos = npred_pos = 0;
    if (nnc != -1) nnc = 0;
    i = j = 0;
    already_counted_real = 0;
    while (i < lst_size(gff_real->features)) {
      feat_real = (GFF_Feature*)lst_get_ptr(gff_real->features, i);
      if (!is_exon(feat_real, feat_list)) { i++; continue; }

      len_real = feat_real->end - feat_real->start + 1;

      if (!already_counted_real) {
        nexons_real++;
        nreal_pos += len_real;
      }

      /* look at all predicted exons up to and overlapping this real exon */
      while (j < lst_size(gff_pred->features)) {
        feat_pred = (GFF_Feature*)lst_get_ptr(gff_pred->features, j);
        if (!is_exon(feat_pred, feat_list)) {
          j++;
          continue;
        }
        else if (feat_pred->start > feat_real->end) {
          if (!already_counted_real) {
            nme++;
            if (dump_exons) dump(prefix, feat_real, NULL, ME, -1);
          }
          break;
        }

        /* otherwise we have a predicted exon to count (start of pred
           <= end of real) */
        nexons_pred++;
        len_pred = feat_pred->end - feat_pred->start + 1;
        npred_pos += len_pred;
        j++;                    /* we'll be done with this prediction
                                   one way or another; next time
                                   through look at a new one */

        if (feat_pred->end < feat_real->start) { /* WE */
          nwe++;
          fp += len_pred;
          if (dump_exons) dump(prefix, NULL, feat_pred, WE, 0);
        }
        else if (feat_pred->start == feat_real->start && /* CR */
                 feat_pred->end == feat_real->end) {
          ncr++;
          tp += len_pred;
          if (dump_exons) dump(prefix, feat_real, feat_pred, CR, 1);
          break;
        }
        else if (feat_pred->start == feat_real->start || /* PC */
                 feat_pred->end == feat_real->end) {
          pred_type type;
          npca++;
          npcp++;
          if (nnc != -1 && 
              max(abs(feat_pred->start - feat_real->start), 
                  abs(feat_pred->end - feat_real->end)) <= nc_threshold) {
            nnc++; 
            type = NC;
          }
          else type = PC;
          if (len_pred < len_real) 
            tp += len_pred;
          else {
            tp += len_real;
            fp += (len_pred - len_real);
          }
          if (dump_exons) dump(prefix, feat_real, feat_pred, type, 
                               min(1, (double)len_real/len_pred));
          break;
        }
        else {                  /* OL */
          int overlap_size;
          pred_type type;
          nola++;
          nolp++;
          if (nnc != -1 && 
              max(abs(feat_pred->start - feat_real->start), 
                  abs(feat_pred->end - feat_real->end)) <= nc_threshold) {
            nnc++; 
            type = NC;
          }
          else type = PC;

          overlap_size = min(feat_pred->end, feat_real->end) - 
            max(feat_pred->start, feat_real->start) + 1;
          tp += overlap_size;
          fp += len_pred - overlap_size;
          if (dump_exons) dump(prefix, feat_real, feat_pred, type,
                               (double)overlap_size/len_pred);
          break;
        }
        /* NOTE: I'm ignoring the possibility that a predicted exon
           could be a PC and/or OL with respect to multiple real
           exons.  The effect on the exon-level stats will be fairly
           minor (at worst a predicted exon is scored as an OL when it
           should be scored as an PC, and a real exon is erroneously
           counted as a ME), but the effect on the nucleotide-level Sn
           and Sp could conceivably be significant.  */
      }

      /* if we have counted at least one prediction (and thus failed
         to reach the end of the list), but the last prediction did
         not extend as far as the end of the real exon, then delay
         moving on to the next real exon */
      if (j < lst_size(gff_pred->features) && feat_pred->end < feat_real->end) 
          already_counted_real = 1;
      else {
        /* if we reached the end of the list of predictions, then it
           must not have contained any exons, and the real exon in
           question is a ME (if it hasn't already been counted) */
        if (j == lst_size(gff_pred->features) && !already_counted_real) 
          nme++; 

        i++;
        already_counted_real = 0;
      }
    }
    
    /* any remaining predictions must be wrong */
    for (; j < lst_size(gff_pred->features); j++) {
      if (is_exon((GFF_Feature*)lst_get_ptr(gff_pred->features, j), 
                  feat_list)) {
        nexons_pred++;
        nwe++;
      }
    }

    compute_and_print_stats(stdout, real_fname, pred_fname, 
                            tp, fp, nreal_pos, npred_pos, seqlen, ncr, 
                            npca, nola, nme, npcp, nolp, nwe, 
                            nexons_real, nexons_pred, nnc);

    tot_tp += tp;
    tot_fp += fp;
    tot_nreal_pos += nreal_pos;
    tot_npred_pos += npred_pos;
    tot_seqlen += seqlen;
    tot_ncr += ncr;
    tot_npca += npca;
    tot_nola += nola;
    tot_nme += nme;
    tot_npcp += npcp;
    tot_nolp += nolp;
    tot_nwe += nwe;
    tot_nexons_real += nexons_real;
    tot_nexons_pred += nexons_pred;
    if (nnc != -1) tot_nnc += nnc;

    if (dump_exons && SUMF != NULL)
      fprintf(SUMF, "# Total number of bases in real exons: %d\n", nreal_pos);

    gff_free_set(gff_real);
    gff_free_set(gff_pred);
  }

  if (lst_size(real_fname_list) > 1)
    compute_and_print_stats(stdout, str_new_charstr("TOTAL"), str_new_charstr(""), 
                            tot_tp, tot_fp, tot_nreal_pos, tot_npred_pos, 
                            tot_seqlen, tot_ncr, tot_npca, tot_nola, tot_nme, 
                            tot_npcp, tot_nolp, tot_nwe, tot_nexons_real, 
                            tot_nexons_pred, tot_nnc);

  return 0;
}
Exemple #18
0
SEXP rph_gff_one_attribute(SEXP gffP, SEXP tagP) {
  GFF_Set *gff = (GFF_Set*)EXTPTR_PTR(gffP);
  GFF_Feature *f;
  ListOfLists *lol;
  List *l1, *l2;
  int numtag, numval, i, j, k, resultLen, maxResultLen=10;
  String *currStr, *tag, *currTag;
  char **result;
  SEXP rv;
  SEXP rph_listOfLists_to_SEXP(ListOfLists *lol);


  if (lst_size(gff->features) == 0) return R_NilValue;
  gff_register_protect(gff);
  result = smalloc(maxResultLen*sizeof(char*));
  tag = str_new_charstr(CHARACTER_VALUE(tagP));
  str_double_trim(tag);
  lol = lol_new(lst_size(gff->features));
  l1 = lst_new_ptr(10);
  l2 = lst_new_ptr(10);
  for (i=0; i < lst_size(gff->features); i++) {
    checkInterruptN(i, 1000);
    resultLen=0;
    f = (GFF_Feature*) lst_get_ptr(gff->features, i);
    numtag = str_split_with_quotes(f->attribute, ";", l1);  //split tags
    for (j=0; j < numtag; j++) {
      currStr = (String*)lst_get_ptr(l1, j);
      str_double_trim(currStr);

      //first try gff version 3, see if we have tag=val format
      numval = str_split_with_quotes(currStr, "=", l2);
      if (numval == 2) {
	currTag = (String*)lst_get_ptr(l2, 0);
	str_double_trim(currTag);
	if (str_equals(tag, currTag)) {  // tag matches target, add all values to list
	  currStr = str_new_charstr(((String*)lst_get_ptr(l2, 1))->chars);
	  lst_free_strings(l2);
	  numval = str_split_with_quotes(currStr, ",", l2);
	  str_free(currStr);
	  for (k=0; k < numval; k++) {
	    currStr = lst_get_ptr(l2, k);
	    str_double_trim(currStr);
	    str_remove_quotes(currStr);
	    if (resultLen > maxResultLen) {
	      maxResultLen += 100;
	      result = srealloc(result, maxResultLen*sizeof(char*));
	    }
	    result[resultLen++] = copy_charstr(currStr->chars);
	  }
	}
      } else {
	lst_free_strings(l2);

	//gff version 2
	//split into tag val val ... by whitespace unless enclosed in quotes
	numval =  str_split_with_quotes(currStr, NULL, l2);
	if (numval > 1) {
	  currStr = (String*)lst_get_ptr(l2, 0);
	  str_double_trim(currStr);
	  if (str_equals(tag, currStr)) {  //tag matches target, add all values to list
	    for (k=1; k < numval; k++) {
	      currStr = (String*)lst_get_ptr(l2, k);
	      str_double_trim(currStr);
	      str_remove_quotes(currStr);
	      if (resultLen > maxResultLen) {
		maxResultLen += 100;
		result = srealloc(result, maxResultLen*sizeof(char*));
	      }
	      result[resultLen++] = copy_charstr(currStr->chars);
	    }
	  }
	}
	lst_free_strings(l2);
      }
    }
    if (resultLen == 0)
      result[resultLen++] = copy_charstr("");  //empty string will be converted to NA later
    lol_push_charvec(lol, result, resultLen, NULL);
    for (j=0; j < resultLen; j++) sfree(result[j]);
  }
  PROTECT(rv = rph_listOfLists_to_SEXP(lol));
  UNPROTECT(1);
  return rv;
}
Exemple #19
0
int main(int argc, char* argv[]) {
  char *maf_fname = NULL, *out_root_fname = "maf_parse", *masked_fn = NULL;
  String *refseq = NULL, *currRefseq;
  int opt_idx, startcol = 1, endcol = -1, include = 1, splitInterval = -1;
  char c, outfilename[1000], splitFormat[100]="%s%.1i.maf", *group_tag = NULL;
  List *order_list = NULL, *seqlist_str = NULL, *cats_to_do_str=NULL, *cats_to_do=NULL;
  MafBlock *block;
  FILE *mfile, *outfile=NULL, *masked_file=NULL;
  int useRefseq=TRUE, currLen=-1, blockIdx=0, currSize, sortWarned=0;
  int lastIdx = 0, currStart=0, by_category = FALSE, i, pretty_print = FALSE;
  int lastStart = -1, gffSearchIdx=0;
  GFF_Set *gff = NULL, *gffSub;
  GFF_Feature *feat;
  CategoryMap *cm = NULL;
  int base_mask_cutoff = -1, stripILines=FALSE, stripELines=FALSE;//, numspec=0;
  List *outfileList=NULL;
  Hashtable *outfileHash=NULL;//, *specNameHash=NULL;
  msa_format_type output_format = MAF;
  MSA *msa = NULL;//, **catMsa;
  char *mask_features_spec_arg=NULL;
  List *mask_features_spec=NULL;
  

  struct option long_opts[] = {
    {"start", 1, 0, 's'},
    {"end", 1, 0, 'e'},
    {"seqs", 1, 0, 'l'},
    {"exclude", 0, 0, 'x'},
    {"order", 1, 0, 'O'},
    {"split", 1, 0, 'S'},
    {"out-root", 1, 0, 'r'},
    {"out-root-digits", 1, 0, 'd'},
    {"no-refseq", 0, 0, 'n'},
    {"features", 1, 0, 'g'},
    {"by-category", 0, 0, 'L'},
    {"do-cats", 1, 0, 'C'},
    {"catmap", 1, 0, 'c'},
    {"by-group", 1, 0, 'P'},
    {"mask-bases", 1, 0, 'b'},
    {"masked-file", 1, 0, 'm'},
    {"strip-i-lines", 0, 0, 'I'},
    {"strip-e-lines", 0, 0, 'E'},
    {"mask-features", 1, 0, 'M'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };


  while ((c = getopt_long(argc, argv, "s:e:l:O:r:S:d:g:c:P:b:o:m:M:pLnxEIh", long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 's':
      startcol = get_arg_int(optarg);
      break;
    case 'e':
      endcol = get_arg_int(optarg);
      break;
    case 'l':
      seqlist_str = get_arg_list(optarg);
      break;
    case 'O':
      order_list = get_arg_list(optarg);
      break;
    case 'x':
      include = FALSE;
      break;
    case 'S':
      splitInterval = atoi(optarg);
      break;
    case 'r':
      out_root_fname = optarg;
      break;
    case 'd':
      sprintf(splitFormat, "%%s%%.%si.%%s", optarg);
      break;
    case 'n':
      useRefseq = FALSE;
      break;
    case 'g':
      gff = gff_read_set(phast_fopen(optarg, "r"));
      gff_sort(gff);
      stripILines=TRUE;
      stripELines=TRUE;
      break;
    case 'c':
      cm = cm_new_string_or_file(optarg);
      break;
    case 'C':
      cats_to_do_str = get_arg_list(optarg);
      break;
    case 'L':
      by_category = TRUE;
      break;
    case 'P':
      group_tag = optarg;
      break;
    case 'b':
      base_mask_cutoff = atoi(optarg);
      break;
    case 'm':
      masked_fn = optarg;
      break;
    case 'M':
      mask_features_spec_arg = optarg;
      break;
    case 'E':
      stripELines=TRUE;
      break;
    case 'I':
      stripILines=TRUE;
      break;
    case 'o':
      output_format = msa_str_to_format(optarg);
      if (output_format == UNKNOWN_FORMAT) 
	die("ERROR: bad output format.  Try \"maf_parse -h\" for help.\n");
      if (output_format != MAF)
	die("Sorry, only MAF format output has been implemented right now.\n");
      break;
    case 'p':
      pretty_print = TRUE;
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Bad argument.  Try 'maf_parse -h' for help.\n");
    }
  }

  if (optind >= argc) 
    die("Missing alignment filename.  Try 'maf_parse -h' for help.\n");
  else if (optind == argc - 1) 
    maf_fname = argv[optind];
  else 
    die("ERROR: Too many arguments.  Try 'maf_parse -h' for help.\n");
  
  set_seed(-1);

  if (startcol < 1 || (endcol != -1 && endcol < startcol))
    die("ERROR: must have 1 <= start <= end <= [msa_length]\n");

  if ((group_tag != NULL || by_category) && gff == NULL)
    die("ERROR: --by-category and --by-group require --features.  Try \"maf_parse -h\""
	" for help.\n");

  if (group_tag != NULL && by_category) 
    die("ERROR: --by-category and --by-group cannot be used together.  Try \"maf_parse -h\""
	" for help.\n");
  
  if (splitInterval != -1 && gff != NULL)
    die("ERROR: can't use --split and --features together.  Try \"maf_parse -h\""
	"for help\n");

  if (group_tag != NULL || by_category) {
    outfileList = lst_new_ptr(10);
    outfileHash = hsh_new(100);
  }

  if (gff != NULL && cm == NULL) 
    cm = cm_new_from_features(gff);

  if (cats_to_do_str != NULL) {
    cats_to_do = cm_get_category_str_list(cm, cats_to_do_str, FALSE);
    if (gff != NULL) 
      gff_filter_by_type(gff, cats_to_do, 0, NULL);
  }

  if (masked_fn != NULL) {
    if (base_mask_cutoff == -1)
      die("ERROR: need to use --mask-bases with --masked-file");
    masked_file = phast_fopen(masked_fn, "w");
  }

  if (mask_features_spec_arg != NULL) {
    if (gff==NULL)
      die("ERROR: need --features with --mask-features");
    mask_features_spec = lst_new_ptr(10);
    str_split(str_new_charstr(mask_features_spec_arg), ",", mask_features_spec);
    for (i=0; i < lst_size(mask_features_spec); i++) {
      fprintf(stderr, "masking species %s within features\n", 
	      ((String*)lst_get_ptr(mask_features_spec, i))->chars);
    }
  }

  /* Check to see if --do-cats names a feature which is length 1. 
     If so, set output_format to SS ? or FASTA ? */
  
  mfile = phast_fopen(maf_fname, "r");
  block = mafBlock_read_next(mfile, NULL, NULL);

  if (splitInterval == -1 && gff==NULL) {
    //TODO: do we want to copy header from original MAF in this case?
    mafBlock_open_outfile(NULL, argc, argv);
  }

  while (block != NULL) {
    if (order_list != NULL)
      mafBlock_reorder(block, order_list);
    if (seqlist_str != NULL)
      mafBlock_subSpec(block, seqlist_str, include);
    if (mafBlock_numSpec(block)==0 || mafBlock_all_gaps(block)) 
      goto get_next_block;
    if (stripILines)
      mafBlock_strip_iLines(block);
    if (stripELines)
      mafBlock_strip_eLines(block);
    if (base_mask_cutoff != -1)
      mafBlock_mask_bases(block, base_mask_cutoff, masked_file);
    //TODO: still need to implement (either here or elsewhere)
    //    if (indel_mask_cutoff != -1) 
    //      mafBlock_mask_indels(block, indel_mask_cutoff, mfile);

    if (useRefseq) {  //get refseq and check that it is consistent in MAF file
      currRefseq = mafBlock_get_refSpec(block);
      if (refseq == NULL) 
	refseq = str_new_charstr(currRefseq->chars);
      else if (str_compare(refseq, currRefseq)!=0)
	die("Error: refseq not consistent in MAF (got %s, %s)\n",
	    refseq->chars, currRefseq->chars);
    }
    
    if (startcol != 1 || endcol != -1) 
      if (0 == mafBlock_trim(block, startcol, endcol, refseq, useRefseq ? 0 : lastIdx))
	goto get_next_block;

    currSize = mafBlock_get_size(block, refseq);
    if (useRefseq) {
      currStart = mafBlock_get_start(block, refseq);
      if (currStart < lastIdx && sortWarned == 0) {
	fprintf(stderr, "Warning: input MAF not sorted with respect to refseq.  Output files may not represent contiguous alignments. (%i, %i)\n", lastIdx, currStart);
	sortWarned = 1;
      }
    }
    else currStart = lastIdx;

    if (currStart < lastStart) gffSearchIdx = 0;
    lastStart = currStart;
    
    lastIdx = currStart + currSize;

    //split by length
    if (splitInterval != -1) {
      if (currLen == -1 || currLen+currSize > splitInterval) {
	sprintf(outfilename, splitFormat, out_root_fname, ++blockIdx,
		msa_suffix_for_format(output_format));
	if (output_format == MAF) {
	  if (outfile != NULL) mafBlock_close_outfile(outfile);
	  outfile = mafBlock_open_outfile(outfilename, argc, argv);
	}
	else if (output_format != MAF && msa != NULL) {
	  //	  msa_print_to_filename(msa, outfilename, output_format, pretty_print);
	  msa_free(msa);
	  msa = NULL;
	}
	currLen = 0;
      }
      currLen += currSize;
    }
    else outfile = stdout;
    if (gff != NULL && mask_features_spec != NULL) {
      gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx,
					       &gffSearchIdx);
      if (gffSub != NULL) {
	mafBlock_mask_region(block, gffSub, mask_features_spec);
	gff_free_set(gffSub);
      }
      mafBlock_print(outfile, block, pretty_print);


    } else if (gff != NULL) {
      gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, 
					       &gffSearchIdx);
      if (gffSub != NULL) {
	if (by_category) gff_group_by_feature(gffSub);
	else if (group_tag != NULL) gff_group(gffSub, group_tag);
	gff_sort(gffSub);
	gff_flatten_within_groups(gffSub);
	for (i=0; i<lst_size(gffSub->features); i++) {
	  feat = (GFF_Feature*)lst_get_ptr(gffSub->features, i);
	  MafBlock *subBlock = mafBlock_copy(block);
	  mafBlock_trim(subBlock, feat->start, feat->end, refseq, 0);
	  if (by_category) 
	    outfile = get_outfile(outfileList, outfileHash, feat->feature, out_root_fname,
				  argc, argv);
	  else if (group_tag != NULL) 
	    outfile = get_outfile(outfileList, outfileHash, 
				  gff_group_name(gffSub, feat), out_root_fname,
				  argc, argv);
	  else outfile = stdout;
	  if (output_format == MAF)
	    mafBlock_print(outfile, subBlock, pretty_print);
	  //	  else msa_add_mafBlock(msa);
	  mafBlock_free(subBlock);
	}
	gff_free_set(gffSub);
      }
    }
    else {
      if (output_format == MAF) 
	mafBlock_print(outfile, block, pretty_print);
      //      else msa = msa_add_mafBlock(mafBlock, msa, );
    }
    
  get_next_block:
    mafBlock_free(block);
    block = mafBlock_read_next(mfile, NULL, NULL);
  }

  if (masked_file != NULL) fclose(masked_file);

  if (output_format == MAF) {
    if (by_category || group_tag != NULL)
      close_outfiles(outfileList, outfileHash);
    else if (outfile!=NULL) mafBlock_close_outfile(outfile);
  } else {
    msa_print(stdout, msa, output_format, pretty_print);
    msa_free(msa);
  }
  if (gff != NULL) gff_free_set(gff);
  phast_fclose(mfile);
  return 0;
}
int main(int argc, char *argv[]) {
  char *msa_fname = NULL, *alph = "ACGT";
  msa_format_type input_format = UNKNOWN_FORMAT;
  char c;
  int opt_idx, seed=-1;
  String *optstr;
  List *tmplist = NULL; 
  struct phyloFit_struct *pf;
  FILE *infile;
  
  struct option long_opts[] = {
    {"msa", 1, 0, 'm'},
    {"tree", 1, 0, 't'},
    {"subst-mod", 1, 0, 's'},
    {"msa-format", 1, 0, 'i'},
    {"nrates", 1, 0, 'k'},
    {"alpha", 1, 0, 'a'},
    {"features", 1, 0, 'g'},
    {"catmap", 1, 0, 'c'},
    {"log", 1, 0, 'l'},
    {"out-root", 1, 0, 'o'},
    {"EM", 0, 0, 'E'},
    {"error", 1, 0, 'e'},
    {"precision", 1, 0, 'p'},
    {"do-cats", 1, 0, 'C'},
    {"non-overlapping", 0, 0, 'V'},
    {"markov", 0, 0, 'N'},
    {"reverse-groups", 1, 0, 'R'},
    {"init-model", 1, 0, 'M'},
    {"init-random", 0, 0, 'r'},
    {"init-parsimony", 0, 0, 'y'},
    {"print-parsimony", 1, 0, 'Y'},
    {"lnl", 0, 0, 'L'},
    {"scale-only", 0, 0, 'B'},
    {"scale-subtree", 1, 0, 'S'},
    {"estimate-freqs", 0, 0, 'F'},
    {"sym-freqs", 0, 0, 'W'},
    {"no-freqs", 0, 0, 'f'},
    {"no-rates", 0, 0, 'n'},
    {"no-opt", 1, 0, 'O'},
    {"min-informative", 1, 0, 'I'},
    {"gaps-as-bases", 0, 0, 'G'},     
    {"quiet", 0, 0, 'q'},
    {"help", 0, 0, 'h'},
    {"windows", 1, 0, 'w'},
    {"windows-explicit", 1, 0, 'v'},
    {"ancestor", 1, 0, 'A'},
    {"post-probs", 0, 0, 'P'},
    {"expected-subs", 0, 0, 'X'},
    {"expected-total-subs", 0, 0, 'Z'},
    {"expected-subs-col", 0, 0, 'J'},
    {"column-probs", 0, 0, 'U'},
    {"rate-constants", 1, 0, 'K'},
    {"ignore-branches", 1, 0, 'b'},
    {"clock", 0, 0, 'z'},
    {"alt-model", 1, 0, 'd'},
    {"label-branches", 1, 0, 0},
    {"label-subtree", 1, 0, 0},
    {"selection", 1, 0, 0},
    {"bound", 1, 0, 'u'},
    {"seed", 1, 0, 'D'},
    {0, 0, 0, 0}
  };

  // NOTE: remaining shortcuts left: HjQx

  pf = phyloFit_struct_new(0);

  while ((c = (char)getopt_long(argc, argv, "m:t:s:g:c:C:i:o:k:a:l:w:v:M:p:A:I:K:S:b:d:O:u:Y:e:D:GVENRqLPXZUBFfnrzhWyJ", long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 'm':
      msa_fname = optarg;
      break;
    case 't':
      if (optarg[0] == '(')        /* in this case, assume topology given
                                   at command line */
        pf->tree = tr_new_from_string(optarg);
      else 
        pf->tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 's':
      pf->subst_mod = tm_get_subst_mod_type(optarg);
      if (pf->subst_mod == UNDEF_MOD) 
        die("ERROR: illegal substitution model.     Type \"phyloFit -h\" for usage.\n");
      break;
    case 'g':
      pf->gff = gff_read_set(phast_fopen(optarg, "r"));
      break;
    case 'c':
      pf->cm = cm_new_string_or_file(optarg);
      break;
    case 'C':
      pf->cats_to_do_str = get_arg_list(optarg);
      break;
    case 'V':
      pf->nonoverlapping = TRUE;
      break;
    case 'o':
      pf->output_fname_root = optarg;
      break;
    case 'k':
      pf->nratecats = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'a':
      pf->alpha = get_arg_dbl(optarg);
      break;
    case 'R':
      pf->reverse_group_tag = optarg;
      break;
    case 'i':
      input_format = msa_str_to_format(optarg);
      if (input_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.    Type 'phyloFit -h' for usage.\n");
      break;
    case 'l':
      if (!strcmp(optarg, "-"))
	pf->logf = stderr;
      else pf->logf = phast_fopen(optarg, "w+");
      break;
    case 'N':
      pf->use_conditionals = 1;
      break;
    case 'w':
      tmplist = get_arg_list(optarg);
      if (lst_size(tmplist) != 2 ||
          str_as_int(lst_get_ptr(tmplist, 0), &(pf->window_size)) != 0 ||
          str_as_int(lst_get_ptr(tmplist, 1), &(pf->window_shift)) != 0) 
        die("ERROR: illegal arguments to --windows.\n");
      lst_free_strings(tmplist);
      lst_free(tmplist);
      break;
    case 'v':
      tmplist = get_arg_list(optarg);
      if (lst_size(tmplist) % 2 != 0) 
        die("ERROR: argument to --windows-explicit must be a list of even length.\n");
      pf->window_coords = str_list_as_int(tmplist);
      lst_free(tmplist);
      break;
    case 'E':
      pf->use_em = TRUE;
      break;
    case 'e':
      pf->error_fname=optarg;
      break;
    case 'p':
      if (!strcmp(optarg, "LOW")) pf->precision = OPT_LOW_PREC;
      else if (!strcmp(optarg, "MED")) pf->precision = OPT_MED_PREC;
      else if (!strcmp(optarg, "HIGH")) pf->precision = OPT_HIGH_PREC;
      else if (!strcmp(optarg, "VERY_HIGH")) pf->precision = OPT_VERY_HIGH_PREC;
      else die("ERROR: --precision must be LOW, MED, or HIGH.\n\n");
      break;
    case 'M':
      pf->input_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
      break;
    case 'r':
      pf->random_init = TRUE;
      break;
    case 'y':
      pf->init_parsimony = TRUE;
      break;
    case 'Y':
      pf->init_parsimony = TRUE;
      pf->parsimony_cost_fname = optarg;
      pf->parsimony_only=TRUE;
      break; 
    case 'L':
      pf->likelihood_only = TRUE;
      break;
    case 'q':
      pf->quiet = TRUE;
      break;
    case 'P':
      pf->do_bases = TRUE;
      break;
    case 'X':
      pf->do_expected_nsubst = TRUE;
      break;
    case 'Z':
      pf->do_expected_nsubst_tot = TRUE;
      break;
    case 'J':
      pf->do_expected_nsubst_col = TRUE;
      break;
    case 'U':
      pf->likelihood_only = TRUE;        /* force -L */
      pf->nsites_threshold = 0;        /* also force this; typical use is
                                   with small number of tuples, no
                                   tuple_idx */
      pf->do_column_probs = TRUE;
      break;
    case 'A':
      pf->root_seqname = optarg;
      break;
    case 'I':
      pf->nsites_threshold = get_arg_int(optarg);
      break;
    case 'G':
      pf->gaps_as_bases = TRUE;
      alph = "ACGT-";
      break;
    case 'B':
      pf->estimate_scale_only = TRUE;
      break;
    case 'S':
      pf->subtree_name = optarg;
      break;       
    case 'F':
      pf->estimate_backgd = TRUE;
      break;
    case 'W':
      pf->estimate_backgd = TRUE;
      pf->symfreq = TRUE;
      break;
    case 'f':
      pf->no_freqs = TRUE;
      break;
    case 'n':
      pf->no_rates = TRUE;
      break;
    case 'K':
      tmplist = get_arg_list(optarg);
      pf->rate_consts = str_list_as_dbl(tmplist);
      pf->nratecats = lst_size(pf->rate_consts);
      pf->use_em = 1;
      lst_free_strings(tmplist); lst_free(tmplist);
      break;
    case 'b':
      pf->ignore_branches = get_arg_list(optarg);
      break;
    case 'z':
      pf->assume_clock = TRUE;
      break;
    case 'O':
      if (pf->nooptstr == NULL) 
	pf->nooptstr = str_new_charstr(optarg);
      else die("ERROR: no-opt argument can only be used once!  parameters can be comma-separated list.");
      break;
    case 'd':
      if (pf->alt_mod_str == NULL) {
	pf->alt_mod_str = lst_new_ptr(1);
      }
      optstr = str_new_charstr(optarg);
      lst_push_ptr(pf->alt_mod_str, optstr);
      break;
    case 0:
      if (strcmp(long_opts[opt_idx].name, "label-branches") == 0 ||
	  strcmp(long_opts[opt_idx].name, "label-subtree") == 0) {
	optstr = str_new_charstr(optarg);
	if (pf->label_str == NULL) {
	  pf->label_str = lst_new_ptr(3);
	  pf->label_type = lst_new_int(3);
	}
	lst_push_ptr(pf->label_str, optstr);
	lst_push_int(pf->label_type, 
		     strcmp(long_opts[opt_idx].name, "label-branches") == 0 ? 
		     BRANCH_TYPE : SUBTREE_TYPE);
      }
      else if (strcmp(long_opts[opt_idx].name, "selection") == 0) {
	pf->selection = get_arg_dbl(optarg);
	pf->use_selection = TRUE;
      }
      else {
	die("ERROR: unknown option.  Type 'phyloFit -h' for usage.\n");
      }
      break;
    case 'u':
      if (pf->bound_arg == NULL) 
	pf->bound_arg = lst_new_ptr(1);
      optstr = str_new_charstr(optarg);
      lst_push_ptr(pf->bound_arg, optstr);
      break;
    case 'D':
      seed = get_arg_int_bounds(optarg, 1, INFTY);
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("ERROR: illegal argument.     Type 'phyloFit -h' for usage.\n");
    }
  }

  set_seed(seed);

  if (msa_fname == NULL) {
    if (optind >= argc) 
      die("ERROR: missing alignment filename.  Type 'phyloFit -h' for usage.\n");
    msa_fname = argv[optind];
    pf->msa_fname = msa_fname;
  }

  infile = phast_fopen(msa_fname, "r");

  if (input_format == UNKNOWN_FORMAT)
    input_format = msa_format_for_content(infile, 1);

  if (pf->nonoverlapping && (pf->use_conditionals || pf->gff != NULL || 
			     pf->cats_to_do_str || input_format == SS))
    die("ERROR: cannot use --non-overlapping with --markov, --features,\n--msa-format SS, or --do-cats.\n");


  /* read alignment */
  if (!pf->quiet) fprintf(stderr, "Reading alignment from %s ...\n", msa_fname);
  if (input_format == MAF) {
    pf->msa = maf_read(infile, NULL, 
		       tm_order(pf->subst_mod) + 1, 
		       NULL, pf->gff, pf->cm, 
		       pf->nonoverlapping ? tm_order(pf->subst_mod) + 1 : -1, 
		       FALSE, pf->reverse_group_tag, NO_STRIP, FALSE);
    if (pf->gaps_as_bases) 
      msa_reset_alphabet(pf->msa, alph);
  }
  else 
    pf->msa = msa_new_from_file_define_format(infile, 
				input_format, alph);

  /* set up for categories */
  /* first label sites, if necessary */
  pf->label_categories = (input_format != MAF);

  run_phyloFit(pf);

  if (pf->logf != NULL && pf->logf != stderr && pf->logf != stdout)
    phast_fclose(pf->logf);
  if (!pf->quiet) fprintf(stderr, "Done.\n");
  sfree(pf);
  
  return 0;
}
SEXP rph_phyloFit(SEXP msaP, 
		  SEXP treeStrP, 
		  SEXP substModP,
		  SEXP scaleOnlyP,
		  SEXP scaleSubtreeP,
		  SEXP nratesP,
		  SEXP alphaP,
		  SEXP rateConstantsP,
		  SEXP initModP,
		  SEXP initBackgdFromDataP,
		  SEXP initRandomP,
		  SEXP initParsimonyP,
		  SEXP clockP,
		  SEXP emP,
		  SEXP maxEmItsP,
		  SEXP precisionP,
		  SEXP gffP,
		  SEXP ninfSitesP,
		  SEXP quietP,
		  SEXP noOptP,
		  SEXP boundP,
		  SEXP logFileP,
		  SEXP selectionP) {
  struct phyloFit_struct *pf;
  int numProtect=0, i;
  double *doubleP;
  char *die_message=NULL;
  SEXP rv=R_NilValue;
  List *new_rate_consts = NULL;
  List *new_rate_weights = NULL;

  GetRNGstate(); //seed R's random number generator
  pf = phyloFit_struct_new(1);  //sets appropriate defaults for RPHAST mode

  pf->msa = (MSA*)EXTPTR_PTR(msaP);

  if (treeStrP != R_NilValue) 
    pf->tree = rph_tree_new(treeStrP);

  pf->use_em = LOGICAL_VALUE(emP);

  if (rateConstantsP != R_NilValue) {
    PROTECT(rateConstantsP = AS_NUMERIC(rateConstantsP));
    numProtect++;
    doubleP = NUMERIC_POINTER(rateConstantsP);
    new_rate_consts = lst_new_dbl(LENGTH(rateConstantsP));
    for (i=0; i < LENGTH(rateConstantsP); i++)
      lst_push_dbl(new_rate_consts, doubleP[i]);
//    pf->use_em = 1;
  }

  if (initModP != R_NilValue) {
    pf->input_mod = (TreeModel*)EXTPTR_PTR(initModP);
    pf->subst_mod = pf->input_mod->subst_mod;
    tm_register_protect(pf->input_mod);
    
    if (new_rate_consts == NULL && pf->input_mod->rK != NULL && pf->input_mod->nratecats > 1) {
      new_rate_consts = lst_new_dbl(pf->input_mod->nratecats);
      for (i=0; i < pf->input_mod->nratecats; i++) 
	lst_push_dbl(new_rate_consts, pf->input_mod->rK[i]);
//      pf-> = 1;
    }

    if (pf->input_mod->empirical_rates && pf->input_mod->freqK != NULL && pf->input_mod->nratecats > 1) {
      new_rate_weights = lst_new_dbl(pf->input_mod->nratecats);
      for (i=0; i < pf->input_mod->nratecats; i++)
	lst_push_dbl(new_rate_weights, pf->input_mod->freqK[i]);
    }

    tm_reinit(pf->input_mod, 
	      rph_get_subst_mod(substModP),
	      nratesP == R_NilValue ? pf->input_mod->nratecats : INTEGER_VALUE(nratesP),
	      NUMERIC_VALUE(alphaP),
	      new_rate_consts,
	      new_rate_weights);
  } else {
    if (nratesP != R_NilValue)
      pf->nratecats = INTEGER_VALUE(nratesP);
    if (alphaP != R_NilValue)
      pf->alpha = NUMERIC_VALUE(alphaP);
    if (rateConstantsP != R_NilValue) {
      pf->rate_consts = new_rate_consts;
      if (nratesP == R_NilValue)
	pf->nratecats = lst_size(new_rate_consts);
      else if (lst_size(new_rate_consts) != pf->nratecats) 
	die("length of new_rate_consts does not match nratecats\n");
    }
  }
  pf->subst_mod = rph_get_subst_mod(substModP);
  
  pf->estimate_scale_only = LOGICAL_VALUE(scaleOnlyP);
  
  if (scaleSubtreeP != R_NilValue) {
    pf->subtree_name = smalloc((1+strlen(CHARACTER_VALUE(scaleSubtreeP)))*sizeof(char));
    strcpy(pf->subtree_name, CHARACTER_VALUE(scaleSubtreeP));
  }
  
  pf->random_init = LOGICAL_VALUE(initRandomP);

  pf->init_backgd_from_data = LOGICAL_VALUE(initBackgdFromDataP);
  
  pf->init_parsimony = LOGICAL_VALUE(initParsimonyP);
  
  pf->assume_clock = LOGICAL_VALUE(clockP);

  if (maxEmItsP != R_NilValue)
    pf->max_em_its = INTEGER_VALUE(maxEmItsP);

  pf->precision = get_precision(CHARACTER_VALUE(precisionP));
  if (pf->precision == OPT_UNKNOWN_PREC) {
    die_message = "invalid precision";
    goto rph_phyloFit_end;
  }

  if (gffP != R_NilValue) {
    pf->gff = (GFF_Set*)EXTPTR_PTR(gffP);
    gff_register_protect(pf->gff);
  }

  if (ninfSitesP != R_NilValue)
    pf->nsites_threshold = INTEGER_VALUE(ninfSitesP);
  
  pf->quiet = LOGICAL_VALUE(quietP);

  if (noOptP != R_NilValue) {
    int len=LENGTH(noOptP), pos=0;
    char *temp;
    for (i=0; i < LENGTH(noOptP); i++) 
      len += strlen(CHARACTER_VALUE(STRING_ELT(noOptP, i)));
    temp = smalloc(len*sizeof(char));
    for (i=0; i < LENGTH(noOptP); i++) {
      if (i != 0) temp[pos++] = ',';
      sprintf(&temp[pos], "%s", CHARACTER_VALUE(STRING_ELT(noOptP, i)));
      pos += strlen(CHARACTER_VALUE(STRING_ELT(noOptP, i)));
    }
    if (pos != len-1) die("ERROR parsing noOpt len=%i pos=%i\n", len, pos);
    temp[pos] = '\0';
    pf->nooptstr = str_new_charstr(temp);
  }

  if (boundP != R_NilValue) {
    pf->bound_arg = lst_new_ptr(LENGTH(boundP));
    for (i=0; i < LENGTH(boundP); i++) {
      String *temp = str_new_charstr(CHARACTER_VALUE(STRING_ELT(boundP, i)));
      lst_push_ptr(pf->bound_arg, temp);
    }
  }

  if (logFileP != R_NilValue) {
    if (IS_CHARACTER(logFileP)) 
      pf->logf = phast_fopen(CHARACTER_VALUE(logFileP), "w+");
    else if (IS_LOGICAL(logFileP) &&
	     LOGICAL_VALUE(logFileP)) {
      pf->logf = stdout;
    }
  }

  if (selectionP != R_NilValue) {
    pf->use_selection = TRUE;
    pf->selection = NUMERIC_VALUE(selectionP);
  }

  msa_register_protect(pf->msa);

  run_phyloFit(pf);
  rv = PROTECT(rph_listOfLists_to_SEXP(pf->results));
  numProtect++;

 rph_phyloFit_end:
  if (pf->logf != NULL && pf->logf != stdout && pf->logf != stderr)
    phast_fclose(pf->logf);
  PutRNGstate();
  if (die_message != NULL) die(die_message);
  if (numProtect > 0) 
    UNPROTECT(numProtect);
  return rv;
}
Exemple #22
0
int main(int argc, char *argv[]) {
  TreeNode *tree = NULL;
  TreeModel *backgd_mod = NULL;
  int i, j,
    size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, 
    nrestarts = 10, npseudocounts = 5, nsamples = -1, 
    nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0,
    nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, 
    suppress_stdout = 0;
  List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl;
  List *msas, *motifs;
  SeqSet *seqset = NULL;
  PooledMSA *pmsa = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  Vector *backgd_mnmod = NULL;
  Hashtable *hash=NULL;
  String *output_prefix = str_new_charstr("phastm.");
  double *has_motif = NULL;
  double prior = PRIOR;
  char c;
  GFF_Set *bedfeats = NULL;

  while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) {
    switch (c) {
    case 't':
      tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) 
	die("ERROR: bad input format.\n");
      break;
    case 'b':
      backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
      break;
    case 's':
      break;
    case 'k':
      size = get_arg_int(optarg);
      break;
    case 'm':
      meme_mode = 1;
      break;
    case 'd':
      pos_examples = get_arg_list(optarg);
      break;
    case 'p':
      profile_mode = 1;
      break;
    case 'n':
      nrestarts = get_arg_int(optarg);
      break;
    case 'I':
      init_list = get_arg_list(optarg);
      break;
    case 'P':
      tmpl = str_list_as_int(get_arg_list(optarg));
      if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n");
      nmostprevalent = lst_get_int(tmpl, 0);
      tuple_size = lst_get_int(tmpl, 1);
      if (!(nmostprevalent > 0 && tuple_size > 0))
	die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", 
	    nmostprevalent, tuple_size);
      lst_free(tmpl);
      break;
    case 'R':
      tmpl = str_list_as_int(get_arg_list(optarg));
      if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n");
      nsamples = lst_get_int(tmpl, 0);
      tuple_size = lst_get_int(tmpl, 1);
      if (!(nsamples > 0 && tuple_size > 0))
	die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size);
      lst_free(tmpl);
      break;
    case 'c':
      npseudocounts = get_arg_int(optarg);
      break;
    case 'w':
      nbest = get_arg_int(optarg);
      break;
    case 'S':
      sample_parms = 1;
      break;
    case 'B':
      nmotifs = get_arg_int(optarg);
      break;
    case 'o': 
      str_free(output_prefix);
      output_prefix = str_new_charstr(optarg);
      str_append_char(output_prefix, '.'); 
      break;
    case 'H': 
      do_html = 1;
      break;
    case 'D': 
      do_bed = 1;
      break;
    case 'x':
      suppress_stdout = 1;
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("ERROR: List of alignment files required.  Try '%s -h'.\n", argv[0]);

  if ((nsamples > 0 && nmostprevalent > 0) || 
      (nsamples > 0 && init_list != NULL) || 
      (nmostprevalent > 0 && init_list != NULL)) 
    die("ERROR: -I, -P, and -R are mutually exclusive.");

  set_seed(-1);
    
  msa_name_list = get_arg_list(argv[optind]);

  if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree;

  if (tree == NULL && !meme_mode && !profile_mode) 
    die("ERROR: Must specify -t, -m, or -p.\n");

  if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && 
      !sample_parms)
    nrestarts = 1;

  if (pos_examples != NULL) {
    hash = hsh_new(lst_size(pos_examples));
    for (i = 0; i < lst_size(pos_examples); i++)
      hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1);
    has_motif = smalloc(lst_size(msa_name_list) * sizeof(double));
  }

  /* open all MSAs */
  msas = lst_new_ptr(lst_size(msa_name_list));
  fprintf(stderr, "Reading alignment(s) ...\n");
  for (i = 0, j = 0; i < lst_size(msa_name_list); i++) {
    String *name = lst_get_ptr(msa_name_list, i);
    FILE *mfile = phast_fopen(name->chars, "r");
    msa_format_type temp_format;
    MSA *msa;
    if (msa_format == UNKNOWN_FORMAT)
      temp_format = msa_format_for_content(mfile, 1);
    else temp_format = msa_format;
    msa = msa_new_from_file_define_format(mfile, temp_format, NULL);
    phast_fclose(mfile);
    if (nseqs == -1) nseqs = msa->nseqs;
    if (!meme_mode &&
        (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 ||
        msa->nseqs != nseqs)) {
      fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars);
      msa_free(msa);
      continue;
    }

    if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
    msa_remove_N_from_alph(msa); /* Ns can be a problem */
    lst_push_ptr(msas, msa);
    if (has_motif != NULL) {
      int k, hm = (hsh_get_int(hash, name->chars) == 1);
      if (meme_mode) {          /* here need to record at individ seq level */
        has_motif = srealloc(has_motif, 
                             (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */
        for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm;
      }
      else has_motif[j++] = hm;
    }
  }
  if (!meme_mode) {
    fprintf(stderr, "Extracting and pooling sufficient statistics ...\n");
    pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0);
    msa_remove_N_from_alph(pmsa->pooled_msa);
  }

  /* obtain individual sequences, if necessary */
  if (nmostprevalent > 0 || nsamples > 0 || meme_mode) {
    if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n");
    else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n");
    seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size);
                                /* for now, assume 1st seq is reference */
    msa_remove_N_from_alph(seqset->set); 
  }

  if (nmostprevalent > 0) {
    fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", 
            nmostprevalent, tuple_size);
    init_list = lst_new_ptr(nmostprevalent);
    mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent);
  }
  else if (nsamples > 0) {
    fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size);
    init_list = lst_new_ptr(nsamples);
    mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples);
  }

  /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */
  if (meme_mode && backgd_mod != NULL && has_motif == NULL)
    backgd_mnmod = backgd_mod->backgd_freqs;

  /* estimate background model, if necessary */
  else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) {
    fprintf(stderr, "Fitting background model%s ...\n", 
            has_motif == NULL ? "" : " (for use in initialization)");
                                /* if discriminative, be clear
                                   backgd isn't really part of the
                                   estimation procedure */
    if (meme_mode) {
      backgd_mnmod = vec_new(strlen(seqset->set->alphabet));
      mtf_estim_backgd_mn(seqset, backgd_mnmod);
    }
    else {
      backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, 
                          pmsa->pooled_msa->alphabet, 1, 0, NULL, -1);
      tm_fit(backgd_mod, pmsa->pooled_msa, 
             tm_params_init(backgd_mod, .1, 5, 0), 
             -1, OPT_MED_PREC, NULL, 0, NULL);
    }
  }

  /* select subset of init strings, if necessary */
  if (nbest > 0 && init_list != NULL) {
    fprintf(stderr, "Winnowing candidate start strings ...\n");
    tmpl = lst_new_ptr(nbest);
    mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa,
                      init_list, nbest, tmpl, !meme_mode, size, tree,
                      meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, 
                      has_motif);
    lst_free(init_list);
    init_list = tmpl;
  }

  /* Now find motifs */
  motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, 
                    !meme_mode, size, nmotifs, tree,
                    meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, 
                    has_motif, prior, nrestarts, init_list, sample_parms, 
                    npseudocounts);
     
  fprintf(stderr, "\n\n");
  if (do_bed)
    bedfeats = gff_new_set_init("phast_motif", "0.1b");

  /* generate output */
  for (i = 0; i < lst_size(motifs); i++) {
    Motif *m = lst_get_ptr(motifs, i);

    if (!suppress_stdout) {
      if (lst_size(motifs) > 1) 
        printf("\n**********\nMOTIF #%d\n**********\n\n", i+1);

      mtf_print(stdout, m);
    }

    if (do_html) {
      String *fname = str_dup(output_prefix);
      str_append_int(fname, i+1);
      str_append_charstr(fname, ".html");
      mtf_print_html(phast_fopen(fname->chars, "w+"), m);
      str_free(fname);
    }

    if (do_bed) 
      mtf_add_features(m, bedfeats);
  }
  if (do_html) {
    String *fname = str_dup(output_prefix);
    str_append_charstr(fname, "index.html");
    mtf_print_summary_html(phast_fopen(fname->chars, "w+"), 
                           motifs, output_prefix);
    str_free(fname);
  }
  if (do_bed) {
    String *fname = str_dup(output_prefix);
    str_append_charstr(fname, "bed");
    gff_print_bed(phast_fopen(fname->chars, "w+"),
                  bedfeats, FALSE);
    str_free(fname);
  }

  return 0;
}