Esempio n. 1
0
SEXP rph_tree_scale(SEXP treeStr, SEXP scaleP, SEXP nodeStr,
		    SEXP includeLeadingP) {
  TreeNode *tr = rph_tree_new(treeStr);
  double scale = NUMERIC_VALUE(scaleP);
  char *newTreeStr;
  SEXP result;

  if (nodeStr != R_NilValue) {
    TreeNode *n;
    int includeLeading=LOGICAL_VALUE(includeLeadingP);
    n = tr_get_node(tr, CHARACTER_VALUE(nodeStr));
    if (n == NULL) {
      tr_name_ancestors(tr);
      n = tr_get_node(tr, CHARACTER_VALUE(nodeStr));
      if (n == NULL)
	die("No node named %s in %s\n", CHARACTER_VALUE(nodeStr),
	    CHARACTER_VALUE(treeStr));
    }
    tr_scale_subtree(tr, n, scale, includeLeading);
  }
  else tr_scale(tr, scale);
  newTreeStr = tr_to_string(tr, 1);
  PROTECT(result = NEW_CHARACTER(1));
  SET_STRING_ELT(result, 0, mkChar(newTreeStr));
  UNPROTECT(1);
  return result;
}
Esempio n. 2
0
SEXP rph_tree_subtree(SEXP treeStr, SEXP nodeStr) {
  TreeNode *tr = rph_tree_new(treeStr);
  TreeNode *n;
  char *newTreeStr;
  SEXP result;
  n = tr_get_node(tr, CHARACTER_VALUE(nodeStr));
  if (n == NULL) {
    tr_name_ancestors(tr);
    n = tr_get_node(tr, CHARACTER_VALUE(nodeStr));
    if (n == NULL)
      die("No node named %s", CHARACTER_VALUE(nodeStr));
  }
  tr_prune_supertree(&tr, n);
  newTreeStr = tr_to_string(tr, 1);
  PROTECT(result = NEW_CHARACTER(1));
  SET_STRING_ELT(result, 0, mkChar(newTreeStr));
  UNPROTECT(1);
  return result;
}
Esempio n. 3
0
SEXP rph_tree_depth(SEXP treeP, SEXP nodeP) {
  TreeNode *tr = rph_tree_new(treeP), *node;
  SEXP rv;

  node = tr_get_node(tr, CHARACTER_VALUE(nodeP));
  if (node == NULL)
    die("no node named %s", CHARACTER_VALUE(nodeP));
  PROTECT(rv = NEW_NUMERIC(1));
  REAL(rv)[0] = tr_distance_to_root(node);
  UNPROTECT(1);
  return rv;
}
/* reconstruct indels by parsimony and assign all base probs to -1
   where ancestral bases are inferred not to have been present */
void do_indels(MSA *msa, TreeModel *mod) {
  int s, tup, i, j;
  TreeNode *n, *lca;
  char c;
  typedef enum {IGNORE, GAP, BASE, MISSING, AMBIG} label_type;
  List *postorder;

  label_type *label = smalloc(mod->tree->nnodes * sizeof(label_type));
  List *inside = lst_new_ptr(mod->tree->nnodes), 
    *outside = lst_new_ptr(mod->tree->nnodes),
    *ambig_cases = lst_new_ptr(mod->tree->nnodes);
  int *seq_to_leaf = smalloc(msa->nseqs * sizeof(int));

  /* build mapping from seqs to leaf indices in tree */
  for (s = 0; s < msa->nseqs; s++) {
    TreeNode *n = tr_get_node(mod->tree, msa->names[s]);
    if (n == NULL)
      die("ERROR: no match for sequence \"%s\" in tree.\n", msa->names[s]);
    seq_to_leaf[s] = n->id;
  }    

  if (mod->msa_seq_idx == NULL)
    tm_build_seq_idx(mod, msa);

  postorder = tr_postorder(mod->tree);

  for (tup = 0; tup < msa->ss->ntuples; tup++) {
    int min = mod->tree->nnodes, max = -1, ngaps = 0, skip_root = FALSE;

    /* find min and max ids of seqs that actually have bases (non-gaps) */
    for (s = 0; s < msa->nseqs; s++) {
      if (ss_get_char_tuple(msa, tup, s, 0) == GAP_CHAR) {
        ngaps++;
        continue;
      }
      if (seq_to_leaf[s] < min) min = seq_to_leaf[s];
      if (seq_to_leaf[s] > max) max = seq_to_leaf[s];

      /* NOTE: missing data being handled like bases here; in some
         cases, a base may be inferred at an ancestral node, when the
         only evidence for it is missing data in the leaves.  There
         are ambiguous cases; we'll err on the side of predicting
         bases rather than indels */
    }

    if (ngaps <= 1) continue;	/* short cut -- impossible to infer
                                   gaps in ancestors */

    else if (ngaps >= msa->nseqs - 1) {
      /* in this case, all ancestors must be gaps */
      for (i = 0; i < mod->tree->nnodes; i++) {
        n = lst_get_ptr(mod->tree->nodes, i);
        if (n->lchild == NULL || n->rchild == NULL) 
          continue;               /* ignore leaves */
        for (j = 0; j < mod->rate_matrix->size; j++)
          mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
	/* mark as gap */
      }
      continue;
    }

    if (min < 0) die("prequel.c: min = %e < 0\n", min);
    if (max < min) die("prequel.c: max (%e) < min (%e)", max, min);

    /* the LCA of all leaves with non-gaps must be the first ancestor of
       the node with the max id that has an id smaller than the min
       id.  This is based on the assumption that node ids are assigned
       sequentially in a preorder traversal of the tree, which will be
       true as long as the tree is read from a Newick file by the code
       in trees.c */
    for (lca = lst_get_ptr(mod->tree->nodes, max); lca->id > min; 
         lca = lca->parent);

    /* by parsimony, the base was inserted on the branch to the LCA,
       and all ancestral nodes outside the subtree rooted at the LCA
       did not have bases */

    if (lca == mod->tree->lchild || lca == mod->tree->rchild)
      skip_root = TRUE;        /* don't mark root as gap in this case:
                                  can't distinguish insertion from
                                  deletion so assume deletion */

    /* mark ancestral bases outside subtree beneath LCA as gaps */
    tr_partition_nodes(mod->tree, lca, inside, outside);
    for (i = 0; i < mod->tree->nnodes; i++) label[i] = BASE;
    for (i = 0; i < lst_size(outside); i++) {
      n = lst_get_ptr(outside, i);
      label[n->id] = IGNORE;
      if (n->lchild == NULL || n->rchild == NULL) 
        continue;               /* skip leaves */
      if (n == mod->tree && skip_root) 
        continue;               /* skip root if condition above */
      for (j = 0; j < mod->rate_matrix->size; j++)
        mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
      /* mark as gap */
    }

    /* check for gaps in subtree; if there's at most one, we can go
       on; otherwise have to use parsimony to infer history in subtree */
    ngaps = 0;
    for (i = 0; i < lst_size(inside); i++) {
      n = lst_get_ptr(inside, i);
      if (n->lchild == NULL &&
          ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0) == GAP_CHAR)
        ngaps++;
    }
    if (ngaps <= 1) continue;

    /* use Dollo parsimony to infer the indel history of the subtree
       beneath the LCA.  Use the fact that every base must have a
       chain of bases to the LCA, because, assuming the alignment is
       correct, no insertions are possible beneath the LCA */
    lst_clear(ambig_cases);
    for (i = 0; i < lst_size(postorder); i++) {
      n = lst_get_ptr(postorder, i);
      if (label[n->id] == IGNORE) continue; /* outside subtree */

      /* MISSING means all leaves beneath node have missing data */
      /* AMBIG means combination of gaps and missing data beneath node */

      else if (n->lchild == NULL) {  /* leaf in subtree */
        c = ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0);
        if (c == GAP_CHAR)
          label[n->id] = GAP;
        else if (msa->is_missing[(int)c]) 
          label[n->id] = MISSING;
        else
          label[n->id] = BASE;
      }
      else {                    /* internal node in subtree */
        if (label[n->lchild->id] == BASE || label[n->rchild->id] == BASE)
          label[n->id] = BASE;  /* by Dollo parsimony */
        else if ((label[n->lchild->id] == GAP || label[n->lchild->id] == AMBIG) &&
                 (label[n->rchild->id] == GAP || label[n->rchild->id] == AMBIG))
          label[n->id] = GAP;   /* gaps from both sides and no bases -- must be gap */
        else if (label[n->lchild->id] == MISSING && label[n->rchild->id] == MISSING)
          label[n->id] = MISSING;
        else {              /* must be GAP/MISSING or AMBIG/MISSING */
          label[n->id] = AMBIG;
          lst_push_ptr(ambig_cases, n);
        }
      }
    }

    /* now resolve any ambiguities, by giving each ambiguous node the same
       label as its parent; traversing ambig_cases in reverse order
       ensures that parents are visited before children  */

    /* first make sure root of subtree has a base */
    if (label[lca->id] == MISSING || label[lca->id] == AMBIG)
      label[lca->id] = BASE;
    /* in this case there is all missing data and gaps beneath the LCA;
       hard to know what is right, but let's force a base and err on
       the side of bases rather than gaps */

    for (i = lst_size(ambig_cases) - 1; i >= 0; i--) {
      n = lst_get_ptr(ambig_cases, i);
      if (n == lca) continue;
      else label[n->id] = label[n->parent->id];
    }

    /* now mark gaps inside subtree, as needed */
    for (i = 0; i < lst_size(inside); i++) {
      n = lst_get_ptr(inside, i);
      if (n->lchild == NULL || n->rchild == NULL) continue;
      if (label[n->id] == GAP) 
        for (j = 0; j < mod->rate_matrix->size; j++)
          mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
    }
  }

  lst_free(inside);
  lst_free(outside);
  lst_free(ambig_cases);
  sfree(seq_to_leaf);
  sfree(label);
}
Esempio n. 5
0
int main(int argc, char *argv[]) {
  /* variables for options, with defaults */
  TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL;
  Hashtable *rename_hash = NULL;
  double scale_factor = 1;
  List *prune_names = NULL, *label = NULL, *labelType = NULL;
  int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE,
    name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE,
    inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE;
  TreeModel *mod = NULL, *merge_mod = NULL;
  char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL,
    *node_distance_name = NULL;
  
  /* other variables */
  String *suffix,  *optstr;
  char c;
  int i, opt_idx;
  TreeNode *n;

  struct option long_opts[] = {
    {"scale", 1, 0, 's'},
    {"extrapolate", 1, 0, 'e'},
    {"prune", 1, 0, 'p'},
    {"prune-all-but", 1, 0, 'P'},
    {"get-subtree", 1, 0, 'g'},
    {"merge", 1, 0, 'm'},
    {"rename", 1, 0, 'r'},
    {"tree-only", 0, 0, 't'},
    {"no-branchlen", 0, 0, 'N'},
    {"dissect", 0, 0, 'd'},
    {"name-ancestors", 0, 0, 'a'},
    {"reroot", 1, 0, 'R'},
    {"with-branch", 1, 0, 'B'},
    {"subtree", 1, 0, 'S'},
    {"branchlen", 0, 0, 'b'},
    {"newick", 0, 0, 'n'},
    {"label-subtree", 1, 0, 'L'},
    {"label-branches", 1, 0, 'l'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", 
                          long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 's':
      scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY);
      break;
    case 'e':
      if (!strcmp(optarg, "default")) {
        optarg = smalloc(1000 * sizeof(char));
        #if defined(__MINGW32__)
          sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh",
		  PHAST_HOME);
        #else
          sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", 
                  PHAST_HOME);
        #endif
      }
      extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'p':
      prune_names = get_arg_list(optarg);
      break;
    case 'P':
      prune_names = get_arg_list(optarg);
      prune_all_but = TRUE;
      break;
    case 'g':
      get_subtree_name = optarg;
      break;
    case 'm':
      suffix = str_new_charstr(optarg);
      str_suffix(suffix, '.');
      if (str_equals_charstr(suffix, "nh"))
        merge_tree = tr_new_from_file(phast_fopen(optarg, "r"));
      else {
        merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
        merge_tree = merge_mod->tree;
      }
      break;
    case 'r':
      rename_hash = make_name_hash(optarg);
      break;
    case 't':
      tree_only = TRUE;
      break;
    case 'N':
      no_branchlen = TRUE;
      tree_only = TRUE;
      break;
    case 'd':
      dissect = TRUE;
      break;
    case 'b':
      print_branchlen = TRUE;
      break;
    case 'D':
      print_distance_to_root = TRUE;
      node_distance_name = optarg;
      break;
    case 'R':
      reroot_name = optarg;
      break;
    case 'B':
      with_branch = TRUE;
      break;
    case 'a':
      name_ancestors = TRUE;
      break;
    case 'S':
      subtree_name = optarg;
      break;
    case 'n':
      inNewick=TRUE;
      break;
    case 'L':  //do the same for --label--subtree and --label-branches
    case 'l':
      if (label == NULL) {
	label = lst_new_ptr(1);
	labelType = lst_new_int(1);
      }
      optstr = str_new_charstr(optarg);
      lst_push_ptr(label, optstr);
      lst_push_int(labelType, (int)c);
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  if (merge_tree != NULL && extrapolate_tree != NULL)
    die("ERROR: Can't use --merge and --extrapolate together");

  set_seed(-1);
    
  suffix = str_new_charstr(argv[optind]);
  str_suffix(suffix, '.');
  if (inNewick || str_equals_charstr(suffix, "nh")) {
    tree = tr_new_from_file(phast_fopen(argv[optind], "r"));
    tree_only = TRUE;           /* can't output tree model in this case */
  }
  else {
    mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);
    tree = mod->tree;
  }

  if (prune_names != NULL) {
    tr_prune(&tree, prune_names, prune_all_but, NULL);
    if (mod != NULL) mod->tree = tree; /* root may have changed */
  }

  if (get_subtree_name != NULL) {
    n = tr_get_node(tree, get_subtree_name);
    if (n == NULL) {
      tr_name_ancestors(tree);
      n = tr_get_node(tree, get_subtree_name);
      if (n == NULL) {
	die("ERROR: no node named '%s'.\n", subtree_name);
      }
    }
    tr_prune_supertree(&tree, n);
    if (mod != NULL) mod->tree = tree;
  }

  if (merge_tree != NULL) {
    tree = tr_hybrid(tree, merge_tree);
    if (mod != NULL) mod->tree = tree;
  }

  else if (extrapolate_tree != NULL) {
    tr_scale_by_subtree(extrapolate_tree, tree);
    tree = extrapolate_tree;
    if (mod != NULL) mod->tree = tree;
  }

  if (scale_factor != 1) {
    if (subtree_name == NULL)
      tr_scale(tree, scale_factor);
    else {
      n = tr_get_node(tree, subtree_name);
      if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name);
      tr_scale_subtree(tree, n, scale_factor, with_branch);
    }
  }

  if (name_ancestors)
    tr_name_ancestors(tree);

  if (rename_hash != NULL) {
    char *newname;
    for (i = 0; i < tree->nnodes; i++) {
      n = lst_get_ptr(tree->nodes, i);
      if (n->name != NULL && n->name[0] != '\0' && 
          (newname = hsh_get(rename_hash, n->name)) != (char*)-1) {
        strcpy(n->name, newname);
      }
    }
  }

  if (reroot_name != NULL) {
    n = tr_get_node(tree, reroot_name);
    if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name);
    tr_reroot(tree, n, with_branch);
    if (mod != NULL) mod->tree = with_branch ? n->parent : n;
    tree = with_branch ? n->parent : n;
  }

  if (label != NULL) {
    for (i=0; i < lst_size(label); i++) {
      String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal;
      List *tmplst = lst_new_ptr(10);
      String *nodename;
      int j;
      str_split(currstr, ":", tmplst);
      if (lst_size(tmplst) != 2) 
	die("ERROR: bad argument to --label-branches or --label-subtree.\n");
      arg1 = lst_get_ptr(tmplst, 0);
      labelVal = lst_get_ptr(tmplst, 1);
      lst_clear(tmplst);
      if (lst_get_int(labelType, i) == (int)'l') {
	str_split(arg1, ",", tmplst);
	for (j=0; j < lst_size(tmplst); j++) {
	  nodename = (String*)lst_get_ptr(tmplst, j);
	  tr_label_node(tree, nodename->chars, labelVal->chars);
	}
	lst_free_strings(tmplst);
      } else if (lst_get_int(labelType, i) == (int)'L') {
	int include_leading_branch = FALSE;
	TreeNode *node;
	nodename = arg1;
	node = tr_get_node(tree, nodename->chars);
	if (node == NULL && nodename->chars[nodename->length-1] == '+') {
	  nodename->chars[--nodename->length] = '\0';
	  node = tr_get_node(tree, nodename->chars);
	  include_leading_branch = TRUE;
	}
	tr_label_subtree(tree, nodename->chars, include_leading_branch, 
			 labelVal->chars);
      } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i));
      str_free(arg1);
      str_free(labelVal);
      lst_free(tmplst);
      str_free(currstr);
    }
    lst_free(label);
    lst_free(labelType);
  }

  if (dissect) 
    tr_print_nodes(stdout, tree);
  if (print_branchlen) 
    printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree));
  if (print_distance_to_root) {
    TreeNode *node = tr_get_node(tree, node_distance_name);
    if (node == NULL) 
      die("ERROR: no node named '%s'.\n", node_distance_name);
    printf("length(root-%s): %f\n", node_distance_name, 
	   tr_distance_to_root(node));
  }

  if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) {
    if (tree_only)
      tr_print(stdout, tree, no_branchlen==FALSE);
    else
      tm_print(stdout, mod);
  }
  return 0;
}