Esempio n. 1
0
int main(int argc, char *argv[]) {
  char c;
  int i, j, t, opt_idx, ntrees, nleaves = -1;
  TreeNode *n, *node_i, *node_j, *lca, *nametree = NULL;
  TreeNode **tree;
  List *leaves, ***distance, *tree_fnames, *tot_dist;
  int mod = FALSE;
  char **leaf_name;
  String *trees_arg;
  FILE *F;

  struct option long_opts[] = {
    {"mod", 0, 0, 'm'},
    {"tree", 1, 0, 't'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "mt:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'm':
      mod = TRUE;
      break;
    case 't':
      if (optarg[0] == '(')
        nametree = tr_new_from_string(optarg);
      else 
        nametree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind > argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  set_seed(-1);

  /* build a comma-delimited list and pass to get_arg_list; allows
     possibility of reading from file via '*' operator */
  trees_arg = str_new(1000);
  for (i = optind; i < argc; i++) {
    str_append_charstr(trees_arg, argv[i]);
    if (i < argc - 1) str_append_char(trees_arg, ',');
  }
  tree_fnames = get_arg_list(trees_arg->chars);

  ntrees = lst_size(tree_fnames);
  tree = smalloc(ntrees * sizeof(void*));

  /* read trees */
  for (t = 0; t < ntrees; t++) {
    String *fname = lst_get_ptr(tree_fnames, t);
    if (mod) {
      TreeModel *m = tm_new_from_file(F = phast_fopen(fname->chars, "r"), 1);
      tree[t] = tr_create_copy(m->tree);
      tm_free(m);
      phast_fclose(F);
    }
    else
      tree[t] = tr_new_from_file(phast_fopen(fname->chars, "r"));
  }

  /* initialization */
  nleaves = (tree[0]->nnodes + 1)/2;
  leaves = lst_new_ptr(nleaves);    
  distance = smalloc(nleaves * sizeof(void*));
  leaf_name = smalloc(nleaves * sizeof(void*));
  for (i = 0; i < nleaves; i++) {
    distance[i] = smalloc(nleaves * sizeof(void*));
    for (j = i+1; j < nleaves; j++) 
      distance[i][j] = lst_new_dbl(ntrees);
  }
  if (nametree == NULL) nametree = tree[0];
  for (i = 0, j = 0; i < lst_size(nametree->nodes); i++) {
    n = lst_get_ptr(nametree->nodes, i);
    if (n->lchild == NULL && n->rchild == NULL)
      leaf_name[j++] = n->name;
  }
  tot_dist = lst_new_dbl(ntrees);

  /* now compute distances */
  for (t = 0; t < ntrees; t++) {
    /* obtain list of leaves */
    lst_clear(leaves);
    for (i = 0; i < lst_size(tree[t]->nodes); i++) {
      n = lst_get_ptr(tree[t]->nodes, i);
      if (n->lchild == NULL && n->rchild == NULL)
        lst_push_ptr(leaves, n);
    }

    if (lst_size(leaves) != nleaves)
      die("ERROR: trees have different numbers of leaves.\n");

    /* look at all pairs */
    for (i = 0; i < nleaves; i++) {
      node_i = lst_get_ptr(leaves, i);
      for (j = i+1; j < nleaves; j++) {
        double dist = 0;
        node_j = lst_get_ptr(leaves, j);
        /* because ids are assigned in pre-order, the first ancestor of
           node j that has an id less than i is the LCA of i and j; we
           seek the sum of distances from both i and j to this node */
        for (n = node_j; n->id >= node_i->id; n = n->parent)
          dist += n->dparent;      
        lca = n;
        for (n = node_i; n != lca; n = n->parent)
          dist += n->dparent;            
        lst_push_dbl(distance[i][j], dist);
      }
    }
    lst_push_dbl(tot_dist, tr_total_len(tree[t]));
  }


  /* print distances and (optionally) stats */
  if (ntrees == 1) {
    for (i = 0; i < nleaves; i++) {
      for (j = i+1; j < nleaves; j++) {
        printf ("%s\t%s\t%f\n", leaf_name[i], leaf_name[j], 
                lst_get_dbl(distance[i][j], 0));
      }
    }
    printf ("%s\t%s\t%f\n", "(total)", "-", lst_get_dbl(tot_dist, 0));
  }
  else {
    double mean, stdev;
    double quantiles[] = {0, 0.025, 0.05, 0.5, 0.95, 0.975, 1};
    double quantile_vals[7]; 

    printf("%-15s %-15s %9s %9s %9s %9s %9s %9s %9s %9s %9s\n", "leaf1", 
           "leaf2", "mean", "stdev", "median", "min", "max", "95%_min", 
           "95%_max", "90%_min", "90%_max");

    for (i = 0; i < nleaves; i++) {
      for (j = i+1; j < nleaves; j++) {
        mean = lst_dbl_mean(distance[i][j]);
        stdev = lst_dbl_stdev(distance[i][j]);
        lst_qsort_dbl(distance[i][j], ASCENDING);
        lst_dbl_quantiles(distance[i][j], quantiles, 7, quantile_vals);

        printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", 
               leaf_name[i], leaf_name[j], mean, stdev, quantile_vals[3], quantile_vals[0], 
               quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], 
               quantile_vals[4]);
      }
    }

    /* also do total branch len */
    mean = lst_dbl_mean(tot_dist);
    stdev = lst_dbl_stdev(tot_dist);
    lst_qsort_dbl(tot_dist, ASCENDING);
    lst_dbl_quantiles(tot_dist, quantiles, 7, quantile_vals);
    
    printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", 
	   "(total)", "-", mean, stdev, quantile_vals[3], quantile_vals[0], 
	   quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], 
	   quantile_vals[4]);
  }

  return 0;
}
SEXP rph_phyloFit(SEXP msaP, 
		  SEXP treeStrP, 
		  SEXP substModP,
		  SEXP scaleOnlyP,
		  SEXP scaleSubtreeP,
		  SEXP nratesP,
		  SEXP alphaP,
		  SEXP rateConstantsP,
		  SEXP initModP,
		  SEXP initBackgdFromDataP,
		  SEXP initRandomP,
		  SEXP initParsimonyP,
		  SEXP clockP,
		  SEXP emP,
		  SEXP maxEmItsP,
		  SEXP precisionP,
		  SEXP gffP,
		  SEXP ninfSitesP,
		  SEXP quietP,
		  SEXP noOptP,
		  SEXP boundP,
		  SEXP logFileP,
		  SEXP selectionP) {
  struct phyloFit_struct *pf;
  int numProtect=0, i;
  double *doubleP;
  char *die_message=NULL;
  SEXP rv=R_NilValue;
  List *new_rate_consts = NULL;
  List *new_rate_weights = NULL;

  GetRNGstate(); //seed R's random number generator
  pf = phyloFit_struct_new(1);  //sets appropriate defaults for RPHAST mode

  pf->msa = (MSA*)EXTPTR_PTR(msaP);

  if (treeStrP != R_NilValue) 
    pf->tree = rph_tree_new(treeStrP);

  pf->use_em = LOGICAL_VALUE(emP);

  if (rateConstantsP != R_NilValue) {
    PROTECT(rateConstantsP = AS_NUMERIC(rateConstantsP));
    numProtect++;
    doubleP = NUMERIC_POINTER(rateConstantsP);
    new_rate_consts = lst_new_dbl(LENGTH(rateConstantsP));
    for (i=0; i < LENGTH(rateConstantsP); i++)
      lst_push_dbl(new_rate_consts, doubleP[i]);
//    pf->use_em = 1;
  }

  if (initModP != R_NilValue) {
    pf->input_mod = (TreeModel*)EXTPTR_PTR(initModP);
    pf->subst_mod = pf->input_mod->subst_mod;
    tm_register_protect(pf->input_mod);
    
    if (new_rate_consts == NULL && pf->input_mod->rK != NULL && pf->input_mod->nratecats > 1) {
      new_rate_consts = lst_new_dbl(pf->input_mod->nratecats);
      for (i=0; i < pf->input_mod->nratecats; i++) 
	lst_push_dbl(new_rate_consts, pf->input_mod->rK[i]);
//      pf-> = 1;
    }

    if (pf->input_mod->empirical_rates && pf->input_mod->freqK != NULL && pf->input_mod->nratecats > 1) {
      new_rate_weights = lst_new_dbl(pf->input_mod->nratecats);
      for (i=0; i < pf->input_mod->nratecats; i++)
	lst_push_dbl(new_rate_weights, pf->input_mod->freqK[i]);
    }

    tm_reinit(pf->input_mod, 
	      rph_get_subst_mod(substModP),
	      nratesP == R_NilValue ? pf->input_mod->nratecats : INTEGER_VALUE(nratesP),
	      NUMERIC_VALUE(alphaP),
	      new_rate_consts,
	      new_rate_weights);
  } else {
    if (nratesP != R_NilValue)
      pf->nratecats = INTEGER_VALUE(nratesP);
    if (alphaP != R_NilValue)
      pf->alpha = NUMERIC_VALUE(alphaP);
    if (rateConstantsP != R_NilValue) {
      pf->rate_consts = new_rate_consts;
      if (nratesP == R_NilValue)
	pf->nratecats = lst_size(new_rate_consts);
      else if (lst_size(new_rate_consts) != pf->nratecats) 
	die("length of new_rate_consts does not match nratecats\n");
    }
  }
  pf->subst_mod = rph_get_subst_mod(substModP);
  
  pf->estimate_scale_only = LOGICAL_VALUE(scaleOnlyP);
  
  if (scaleSubtreeP != R_NilValue) {
    pf->subtree_name = smalloc((1+strlen(CHARACTER_VALUE(scaleSubtreeP)))*sizeof(char));
    strcpy(pf->subtree_name, CHARACTER_VALUE(scaleSubtreeP));
  }
  
  pf->random_init = LOGICAL_VALUE(initRandomP);

  pf->init_backgd_from_data = LOGICAL_VALUE(initBackgdFromDataP);
  
  pf->init_parsimony = LOGICAL_VALUE(initParsimonyP);
  
  pf->assume_clock = LOGICAL_VALUE(clockP);

  if (maxEmItsP != R_NilValue)
    pf->max_em_its = INTEGER_VALUE(maxEmItsP);

  pf->precision = get_precision(CHARACTER_VALUE(precisionP));
  if (pf->precision == OPT_UNKNOWN_PREC) {
    die_message = "invalid precision";
    goto rph_phyloFit_end;
  }

  if (gffP != R_NilValue) {
    pf->gff = (GFF_Set*)EXTPTR_PTR(gffP);
    gff_register_protect(pf->gff);
  }

  if (ninfSitesP != R_NilValue)
    pf->nsites_threshold = INTEGER_VALUE(ninfSitesP);
  
  pf->quiet = LOGICAL_VALUE(quietP);

  if (noOptP != R_NilValue) {
    int len=LENGTH(noOptP), pos=0;
    char *temp;
    for (i=0; i < LENGTH(noOptP); i++) 
      len += strlen(CHARACTER_VALUE(STRING_ELT(noOptP, i)));
    temp = smalloc(len*sizeof(char));
    for (i=0; i < LENGTH(noOptP); i++) {
      if (i != 0) temp[pos++] = ',';
      sprintf(&temp[pos], "%s", CHARACTER_VALUE(STRING_ELT(noOptP, i)));
      pos += strlen(CHARACTER_VALUE(STRING_ELT(noOptP, i)));
    }
    if (pos != len-1) die("ERROR parsing noOpt len=%i pos=%i\n", len, pos);
    temp[pos] = '\0';
    pf->nooptstr = str_new_charstr(temp);
  }

  if (boundP != R_NilValue) {
    pf->bound_arg = lst_new_ptr(LENGTH(boundP));
    for (i=0; i < LENGTH(boundP); i++) {
      String *temp = str_new_charstr(CHARACTER_VALUE(STRING_ELT(boundP, i)));
      lst_push_ptr(pf->bound_arg, temp);
    }
  }

  if (logFileP != R_NilValue) {
    if (IS_CHARACTER(logFileP)) 
      pf->logf = phast_fopen(CHARACTER_VALUE(logFileP), "w+");
    else if (IS_LOGICAL(logFileP) &&
	     LOGICAL_VALUE(logFileP)) {
      pf->logf = stdout;
    }
  }

  if (selectionP != R_NilValue) {
    pf->use_selection = TRUE;
    pf->selection = NUMERIC_VALUE(selectionP);
  }

  msa_register_protect(pf->msa);

  run_phyloFit(pf);
  rv = PROTECT(rph_listOfLists_to_SEXP(pf->results));
  numProtect++;

 rph_phyloFit_end:
  if (pf->logf != NULL && pf->logf != stdout && pf->logf != stderr)
    phast_fclose(pf->logf);
  PutRNGstate();
  if (die_message != NULL) die(die_message);
  if (numProtect > 0) 
    UNPROTECT(numProtect);
  return rv;
}