Beispiel #1
0
SEXP rph_tree_label_subtree(SEXP treeP, SEXP nodeP,
			    SEXP includeLeadingBranchP,
			    SEXP labelP) {
  int i, numtree = LENGTH(treeP), include_leading_branch=0;
  TreeNode *tr;
  SEXP result;
  char *node, *label;
  label = copy_charstr(CHARACTER_VALUE(labelP));
  node = copy_charstr(CHARACTER_VALUE(nodeP));
  include_leading_branch = LOGICAL_VALUE(includeLeadingBranchP);
  PROTECT(result = NEW_CHARACTER(numtree));
  for (i=0; i < numtree; i++) {
    tr = rph_tree_new(STRING_ELT(treeP, i));
    tr_label_subtree(tr, node, include_leading_branch, label);
    SET_STRING_ELT(result, i, mkChar(tr_to_string(tr, 1)));
  }
  UNPROTECT(1);
  return result;
}
Beispiel #2
0
SEXP rph_tree_label_branches(SEXP treeP, SEXP nodep, SEXP labelP) {
  int i, j, numtree = LENGTH(treeP);
  TreeNode *tr;
  SEXP result;
  char *label;

  label = copy_charstr(CHARACTER_VALUE(labelP));
  PROTECT(result = NEW_CHARACTER(numtree));
  for (i=0; i < numtree; i++) {
    tr = rph_tree_new(STRING_ELT(treeP, i));
    for (j = 0; j < LENGTH(nodep); j++)
      tr_label_node(tr, CHARACTER_VALUE(STRING_ELT(nodep, j)), label);
    SET_STRING_ELT(result, i, mkChar(tr_to_string(tr, 1)));
  }
  UNPROTECT(1);
  return result;
}
Beispiel #3
0
SEXP rph_gff_one_attribute(SEXP gffP, SEXP tagP) {
  GFF_Set *gff = (GFF_Set*)EXTPTR_PTR(gffP);
  GFF_Feature *f;
  ListOfLists *lol;
  List *l1, *l2;
  int numtag, numval, i, j, k, resultLen, maxResultLen=10;
  String *currStr, *tag, *currTag;
  char **result;
  SEXP rv;
  SEXP rph_listOfLists_to_SEXP(ListOfLists *lol);


  if (lst_size(gff->features) == 0) return R_NilValue;
  gff_register_protect(gff);
  result = smalloc(maxResultLen*sizeof(char*));
  tag = str_new_charstr(CHARACTER_VALUE(tagP));
  str_double_trim(tag);
  lol = lol_new(lst_size(gff->features));
  l1 = lst_new_ptr(10);
  l2 = lst_new_ptr(10);
  for (i=0; i < lst_size(gff->features); i++) {
    checkInterruptN(i, 1000);
    resultLen=0;
    f = (GFF_Feature*) lst_get_ptr(gff->features, i);
    numtag = str_split_with_quotes(f->attribute, ";", l1);  //split tags
    for (j=0; j < numtag; j++) {
      currStr = (String*)lst_get_ptr(l1, j);
      str_double_trim(currStr);

      //first try gff version 3, see if we have tag=val format
      numval = str_split_with_quotes(currStr, "=", l2);
      if (numval == 2) {
	currTag = (String*)lst_get_ptr(l2, 0);
	str_double_trim(currTag);
	if (str_equals(tag, currTag)) {  // tag matches target, add all values to list
	  currStr = str_new_charstr(((String*)lst_get_ptr(l2, 1))->chars);
	  lst_free_strings(l2);
	  numval = str_split_with_quotes(currStr, ",", l2);
	  str_free(currStr);
	  for (k=0; k < numval; k++) {
	    currStr = lst_get_ptr(l2, k);
	    str_double_trim(currStr);
	    str_remove_quotes(currStr);
	    if (resultLen > maxResultLen) {
	      maxResultLen += 100;
	      result = srealloc(result, maxResultLen*sizeof(char*));
	    }
	    result[resultLen++] = copy_charstr(currStr->chars);
	  }
	}
      } else {
	lst_free_strings(l2);

	//gff version 2
	//split into tag val val ... by whitespace unless enclosed in quotes
	numval =  str_split_with_quotes(currStr, NULL, l2);
	if (numval > 1) {
	  currStr = (String*)lst_get_ptr(l2, 0);
	  str_double_trim(currStr);
	  if (str_equals(tag, currStr)) {  //tag matches target, add all values to list
	    for (k=1; k < numval; k++) {
	      currStr = (String*)lst_get_ptr(l2, k);
	      str_double_trim(currStr);
	      str_remove_quotes(currStr);
	      if (resultLen > maxResultLen) {
		maxResultLen += 100;
		result = srealloc(result, maxResultLen*sizeof(char*));
	      }
	      result[resultLen++] = copy_charstr(currStr->chars);
	    }
	  }
	}
	lst_free_strings(l2);
      }
    }
    if (resultLen == 0)
      result[resultLen++] = copy_charstr("");  //empty string will be converted to NA later
    lol_push_charvec(lol, result, resultLen, NULL);
    for (j=0; j < resultLen; j++) sfree(result[j]);
  }
  PROTECT(rv = rph_listOfLists_to_SEXP(lol));
  UNPROTECT(1);
  return rv;
}
/* if query_seq and target_seq are NULL, they will attempt to be read
   from the filenames contained in the LocalPwAlignment object (FASTA
   format is assumed) */
MSA* la_to_msa(LocalPwAlignment *lpwa, int force_global) {
  int i, j, k, len;
  char **names = (char**)smalloc(2 * sizeof(char*));
  char **seqs = (char**)smalloc(2 * sizeof(char*));
  String *query_seq = lpwa->query_seq, *target_seq = lpwa->target_seq;
  String *qseq = str_new(query_seq->length);
  String *tseq = str_new(target_seq->length);
  GaplessAlignment *lga = NULL;

  names[0] = copy_charstr(lpwa->query_name->chars);
  names[1] = copy_charstr(lpwa->target_name->chars);

  for (i = 0; i < lst_size(lpwa->alignment_blocks); i++) {
    AlignmentBlock* b = lst_get_ptr(lpwa->alignment_blocks, i);
    checkInterrupt();
    for (j = 0; j < lst_size(b->gapless_alns); j++) {
      GaplessAlignment *ga = lst_get_ptr(b->gapless_alns, j);

      if (lga == NULL) {
        for (k = 0; k < ga->query_beg-1; k++) {
          str_append_char(qseq, query_seq->chars[k]);
          str_append_char(tseq, GAP_CHAR);
        }
        if (force_global) {
          for (k = 0; k < ga->target_beg-1; k++) {
            str_append_char(qseq, GAP_CHAR);
            str_append_char(tseq, target_seq->chars[k]);
          }
        }
      }
      else {
        if (lga->query_end >= ga->query_beg ||
            (force_global && lga->target_end >= ga->target_beg)) {
          die("ERROR: overlapping alignment segments.\n");
        }
        if (j > 0 && lga->query_end == ga->query_beg-1) {
                                /* gap in query seq */
          for (k = lga->target_end; k < ga->target_beg-1; k++) {
            str_append_char(qseq, GAP_CHAR);
            str_append_char(tseq, target_seq->chars[k]);
          }
        }
        else {                  /* gap in target seq */
          for (k = lga->query_end; k < ga->query_beg-1; k++) {
            str_append_char(qseq, query_seq->chars[k]);
            str_append_char(tseq, GAP_CHAR);
          }
          if (force_global) {
            for (k = lga->target_end; k < ga->target_beg-1; k++) {
              str_append_char(qseq, GAP_CHAR);
              str_append_char(tseq, target_seq->chars[k]);
            }
          }
        }
      }

      for (k = 0; k < ga->query_end - ga->query_beg + 1; k++) {
        str_append_char(qseq, query_seq->chars[ga->query_beg + k - 1]);
        str_append_char(tseq, target_seq->chars[ga->target_beg + k - 1]);
      }

      lga = ga;
    }
  }

  for (k = lga->query_end; k < query_seq->length; k++) {
    str_append_char(qseq, query_seq->chars[k]);
    str_append_char(tseq, GAP_CHAR);
  }

  if (force_global) {
    for (k = lga->target_end; k < target_seq->length; k++) {
      str_append_char(qseq, GAP_CHAR);
      str_append_char(tseq, target_seq->chars[k]);
    }
  }

  seqs[0] = qseq->chars;
  seqs[1] = tseq->chars;
  qseq->chars = NULL; tseq->chars = NULL;
  len = qseq->length;
  str_free(qseq); str_free(tseq);
  return msa_new(seqs, names, 2, len, NULL);  
}
Beispiel #5
0
int main(int argc, char *argv[]) {
  char c;
  char *msa_fname = NULL;
  int opt_idx, i, old_nnodes;
  MSA *msa;
  List *pruned_names = lst_new_ptr(5), *tmpl;
  BDPhyloHmm *bdphmm;
  GFF_Set *predictions;
  int found = FALSE;
  List *ignore_types = lst_new_ptr(1);

  struct option long_opts[] = {
    {"refseq", 1, 0, 'M'},
    {"msa-format", 1, 0, 'i'},
    {"refidx", 1, 0, 'r'},
    {"rho", 1, 0, 'R'},
    {"phi", 1, 0, 'p'},
    {"transitions", 1, 0, 't'},    
    {"expected-length", 1, 0, 'E'},
    {"target-coverage", 1, 0, 'C'},
    {"seqname", 1, 0, 'N'},
    {"idpref", 1, 0, 'P'},
    {"indel-model", 1, 0, 'I'},
    {"indel-history", 1, 0, 'H'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  /* arguments and defaults for options */
  FILE *refseq_f = NULL, *msa_f = NULL;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  TreeModel *source_mod;
  double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, 
    phi = DEFAULT_PHI, gamma = -1, omega = -1, 
    alpha_c = -1, beta_c = -1, tau_c = -1,
    alpha_n = -1, beta_n = -1, tau_n = -1;
  int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, 
    estim_gamma = TRUE, estim_omega = TRUE;
  char *seqname = NULL, *idpref = NULL;
  IndelHistory *ih = NULL;

  while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'R':
      rho = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 't':
      if (optarg[0] != '~') estim_gamma = estim_omega = FALSE;
      else optarg = &optarg[1];
      set_transitions = TRUE;
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 2) 
        die("ERROR: bad argument to --transitions.\n");
      mu = lst_get_dbl(tmpl, 0);
      nu = lst_get_dbl(tmpl, 1);
      if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1)
        die("ERROR: bad argument to --transitions.\n");
      lst_free(tmpl);
      break;
    case 'p':
      if (optarg[0] != '~') estim_phi = FALSE;
      else optarg = &optarg[1];
      phi = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'E':
      if (optarg[0] != '~') estim_omega = FALSE;
      else optarg = &optarg[1];
      omega = get_arg_dbl_bounds(optarg, 1, INFTY);
      mu = 1/omega;
      break;
    case 'C':
      if (optarg[0] != '~') estim_gamma = FALSE;
      else optarg = &optarg[1];
      gamma = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'r':
      refidx = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'M':
      refseq_f = phast_fopen(optarg, "r");
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.\n");
      break;
    case 'N':
      seqname = optarg;
      break;
    case 'P':
      idpref = optarg;
      break;
    case 'I':
      tmpl = get_arg_list_dbl(optarg);
      if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6)
        die("ERROR: bad argument to --indel-model.\n");
      alpha_n = lst_get_dbl(tmpl, 0);
      beta_n = lst_get_dbl(tmpl, 1);
      tau_n = lst_get_dbl(tmpl, 2);
      if (lst_size(tmpl) == 6) {
        alpha_c = lst_get_dbl(tmpl, 3);
        beta_c = lst_get_dbl(tmpl, 4);
        tau_c = lst_get_dbl(tmpl, 5);
      }
      else {
        alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n;
      }
      if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || 
          tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || 
          beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1)
        die("ERROR: bad argument to --indel-model.\n");
      break;
    case 'H':
      fprintf(stderr, "Reading indel history from %s...\n", optarg);
      ih = ih_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("Bad argument.  Try 'dless -h'.\n");
    }
  }

  if (optind != argc - 1)
    die("Missing alignment file or model file.  Try 'dless -h'.\n");

  if (set_transitions && (gamma != -1 || omega != -1))
    die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n");

  if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1))
    die("ERROR: --target-coverage and --expecteed-length must be used together.\n");

  set_seed(-1);

  if (gamma != -1)
    nu = gamma/(1-gamma) * mu;

  fprintf(stderr, "Reading tree model from %s...\n", argv[optind]);
  source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1);

  if (source_mod->nratecats > 1) 
    die("ERROR: rate variation not currently supported.\n");

  if (source_mod->order > 0)
    die("ERROR: only single nucleotide models are currently supported.\n");

  if (!tm_is_reversible(source_mod))
    phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n");

  /* read alignment */
  msa_f = phast_fopen(argv[optind], "r");

  fprintf(stderr, "Reading alignment from %s...\n", argv[optind]);
  if (msa_format == UNKNOWN_FORMAT) 
    msa_format = msa_format_for_content(msa_f, 1);

  if (msa_format == MAF) {
    msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, 
                   NO_STRIP, FALSE); 
  }
  else 
    msa = msa_new_from_file_define_format(msa_f, msa_format, NULL);

  if (msa_alph_has_lowercase(msa)) msa_toupper(msa); 
  msa_remove_N_from_alph(msa);

  if (msa->ss == NULL) {
    fprintf(stderr, "Extracting sufficient statistics...\n");
    ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0);
  }
  else if (msa->ss->tuple_idx == NULL)
    die("ERROR: ordered representation of alignment required unless --suff-stats.\n");

  /* prune tree, if necessary */
  old_nnodes = source_mod->tree->nnodes;
  tm_prune(source_mod, msa, pruned_names);

  if (lst_size(pruned_names) == (old_nnodes + 1) / 2)
    die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n");
  if (lst_size(pruned_names) > 0) {
    fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment (");
    for (i = 0; i < lst_size(pruned_names); i++)
      fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, 
              i < lst_size(pruned_names) - 1 ? ", " : ").\n");
  }

  /* this has to be done after pruning tree */
  tr_name_ancestors(source_mod->tree);

  /* also make sure match for reference sequence in tree */
  if (refidx > 0) {
    for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) {
      TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i);
      if (!strcmp(n->name, msa->names[refidx-1]))
        found = TRUE;
    }
    if (!found) die("ERROR: no match for reference sequence in tree.\n");
  }

  /* checks for indel model */
  if (alpha_c > 0) {
    if (ih == NULL) {
      fprintf(stderr, "Reconstructing indel history by parsimony...\n");
      ih = ih_reconstruct(msa, source_mod->tree);
    }
    else {
      if (ih->ncols != msa->length)
        die("ERROR: indel history doesn't seem to match alignment.\n");
      if (ih->tree->nnodes != source_mod->tree->nnodes)
        die("ERROR: indel history doesn't seem to match tree model.\n");
    }
  }

  bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, 
                  alpha_n, beta_n, tau_n, estim_gamma, estim_omega, 
                  estim_phi);

  /* compute emissions */
  phmm_compute_emissions(bdphmm->phmm, msa, FALSE);

  /* add emissions for indel model, if necessary */
  if (alpha_c > 0) {
    fprintf(stderr, "Adjusting emissions for indels...\n");
    bd_add_indel_emissions(bdphmm, ih);
  }

  /* postprocess for missing data (requires special handling) */
  fprintf(stderr, "Adjusting emissions for missing data...\n");
  bd_handle_missing_data(bdphmm, msa);

  if (estim_gamma || estim_omega || estim_phi) {
    fprintf(stderr, "Estimating free parameters...\n");
    bd_estimate_transitions(bdphmm, msa);
  }

  /* set seqname and idpref, if necessary */
  if (seqname == NULL || idpref == NULL) {
    /* derive default from file name root */
    String *tmp = str_new_charstr(msa_fname);
    if (!str_equals_charstr(tmp, "-")) {
      str_remove_path(tmp);
      str_root(tmp, '.');
      if (idpref == NULL) idpref = copy_charstr(tmp->chars);
      str_root(tmp, '.');         /* apply one more time for double suffix */
      if (seqname == NULL) seqname = tmp->chars;    
    }
    else if (seqname == NULL) seqname = "refseq";
  }

  /* obtain predictions */
  fprintf(stderr, "Running Viterbi algorithm...\n");
  predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL);
  lst_push_ptr(ignore_types, str_new_charstr("nonconserved"));
  gff_filter_by_type(predictions, ignore_types, TRUE, NULL);

  /* score predictions */
  fprintf(stderr, "Scoring predictions...\n");
  bd_score_predictions(bdphmm, predictions);
  
  /* can free emissions now */
  for (i = 0; i < bdphmm->phmm->hmm->nstates; i++)
    sfree(bdphmm->phmm->emissions[i]);
  sfree(bdphmm->phmm->emissions);
  bdphmm->phmm->emissions = NULL;

  /* convert GFF to coord frame of reference sequence and adjust
     coords by idx_offset, if necessary  */
  if (refidx != 0 || msa->idx_offset != 0)
    msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset);

  if (refidx != 0) 
    gff_flatten(predictions);	
  /* necessary because coord conversion might create overlapping
     features (can happen in deletions in reference sequence) */

  /* now output predictions */
  fprintf(stderr, "Writing GFF to stdout...\n");
  gff_print_set(stdout, predictions);

  fprintf(stderr, "Done.\n");
  
  return 0;
}