SEXP rph_tree_scale(SEXP treeStr, SEXP scaleP, SEXP nodeStr, SEXP includeLeadingP) { TreeNode *tr = rph_tree_new(treeStr); double scale = NUMERIC_VALUE(scaleP); char *newTreeStr; SEXP result; if (nodeStr != R_NilValue) { TreeNode *n; int includeLeading=LOGICAL_VALUE(includeLeadingP); n = tr_get_node(tr, CHARACTER_VALUE(nodeStr)); if (n == NULL) { tr_name_ancestors(tr); n = tr_get_node(tr, CHARACTER_VALUE(nodeStr)); if (n == NULL) die("No node named %s in %s\n", CHARACTER_VALUE(nodeStr), CHARACTER_VALUE(treeStr)); } tr_scale_subtree(tr, n, scale, includeLeading); } else tr_scale(tr, scale); newTreeStr = tr_to_string(tr, 1); PROTECT(result = NEW_CHARACTER(1)); SET_STRING_ELT(result, 0, mkChar(newTreeStr)); UNPROTECT(1); return result; }
SEXP rph_tree_name_ancestors(SEXP treeStr) { TreeNode *tr = rph_tree_new(treeStr); char *newTreeStr; SEXP result; tr_name_ancestors(tr); newTreeStr = tr_to_string(tr, 1); PROTECT(result = NEW_CHARACTER(1)); SET_STRING_ELT(result, 0, mkChar(newTreeStr)); UNPROTECT(1); return result; }
SEXP rph_tree_subtree(SEXP treeStr, SEXP nodeStr) { TreeNode *tr = rph_tree_new(treeStr); TreeNode *n; char *newTreeStr; SEXP result; n = tr_get_node(tr, CHARACTER_VALUE(nodeStr)); if (n == NULL) { tr_name_ancestors(tr); n = tr_get_node(tr, CHARACTER_VALUE(nodeStr)); if (n == NULL) die("No node named %s", CHARACTER_VALUE(nodeStr)); } tr_prune_supertree(&tr, n); newTreeStr = tr_to_string(tr, 1); PROTECT(result = NEW_CHARACTER(1)); SET_STRING_ELT(result, 0, mkChar(newTreeStr)); UNPROTECT(1); return result; }
int main(int argc, char *argv[]) { char c; int opt_idx, node; FILE *out_f = NULL, *msa_f, *mod_f; char *out_root; TreeModel *mod; MSA *msa; char out_fname[STR_MED_LEN]; struct option long_opts[] = { {"refseq", 1, 0, 'r'}, {"msa-format", 1, 0, 'i'}, {"seqs", 1, 0, 's'}, {"exclude", 0, 0, 'x'}, {"no-probs", 0, 0, 'n'}, {"suff-stats", 0, 0, 'S'}, {"encode", 1, 0, 'e'}, {"keep-gaps", 0, 0, 'k'}, {"gibbs", 1, 0, 'G'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* arguments and defaults for options */ FILE *refseq_f = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; int suff_stats = FALSE, exclude = FALSE, keep_gaps = FALSE, do_probs = TRUE; List *seqlist = NULL; PbsCode *code = NULL; int gibbs_nsamples = -1; while ((c = (char)getopt_long(argc, argv, "r:i:s:e:knxSh", long_opts, &opt_idx)) != -1) { switch (c) { case 'r': refseq_f = phast_fopen(optarg, "r"); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: unrecognized alignment format.\n"); break; case 'S': suff_stats = TRUE; break; case 'e': code = pbs_new_from_file(phast_fopen(optarg, "r")); break; case 's': seqlist = get_arg_list(optarg); break; case 'x': exclude = TRUE; break; case 'n': do_probs = FALSE; break; case 'k': keep_gaps = TRUE; break; case 'G': gibbs_nsamples = get_arg_int_bounds(optarg, 1, INFTY); break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'prequel -h'.\n"); } } if (optind != argc - 3) die("Three arguments required. Try 'prequel -h'.\n"); set_seed(-1); if (!do_probs && (suff_stats || code != NULL)) die("ERROR: --no-probs can't be used with --suff-stats or --encode.\n"); msa_f = phast_fopen(argv[optind], "r"); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(msa_f, 1); fprintf(stderr, "Reading alignment from %s...\n", argv[optind]); if (msa_format == MAF) { msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, !suff_stats, NULL, NO_STRIP, FALSE); /* (no need to store order if suff_stats mode) */ } else msa = msa_new_from_file_define_format(msa_f, msa_format, NULL); if (msa->ss == NULL) { fprintf(stderr, "Extracting sufficient statistics...\n"); ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0); } else if (msa->ss->tuple_idx == NULL && !suff_stats) die("ERROR: ordered representation of alignment required unless --suff-stats.\n"); mod_f = phast_fopen(argv[optind+1], "r"); out_root = argv[optind+2]; mod = tm_new_from_file(mod_f, 1); /* MH prune just like in phastcons */ int old_nnodes = mod->tree->nnodes; List *pruned_names = lst_new_ptr(msa->nseqs); tm_prune(mod, msa, pruned_names); if (lst_size(pruned_names) == (old_nnodes + 1) / 2) die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n"); if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment ("); int j; for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); tr_name_ancestors(mod->tree); if (mod->order != 0) die("ERROR: Only single nucleotide models are supported.\n"); if (mod->nratecats > 1) die("ERROR: Rate variation not supported.\n"); mod->tree_posteriors = tl_new_tree_posteriors(mod, msa, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE); fprintf(stderr, "Computing posterior probabilities...\n"); if (gibbs_nsamples > 0) die("ERROR: --gibbs not implemented yet."); /* gb_sample_ancestral_seqs(mod, msa, mod->tree_posteriors, gibbs_nsamples); */ else tl_compute_log_likelihood(mod, msa, NULL, NULL, -1, mod->tree_posteriors); fprintf(stderr, "Reconstructing indels by parsimony...\n"); do_indels(msa, mod); for (node = 0; node < mod->tree->nnodes; node++) { int i, j; TreeNode *n = lst_get_ptr(mod->tree->nodes, node); if (n->lchild == NULL || n->rchild == NULL) continue; if (seqlist != NULL) { int in_list = str_in_list_charstr(n->name, seqlist); if ((in_list && exclude) || (!in_list && !exclude)) continue; } fprintf(stderr, "Writing output for ancestral node '%s'...\n", n->name); if (suff_stats) { if (out_f == NULL) { sprintf(out_fname, "%s.stats", out_root); out_f = phast_fopen(out_fname, "w+"); fprintf(out_f, "#count\t"); for (j = 0; j < mod->rate_matrix->size; j++) fprintf(out_f, "p(%c)%c", mod->rate_matrix->states[j], j == mod->rate_matrix->size - 1 ? '\n' : '\t'); } for (i = 0; i < msa->ss->ntuples; i++) { if (mod->tree_posteriors->base_probs[0][0][node][i] == -1) continue; /* no base this node */ fprintf(out_f, "%.0f\t", msa->ss->counts[i]); for (j = 0; j < mod->rate_matrix->size; j++) { fprintf(out_f, "%f%c", mod->tree_posteriors->base_probs[0][j][node][i], j == mod->rate_matrix->size - 1 ? '\n' : '\t'); } } } else if (code == NULL && do_probs) { /* ordinary sequence-by-sequence output */ sprintf(out_fname, "%s.%s.probs", out_root, n->name); out_f = phast_fopen(out_fname, "w+"); fprintf(out_f, "#"); for (j = 0; j < mod->rate_matrix->size; j++) fprintf(out_f, "p(%c)%c", mod->rate_matrix->states[j], j == mod->rate_matrix->size - 1 ? '\n' : '\t'); for (i = 0; i < msa->length; i++) { if (mod->tree_posteriors->base_probs[0][0][node][msa->ss->tuple_idx[i]] == -1) { /* no base */ if (keep_gaps) fprintf(out_f, "-\n"); /* otherwise do nothing */ } else for (j = 0; j < mod->rate_matrix->size; j++) fprintf(out_f, "%f%c", mod->tree_posteriors->base_probs[0][j][node][msa->ss->tuple_idx[i]], j == mod->rate_matrix->size - 1 ? '\n' : '\t'); } phast_fclose(out_f); } else if (code == NULL && !do_probs) { /* write point estimates to FASTA file */ char *outseq = smalloc((msa->length + 1) * sizeof(char)); int len = 0; for (i = 0; i < msa->length; i++) { if (mod->tree_posteriors->base_probs[0][0][node][msa->ss->tuple_idx[i]] == -1) { /* no base */ if (keep_gaps) outseq[len++] = GAP_CHAR; /* otherwise do nothing */ } else { double maxprob = 0; int maxidx = -1; for (j = 0; j < mod->rate_matrix->size; j++) { if (mod->tree_posteriors->base_probs[0][j][node][msa->ss->tuple_idx[i]] > maxprob) { maxprob = mod->tree_posteriors->base_probs[0][j][node][msa->ss->tuple_idx[i]]; maxidx = j; } } outseq[len++] = mod->rate_matrix->states[maxidx]; } } outseq[len] = '\0'; /* print in FASTA format */ sprintf(out_fname, "%s.%s.fa", out_root, n->name); out_f = phast_fopen(out_fname, "w+"); print_seq_fasta(out_f, outseq, n->name, len); phast_fclose(out_f); sfree(outseq); } else { /* encoded sequence-by-sequence output */ double error, tot_error = 0; int ngaps = 0; Vector *v; unsigned *encoded; /* first encode tuple by tuple */ v = vec_new(mod->rate_matrix->size); encoded = smalloc(msa->ss->ntuples * sizeof(unsigned)); for (i = 0; i < msa->ss->ntuples; i++) { if (mod->tree_posteriors->base_probs[0][0][node][i] == -1) { encoded[i] = code->gap_code; ngaps += msa->ss->counts[i]; } else { for (j = 0; j < mod->rate_matrix->size; j++) vec_set(v, j, mod->tree_posteriors->base_probs[0][j][node][i]); encoded[i] = pbs_get_index(code, v, &error); tot_error += error * msa->ss->counts[i]; } } vec_free(v); /* now write site by site */ sprintf(out_fname, "%s.%s.bin", out_root, n->name); out_f = phast_fopen(out_fname, "w+"); for (i = 0; i < msa->length; i++) { if (keep_gaps || encoded[msa->ss->tuple_idx[i]] != code->gap_code) pbs_write_binary(code, encoded[msa->ss->tuple_idx[i]], out_f); } fprintf(stderr, "Average approximation error ('%s'): %f bits\n", n->name, tot_error/(msa->length - ngaps)); sfree(encoded); } } fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char *argv[]) { List *pruned_names = lst_new_ptr(5); TreeModel *source_mod; MSA *msa = NULL, *out_msa; IndelHistory *ih; char *read_hist_fname = NULL; char c; int opt_idx, old_nnodes, i; msa_format_type msa_format = UNKNOWN_FORMAT; int output_alignment = FALSE, ia_names = FALSE; struct option long_opts[] = { {"msa-format", 1, 0, 'i'}, {"output-alignment", 0, 0, 'A'}, {"read-history", 1, 0, 'H'}, {"ia-names", 0, 0, 'I'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "i:H:AIh", long_opts, &opt_idx)) != -1) { switch (c) { case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == -1) die("ERROR: unrecognized alignment format.\n"); break; case 'A': output_alignment = TRUE; break; case 'H': read_hist_fname = optarg; break; case 'I': ia_names = TRUE; break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'indelHistory -h'.\n"); } } set_seed(-1); if (read_hist_fname != NULL) { fprintf(stderr, "Reading indel history from %s...\n", read_hist_fname); ih = ih_new_from_file(phast_fopen(read_hist_fname, "r")); } else { FILE *mfile; if (optind != argc - 2) die("Two arguments required. Try 'indelHistory -h'.\n"); fprintf(stderr, "Reading alignment from %s...\n", argv[optind]); mfile = phast_fopen(argv[optind], "r"); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(mfile, 1); msa = msa_new_from_file_define_format(mfile, msa_format, "ACGTNB^.-"); phast_fclose(mfile); if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL)) die("ERROR: ordered representation of alignment required.\n"); fprintf(stderr, "Reading tree from %s...\n", argv[optind+1]); source_mod = tm_new_from_file(phast_fopen(argv[optind+1], "r"), 1); /* prune tree, if necessary */ old_nnodes = source_mod->tree->nnodes; tm_prune(source_mod, msa, pruned_names); if (lst_size(pruned_names) == (old_nnodes + 1) / 2) die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n"); if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment ("); for (i = 0; i < lst_size(pruned_names); i++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, i < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free(pruned_names); tr_name_ancestors(source_mod->tree); if (msa->nseqs > (source_mod->tree->nnodes + 1) / 2) { /* assume ancestral seqs specified in this case */ if (ia_names) { fprintf(stderr, "Converting sequence names...\n"); ih_convert_ia_names(msa, source_mod->tree); } fprintf(stderr, "Extracting indel history from alignment...\n"); ih = ih_extract_from_alignment(msa, source_mod->tree); } else { /* infer by parsimony */ if (msa->ss == NULL) { fprintf(stderr, "Extracting sufficient statistics...\n"); ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0); } fprintf(stderr, "Inferring indel history by parsimony...\n"); ih = ih_reconstruct(msa, source_mod->tree); } } if (output_alignment) { out_msa = ih_as_alignment(ih, msa); msa_print(stdout, out_msa, FASTA, FALSE); } else ih_print(ih, stdout, read_hist_fname != NULL ? read_hist_fname : argv[optind], "indelHistory"); fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char *argv[]) { char c; char *msa_fname = NULL; int opt_idx, i, old_nnodes; MSA *msa; List *pruned_names = lst_new_ptr(5), *tmpl; BDPhyloHmm *bdphmm; GFF_Set *predictions; int found = FALSE; List *ignore_types = lst_new_ptr(1); struct option long_opts[] = { {"refseq", 1, 0, 'M'}, {"msa-format", 1, 0, 'i'}, {"refidx", 1, 0, 'r'}, {"rho", 1, 0, 'R'}, {"phi", 1, 0, 'p'}, {"transitions", 1, 0, 't'}, {"expected-length", 1, 0, 'E'}, {"target-coverage", 1, 0, 'C'}, {"seqname", 1, 0, 'N'}, {"idpref", 1, 0, 'P'}, {"indel-model", 1, 0, 'I'}, {"indel-history", 1, 0, 'H'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* arguments and defaults for options */ FILE *refseq_f = NULL, *msa_f = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; TreeModel *source_mod; double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, phi = DEFAULT_PHI, gamma = -1, omega = -1, alpha_c = -1, beta_c = -1, tau_c = -1, alpha_n = -1, beta_n = -1, tau_n = -1; int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, estim_gamma = TRUE, estim_omega = TRUE; char *seqname = NULL, *idpref = NULL; IndelHistory *ih = NULL; while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) { switch (c) { case 'R': rho = get_arg_dbl_bounds(optarg, 0, 1); break; case 't': if (optarg[0] != '~') estim_gamma = estim_omega = FALSE; else optarg = &optarg[1]; set_transitions = TRUE; tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 2) die("ERROR: bad argument to --transitions.\n"); mu = lst_get_dbl(tmpl, 0); nu = lst_get_dbl(tmpl, 1); if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1) die("ERROR: bad argument to --transitions.\n"); lst_free(tmpl); break; case 'p': if (optarg[0] != '~') estim_phi = FALSE; else optarg = &optarg[1]; phi = get_arg_dbl_bounds(optarg, 0, 1); break; case 'E': if (optarg[0] != '~') estim_omega = FALSE; else optarg = &optarg[1]; omega = get_arg_dbl_bounds(optarg, 1, INFTY); mu = 1/omega; break; case 'C': if (optarg[0] != '~') estim_gamma = FALSE; else optarg = &optarg[1]; gamma = get_arg_dbl_bounds(optarg, 0, 1); break; case 'r': refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'M': refseq_f = phast_fopen(optarg, "r"); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: unrecognized alignment format.\n"); break; case 'N': seqname = optarg; break; case 'P': idpref = optarg; break; case 'I': tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-model.\n"); alpha_n = lst_get_dbl(tmpl, 0); beta_n = lst_get_dbl(tmpl, 1); tau_n = lst_get_dbl(tmpl, 2); if (lst_size(tmpl) == 6) { alpha_c = lst_get_dbl(tmpl, 3); beta_c = lst_get_dbl(tmpl, 4); tau_c = lst_get_dbl(tmpl, 5); } else { alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n; } if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1) die("ERROR: bad argument to --indel-model.\n"); break; case 'H': fprintf(stderr, "Reading indel history from %s...\n", optarg); ih = ih_new_from_file(phast_fopen(optarg, "r")); break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'dless -h'.\n"); } } if (optind != argc - 1) die("Missing alignment file or model file. Try 'dless -h'.\n"); if (set_transitions && (gamma != -1 || omega != -1)) die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n"); if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1)) die("ERROR: --target-coverage and --expecteed-length must be used together.\n"); set_seed(-1); if (gamma != -1) nu = gamma/(1-gamma) * mu; fprintf(stderr, "Reading tree model from %s...\n", argv[optind]); source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1); if (source_mod->nratecats > 1) die("ERROR: rate variation not currently supported.\n"); if (source_mod->order > 0) die("ERROR: only single nucleotide models are currently supported.\n"); if (!tm_is_reversible(source_mod)) phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n"); /* read alignment */ msa_f = phast_fopen(argv[optind], "r"); fprintf(stderr, "Reading alignment from %s...\n", argv[optind]); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(msa_f, 1); if (msa_format == MAF) { msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); } else msa = msa_new_from_file_define_format(msa_f, msa_format, NULL); if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); if (msa->ss == NULL) { fprintf(stderr, "Extracting sufficient statistics...\n"); ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0); } else if (msa->ss->tuple_idx == NULL) die("ERROR: ordered representation of alignment required unless --suff-stats.\n"); /* prune tree, if necessary */ old_nnodes = source_mod->tree->nnodes; tm_prune(source_mod, msa, pruned_names); if (lst_size(pruned_names) == (old_nnodes + 1) / 2) die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n"); if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment ("); for (i = 0; i < lst_size(pruned_names); i++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, i < lst_size(pruned_names) - 1 ? ", " : ").\n"); } /* this has to be done after pruning tree */ tr_name_ancestors(source_mod->tree); /* also make sure match for reference sequence in tree */ if (refidx > 0) { for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) { TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i); if (!strcmp(n->name, msa->names[refidx-1])) found = TRUE; } if (!found) die("ERROR: no match for reference sequence in tree.\n"); } /* checks for indel model */ if (alpha_c > 0) { if (ih == NULL) { fprintf(stderr, "Reconstructing indel history by parsimony...\n"); ih = ih_reconstruct(msa, source_mod->tree); } else { if (ih->ncols != msa->length) die("ERROR: indel history doesn't seem to match alignment.\n"); if (ih->tree->nnodes != source_mod->tree->nnodes) die("ERROR: indel history doesn't seem to match tree model.\n"); } } bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, alpha_n, beta_n, tau_n, estim_gamma, estim_omega, estim_phi); /* compute emissions */ phmm_compute_emissions(bdphmm->phmm, msa, FALSE); /* add emissions for indel model, if necessary */ if (alpha_c > 0) { fprintf(stderr, "Adjusting emissions for indels...\n"); bd_add_indel_emissions(bdphmm, ih); } /* postprocess for missing data (requires special handling) */ fprintf(stderr, "Adjusting emissions for missing data...\n"); bd_handle_missing_data(bdphmm, msa); if (estim_gamma || estim_omega || estim_phi) { fprintf(stderr, "Estimating free parameters...\n"); bd_estimate_transitions(bdphmm, msa); } /* set seqname and idpref, if necessary */ if (seqname == NULL || idpref == NULL) { /* derive default from file name root */ String *tmp = str_new_charstr(msa_fname); if (!str_equals_charstr(tmp, "-")) { str_remove_path(tmp); str_root(tmp, '.'); if (idpref == NULL) idpref = copy_charstr(tmp->chars); str_root(tmp, '.'); /* apply one more time for double suffix */ if (seqname == NULL) seqname = tmp->chars; } else if (seqname == NULL) seqname = "refseq"; } /* obtain predictions */ fprintf(stderr, "Running Viterbi algorithm...\n"); predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL); lst_push_ptr(ignore_types, str_new_charstr("nonconserved")); gff_filter_by_type(predictions, ignore_types, TRUE, NULL); /* score predictions */ fprintf(stderr, "Scoring predictions...\n"); bd_score_predictions(bdphmm, predictions); /* can free emissions now */ for (i = 0; i < bdphmm->phmm->hmm->nstates; i++) sfree(bdphmm->phmm->emissions[i]); sfree(bdphmm->phmm->emissions); bdphmm->phmm->emissions = NULL; /* convert GFF to coord frame of reference sequence and adjust coords by idx_offset, if necessary */ if (refidx != 0 || msa->idx_offset != 0) msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset); if (refidx != 0) gff_flatten(predictions); /* necessary because coord conversion might create overlapping features (can happen in deletions in reference sequence) */ /* now output predictions */ fprintf(stderr, "Writing GFF to stdout...\n"); gff_print_set(stdout, predictions); fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char *argv[]) { /* variables for options, with defaults */ TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL; Hashtable *rename_hash = NULL; double scale_factor = 1; List *prune_names = NULL, *label = NULL, *labelType = NULL; int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE, name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE, inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE; TreeModel *mod = NULL, *merge_mod = NULL; char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL, *node_distance_name = NULL; /* other variables */ String *suffix, *optstr; char c; int i, opt_idx; TreeNode *n; struct option long_opts[] = { {"scale", 1, 0, 's'}, {"extrapolate", 1, 0, 'e'}, {"prune", 1, 0, 'p'}, {"prune-all-but", 1, 0, 'P'}, {"get-subtree", 1, 0, 'g'}, {"merge", 1, 0, 'm'}, {"rename", 1, 0, 'r'}, {"tree-only", 0, 0, 't'}, {"no-branchlen", 0, 0, 'N'}, {"dissect", 0, 0, 'd'}, {"name-ancestors", 0, 0, 'a'}, {"reroot", 1, 0, 'R'}, {"with-branch", 1, 0, 'B'}, {"subtree", 1, 0, 'S'}, {"branchlen", 0, 0, 'b'}, {"newick", 0, 0, 'n'}, {"label-subtree", 1, 0, 'L'}, {"label-branches", 1, 0, 'l'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", long_opts, &opt_idx)) != -1) { switch (c) { case 's': scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY); break; case 'e': if (!strcmp(optarg, "default")) { optarg = smalloc(1000 * sizeof(char)); #if defined(__MINGW32__) sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME); #else sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME); #endif } extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'p': prune_names = get_arg_list(optarg); break; case 'P': prune_names = get_arg_list(optarg); prune_all_but = TRUE; break; case 'g': get_subtree_name = optarg; break; case 'm': suffix = str_new_charstr(optarg); str_suffix(suffix, '.'); if (str_equals_charstr(suffix, "nh")) merge_tree = tr_new_from_file(phast_fopen(optarg, "r")); else { merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); merge_tree = merge_mod->tree; } break; case 'r': rename_hash = make_name_hash(optarg); break; case 't': tree_only = TRUE; break; case 'N': no_branchlen = TRUE; tree_only = TRUE; break; case 'd': dissect = TRUE; break; case 'b': print_branchlen = TRUE; break; case 'D': print_distance_to_root = TRUE; node_distance_name = optarg; break; case 'R': reroot_name = optarg; break; case 'B': with_branch = TRUE; break; case 'a': name_ancestors = TRUE; break; case 'S': subtree_name = optarg; break; case 'n': inNewick=TRUE; break; case 'L': //do the same for --label--subtree and --label-branches case 'l': if (label == NULL) { label = lst_new_ptr(1); labelType = lst_new_int(1); } optstr = str_new_charstr(optarg); lst_push_ptr(label, optstr); lst_push_int(labelType, (int)c); break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); if (merge_tree != NULL && extrapolate_tree != NULL) die("ERROR: Can't use --merge and --extrapolate together"); set_seed(-1); suffix = str_new_charstr(argv[optind]); str_suffix(suffix, '.'); if (inNewick || str_equals_charstr(suffix, "nh")) { tree = tr_new_from_file(phast_fopen(argv[optind], "r")); tree_only = TRUE; /* can't output tree model in this case */ } else { mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1); tree = mod->tree; } if (prune_names != NULL) { tr_prune(&tree, prune_names, prune_all_but, NULL); if (mod != NULL) mod->tree = tree; /* root may have changed */ } if (get_subtree_name != NULL) { n = tr_get_node(tree, get_subtree_name); if (n == NULL) { tr_name_ancestors(tree); n = tr_get_node(tree, get_subtree_name); if (n == NULL) { die("ERROR: no node named '%s'.\n", subtree_name); } } tr_prune_supertree(&tree, n); if (mod != NULL) mod->tree = tree; } if (merge_tree != NULL) { tree = tr_hybrid(tree, merge_tree); if (mod != NULL) mod->tree = tree; } else if (extrapolate_tree != NULL) { tr_scale_by_subtree(extrapolate_tree, tree); tree = extrapolate_tree; if (mod != NULL) mod->tree = tree; } if (scale_factor != 1) { if (subtree_name == NULL) tr_scale(tree, scale_factor); else { n = tr_get_node(tree, subtree_name); if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name); tr_scale_subtree(tree, n, scale_factor, with_branch); } } if (name_ancestors) tr_name_ancestors(tree); if (rename_hash != NULL) { char *newname; for (i = 0; i < tree->nnodes; i++) { n = lst_get_ptr(tree->nodes, i); if (n->name != NULL && n->name[0] != '\0' && (newname = hsh_get(rename_hash, n->name)) != (char*)-1) { strcpy(n->name, newname); } } } if (reroot_name != NULL) { n = tr_get_node(tree, reroot_name); if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name); tr_reroot(tree, n, with_branch); if (mod != NULL) mod->tree = with_branch ? n->parent : n; tree = with_branch ? n->parent : n; } if (label != NULL) { for (i=0; i < lst_size(label); i++) { String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal; List *tmplst = lst_new_ptr(10); String *nodename; int j; str_split(currstr, ":", tmplst); if (lst_size(tmplst) != 2) die("ERROR: bad argument to --label-branches or --label-subtree.\n"); arg1 = lst_get_ptr(tmplst, 0); labelVal = lst_get_ptr(tmplst, 1); lst_clear(tmplst); if (lst_get_int(labelType, i) == (int)'l') { str_split(arg1, ",", tmplst); for (j=0; j < lst_size(tmplst); j++) { nodename = (String*)lst_get_ptr(tmplst, j); tr_label_node(tree, nodename->chars, labelVal->chars); } lst_free_strings(tmplst); } else if (lst_get_int(labelType, i) == (int)'L') { int include_leading_branch = FALSE; TreeNode *node; nodename = arg1; node = tr_get_node(tree, nodename->chars); if (node == NULL && nodename->chars[nodename->length-1] == '+') { nodename->chars[--nodename->length] = '\0'; node = tr_get_node(tree, nodename->chars); include_leading_branch = TRUE; } tr_label_subtree(tree, nodename->chars, include_leading_branch, labelVal->chars); } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i)); str_free(arg1); str_free(labelVal); lst_free(tmplst); str_free(currstr); } lst_free(label); lst_free(labelType); } if (dissect) tr_print_nodes(stdout, tree); if (print_branchlen) printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree)); if (print_distance_to_root) { TreeNode *node = tr_get_node(tree, node_distance_name); if (node == NULL) die("ERROR: no node named '%s'.\n", node_distance_name); printf("length(root-%s): %f\n", node_distance_name, tr_distance_to_root(node)); } if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) { if (tree_only) tr_print(stdout, tree, no_branchlen==FALSE); else tm_print(stdout, mod); } return 0; }