int main(int argc, char *argv[]) { char c; int i, j, t, opt_idx, ntrees, nleaves = -1; TreeNode *n, *node_i, *node_j, *lca, *nametree = NULL; TreeNode **tree; List *leaves, ***distance, *tree_fnames, *tot_dist; int mod = FALSE; char **leaf_name; String *trees_arg; FILE *F; struct option long_opts[] = { {"mod", 0, 0, 'm'}, {"tree", 1, 0, 't'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "mt:h", long_opts, &opt_idx)) != -1) { switch (c) { case 'm': mod = TRUE; break; case 't': if (optarg[0] == '(') nametree = tr_new_from_string(optarg); else nametree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind > argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); set_seed(-1); /* build a comma-delimited list and pass to get_arg_list; allows possibility of reading from file via '*' operator */ trees_arg = str_new(1000); for (i = optind; i < argc; i++) { str_append_charstr(trees_arg, argv[i]); if (i < argc - 1) str_append_char(trees_arg, ','); } tree_fnames = get_arg_list(trees_arg->chars); ntrees = lst_size(tree_fnames); tree = smalloc(ntrees * sizeof(void*)); /* read trees */ for (t = 0; t < ntrees; t++) { String *fname = lst_get_ptr(tree_fnames, t); if (mod) { TreeModel *m = tm_new_from_file(F = phast_fopen(fname->chars, "r"), 1); tree[t] = tr_create_copy(m->tree); tm_free(m); phast_fclose(F); } else tree[t] = tr_new_from_file(phast_fopen(fname->chars, "r")); } /* initialization */ nleaves = (tree[0]->nnodes + 1)/2; leaves = lst_new_ptr(nleaves); distance = smalloc(nleaves * sizeof(void*)); leaf_name = smalloc(nleaves * sizeof(void*)); for (i = 0; i < nleaves; i++) { distance[i] = smalloc(nleaves * sizeof(void*)); for (j = i+1; j < nleaves; j++) distance[i][j] = lst_new_dbl(ntrees); } if (nametree == NULL) nametree = tree[0]; for (i = 0, j = 0; i < lst_size(nametree->nodes); i++) { n = lst_get_ptr(nametree->nodes, i); if (n->lchild == NULL && n->rchild == NULL) leaf_name[j++] = n->name; } tot_dist = lst_new_dbl(ntrees); /* now compute distances */ for (t = 0; t < ntrees; t++) { /* obtain list of leaves */ lst_clear(leaves); for (i = 0; i < lst_size(tree[t]->nodes); i++) { n = lst_get_ptr(tree[t]->nodes, i); if (n->lchild == NULL && n->rchild == NULL) lst_push_ptr(leaves, n); } if (lst_size(leaves) != nleaves) die("ERROR: trees have different numbers of leaves.\n"); /* look at all pairs */ for (i = 0; i < nleaves; i++) { node_i = lst_get_ptr(leaves, i); for (j = i+1; j < nleaves; j++) { double dist = 0; node_j = lst_get_ptr(leaves, j); /* because ids are assigned in pre-order, the first ancestor of node j that has an id less than i is the LCA of i and j; we seek the sum of distances from both i and j to this node */ for (n = node_j; n->id >= node_i->id; n = n->parent) dist += n->dparent; lca = n; for (n = node_i; n != lca; n = n->parent) dist += n->dparent; lst_push_dbl(distance[i][j], dist); } } lst_push_dbl(tot_dist, tr_total_len(tree[t])); } /* print distances and (optionally) stats */ if (ntrees == 1) { for (i = 0; i < nleaves; i++) { for (j = i+1; j < nleaves; j++) { printf ("%s\t%s\t%f\n", leaf_name[i], leaf_name[j], lst_get_dbl(distance[i][j], 0)); } } printf ("%s\t%s\t%f\n", "(total)", "-", lst_get_dbl(tot_dist, 0)); } else { double mean, stdev; double quantiles[] = {0, 0.025, 0.05, 0.5, 0.95, 0.975, 1}; double quantile_vals[7]; printf("%-15s %-15s %9s %9s %9s %9s %9s %9s %9s %9s %9s\n", "leaf1", "leaf2", "mean", "stdev", "median", "min", "max", "95%_min", "95%_max", "90%_min", "90%_max"); for (i = 0; i < nleaves; i++) { for (j = i+1; j < nleaves; j++) { mean = lst_dbl_mean(distance[i][j]); stdev = lst_dbl_stdev(distance[i][j]); lst_qsort_dbl(distance[i][j], ASCENDING); lst_dbl_quantiles(distance[i][j], quantiles, 7, quantile_vals); printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", leaf_name[i], leaf_name[j], mean, stdev, quantile_vals[3], quantile_vals[0], quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], quantile_vals[4]); } } /* also do total branch len */ mean = lst_dbl_mean(tot_dist); stdev = lst_dbl_stdev(tot_dist); lst_qsort_dbl(tot_dist, ASCENDING); lst_dbl_quantiles(tot_dist, quantiles, 7, quantile_vals); printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", "(total)", "-", mean, stdev, quantile_vals[3], quantile_vals[0], quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], quantile_vals[4]); } return 0; }
int main(int argc, char *argv[]) { char c; char *msa_fname = NULL; int opt_idx, i, old_nnodes; MSA *msa; List *pruned_names = lst_new_ptr(5), *tmpl; BDPhyloHmm *bdphmm; GFF_Set *predictions; int found = FALSE; List *ignore_types = lst_new_ptr(1); struct option long_opts[] = { {"refseq", 1, 0, 'M'}, {"msa-format", 1, 0, 'i'}, {"refidx", 1, 0, 'r'}, {"rho", 1, 0, 'R'}, {"phi", 1, 0, 'p'}, {"transitions", 1, 0, 't'}, {"expected-length", 1, 0, 'E'}, {"target-coverage", 1, 0, 'C'}, {"seqname", 1, 0, 'N'}, {"idpref", 1, 0, 'P'}, {"indel-model", 1, 0, 'I'}, {"indel-history", 1, 0, 'H'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* arguments and defaults for options */ FILE *refseq_f = NULL, *msa_f = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; TreeModel *source_mod; double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, phi = DEFAULT_PHI, gamma = -1, omega = -1, alpha_c = -1, beta_c = -1, tau_c = -1, alpha_n = -1, beta_n = -1, tau_n = -1; int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, estim_gamma = TRUE, estim_omega = TRUE; char *seqname = NULL, *idpref = NULL; IndelHistory *ih = NULL; while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) { switch (c) { case 'R': rho = get_arg_dbl_bounds(optarg, 0, 1); break; case 't': if (optarg[0] != '~') estim_gamma = estim_omega = FALSE; else optarg = &optarg[1]; set_transitions = TRUE; tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 2) die("ERROR: bad argument to --transitions.\n"); mu = lst_get_dbl(tmpl, 0); nu = lst_get_dbl(tmpl, 1); if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1) die("ERROR: bad argument to --transitions.\n"); lst_free(tmpl); break; case 'p': if (optarg[0] != '~') estim_phi = FALSE; else optarg = &optarg[1]; phi = get_arg_dbl_bounds(optarg, 0, 1); break; case 'E': if (optarg[0] != '~') estim_omega = FALSE; else optarg = &optarg[1]; omega = get_arg_dbl_bounds(optarg, 1, INFTY); mu = 1/omega; break; case 'C': if (optarg[0] != '~') estim_gamma = FALSE; else optarg = &optarg[1]; gamma = get_arg_dbl_bounds(optarg, 0, 1); break; case 'r': refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'M': refseq_f = phast_fopen(optarg, "r"); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: unrecognized alignment format.\n"); break; case 'N': seqname = optarg; break; case 'P': idpref = optarg; break; case 'I': tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-model.\n"); alpha_n = lst_get_dbl(tmpl, 0); beta_n = lst_get_dbl(tmpl, 1); tau_n = lst_get_dbl(tmpl, 2); if (lst_size(tmpl) == 6) { alpha_c = lst_get_dbl(tmpl, 3); beta_c = lst_get_dbl(tmpl, 4); tau_c = lst_get_dbl(tmpl, 5); } else { alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n; } if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1) die("ERROR: bad argument to --indel-model.\n"); break; case 'H': fprintf(stderr, "Reading indel history from %s...\n", optarg); ih = ih_new_from_file(phast_fopen(optarg, "r")); break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'dless -h'.\n"); } } if (optind != argc - 1) die("Missing alignment file or model file. Try 'dless -h'.\n"); if (set_transitions && (gamma != -1 || omega != -1)) die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n"); if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1)) die("ERROR: --target-coverage and --expecteed-length must be used together.\n"); set_seed(-1); if (gamma != -1) nu = gamma/(1-gamma) * mu; fprintf(stderr, "Reading tree model from %s...\n", argv[optind]); source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1); if (source_mod->nratecats > 1) die("ERROR: rate variation not currently supported.\n"); if (source_mod->order > 0) die("ERROR: only single nucleotide models are currently supported.\n"); if (!tm_is_reversible(source_mod)) phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n"); /* read alignment */ msa_f = phast_fopen(argv[optind], "r"); fprintf(stderr, "Reading alignment from %s...\n", argv[optind]); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(msa_f, 1); if (msa_format == MAF) { msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); } else msa = msa_new_from_file_define_format(msa_f, msa_format, NULL); if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); if (msa->ss == NULL) { fprintf(stderr, "Extracting sufficient statistics...\n"); ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0); } else if (msa->ss->tuple_idx == NULL) die("ERROR: ordered representation of alignment required unless --suff-stats.\n"); /* prune tree, if necessary */ old_nnodes = source_mod->tree->nnodes; tm_prune(source_mod, msa, pruned_names); if (lst_size(pruned_names) == (old_nnodes + 1) / 2) die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n"); if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment ("); for (i = 0; i < lst_size(pruned_names); i++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, i < lst_size(pruned_names) - 1 ? ", " : ").\n"); } /* this has to be done after pruning tree */ tr_name_ancestors(source_mod->tree); /* also make sure match for reference sequence in tree */ if (refidx > 0) { for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) { TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i); if (!strcmp(n->name, msa->names[refidx-1])) found = TRUE; } if (!found) die("ERROR: no match for reference sequence in tree.\n"); } /* checks for indel model */ if (alpha_c > 0) { if (ih == NULL) { fprintf(stderr, "Reconstructing indel history by parsimony...\n"); ih = ih_reconstruct(msa, source_mod->tree); } else { if (ih->ncols != msa->length) die("ERROR: indel history doesn't seem to match alignment.\n"); if (ih->tree->nnodes != source_mod->tree->nnodes) die("ERROR: indel history doesn't seem to match tree model.\n"); } } bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, alpha_n, beta_n, tau_n, estim_gamma, estim_omega, estim_phi); /* compute emissions */ phmm_compute_emissions(bdphmm->phmm, msa, FALSE); /* add emissions for indel model, if necessary */ if (alpha_c > 0) { fprintf(stderr, "Adjusting emissions for indels...\n"); bd_add_indel_emissions(bdphmm, ih); } /* postprocess for missing data (requires special handling) */ fprintf(stderr, "Adjusting emissions for missing data...\n"); bd_handle_missing_data(bdphmm, msa); if (estim_gamma || estim_omega || estim_phi) { fprintf(stderr, "Estimating free parameters...\n"); bd_estimate_transitions(bdphmm, msa); } /* set seqname and idpref, if necessary */ if (seqname == NULL || idpref == NULL) { /* derive default from file name root */ String *tmp = str_new_charstr(msa_fname); if (!str_equals_charstr(tmp, "-")) { str_remove_path(tmp); str_root(tmp, '.'); if (idpref == NULL) idpref = copy_charstr(tmp->chars); str_root(tmp, '.'); /* apply one more time for double suffix */ if (seqname == NULL) seqname = tmp->chars; } else if (seqname == NULL) seqname = "refseq"; } /* obtain predictions */ fprintf(stderr, "Running Viterbi algorithm...\n"); predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL); lst_push_ptr(ignore_types, str_new_charstr("nonconserved")); gff_filter_by_type(predictions, ignore_types, TRUE, NULL); /* score predictions */ fprintf(stderr, "Scoring predictions...\n"); bd_score_predictions(bdphmm, predictions); /* can free emissions now */ for (i = 0; i < bdphmm->phmm->hmm->nstates; i++) sfree(bdphmm->phmm->emissions[i]); sfree(bdphmm->phmm->emissions); bdphmm->phmm->emissions = NULL; /* convert GFF to coord frame of reference sequence and adjust coords by idx_offset, if necessary */ if (refidx != 0 || msa->idx_offset != 0) msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset); if (refidx != 0) gff_flatten(predictions); /* necessary because coord conversion might create overlapping features (can happen in deletions in reference sequence) */ /* now output predictions */ fprintf(stderr, "Writing GFF to stdout...\n"); gff_print_set(stdout, predictions); fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char *argv[]) { struct phastCons_struct *p = phastCons_struct_new(0); struct option long_opts[] = { {"states", 1, 0, 'S'}, {"hmm", 1, 0, 'H'}, {"viterbi", 1, 0, 'V'}, {"most-conserved", 1, 0, 'V'}, /* same as --viterbi */ {"no-post-probs", 0, 0, 'n'}, {"msa-format", 1, 0, 'i'}, {"FC", 0, 0, 'X'}, {"lambda", 1, 0, 'l'}, {"target-coverage", 1, 0, 'C'}, {"transitions", 1, 0, 't'}, {"expected-length", 1, 0, 'E'}, {"expected-lengths", 1, 0, 'E'}, /* for backward compatibility */ {"estimate-trees", 1, 0, 'T'}, {"estimate-rho", 1, 0, 'O'}, {"rho", 1, 0, 'R'}, {"gc", 1, 0, 'G'}, {"ignore-missing", 0, 0, 'z'}, {"nrates", 1, 0, 'k'}, {"log", 1, 0, 'g'}, {"refidx", 1, 0, 'r'}, {"suppress-missing", 0, 0, 'x'}, /* for backward compatibility */ {"reflect-strand", 1, 0, 'U'}, {"catmap", 1, 0, 'c'}, {"extrapolate", 1, 0, 'e'}, {"indels", 0, 0, 'I'}, {"max-micro-indel", 1, 0, 'Y'}, {"indel-params", 1, 0, 'D'}, {"min-informative-types", 1, 0, 'M'}, /* for backward compatibility */ {"require-informative", 1, 0, 'M'}, {"not-informative", 1, 0, 'F'}, {"lnl", 1, 0, 'L'}, {"seqname", 1, 0, 'N'}, {"idpref", 1, 0, 'P'}, {"score", 0, 0, 's'}, {"coding-potential", 0, 0, 'p'}, {"indels-only", 0, 0, 'J'}, {"alias", 1, 0, 'A'}, {"quiet", 0, 0, 'q'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* other vars */ FILE *infile; char *msa_fname; char c; int opt_idx, i, coding_potential=FALSE; List *tmpl = NULL; String *tmpstr; char *mods_fname = NULL; List *mod_fname_list; msa_format_type msa_format = UNKNOWN_FORMAT; while ((c = getopt_long(argc, argv, "S:H:V:ni:k:l:C:G:zt:E:R:T:O:r:xL:sN:P:g:U:c:e:IY:D:JM:F:pA:Xqh", long_opts, &opt_idx)) != -1) { switch (c) { case 'S': p->states = get_arg_list(optarg); break; case 'H': p->hmm = hmm_new_from_file(phast_fopen(optarg, "r")); p->two_state = FALSE; break; case 'V': p->viterbi_f = phast_fopen(optarg, "w+"); tmpstr = str_new_charstr(optarg); if (str_ends_with_charstr(tmpstr, ".gff")) p->gff = TRUE; str_free(tmpstr); break; case 'n': p->post_probs = FALSE; break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: bad argument to --msa-format\n"); break; case 'X': p->FC = TRUE; p->two_state = FALSE; break; case 'l': if (optarg[0] != '~') p->estim_lambda = FALSE; else optarg = &optarg[1]; p->lambda = get_arg_dbl_bounds(optarg, 0, 1); break; case 'C': p->gamma = get_arg_dbl_bounds(optarg, 0, 1); break; case 'G': p->gc = get_arg_dbl_bounds(optarg, 0, 1); break; case 't': p->set_transitions = TRUE; if (optarg[0] != '~') p->estim_transitions = FALSE; else optarg = &optarg[1]; tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 2) die("ERROR: bad argument to --transitions.\n"); p->mu = lst_get_dbl(tmpl, 0); p->nu = lst_get_dbl(tmpl, 1); if (p->mu <= 0 || p->mu >= 1 || p->nu <= 0 || p->nu >= 1) die("ERROR: bad argument to --transitions.\n"); lst_free(tmpl); break; case 'E': if (optarg[0] != '~') p->estim_transitions = FALSE; else optarg = &optarg[1]; p->omega = get_arg_dbl_bounds(optarg, 1, INFTY); p->mu = 1/p->omega; break; case 'T': p->estim_trees = TRUE; p->estim_trees_fname_root = optarg; break; case 'O': p->estim_rho = TRUE; p->estim_trees_fname_root = optarg; break; case 'z': p->ignore_missing = TRUE; break; case 'k': tmpl = get_arg_list_int(optarg); if (lst_size(tmpl) > 2) die("ERROR: too many arguments with --nrates.\n"); p->nrates = lst_get_int(tmpl, 0); if (p->nrates <= 0) die("ERROR: bad argument to --nrates (%d).\n", p->nrates); if (lst_size(tmpl) == 2) { p->nrates2 = lst_get_int(tmpl, 1); if (p->nrates2 <= 0) die("ERROR: bad argument to --nrates (%d).\n", p->nrates2); } lst_free(tmpl); break; case 'R': p->rho = get_arg_dbl_bounds(optarg, 0, 1); break; case 'g': if (!strcmp(optarg, "-")) p->log_f = stderr; else p->log_f = phast_fopen(optarg, "w+"); break; case 'r': p->refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'x': /* do nothing; left in for backward compatibility */ break; case 'U': p->pivot_states = get_arg_list(optarg); /* we want strings not ints for phmm_new */ break; case 'e': p->extrapolate_tree_fname = optarg; break; case 'I': p->indels = TRUE; break; case 'Y': p->max_micro_indel = get_arg_int_bounds(optarg, 1, INFTY); break; case 'D': if (optarg[0] != '~') p->estim_indels = FALSE; else optarg = &optarg[1]; tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-params.\n"); p->alpha_0 = lst_get_dbl(tmpl, 0); p->beta_0 = lst_get_dbl(tmpl, 1); p->tau_0 = lst_get_dbl(tmpl, 2); p->alpha_1 = lst_get_dbl(tmpl, 3); p->beta_1 = lst_get_dbl(tmpl, 4); p->tau_1 = lst_get_dbl(tmpl, 5); if (p->alpha_0 < 0 || p->beta_0 < 0 || p->tau_0 < 0 || p->alpha_1 < 0 || p->beta_1 < 0 || p->tau_1 < 0) die("ERROR: bad argument to --indel-params.\n"); lst_free(tmpl); break; case 'J': p->indels_only = TRUE; p->two_state = FALSE; p->indels = TRUE; p->post_probs = FALSE; break; case 'M': p->inform_reqd = get_arg_list(optarg); break; case 'F': p->not_informative = get_arg_list(optarg); break; case 'c': p->cm = cm_new_string_or_file(optarg); break; case 'L': p->lnl_f = phast_fopen(optarg, "w+"); break; case 'N': p->seqname = optarg; break; case 'P': p->idpref = optarg; break; case 's': p->score = TRUE; break; case 'p': coding_potential = TRUE; break; case 'A': p->alias_hash = make_name_hash(optarg); break; case 'q': p->results_f = NULL; break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if ((!coding_potential && optind != argc - 2) || (coding_potential && optind != argc - 2 && optind != argc - 1)) die("ERROR: extra or missing arguments. Try '%s -h'.\n", argv[0]); set_seed(-1); if (p->extrapolate_tree_fname != NULL && !strcmp(p->extrapolate_tree_fname, "default")) { p->extrapolate_tree_fname = smalloc((strlen(PHAST_HOME)+100)*sizeof(char)); #if defined(__MINGW32__) sprintf(p->extrapolate_tree_fname, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME); #else sprintf(p->extrapolate_tree_fname, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME); #endif } if (p->extrapolate_tree_fname != NULL) p->extrapolate_tree = tr_new_from_file(phast_fopen(p->extrapolate_tree_fname, "r")); mods_fname = (optind == argc - 2 ? argv[argc - 1] : NULL); /* if there are two args, mods are the second one; otherwise will use default mods for coding potential (see below) */ /* set defaults for coding-potential mode */ if (coding_potential) { char tmp[5000]; p->two_state = FALSE; if (p->cm == NULL) p->cm = cm_new_string_or_file("NCATS=4; CNS 1; CDS 2-4"); if (p->hmm == NULL) { #if defined(__MINGW32__) sprintf(tmp, "%s\\data\\phastCons\\%s", PHAST_HOME, p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm"); #else sprintf(tmp, "%s/data/phastCons/%s", PHAST_HOME, p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm"); #endif if (p->results_f!=NULL) fprintf(p->results_f, "Reading HMM from %s...\n", tmp); p->hmm = hmm_new_from_file(phast_fopen(tmp, "r")); } if (mods_fname == NULL) { #if defined(__MINGW32__) sprintf(tmp, "%s\\data\\exoniphy\\mammals\\r3.ncns.mod, %s\\data\\exoniphy\\mammals\\r3.cns.mod, %s\\data\\exoniphy\\mammals\\r3.cds-1.mod, %s\\data\\exoniphy\\mammals\\r3.cds-2.mod, %s\\data\\exoniphy\\mammals\\r3.cds-3.mod", PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME); #else sprintf(tmp, "\ %s/data/exoniphy/mammals/r3.ncns.mod,\ %s/data/exoniphy/mammals/r3.cns.mod,\ %s/data/exoniphy/mammals/r3.cds-1.mod,\ %s/data/exoniphy/mammals/r3.cds-2.mod,\ %s/data/exoniphy/mammals/r3.cds-3.mod", PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME); #endif mods_fname = tmp; } if (p->states == NULL) p->states = get_arg_list("CDS"); if (p->pivot_states == NULL) p->pivot_states = get_arg_list("background,CNS"); }
List *pwm_read(const char *filename) { List *result; Matrix *pwm = NULL; int i, currBase, nBases = 0; FILE * F; // char *motifName; String *line = str_new(STR_MED_LEN); List *l = lst_new_ptr(3); List *probabilitiesStr = lst_new_ptr(4); List *probabilitiesDbl; Regex *pssm_re = NULL; Regex *motif_name_re = NULL; int alphabetLength; result = lst_new_ptr(1); //letter-probability matrix: alength= 4 w= 8 nsites= 2 E= 1.5e+004 pssm_re = str_re_new("^letter-probability matrix: alength= ([0-9]+) w= ([0-9]+)"); motif_name_re = str_re_new("^MOTIF[[:space:]]+(.+?)[[:space:]].*"); //open PWM file F = phast_fopen(filename, "r"); currBase = 0; nBases = -1; //For each line in the MEME file while ((str_readline(line, F)) != EOF) { //If line matches Motif name if (str_re_match(line, motif_name_re, l, 1) > 0) { // motifName = copy_charstr(((String*)lst_get_ptr(l, 1))->chars); //printf("motifName=%s\n", motifName); } //If line matches beginning of a probability matrix else if (str_re_match(line, pssm_re, l, 2) > 0) { //Extract the alphabet size & number of bases in matrix if (str_as_int((String*)lst_get_ptr(l, 1), &alphabetLength) != 0) die("ERROR: Unable to parse 'alength=' from MEME file, expected integer, read %s", ((String*)lst_get_ptr(l, 1))->chars); if (str_as_int((String*)lst_get_ptr(l, 2), &nBases) != 0) die("ERROR: Unable to parse 'w=' from MEME file, expected integer, read %s ", ((String*)lst_get_ptr(l, 2))->chars); currBase = 0; if (nBases <= 0) //We must have at least one base in the PWM die("ERROR: No Position Weight Matrices were detected in the provided PWM file"); if (alphabetLength <= 0) //We must have a positive alphabet length die("ERROR: Alphabet lengh specified in PWM file must be greater than zero"); pwm = mat_new(nBases, alphabetLength); mat_set_all(pwm, -1); continue; //If this row contains matrix data } else if (currBase < nBases) { //Parse row of probabilities str_double_trim(line); str_split(line, NULL, probabilitiesStr); probabilitiesDbl = str_list_as_dbl(probabilitiesStr); for (i = 0; i < lst_size(probabilitiesDbl); i++) mat_set(pwm, currBase, i, log(lst_get_dbl(probabilitiesDbl, i))); currBase++; } else if ((currBase == nBases) && (pwm != NULL)) { //Push full matrix lst_push_ptr(result, pwm); pwm = NULL; } } if (currBase == nBases && pwm != NULL) lst_push_ptr(result, pwm); else if (pwm != NULL) die("Premature end of PWM file\n"); str_re_free(motif_name_re); str_re_free(pssm_re); phast_fclose(F); return result; }