void go() override { Botan::X509_Certificate subject_cert(get_arg("subject")); Botan::Certificate_Store_In_Memory trusted; for(auto const& certfile : get_arg_list("ca_certs")) { trusted.add_certificate(Botan::X509_Certificate(certfile)); } Botan::Path_Validation_Restrictions restrictions; Botan::Path_Validation_Result result = Botan::x509_path_validate(subject_cert, restrictions, trusted); if(result.successful_validation()) { output() << "Certificate passes validation checks\n"; } else { output() << "Certificate did not validate - " << result.result_string() << "\n"; } }
void go() override { const std::string hash_algo = get_arg("algo"); for(std::string key_file : get_arg_list("keys")) { std::unique_ptr<Botan::Public_Key> key(Botan::X509::load_key(key_file)); output() << key_file << ": " << key->fingerprint_public(hash_algo) << "\n"; } }
/* * Return process cmdline as a Python list of cmdline arguments. */ static PyObject* get_process_cmdline(PyObject* self, PyObject* args) { long pid; PyObject* arglist = NULL; if (! PyArg_ParseTuple(args, "l", &pid)) { return NULL; } // get the commandline, defined in arch/osx/process_info.c arglist = get_arg_list(pid); return arglist; }
/* * Return process cmdline as a Python list of cmdline arguments. */ static PyObject* get_process_cmdline(PyObject* self, PyObject* args) { long pid; PyObject* arglist = NULL; if (! PyArg_ParseTuple(args, "l", &pid)) { return NULL; } // get the commandline, defined in arch/bsd/process_info.c arglist = get_arg_list(pid); // get_arg_list() returns NULL only if getcmdargs failed with ESRCH // (no process with that PID) if (NULL == arglist) { return PyErr_SetFromErrno(PyExc_OSError); } return Py_BuildValue("N", arglist); }
int main(int argc, char* argv[]) { FILE* F; MSA *msa; int *msa_gap_patterns = NULL; HMM *hmm = NULL; TreeNode *tree = NULL; int i, input_format = SS, msa_idx, quiet_mode = FALSE, ncats, nmsas, ncats_unspooled, indel_nseqs = -1; String *msa_fname, *gff_fname; List *gff_fname_list = NULL, *msa_fname_list = NULL, *msa_length_list = NULL, *model_indels_str = NULL; Matrix *traincounts = NULL; Vector *begcounts = NULL, *statecounts = NULL; CategoryMap *cm = NULL; char c; GapPatternMap *gpm = NULL; GFF_Set *gff; char *reverse_groups_tag = NULL; while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) { switch(c) { case 'i': input_format = msa_str_to_format(optarg); if (input_format == -1) die("ERROR: bad alignment format.\n"); break; case 'g': gff_fname_list = get_arg_list(optarg); break; case 'c': cm = cm_new_string_or_file(optarg); break; case 'm': msa_fname_list = get_arg_list(optarg); break; case 'M': msa_length_list = str_list_as_int(get_arg_list(optarg)); break; case 'R': reverse_groups_tag = optarg; break; case 'I': model_indels_str = get_arg_list(optarg); break; case 'n': indel_nseqs = get_arg_int(optarg); break; case 't': if (optarg[0] == '(') /* in this case, assume topology given at command line */ tree = tr_new_from_string(optarg); else tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'q': quiet_mode = TRUE; break; case 'h': print_usage(); exit(0); case '?': die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n"); } } if (msa_fname_list == NULL) die("ERROR: -m required. Type 'hmm_train -h' for usage.\n"); if (gff_fname_list == NULL) die("ERROR: -g required in training mode. Type 'hmm_train -h' for usage.\n"); if (msa_length_list != NULL && msa_fname_list != NULL) die("ERROR: -m and -M are mutually exclusive. Type 'hmm_train -h' for usage.\n"); if (model_indels_str != NULL && tree == NULL) die("ERROR: -I requires -t. Type 'hmm_train -h' for usage.\n"); if (cm == NULL) die("ERROR: category map required.\n"); set_seed(-1); ncats = cm->ncats + 1; ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled : ncats; nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) : lst_size(msa_fname_list)); if (model_indels_str != NULL) { if (tree == NULL) die("ERROR: tree is NULL\n"); /*FIXME: indel_ncats broken */ gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE); ncats = cm->ncats + 1; /* numbers will change */ ncats_unspooled = cm->unspooler == NULL ? ncats : cm->unspooler->nstates_unspooled; } /* allocate memory for storage of "training paths" */ traincounts = mat_new(ncats_unspooled, ncats_unspooled); statecounts = vec_new(ncats_unspooled); begcounts = vec_new(ncats_unspooled); mat_zero(traincounts); vec_zero(statecounts); vec_zero(begcounts); /* create skeleton of new HMM. */ hmm = hmm_new_nstates(ncats_unspooled, 0, 0); /* Main loop: consider each MSA in turn */ for (msa_idx = 0; msa_idx < nmsas; msa_idx++) { if (msa_fname_list != NULL) { msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx); F = phast_fopen(msa_fname->chars, "r"); if (!quiet_mode) fprintf(stderr, "Reading alignment from %s ...\n", F == stdin ? "stdin" : msa_fname->chars); msa = msa_new_from_file(F, NULL); phast_fclose(F); } else { /* only lengths of alignments specified */ msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL); /* just a shell in this case */ } gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx); if (!quiet_mode) fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars); gff = gff_read_set(phast_fopen(gff_fname->chars, "r")); /* convert GFF to coordinate frame of alignment */ if (msa_length_list == NULL) { if (!quiet_mode) fprintf(stderr, "Mapping annotations to alignment ...\n"); msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */ } if (model_indels_str != NULL) { if (!quiet_mode) fprintf(stderr, "Obtaining gap patterns ...\n"); msa_gap_patterns = smalloc(msa->length * sizeof(int)); gp_set_phylo_patterns(gpm, msa_gap_patterns, msa); } /* at this point, we don't actually need the alignment anymore; if using ordered suff stats (likely with large data sets), can free them now, to avoid running out of memory */ if (msa->ss != NULL) { ss_free(msa->ss); msa->ss = NULL; } if (reverse_groups_tag != NULL) { if (!quiet_mode) fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n", reverse_groups_tag); /* we don't need to reverse complement the whole alignment -- just the gff and possibly the gap pattern array (pass a NULL msa) */ gff_group(gff, reverse_groups_tag); msa_reverse_compl_feats(NULL, gff, msa_gap_patterns); } if (!quiet_mode) fprintf(stderr, "Labeling sites by category ...\n"); msa_label_categories(msa, gff, cm); gff_free_set(gff); if (model_indels_str != NULL) { if (!quiet_mode) fprintf(stderr, "Remapping categories according to gap patterns ...\n"); if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) { /* in this case, we'll simply reassign non-trivial gap patterns randomly. This will achieve the desired effect with minimal coding, as long as the number of sites is not too small (the indel model is probably useless anyway if the number is small) */ int pat, newpat; int npatterns = 4 * indel_nseqs - 5; int complex_allowed[cm->ncats+1]; List *no_complex_names, *no_complex_nums; if (!quiet_mode) fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs); /* set up index indicating by cat no. whether complex gaps are allowed */ for (i = 0; i < ncats; i++) complex_allowed[i] = 1; no_complex_names = lst_new_ptr(10); str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names); no_complex_nums = cm_get_category_list(cm, no_complex_names, 1); for (i = 0; i < lst_size(no_complex_nums); i++) complex_allowed[lst_get_int(no_complex_nums, i)] = 0; lst_free(no_complex_nums); lst_free_strings(no_complex_names); lst_free(no_complex_names); /* now reassign all non-null numbers */ for (i = 0; i < msa->length; ) { if ((pat = msa_gap_patterns[i]) != 0) { if (complex_allowed[msa->categories[i]]) newpat = 1 + ((double)npatterns * unif_rand()); /* random number in interval [1, npatterns] */ else newpat = 1 + ((double)(npatterns-1) * unif_rand()); /* random number in interval [1,npatterns-1] (excludes complex gap pattern) */ for (; i < msa->length && msa_gap_patterns[i] == pat; i++) msa_gap_patterns[i] = newpat; /* change for whole sequence */ } else i++; } } /* obtain gapped category number for each site */ for (i = 0; i < msa->length; i++) if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL) msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]]; } if (!quiet_mode) fprintf(stderr, "Unspooling categories ...\n"); cm_spooled_to_unspooled(cm, msa->categories, msa->length); if (!quiet_mode) fprintf(stderr, "Collecting training data ...\n"); hmm_train_update_counts(traincounts, statecounts, begcounts, msa->categories, msa->length, ncats_unspooled); if (msa_gap_patterns != NULL) sfree(msa_gap_patterns); msa_free(msa); } /* now train HMM, using cumulative data */ hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL, begcounts, NULL); /* if modeling indels, adjust begin transitions so probability is distributed among different "gap pattern" states that all correspond to the same ungapped state (category); this helps avoid problems that occur when training on a few large sequences (e.g., whole chromosomes) and then testing on many shorter ones */ if (model_indels_str != NULL) { double tprob[gpm->ncats]; int nst[gpm->ncats]; /* total prob and number of states per spooled, ungapped category */ for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0; for (i = 0; i < hmm->nstates; i++) { if (vec_get(hmm->begin_transitions, i) > 0) /* have to go from unspooled space to spooled space, then to ungapped space (HMM states correspond to unspooled, gapped categories). Note that states with nonzero begin probs shouldn't be conditioned on other states. */ tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] += vec_get(hmm->begin_transitions, i); nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++; } for (i = 0; i < hmm->nstates; i++) if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0) vec_set(hmm->begin_transitions, i, tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] / nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]); /* (uniform prior) */ } /* write trained HMM */ hmm_print(stdout, hmm); if (!quiet_mode) fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char *argv[]) { char c; int opt_idx; GFF_Set *gff; List *include = NULL; char *groupby = "transcript_id", *exongroup_tag = NULL; int unique = FALSE, sort = FALSE, simplebed = FALSE, fix_start_stop = FALSE, add_utrs = FALSE, add_introns = FALSE, add_signals = FALSE; enum {GFF, BED, GENEPRED, WIG} output_format = GFF; FILE *discards_f = NULL, *groups_f = NULL; struct option long_opts[] = { {"output", 1, 0, 'o'}, {"include-only", 1, 0, 'i'}, {"include-groups", 1, 0, 'l'}, {"groupby", 1, 0, 'g'}, {"exongroup", 1, 0, 'e'}, {"add-utrs", 0, 0, 'U'}, {"add-introns", 0, 0, 'I'}, {"add-signals", 0, 0, 'S'}, {"fix-start-stop", 0, 0, 'f'}, {"unique", 0, 0, 'u'}, {"sort", 0, 0, 's'}, {"simplebed", 0, 0, 'b'}, {"discards", 1, 0, 'd'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = (char)getopt_long(argc, argv, "o:i:l:g:e:d:UISfusbh", long_opts, &opt_idx)) != -1) { switch (c) { case 'o': if (!strcmp("bed", optarg)) output_format = BED; else if (!strcmp("genepred", optarg)) output_format = GENEPRED; else if (!strcmp("wig", optarg)) output_format = WIG; else if (strcmp("gff", optarg)) die("ERROR: bad output format.\n"); break; case 'i': include = get_arg_list(optarg); break; case 'l': groups_f = phast_fopen(optarg, "r"); break; case 'g': groupby = optarg; break; case 'e': exongroup_tag = optarg; break; case 'U': add_utrs = TRUE; break; case 'I': add_introns = TRUE; break; case 'S': add_signals = TRUE; break; case 'f': fix_start_stop = TRUE; break; case 'u': unique = TRUE; break; case 'b': simplebed = TRUE; output_format = BED; break; case 'd': discards_f = phast_fopen(optarg, "w+"); break; case 's': sort = TRUE; break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); set_seed(-1); gff = gff_read_set(phast_fopen(argv[optind], "r")); if (lst_size(gff->features) == 0) exit(0); /* helps avoid unexpected behavior below */ /* filter by type */ if (include != NULL) gff_filter_by_type(gff, include, FALSE, discards_f); /* group */ gff_group(gff, groupby); /* utrs, introns, & signals */ if (add_utrs) gff_create_utrs(gff); if (add_introns) gff_create_introns(gff); if (add_signals) gff_create_signals(gff); /* subgroup */ if (exongroup_tag != NULL) gff_exon_group(gff, exongroup_tag); /* filter by group */ if (groups_f != NULL) { String *s = str_new(STR_LONG_LEN); List *groups = lst_new_ptr(10000); str_slurp(s, groups_f); str_split(s, NULL, groups); gff_filter_by_group(gff, groups); lst_free_strings(groups); lst_free(groups); str_free(s); } /* sort */ if (sort) gff_sort(gff); /* make unique */ if (unique) gff_remove_overlaps(gff, discards_f); if (fix_start_stop) gff_fix_start_stop(gff); if (output_format == BED) gff_print_bed(stdout, gff, !simplebed); else if (output_format == GENEPRED) gff_print_genepred(stdout, gff); else if (output_format == WIG) wig_print(stdout, gff); else gff_print_set(stdout, gff); gff_free_set(gff); return 0; }
int main(int argc, char *argv[]) { char c; int opt_idx, node; FILE *out_f = NULL, *msa_f, *mod_f; char *out_root; TreeModel *mod; MSA *msa; char out_fname[STR_MED_LEN]; struct option long_opts[] = { {"refseq", 1, 0, 'r'}, {"msa-format", 1, 0, 'i'}, {"seqs", 1, 0, 's'}, {"exclude", 0, 0, 'x'}, {"no-probs", 0, 0, 'n'}, {"suff-stats", 0, 0, 'S'}, {"encode", 1, 0, 'e'}, {"keep-gaps", 0, 0, 'k'}, {"gibbs", 1, 0, 'G'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* arguments and defaults for options */ FILE *refseq_f = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; int suff_stats = FALSE, exclude = FALSE, keep_gaps = FALSE, do_probs = TRUE; List *seqlist = NULL; PbsCode *code = NULL; int gibbs_nsamples = -1; while ((c = (char)getopt_long(argc, argv, "r:i:s:e:knxSh", long_opts, &opt_idx)) != -1) { switch (c) { case 'r': refseq_f = phast_fopen(optarg, "r"); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: unrecognized alignment format.\n"); break; case 'S': suff_stats = TRUE; break; case 'e': code = pbs_new_from_file(phast_fopen(optarg, "r")); break; case 's': seqlist = get_arg_list(optarg); break; case 'x': exclude = TRUE; break; case 'n': do_probs = FALSE; break; case 'k': keep_gaps = TRUE; break; case 'G': gibbs_nsamples = get_arg_int_bounds(optarg, 1, INFTY); break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'prequel -h'.\n"); } } if (optind != argc - 3) die("Three arguments required. Try 'prequel -h'.\n"); set_seed(-1); if (!do_probs && (suff_stats || code != NULL)) die("ERROR: --no-probs can't be used with --suff-stats or --encode.\n"); msa_f = phast_fopen(argv[optind], "r"); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(msa_f, 1); fprintf(stderr, "Reading alignment from %s...\n", argv[optind]); if (msa_format == MAF) { msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, !suff_stats, NULL, NO_STRIP, FALSE); /* (no need to store order if suff_stats mode) */ } else msa = msa_new_from_file_define_format(msa_f, msa_format, NULL); if (msa->ss == NULL) { fprintf(stderr, "Extracting sufficient statistics...\n"); ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0); } else if (msa->ss->tuple_idx == NULL && !suff_stats) die("ERROR: ordered representation of alignment required unless --suff-stats.\n"); mod_f = phast_fopen(argv[optind+1], "r"); out_root = argv[optind+2]; mod = tm_new_from_file(mod_f, 1); /* MH prune just like in phastcons */ int old_nnodes = mod->tree->nnodes; List *pruned_names = lst_new_ptr(msa->nseqs); tm_prune(mod, msa, pruned_names); if (lst_size(pruned_names) == (old_nnodes + 1) / 2) die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n"); if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment ("); int j; for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); tr_name_ancestors(mod->tree); if (mod->order != 0) die("ERROR: Only single nucleotide models are supported.\n"); if (mod->nratecats > 1) die("ERROR: Rate variation not supported.\n"); mod->tree_posteriors = tl_new_tree_posteriors(mod, msa, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE); fprintf(stderr, "Computing posterior probabilities...\n"); if (gibbs_nsamples > 0) die("ERROR: --gibbs not implemented yet."); /* gb_sample_ancestral_seqs(mod, msa, mod->tree_posteriors, gibbs_nsamples); */ else tl_compute_log_likelihood(mod, msa, NULL, NULL, -1, mod->tree_posteriors); fprintf(stderr, "Reconstructing indels by parsimony...\n"); do_indels(msa, mod); for (node = 0; node < mod->tree->nnodes; node++) { int i, j; TreeNode *n = lst_get_ptr(mod->tree->nodes, node); if (n->lchild == NULL || n->rchild == NULL) continue; if (seqlist != NULL) { int in_list = str_in_list_charstr(n->name, seqlist); if ((in_list && exclude) || (!in_list && !exclude)) continue; } fprintf(stderr, "Writing output for ancestral node '%s'...\n", n->name); if (suff_stats) { if (out_f == NULL) { sprintf(out_fname, "%s.stats", out_root); out_f = phast_fopen(out_fname, "w+"); fprintf(out_f, "#count\t"); for (j = 0; j < mod->rate_matrix->size; j++) fprintf(out_f, "p(%c)%c", mod->rate_matrix->states[j], j == mod->rate_matrix->size - 1 ? '\n' : '\t'); } for (i = 0; i < msa->ss->ntuples; i++) { if (mod->tree_posteriors->base_probs[0][0][node][i] == -1) continue; /* no base this node */ fprintf(out_f, "%.0f\t", msa->ss->counts[i]); for (j = 0; j < mod->rate_matrix->size; j++) { fprintf(out_f, "%f%c", mod->tree_posteriors->base_probs[0][j][node][i], j == mod->rate_matrix->size - 1 ? '\n' : '\t'); } } } else if (code == NULL && do_probs) { /* ordinary sequence-by-sequence output */ sprintf(out_fname, "%s.%s.probs", out_root, n->name); out_f = phast_fopen(out_fname, "w+"); fprintf(out_f, "#"); for (j = 0; j < mod->rate_matrix->size; j++) fprintf(out_f, "p(%c)%c", mod->rate_matrix->states[j], j == mod->rate_matrix->size - 1 ? '\n' : '\t'); for (i = 0; i < msa->length; i++) { if (mod->tree_posteriors->base_probs[0][0][node][msa->ss->tuple_idx[i]] == -1) { /* no base */ if (keep_gaps) fprintf(out_f, "-\n"); /* otherwise do nothing */ } else for (j = 0; j < mod->rate_matrix->size; j++) fprintf(out_f, "%f%c", mod->tree_posteriors->base_probs[0][j][node][msa->ss->tuple_idx[i]], j == mod->rate_matrix->size - 1 ? '\n' : '\t'); } phast_fclose(out_f); } else if (code == NULL && !do_probs) { /* write point estimates to FASTA file */ char *outseq = smalloc((msa->length + 1) * sizeof(char)); int len = 0; for (i = 0; i < msa->length; i++) { if (mod->tree_posteriors->base_probs[0][0][node][msa->ss->tuple_idx[i]] == -1) { /* no base */ if (keep_gaps) outseq[len++] = GAP_CHAR; /* otherwise do nothing */ } else { double maxprob = 0; int maxidx = -1; for (j = 0; j < mod->rate_matrix->size; j++) { if (mod->tree_posteriors->base_probs[0][j][node][msa->ss->tuple_idx[i]] > maxprob) { maxprob = mod->tree_posteriors->base_probs[0][j][node][msa->ss->tuple_idx[i]]; maxidx = j; } } outseq[len++] = mod->rate_matrix->states[maxidx]; } } outseq[len] = '\0'; /* print in FASTA format */ sprintf(out_fname, "%s.%s.fa", out_root, n->name); out_f = phast_fopen(out_fname, "w+"); print_seq_fasta(out_f, outseq, n->name, len); phast_fclose(out_f); sfree(outseq); } else { /* encoded sequence-by-sequence output */ double error, tot_error = 0; int ngaps = 0; Vector *v; unsigned *encoded; /* first encode tuple by tuple */ v = vec_new(mod->rate_matrix->size); encoded = smalloc(msa->ss->ntuples * sizeof(unsigned)); for (i = 0; i < msa->ss->ntuples; i++) { if (mod->tree_posteriors->base_probs[0][0][node][i] == -1) { encoded[i] = code->gap_code; ngaps += msa->ss->counts[i]; } else { for (j = 0; j < mod->rate_matrix->size; j++) vec_set(v, j, mod->tree_posteriors->base_probs[0][j][node][i]); encoded[i] = pbs_get_index(code, v, &error); tot_error += error * msa->ss->counts[i]; } } vec_free(v); /* now write site by site */ sprintf(out_fname, "%s.%s.bin", out_root, n->name); out_f = phast_fopen(out_fname, "w+"); for (i = 0; i < msa->length; i++) { if (keep_gaps || encoded[msa->ss->tuple_idx[i]] != code->gap_code) pbs_write_binary(code, encoded[msa->ss->tuple_idx[i]], out_f); } fprintf(stderr, "Average approximation error ('%s'): %f bits\n", n->name, tot_error/(msa->length - ngaps)); sfree(encoded); } } fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char* argv[]) { FILE* F; GFF_Set *gff_real=NULL, *gff_pred=NULL; char c; List *real_fname_list = NULL, *pred_fname_list = NULL, *feat_list = NULL, *seq_len_list = NULL, *l = NULL; int nfile, i, j; char *prefix = NULL; int tot_tp = 0, tot_fp = 0, tot_nreal_pos = 0, tot_npred_pos = 0, tot_seqlen = 0, tot_ncr = 0, tot_npca = 0, tot_nola = 0, tot_nme = 0, tot_npcp = 0, tot_nolp = 0, tot_nwe = 0, tot_nexons_real = 0, tot_nexons_pred = 0, dump_exons = 0, nnc = -1, tot_nnc = -1, nc_threshold = 0; while ((c = (char)getopt(argc, argv, "r:p:f:l:d:n:h")) != -1) { switch(c) { case 'r': real_fname_list = get_arg_list(optarg); break; case 'p': pred_fname_list = get_arg_list(optarg); break; case 'l': l = get_arg_list(optarg); /* convert to ints */ seq_len_list = lst_new_int(lst_size(l)); for (i = 0; i < lst_size(l); i++) { int tmp; if (str_as_int((String*)lst_get_ptr(l, i), &tmp) != 0) { die("ERROR: Bad integer in <seq_len_list>.\n"); } lst_push_int(seq_len_list, tmp); } break; case 'f': feat_list = get_arg_list(optarg); break; case 'd': dump_exons = 1; prefix = optarg; break; case 'n': nnc = tot_nnc = 0; nc_threshold = get_arg_int(optarg); break; case 'h': print_usage(); exit(0); case '?': die("Unrecognized option. Try \"eval_predictions -h\" for help.\n"); } } set_seed(-1); if (feat_list == NULL) { feat_list = lst_new_ptr(1); lst_push_ptr(feat_list, str_new_charstr(GFF_CDS_TYPE)); } if (real_fname_list == NULL || pred_fname_list == NULL || seq_len_list == NULL) { die("ERROR: Must specify -r, -p, and -l. Try \"eval_predictions -h\" for help.\n"); } if (lst_size(real_fname_list) != lst_size(pred_fname_list)) { die("ERROR: Must specify lists of equal length for real and predicted filenames.\n\n."); } if (lst_size(seq_len_list) == 1 && lst_size(real_fname_list) > 1) for (i = 1; i < lst_size(real_fname_list); i++) lst_push_int(seq_len_list, lst_get_int(seq_len_list, 0)); else if (lst_size(seq_len_list) != lst_size(real_fname_list)) die("ERROR: List of sequence lengths does not match lists of real and predicted filenames.\n"); /* print header */ printf("%-25s %-25s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "Real_fname", "Pred_fname", "Sn", "Sp", "AC", "CC", "ESn", "ESp", "CRa", "PCa", "OLa", "ME", "CRp", "PCp", "OLp", "WE"); if (nnc != -1) printf(" %7s %7s %7s %7s", "NCa", "NCp", "CR+NCa", "CR+NCp"); printf("\n"); for (nfile = 0; nfile < lst_size(real_fname_list); nfile++) { int tp, fp, nexons_real, nexons_pred, nwe, nme, ncr, npca, npcp, nola, nolp, nreal_pos, npred_pos, len_real, len_pred, seqlen, already_counted_real; String *real_fname, *pred_fname; GFF_Feature *feat_real, *feat_pred=NULL; real_fname = (String*)lst_get_ptr(real_fname_list, nfile); F = phast_fopen(real_fname->chars, "r"); if ((gff_real = gff_read_set(F)) == NULL) { die("ERROR: Unable to read file \"%s\".\n", real_fname->chars); } phast_fclose(F); pred_fname = (String*)lst_get_ptr(pred_fname_list, nfile); F = phast_fopen(pred_fname->chars, "r"); if ((gff_pred = gff_read_set(F)) == NULL) { die("ERROR: Unable to read file \"%s\".\n", pred_fname->chars); } phast_fclose(F); seqlen = lst_get_int(seq_len_list, nfile); /* sort ungrouped -- only cds exons will be considered, and each one will be considered individually */ gff_ungroup(gff_real); gff_ungroup(gff_pred); gff_sort(gff_real); gff_sort(gff_pred); nexons_real = nexons_pred = nwe = nme = ncr = npca = npcp = nola = nolp = tp = fp = nreal_pos = npred_pos = 0; if (nnc != -1) nnc = 0; i = j = 0; already_counted_real = 0; while (i < lst_size(gff_real->features)) { feat_real = (GFF_Feature*)lst_get_ptr(gff_real->features, i); if (!is_exon(feat_real, feat_list)) { i++; continue; } len_real = feat_real->end - feat_real->start + 1; if (!already_counted_real) { nexons_real++; nreal_pos += len_real; } /* look at all predicted exons up to and overlapping this real exon */ while (j < lst_size(gff_pred->features)) { feat_pred = (GFF_Feature*)lst_get_ptr(gff_pred->features, j); if (!is_exon(feat_pred, feat_list)) { j++; continue; } else if (feat_pred->start > feat_real->end) { if (!already_counted_real) { nme++; if (dump_exons) dump(prefix, feat_real, NULL, ME, -1); } break; } /* otherwise we have a predicted exon to count (start of pred <= end of real) */ nexons_pred++; len_pred = feat_pred->end - feat_pred->start + 1; npred_pos += len_pred; j++; /* we'll be done with this prediction one way or another; next time through look at a new one */ if (feat_pred->end < feat_real->start) { /* WE */ nwe++; fp += len_pred; if (dump_exons) dump(prefix, NULL, feat_pred, WE, 0); } else if (feat_pred->start == feat_real->start && /* CR */ feat_pred->end == feat_real->end) { ncr++; tp += len_pred; if (dump_exons) dump(prefix, feat_real, feat_pred, CR, 1); break; } else if (feat_pred->start == feat_real->start || /* PC */ feat_pred->end == feat_real->end) { pred_type type; npca++; npcp++; if (nnc != -1 && max(abs(feat_pred->start - feat_real->start), abs(feat_pred->end - feat_real->end)) <= nc_threshold) { nnc++; type = NC; } else type = PC; if (len_pred < len_real) tp += len_pred; else { tp += len_real; fp += (len_pred - len_real); } if (dump_exons) dump(prefix, feat_real, feat_pred, type, min(1, (double)len_real/len_pred)); break; } else { /* OL */ int overlap_size; pred_type type; nola++; nolp++; if (nnc != -1 && max(abs(feat_pred->start - feat_real->start), abs(feat_pred->end - feat_real->end)) <= nc_threshold) { nnc++; type = NC; } else type = PC; overlap_size = min(feat_pred->end, feat_real->end) - max(feat_pred->start, feat_real->start) + 1; tp += overlap_size; fp += len_pred - overlap_size; if (dump_exons) dump(prefix, feat_real, feat_pred, type, (double)overlap_size/len_pred); break; } /* NOTE: I'm ignoring the possibility that a predicted exon could be a PC and/or OL with respect to multiple real exons. The effect on the exon-level stats will be fairly minor (at worst a predicted exon is scored as an OL when it should be scored as an PC, and a real exon is erroneously counted as a ME), but the effect on the nucleotide-level Sn and Sp could conceivably be significant. */ } /* if we have counted at least one prediction (and thus failed to reach the end of the list), but the last prediction did not extend as far as the end of the real exon, then delay moving on to the next real exon */ if (j < lst_size(gff_pred->features) && feat_pred->end < feat_real->end) already_counted_real = 1; else { /* if we reached the end of the list of predictions, then it must not have contained any exons, and the real exon in question is a ME (if it hasn't already been counted) */ if (j == lst_size(gff_pred->features) && !already_counted_real) nme++; i++; already_counted_real = 0; } } /* any remaining predictions must be wrong */ for (; j < lst_size(gff_pred->features); j++) { if (is_exon((GFF_Feature*)lst_get_ptr(gff_pred->features, j), feat_list)) { nexons_pred++; nwe++; } } compute_and_print_stats(stdout, real_fname, pred_fname, tp, fp, nreal_pos, npred_pos, seqlen, ncr, npca, nola, nme, npcp, nolp, nwe, nexons_real, nexons_pred, nnc); tot_tp += tp; tot_fp += fp; tot_nreal_pos += nreal_pos; tot_npred_pos += npred_pos; tot_seqlen += seqlen; tot_ncr += ncr; tot_npca += npca; tot_nola += nola; tot_nme += nme; tot_npcp += npcp; tot_nolp += nolp; tot_nwe += nwe; tot_nexons_real += nexons_real; tot_nexons_pred += nexons_pred; if (nnc != -1) tot_nnc += nnc; if (dump_exons && SUMF != NULL) fprintf(SUMF, "# Total number of bases in real exons: %d\n", nreal_pos); gff_free_set(gff_real); gff_free_set(gff_pred); } if (lst_size(real_fname_list) > 1) compute_and_print_stats(stdout, str_new_charstr("TOTAL"), str_new_charstr(""), tot_tp, tot_fp, tot_nreal_pos, tot_npred_pos, tot_seqlen, tot_ncr, tot_npca, tot_nola, tot_nme, tot_npcp, tot_nolp, tot_nwe, tot_nexons_real, tot_nexons_pred, tot_nnc); return 0; }
int main(int argc, char *argv[]) { /* variables for options, with defaults */ TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL; Hashtable *rename_hash = NULL; double scale_factor = 1; List *prune_names = NULL, *label = NULL, *labelType = NULL; int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE, name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE, inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE; TreeModel *mod = NULL, *merge_mod = NULL; char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL, *node_distance_name = NULL; /* other variables */ String *suffix, *optstr; char c; int i, opt_idx; TreeNode *n; struct option long_opts[] = { {"scale", 1, 0, 's'}, {"extrapolate", 1, 0, 'e'}, {"prune", 1, 0, 'p'}, {"prune-all-but", 1, 0, 'P'}, {"get-subtree", 1, 0, 'g'}, {"merge", 1, 0, 'm'}, {"rename", 1, 0, 'r'}, {"tree-only", 0, 0, 't'}, {"no-branchlen", 0, 0, 'N'}, {"dissect", 0, 0, 'd'}, {"name-ancestors", 0, 0, 'a'}, {"reroot", 1, 0, 'R'}, {"with-branch", 1, 0, 'B'}, {"subtree", 1, 0, 'S'}, {"branchlen", 0, 0, 'b'}, {"newick", 0, 0, 'n'}, {"label-subtree", 1, 0, 'L'}, {"label-branches", 1, 0, 'l'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", long_opts, &opt_idx)) != -1) { switch (c) { case 's': scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY); break; case 'e': if (!strcmp(optarg, "default")) { optarg = smalloc(1000 * sizeof(char)); #if defined(__MINGW32__) sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME); #else sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME); #endif } extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'p': prune_names = get_arg_list(optarg); break; case 'P': prune_names = get_arg_list(optarg); prune_all_but = TRUE; break; case 'g': get_subtree_name = optarg; break; case 'm': suffix = str_new_charstr(optarg); str_suffix(suffix, '.'); if (str_equals_charstr(suffix, "nh")) merge_tree = tr_new_from_file(phast_fopen(optarg, "r")); else { merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); merge_tree = merge_mod->tree; } break; case 'r': rename_hash = make_name_hash(optarg); break; case 't': tree_only = TRUE; break; case 'N': no_branchlen = TRUE; tree_only = TRUE; break; case 'd': dissect = TRUE; break; case 'b': print_branchlen = TRUE; break; case 'D': print_distance_to_root = TRUE; node_distance_name = optarg; break; case 'R': reroot_name = optarg; break; case 'B': with_branch = TRUE; break; case 'a': name_ancestors = TRUE; break; case 'S': subtree_name = optarg; break; case 'n': inNewick=TRUE; break; case 'L': //do the same for --label--subtree and --label-branches case 'l': if (label == NULL) { label = lst_new_ptr(1); labelType = lst_new_int(1); } optstr = str_new_charstr(optarg); lst_push_ptr(label, optstr); lst_push_int(labelType, (int)c); break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); if (merge_tree != NULL && extrapolate_tree != NULL) die("ERROR: Can't use --merge and --extrapolate together"); set_seed(-1); suffix = str_new_charstr(argv[optind]); str_suffix(suffix, '.'); if (inNewick || str_equals_charstr(suffix, "nh")) { tree = tr_new_from_file(phast_fopen(argv[optind], "r")); tree_only = TRUE; /* can't output tree model in this case */ } else { mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1); tree = mod->tree; } if (prune_names != NULL) { tr_prune(&tree, prune_names, prune_all_but, NULL); if (mod != NULL) mod->tree = tree; /* root may have changed */ } if (get_subtree_name != NULL) { n = tr_get_node(tree, get_subtree_name); if (n == NULL) { tr_name_ancestors(tree); n = tr_get_node(tree, get_subtree_name); if (n == NULL) { die("ERROR: no node named '%s'.\n", subtree_name); } } tr_prune_supertree(&tree, n); if (mod != NULL) mod->tree = tree; } if (merge_tree != NULL) { tree = tr_hybrid(tree, merge_tree); if (mod != NULL) mod->tree = tree; } else if (extrapolate_tree != NULL) { tr_scale_by_subtree(extrapolate_tree, tree); tree = extrapolate_tree; if (mod != NULL) mod->tree = tree; } if (scale_factor != 1) { if (subtree_name == NULL) tr_scale(tree, scale_factor); else { n = tr_get_node(tree, subtree_name); if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name); tr_scale_subtree(tree, n, scale_factor, with_branch); } } if (name_ancestors) tr_name_ancestors(tree); if (rename_hash != NULL) { char *newname; for (i = 0; i < tree->nnodes; i++) { n = lst_get_ptr(tree->nodes, i); if (n->name != NULL && n->name[0] != '\0' && (newname = hsh_get(rename_hash, n->name)) != (char*)-1) { strcpy(n->name, newname); } } } if (reroot_name != NULL) { n = tr_get_node(tree, reroot_name); if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name); tr_reroot(tree, n, with_branch); if (mod != NULL) mod->tree = with_branch ? n->parent : n; tree = with_branch ? n->parent : n; } if (label != NULL) { for (i=0; i < lst_size(label); i++) { String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal; List *tmplst = lst_new_ptr(10); String *nodename; int j; str_split(currstr, ":", tmplst); if (lst_size(tmplst) != 2) die("ERROR: bad argument to --label-branches or --label-subtree.\n"); arg1 = lst_get_ptr(tmplst, 0); labelVal = lst_get_ptr(tmplst, 1); lst_clear(tmplst); if (lst_get_int(labelType, i) == (int)'l') { str_split(arg1, ",", tmplst); for (j=0; j < lst_size(tmplst); j++) { nodename = (String*)lst_get_ptr(tmplst, j); tr_label_node(tree, nodename->chars, labelVal->chars); } lst_free_strings(tmplst); } else if (lst_get_int(labelType, i) == (int)'L') { int include_leading_branch = FALSE; TreeNode *node; nodename = arg1; node = tr_get_node(tree, nodename->chars); if (node == NULL && nodename->chars[nodename->length-1] == '+') { nodename->chars[--nodename->length] = '\0'; node = tr_get_node(tree, nodename->chars); include_leading_branch = TRUE; } tr_label_subtree(tree, nodename->chars, include_leading_branch, labelVal->chars); } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i)); str_free(arg1); str_free(labelVal); lst_free(tmplst); str_free(currstr); } lst_free(label); lst_free(labelType); } if (dissect) tr_print_nodes(stdout, tree); if (print_branchlen) printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree)); if (print_distance_to_root) { TreeNode *node = tr_get_node(tree, node_distance_name); if (node == NULL) die("ERROR: no node named '%s'.\n", node_distance_name); printf("length(root-%s): %f\n", node_distance_name, tr_distance_to_root(node)); } if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) { if (tree_only) tr_print(stdout, tree, no_branchlen==FALSE); else tm_print(stdout, mod); } return 0; }
%s/data/exoniphy/mammals/r3.cns.mod,\ %s/data/exoniphy/mammals/r3.cds-1.mod,\ %s/data/exoniphy/mammals/r3.cds-2.mod,\ %s/data/exoniphy/mammals/r3.cds-3.mod", PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME); #endif mods_fname = tmp; } if (p->states == NULL) p->states = get_arg_list("CDS"); if (p->pivot_states == NULL) p->pivot_states = get_arg_list("background,CNS"); } /* read tree models */ mod_fname_list = get_arg_list(mods_fname); p->nummod = lst_size(mod_fname_list); p->mod = (TreeModel**)smalloc(sizeof(TreeModel*) * p->nummod); for (i = 0; i < p->nummod; i++) { String *fname = lst_get_ptr(mod_fname_list, i); if (p->results_f != NULL) fprintf(p->results_f, "Reading tree model from %s...\n", fname->chars); p->mod[i] = tm_new_from_file(phast_fopen(fname->chars, "r"), 1); p->mod[i]->use_conditionals = 1; } /* read alignment */ msa_fname = argv[optind]; infile = phast_fopen(msa_fname, "r");
int main(int argc, char *argv[]) { char *msa_fname = NULL, *alph = "ACGT"; msa_format_type input_format = UNKNOWN_FORMAT; char c; int opt_idx, seed=-1; String *optstr; List *tmplist = NULL; struct phyloFit_struct *pf; FILE *infile; struct option long_opts[] = { {"msa", 1, 0, 'm'}, {"tree", 1, 0, 't'}, {"subst-mod", 1, 0, 's'}, {"msa-format", 1, 0, 'i'}, {"nrates", 1, 0, 'k'}, {"alpha", 1, 0, 'a'}, {"features", 1, 0, 'g'}, {"catmap", 1, 0, 'c'}, {"log", 1, 0, 'l'}, {"out-root", 1, 0, 'o'}, {"EM", 0, 0, 'E'}, {"error", 1, 0, 'e'}, {"precision", 1, 0, 'p'}, {"do-cats", 1, 0, 'C'}, {"non-overlapping", 0, 0, 'V'}, {"markov", 0, 0, 'N'}, {"reverse-groups", 1, 0, 'R'}, {"init-model", 1, 0, 'M'}, {"init-random", 0, 0, 'r'}, {"init-parsimony", 0, 0, 'y'}, {"print-parsimony", 1, 0, 'Y'}, {"lnl", 0, 0, 'L'}, {"scale-only", 0, 0, 'B'}, {"scale-subtree", 1, 0, 'S'}, {"estimate-freqs", 0, 0, 'F'}, {"sym-freqs", 0, 0, 'W'}, {"no-freqs", 0, 0, 'f'}, {"no-rates", 0, 0, 'n'}, {"no-opt", 1, 0, 'O'}, {"min-informative", 1, 0, 'I'}, {"gaps-as-bases", 0, 0, 'G'}, {"quiet", 0, 0, 'q'}, {"help", 0, 0, 'h'}, {"windows", 1, 0, 'w'}, {"windows-explicit", 1, 0, 'v'}, {"ancestor", 1, 0, 'A'}, {"post-probs", 0, 0, 'P'}, {"expected-subs", 0, 0, 'X'}, {"expected-total-subs", 0, 0, 'Z'}, {"expected-subs-col", 0, 0, 'J'}, {"column-probs", 0, 0, 'U'}, {"rate-constants", 1, 0, 'K'}, {"ignore-branches", 1, 0, 'b'}, {"clock", 0, 0, 'z'}, {"alt-model", 1, 0, 'd'}, {"label-branches", 1, 0, 0}, {"label-subtree", 1, 0, 0}, {"selection", 1, 0, 0}, {"bound", 1, 0, 'u'}, {"seed", 1, 0, 'D'}, {0, 0, 0, 0} }; // NOTE: remaining shortcuts left: HjQx pf = phyloFit_struct_new(0); while ((c = (char)getopt_long(argc, argv, "m:t:s:g:c:C:i:o:k:a:l:w:v:M:p:A:I:K:S:b:d:O:u:Y:e:D:GVENRqLPXZUBFfnrzhWyJ", long_opts, &opt_idx)) != -1) { switch(c) { case 'm': msa_fname = optarg; break; case 't': if (optarg[0] == '(') /* in this case, assume topology given at command line */ pf->tree = tr_new_from_string(optarg); else pf->tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 's': pf->subst_mod = tm_get_subst_mod_type(optarg); if (pf->subst_mod == UNDEF_MOD) die("ERROR: illegal substitution model. Type \"phyloFit -h\" for usage.\n"); break; case 'g': pf->gff = gff_read_set(phast_fopen(optarg, "r")); break; case 'c': pf->cm = cm_new_string_or_file(optarg); break; case 'C': pf->cats_to_do_str = get_arg_list(optarg); break; case 'V': pf->nonoverlapping = TRUE; break; case 'o': pf->output_fname_root = optarg; break; case 'k': pf->nratecats = get_arg_int_bounds(optarg, 0, INFTY); break; case 'a': pf->alpha = get_arg_dbl(optarg); break; case 'R': pf->reverse_group_tag = optarg; break; case 'i': input_format = msa_str_to_format(optarg); if (input_format == UNKNOWN_FORMAT) die("ERROR: unrecognized alignment format. Type 'phyloFit -h' for usage.\n"); break; case 'l': if (!strcmp(optarg, "-")) pf->logf = stderr; else pf->logf = phast_fopen(optarg, "w+"); break; case 'N': pf->use_conditionals = 1; break; case 'w': tmplist = get_arg_list(optarg); if (lst_size(tmplist) != 2 || str_as_int(lst_get_ptr(tmplist, 0), &(pf->window_size)) != 0 || str_as_int(lst_get_ptr(tmplist, 1), &(pf->window_shift)) != 0) die("ERROR: illegal arguments to --windows.\n"); lst_free_strings(tmplist); lst_free(tmplist); break; case 'v': tmplist = get_arg_list(optarg); if (lst_size(tmplist) % 2 != 0) die("ERROR: argument to --windows-explicit must be a list of even length.\n"); pf->window_coords = str_list_as_int(tmplist); lst_free(tmplist); break; case 'E': pf->use_em = TRUE; break; case 'e': pf->error_fname=optarg; break; case 'p': if (!strcmp(optarg, "LOW")) pf->precision = OPT_LOW_PREC; else if (!strcmp(optarg, "MED")) pf->precision = OPT_MED_PREC; else if (!strcmp(optarg, "HIGH")) pf->precision = OPT_HIGH_PREC; else if (!strcmp(optarg, "VERY_HIGH")) pf->precision = OPT_VERY_HIGH_PREC; else die("ERROR: --precision must be LOW, MED, or HIGH.\n\n"); break; case 'M': pf->input_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); break; case 'r': pf->random_init = TRUE; break; case 'y': pf->init_parsimony = TRUE; break; case 'Y': pf->init_parsimony = TRUE; pf->parsimony_cost_fname = optarg; pf->parsimony_only=TRUE; break; case 'L': pf->likelihood_only = TRUE; break; case 'q': pf->quiet = TRUE; break; case 'P': pf->do_bases = TRUE; break; case 'X': pf->do_expected_nsubst = TRUE; break; case 'Z': pf->do_expected_nsubst_tot = TRUE; break; case 'J': pf->do_expected_nsubst_col = TRUE; break; case 'U': pf->likelihood_only = TRUE; /* force -L */ pf->nsites_threshold = 0; /* also force this; typical use is with small number of tuples, no tuple_idx */ pf->do_column_probs = TRUE; break; case 'A': pf->root_seqname = optarg; break; case 'I': pf->nsites_threshold = get_arg_int(optarg); break; case 'G': pf->gaps_as_bases = TRUE; alph = "ACGT-"; break; case 'B': pf->estimate_scale_only = TRUE; break; case 'S': pf->subtree_name = optarg; break; case 'F': pf->estimate_backgd = TRUE; break; case 'W': pf->estimate_backgd = TRUE; pf->symfreq = TRUE; break; case 'f': pf->no_freqs = TRUE; break; case 'n': pf->no_rates = TRUE; break; case 'K': tmplist = get_arg_list(optarg); pf->rate_consts = str_list_as_dbl(tmplist); pf->nratecats = lst_size(pf->rate_consts); pf->use_em = 1; lst_free_strings(tmplist); lst_free(tmplist); break; case 'b': pf->ignore_branches = get_arg_list(optarg); break; case 'z': pf->assume_clock = TRUE; break; case 'O': if (pf->nooptstr == NULL) pf->nooptstr = str_new_charstr(optarg); else die("ERROR: no-opt argument can only be used once! parameters can be comma-separated list."); break; case 'd': if (pf->alt_mod_str == NULL) { pf->alt_mod_str = lst_new_ptr(1); } optstr = str_new_charstr(optarg); lst_push_ptr(pf->alt_mod_str, optstr); break; case 0: if (strcmp(long_opts[opt_idx].name, "label-branches") == 0 || strcmp(long_opts[opt_idx].name, "label-subtree") == 0) { optstr = str_new_charstr(optarg); if (pf->label_str == NULL) { pf->label_str = lst_new_ptr(3); pf->label_type = lst_new_int(3); } lst_push_ptr(pf->label_str, optstr); lst_push_int(pf->label_type, strcmp(long_opts[opt_idx].name, "label-branches") == 0 ? BRANCH_TYPE : SUBTREE_TYPE); } else if (strcmp(long_opts[opt_idx].name, "selection") == 0) { pf->selection = get_arg_dbl(optarg); pf->use_selection = TRUE; } else { die("ERROR: unknown option. Type 'phyloFit -h' for usage.\n"); } break; case 'u': if (pf->bound_arg == NULL) pf->bound_arg = lst_new_ptr(1); optstr = str_new_charstr(optarg); lst_push_ptr(pf->bound_arg, optstr); break; case 'D': seed = get_arg_int_bounds(optarg, 1, INFTY); break; case 'h': printf("%s", HELP); exit(0); case '?': die("ERROR: illegal argument. Type 'phyloFit -h' for usage.\n"); } } set_seed(seed); if (msa_fname == NULL) { if (optind >= argc) die("ERROR: missing alignment filename. Type 'phyloFit -h' for usage.\n"); msa_fname = argv[optind]; pf->msa_fname = msa_fname; } infile = phast_fopen(msa_fname, "r"); if (input_format == UNKNOWN_FORMAT) input_format = msa_format_for_content(infile, 1); if (pf->nonoverlapping && (pf->use_conditionals || pf->gff != NULL || pf->cats_to_do_str || input_format == SS)) die("ERROR: cannot use --non-overlapping with --markov, --features,\n--msa-format SS, or --do-cats.\n"); /* read alignment */ if (!pf->quiet) fprintf(stderr, "Reading alignment from %s ...\n", msa_fname); if (input_format == MAF) { pf->msa = maf_read(infile, NULL, tm_order(pf->subst_mod) + 1, NULL, pf->gff, pf->cm, pf->nonoverlapping ? tm_order(pf->subst_mod) + 1 : -1, FALSE, pf->reverse_group_tag, NO_STRIP, FALSE); if (pf->gaps_as_bases) msa_reset_alphabet(pf->msa, alph); } else pf->msa = msa_new_from_file_define_format(infile, input_format, alph); /* set up for categories */ /* first label sites, if necessary */ pf->label_categories = (input_format != MAF); run_phyloFit(pf); if (pf->logf != NULL && pf->logf != stderr && pf->logf != stdout) phast_fclose(pf->logf); if (!pf->quiet) fprintf(stderr, "Done.\n"); sfree(pf); return 0; }
int main(int argc, char* argv[]) { char *maf_fname = NULL, *out_root_fname = "maf_parse", *masked_fn = NULL; String *refseq = NULL, *currRefseq; int opt_idx, startcol = 1, endcol = -1, include = 1, splitInterval = -1; char c, outfilename[1000], splitFormat[100]="%s%.1i.maf", *group_tag = NULL; List *order_list = NULL, *seqlist_str = NULL, *cats_to_do_str=NULL, *cats_to_do=NULL; MafBlock *block; FILE *mfile, *outfile=NULL, *masked_file=NULL; int useRefseq=TRUE, currLen=-1, blockIdx=0, currSize, sortWarned=0; int lastIdx = 0, currStart=0, by_category = FALSE, i, pretty_print = FALSE; int lastStart = -1, gffSearchIdx=0; GFF_Set *gff = NULL, *gffSub; GFF_Feature *feat; CategoryMap *cm = NULL; int base_mask_cutoff = -1, stripILines=FALSE, stripELines=FALSE;//, numspec=0; List *outfileList=NULL; Hashtable *outfileHash=NULL;//, *specNameHash=NULL; msa_format_type output_format = MAF; MSA *msa = NULL;//, **catMsa; char *mask_features_spec_arg=NULL; List *mask_features_spec=NULL; struct option long_opts[] = { {"start", 1, 0, 's'}, {"end", 1, 0, 'e'}, {"seqs", 1, 0, 'l'}, {"exclude", 0, 0, 'x'}, {"order", 1, 0, 'O'}, {"split", 1, 0, 'S'}, {"out-root", 1, 0, 'r'}, {"out-root-digits", 1, 0, 'd'}, {"no-refseq", 0, 0, 'n'}, {"features", 1, 0, 'g'}, {"by-category", 0, 0, 'L'}, {"do-cats", 1, 0, 'C'}, {"catmap", 1, 0, 'c'}, {"by-group", 1, 0, 'P'}, {"mask-bases", 1, 0, 'b'}, {"masked-file", 1, 0, 'm'}, {"strip-i-lines", 0, 0, 'I'}, {"strip-e-lines", 0, 0, 'E'}, {"mask-features", 1, 0, 'M'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "s:e:l:O:r:S:d:g:c:P:b:o:m:M:pLnxEIh", long_opts, &opt_idx)) != -1) { switch(c) { case 's': startcol = get_arg_int(optarg); break; case 'e': endcol = get_arg_int(optarg); break; case 'l': seqlist_str = get_arg_list(optarg); break; case 'O': order_list = get_arg_list(optarg); break; case 'x': include = FALSE; break; case 'S': splitInterval = atoi(optarg); break; case 'r': out_root_fname = optarg; break; case 'd': sprintf(splitFormat, "%%s%%.%si.%%s", optarg); break; case 'n': useRefseq = FALSE; break; case 'g': gff = gff_read_set(phast_fopen(optarg, "r")); gff_sort(gff); stripILines=TRUE; stripELines=TRUE; break; case 'c': cm = cm_new_string_or_file(optarg); break; case 'C': cats_to_do_str = get_arg_list(optarg); break; case 'L': by_category = TRUE; break; case 'P': group_tag = optarg; break; case 'b': base_mask_cutoff = atoi(optarg); break; case 'm': masked_fn = optarg; break; case 'M': mask_features_spec_arg = optarg; break; case 'E': stripELines=TRUE; break; case 'I': stripILines=TRUE; break; case 'o': output_format = msa_str_to_format(optarg); if (output_format == UNKNOWN_FORMAT) die("ERROR: bad output format. Try \"maf_parse -h\" for help.\n"); if (output_format != MAF) die("Sorry, only MAF format output has been implemented right now.\n"); break; case 'p': pretty_print = TRUE; break; case 'h': print_usage(); exit(0); case '?': die("Bad argument. Try 'maf_parse -h' for help.\n"); } } if (optind >= argc) die("Missing alignment filename. Try 'maf_parse -h' for help.\n"); else if (optind == argc - 1) maf_fname = argv[optind]; else die("ERROR: Too many arguments. Try 'maf_parse -h' for help.\n"); set_seed(-1); if (startcol < 1 || (endcol != -1 && endcol < startcol)) die("ERROR: must have 1 <= start <= end <= [msa_length]\n"); if ((group_tag != NULL || by_category) && gff == NULL) die("ERROR: --by-category and --by-group require --features. Try \"maf_parse -h\"" " for help.\n"); if (group_tag != NULL && by_category) die("ERROR: --by-category and --by-group cannot be used together. Try \"maf_parse -h\"" " for help.\n"); if (splitInterval != -1 && gff != NULL) die("ERROR: can't use --split and --features together. Try \"maf_parse -h\"" "for help\n"); if (group_tag != NULL || by_category) { outfileList = lst_new_ptr(10); outfileHash = hsh_new(100); } if (gff != NULL && cm == NULL) cm = cm_new_from_features(gff); if (cats_to_do_str != NULL) { cats_to_do = cm_get_category_str_list(cm, cats_to_do_str, FALSE); if (gff != NULL) gff_filter_by_type(gff, cats_to_do, 0, NULL); } if (masked_fn != NULL) { if (base_mask_cutoff == -1) die("ERROR: need to use --mask-bases with --masked-file"); masked_file = phast_fopen(masked_fn, "w"); } if (mask_features_spec_arg != NULL) { if (gff==NULL) die("ERROR: need --features with --mask-features"); mask_features_spec = lst_new_ptr(10); str_split(str_new_charstr(mask_features_spec_arg), ",", mask_features_spec); for (i=0; i < lst_size(mask_features_spec); i++) { fprintf(stderr, "masking species %s within features\n", ((String*)lst_get_ptr(mask_features_spec, i))->chars); } } /* Check to see if --do-cats names a feature which is length 1. If so, set output_format to SS ? or FASTA ? */ mfile = phast_fopen(maf_fname, "r"); block = mafBlock_read_next(mfile, NULL, NULL); if (splitInterval == -1 && gff==NULL) { //TODO: do we want to copy header from original MAF in this case? mafBlock_open_outfile(NULL, argc, argv); } while (block != NULL) { if (order_list != NULL) mafBlock_reorder(block, order_list); if (seqlist_str != NULL) mafBlock_subSpec(block, seqlist_str, include); if (mafBlock_numSpec(block)==0 || mafBlock_all_gaps(block)) goto get_next_block; if (stripILines) mafBlock_strip_iLines(block); if (stripELines) mafBlock_strip_eLines(block); if (base_mask_cutoff != -1) mafBlock_mask_bases(block, base_mask_cutoff, masked_file); //TODO: still need to implement (either here or elsewhere) // if (indel_mask_cutoff != -1) // mafBlock_mask_indels(block, indel_mask_cutoff, mfile); if (useRefseq) { //get refseq and check that it is consistent in MAF file currRefseq = mafBlock_get_refSpec(block); if (refseq == NULL) refseq = str_new_charstr(currRefseq->chars); else if (str_compare(refseq, currRefseq)!=0) die("Error: refseq not consistent in MAF (got %s, %s)\n", refseq->chars, currRefseq->chars); } if (startcol != 1 || endcol != -1) if (0 == mafBlock_trim(block, startcol, endcol, refseq, useRefseq ? 0 : lastIdx)) goto get_next_block; currSize = mafBlock_get_size(block, refseq); if (useRefseq) { currStart = mafBlock_get_start(block, refseq); if (currStart < lastIdx && sortWarned == 0) { fprintf(stderr, "Warning: input MAF not sorted with respect to refseq. Output files may not represent contiguous alignments. (%i, %i)\n", lastIdx, currStart); sortWarned = 1; } } else currStart = lastIdx; if (currStart < lastStart) gffSearchIdx = 0; lastStart = currStart; lastIdx = currStart + currSize; //split by length if (splitInterval != -1) { if (currLen == -1 || currLen+currSize > splitInterval) { sprintf(outfilename, splitFormat, out_root_fname, ++blockIdx, msa_suffix_for_format(output_format)); if (output_format == MAF) { if (outfile != NULL) mafBlock_close_outfile(outfile); outfile = mafBlock_open_outfile(outfilename, argc, argv); } else if (output_format != MAF && msa != NULL) { // msa_print_to_filename(msa, outfilename, output_format, pretty_print); msa_free(msa); msa = NULL; } currLen = 0; } currLen += currSize; } else outfile = stdout; if (gff != NULL && mask_features_spec != NULL) { gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, &gffSearchIdx); if (gffSub != NULL) { mafBlock_mask_region(block, gffSub, mask_features_spec); gff_free_set(gffSub); } mafBlock_print(outfile, block, pretty_print); } else if (gff != NULL) { gffSub = gff_subset_range_overlap_sorted(gff, currStart+1, lastIdx, &gffSearchIdx); if (gffSub != NULL) { if (by_category) gff_group_by_feature(gffSub); else if (group_tag != NULL) gff_group(gffSub, group_tag); gff_sort(gffSub); gff_flatten_within_groups(gffSub); for (i=0; i<lst_size(gffSub->features); i++) { feat = (GFF_Feature*)lst_get_ptr(gffSub->features, i); MafBlock *subBlock = mafBlock_copy(block); mafBlock_trim(subBlock, feat->start, feat->end, refseq, 0); if (by_category) outfile = get_outfile(outfileList, outfileHash, feat->feature, out_root_fname, argc, argv); else if (group_tag != NULL) outfile = get_outfile(outfileList, outfileHash, gff_group_name(gffSub, feat), out_root_fname, argc, argv); else outfile = stdout; if (output_format == MAF) mafBlock_print(outfile, subBlock, pretty_print); // else msa_add_mafBlock(msa); mafBlock_free(subBlock); } gff_free_set(gffSub); } } else { if (output_format == MAF) mafBlock_print(outfile, block, pretty_print); // else msa = msa_add_mafBlock(mafBlock, msa, ); } get_next_block: mafBlock_free(block); block = mafBlock_read_next(mfile, NULL, NULL); } if (masked_file != NULL) fclose(masked_file); if (output_format == MAF) { if (by_category || group_tag != NULL) close_outfiles(outfileList, outfileHash); else if (outfile!=NULL) mafBlock_close_outfile(outfile); } else { msa_print(stdout, msa, output_format, pretty_print); msa_free(msa); } if (gff != NULL) gff_free_set(gff); phast_fclose(mfile); return 0; }
int main(int argc, char *argv[]) { char c; int opt_idx, i, j, k, N, nleaves; List *names, *treelist, *newlist, *tmpl, *groups = NULL; TreeNode *t, *tnew; int *used=NULL; struct option long_opts[] = { {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "h", long_opts, &opt_idx)) != -1) { switch (c) { case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'treeGen -h'.\n"); } } if (optind < argc - 2 || optind > argc - 1) die("ERROR: Wrong number of arguments. Try 'treeGen -h'.\n"); set_seed(-1); names = get_arg_list(argv[optind]); if (lst_size(names) <= 1) die("ERROR: must specify at least two species names.\n"); if (optind == argc - 2) { groups = get_arg_list_int(argv[optind+1]); if (lst_size(names) != lst_size(groups)) die("ERROR: name list and group list must be equal in length.\n"); } nleaves = lst_size(names) - 1; /* excluding outgroup */ N = num_rooted_topologies(nleaves); if (groups != NULL) { int maxgroup = 0; for (i = 0; i < lst_size(groups); i++) if (lst_get_int(groups, i) > maxgroup) maxgroup = lst_get_int(groups, i); used = smalloc((maxgroup+1) * sizeof(int)); for (i = 0; i <= maxgroup; i++) used[i] = FALSE; } /* FIXME: eventually need to consider constraints here */ if (N > 1e9) fprintf(stderr, "WARNING: very large number of topologies expected (%d). Program may not finish.\n", N); /* start with tree consisting of first two names */ t = tr_new_trivial(((String*)lst_get_ptr(names, 0))->chars, ((String*)lst_get_ptr(names, 1))->chars); treelist = lst_new_ptr(1000); newlist = lst_new_ptr(1000); lst_push_ptr(treelist, t); if (groups != NULL) { /* use branch lengths to encode group membership -- sort of an ugly hack but should be okay here */ t->lchild->dparent = lst_get_int(groups, 0); t->rchild->dparent = lst_get_int(groups, 1); if (t->lchild->dparent == t->rchild->dparent) t->dparent = t->lchild->dparent; used[lst_get_int(groups, 0)] = TRUE; used[lst_get_int(groups, 1)] = TRUE; } for (i = 2; i < nleaves; i++) { char *nextname = ((String*)lst_get_ptr(names, i))->chars; int nextgroup = groups != NULL ? lst_get_int(groups, i) : -1; lst_clear(newlist); for (j = 0; j < lst_size(treelist); j++) { t = lst_get_ptr(treelist, j); /* create copies and add leaf to each internal branch */ for (k = 1; k < t->nnodes; k++) { TreeNode *n = lst_get_ptr(t->nodes, k); /* decide whether adding leaf to this branch is consistent with monophyletic groups */ if (groups != NULL) { int branchgroup = n->dparent; int ancgroup = n->parent->dparent; if (nextgroup > 0 && used[nextgroup]) { /* group is represented in the tree */ if (nextgroup != branchgroup) { continue; /* can only add to the designated subtree */ } } else { /* group is zero (background) or not yet represented in the tree */ if (branchgroup != 0 && nextgroup != branchgroup && branchgroup == ancgroup) { continue; /* only prohibit adding inside another designated subtree (adding to leading branch is okay) */ } } } tnew = tr_create_copy(t); tr_add_leaf_internal(tnew, k, nextname, nextgroup); lst_push_ptr(newlist, tnew); } /* now add leaf at root; this time reuse the original copy to avoid unnecessary memory reallocation */ if (nextgroup <= 0 || !used[nextgroup] || t->dparent == nextgroup) { tr_add_leaf_at_root(t, nextname, nextgroup); lst_push_ptr(newlist, t); } else tr_free(t); } /* swap treelist and newlist */ tmpl = treelist; treelist = newlist; newlist = tmpl; if (groups != NULL) used[nextgroup] = TRUE; } /* traverse list and add outgroup at root of each tree */ if (nleaves > 1) { for (j = 0; j < lst_size(treelist); j++) { t = lst_get_ptr(treelist, j); tr_add_leaf_at_root(t, ((String*)lst_get_ptr(names, nleaves))->chars, 0); } } /* print trees */ for (j = 0; j < lst_size(treelist); j++) { t = lst_get_ptr(treelist, j); tr_print(stdout, t, FALSE); } return 0; }
int main(int argc, char *argv[]) { char c; List *l; int i, j, strand, bed_output = 0, backgd_nmods = -1, feat_nmods = -1, winsize = -1, verbose = 0, max_nmods, memblocksize, old_nleaves, refidx = 1, base_by_base = FALSE, windowWig = FALSE; TreeModel **backgd_mods = NULL, **feat_mods = NULL; HMM *backgd_hmm = NULL, *feat_hmm = NULL; msa_format_type inform = UNKNOWN_FORMAT; GFF_Set *features = NULL; MSA *msa, *msa_compl=NULL; double **backgd_emissions, **feat_emissions, **mem, **dummy_emissions, *winscore_pos=NULL, *winscore_neg=NULL; int *no_alignment=NULL; List *pruned_names; char *msa_fname; FILE *infile; int opt_idx; struct option long_opts[] = { {"background-mods", 1, 0, 'b'}, {"background-hmm", 1, 0, 'B'}, {"feature-mods", 1, 0, 'f'}, {"feature-hmm", 1, 0, 'F'}, {"features", 1, 0, 'g'}, {"window", 1, 0, 'w'}, {"window-wig", 1, 0, 'W'}, {"base-by-base", 0, 0, 'y'}, {"msa-format", 1, 0, 'i'}, {"refidx", 1, 0, 'r'}, {"output-bed", 0, 0, 'd'}, {"verbose", 0, 0, 'v'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "B:b:F:f:r:g:w:W:i:ydvh", long_opts, &opt_idx)) != -1) { switch (c) { case 'B': backgd_hmm = hmm_new_from_file(phast_fopen(optarg, "r")); break; case 'b': l = get_arg_list(optarg); backgd_nmods = lst_size(l); backgd_mods = smalloc(backgd_nmods * sizeof(void*)); for (i = 0; i < backgd_nmods; i++) backgd_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1); lst_free_strings(l); lst_free(l); break; case 'F': feat_hmm = hmm_new_from_file(phast_fopen(optarg, "r")); break; case 'f': l = get_arg_list(optarg); feat_nmods = lst_size(l); feat_mods = smalloc(feat_nmods * sizeof(void*)); for (i = 0; i < feat_nmods; i++) feat_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1); lst_free_strings(l); lst_free(l); break; case 'g': features = gff_read_set(phast_fopen(optarg, "r")); break; case 'w': winsize = get_arg_int(optarg); if (winsize <= 0) die("ERROR: window size must be positive.\n"); break; case 'W': winsize = get_arg_int(optarg); if (winsize <= 0) die("ERROR: window size must be positive.\n"); windowWig = TRUE; break; case 'y': base_by_base = TRUE; break; case 'i': inform = msa_str_to_format(optarg); if (inform == UNKNOWN_FORMAT) die("Bad argument to -i.\n"); break; case 'r': refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'd': bed_output = 1; break; case 'h': printf("%s", HELP); exit(0); case 'v': verbose = 1; break; case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } set_seed(-1); if (backgd_mods == NULL || feat_mods == NULL) die("ERROR: -b and -f required. Try '%s -h'.\n", argv[0]); if (backgd_nmods == 1 && backgd_hmm == NULL) backgd_hmm = hmm_create_trivial(); else if (backgd_hmm == NULL) die("ERROR: -B required. Try '%s -h'.\n", argv[0]); if (feat_nmods == 1 && feat_hmm == NULL) feat_hmm = hmm_create_trivial(); else if (feat_hmm == NULL) die("ERROR: -F required. Try '%s -h'.\n", argv[0]); if ((winsize == -1 && features == NULL && !base_by_base) || (winsize != -1 && features != NULL) || (winsize != -1 && base_by_base) || (features != NULL && base_by_base)) die("ERROR: must specify exactly one of -g, -w, and -y. Try '%s -h'.\n", argv[0]); if (backgd_hmm->nstates != backgd_nmods) die("ERROR: number of states must equal number of tree models for background.\n"); if (feat_hmm->nstates != feat_nmods) die("ERROR: number of states must equal number of tree models for features.\n"); if (features != NULL && lst_size(features->features) == 0) die("ERROR: empty features file.\n"); if (base_by_base && (backgd_nmods > 1 || feat_nmods > 1)) die("ERROR: only single phylogenetic models (not HMMs) are supported with --base-by-base.\n"); if (optind != argc - 1) die("ERROR: too few arguments. Try '%s -h'.\n", argv[0]); if (verbose) fprintf(stderr, "Reading alignment ...\n"); msa_fname = argv[optind]; infile = phast_fopen(msa_fname, "r"); if (inform == UNKNOWN_FORMAT) inform = msa_format_for_content(infile, 1); if (inform == MAF) msa = maf_read(infile, NULL, 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); else msa = msa_new_from_file_define_format(infile, inform, NULL); if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* need ordered representation of alignment */ if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL) ) die("ERROR: ordered sufficient statistics are required.\n"); pruned_names = lst_new_ptr(msa->nseqs); for (i = 0; i < backgd_nmods; i++) { old_nleaves = (backgd_mods[i]->tree->nnodes + 1) / 2; tm_prune(backgd_mods[i], msa, pruned_names); if (lst_size(pruned_names) >= old_nleaves) die("ERROR: no match for leaves of tree in alignment (background model #%d)\n", i+1); else if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves in background model (#%d) with no match in alignment (", i+1); for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); } for (i = 0; i < feat_nmods; i++) { old_nleaves = (feat_mods[i]->tree->nnodes + 1) / 2; tm_prune(feat_mods[i], msa, pruned_names); if (lst_size(pruned_names) >= old_nleaves) die("ERROR: no match for leaves of tree in alignment (features model #%d)\n", i+1); else if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves in features model (#%d) with no match in alignment (", i+1); for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); } lst_free(pruned_names); /* first have to subtract offset from features, if necessary */ if (msa->idx_offset != 0 && features != NULL) { for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); f->start -= msa->idx_offset; f->end -= msa->idx_offset; } } /* convert to coord frame of alignment */ if (features != NULL && refidx != 0) { if (verbose) fprintf(stderr, "Mapping coordinates ...\n"); msa_map_gff_coords(msa, features, refidx, 0, 0); if (lst_size(features->features) == 0) die("ERROR: no features within coordinate range of alignment.\n"); } /* Make a reverse complemented copy of the alignment. The two strands will be processed separately, to avoid problems with overlapping features, etc. */ if (!base_by_base) { /* skip in base by base case */ if (verbose) fprintf(stderr, "Creating reverse complemented alignment ...\n"); msa_compl = msa_create_copy(msa, 0); /* temporary workaround: make sure reverse complement not based on sufficient stats */ if (msa_compl->seqs == NULL) ss_to_msa(msa_compl); if (msa_compl->ss != NULL) { ss_free(msa_compl->ss); msa_compl->ss = NULL; } msa_reverse_compl(msa_compl); } /* allocate memory for computing scores */ backgd_emissions = smalloc(backgd_nmods * sizeof(void*)); for (i = 0; i < backgd_nmods; i++) backgd_emissions[i] = smalloc(msa->length * sizeof(double)); feat_emissions = smalloc(feat_nmods * sizeof(void*)); for (i = 0; i < feat_nmods; i++) feat_emissions[i] = smalloc(msa->length * sizeof(double)); max_nmods = max(backgd_nmods, feat_nmods); dummy_emissions = smalloc(max_nmods * sizeof(void*)); mem = smalloc(max_nmods * sizeof(void*)); /* memory for forward algorithm -- each block must be as large as the largest feature */ if (features != NULL) { for (i = 0, memblocksize = -1; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); if (f->end - f->start + 1 > memblocksize) memblocksize = f->end - f->start + 1; } } else memblocksize = winsize; /* -1 if base-by-base mode */ if (memblocksize > 0) for (i = 0; i < max_nmods; i++) mem[i] = smalloc(memblocksize * sizeof(double)); if (winsize != -1) { winscore_pos = smalloc(msa->length * sizeof(double)); winscore_neg = smalloc(msa->length * sizeof(double)); no_alignment = smalloc(msa->length * sizeof(int)); for (i = 0; i < msa->length; i++) { winscore_pos[i] = winscore_neg[i] = NEGINFTY; if (refidx == 0) no_alignment[i] = FALSE; else no_alignment[i] = msa_missing_col(msa, refidx, i); } } /* the rest will be repeated for each strand */ for (strand = 1; strand <= 2; strand++) { MSA *thismsa = strand == 1 ? msa : msa_compl; double *winscore = strand == 1 ? winscore_pos : winscore_neg; if (base_by_base && strand == 2) break; /* don't do second pass in base_by_base case */ if (verbose) fprintf(stderr, "Processing %c strand ...\n", strand == 1 ? '+' : '-'); /* set up dummy categories array, so that emissions are only computed where needed */ thismsa->categories = smalloc(thismsa->length * sizeof(int)); thismsa->ncats = 1; if (winsize != -1) { if (strand == 1) for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = no_alignment[i] ? 0 : 1; else for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = no_alignment[thismsa->length - i - 1] ? 0 : 1; } else if (features != NULL) { for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 0; for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); if (f->start <= 0 || f->end <= 0) { fprintf(stderr, "WARNING: feature out of range ('"); gff_print_feat(stderr, f); fprintf(stderr, "')\n"); continue; } if (strand == 1 && f->strand != '-') for (j = f->start - 1; j < f->end; j++) thismsa->categories[j] = 1; else if (strand == 2 && f->strand == '-') for (j = thismsa->length - f->end; j < thismsa->length - f->start + 1; j++) thismsa->categories[j] = 1; } } else { /* base-by-base scores */ for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 1; } if (thismsa->ss != NULL) ss_update_categories(thismsa); /* compute emissions */ for (i = 0; i < backgd_nmods; i++) { if (verbose) fprintf(stderr, "Computing emissions for background model #%d ...\n", i+1); tl_compute_log_likelihood(backgd_mods[i], thismsa, backgd_emissions[i], NULL, 1, NULL); } for (i = 0; i < feat_nmods; i++) { if (verbose) fprintf(stderr, "Computing emissions for features model #%d ...\n", i+1); tl_compute_log_likelihood(feat_mods[i], thismsa, feat_emissions[i], NULL, 1, NULL); } /* now compute scores */ if (winsize != -1) { /* windows case */ int winstart; if (verbose) fprintf(stderr, "Computing scores ...\n"); for (winstart = 0; winstart <= thismsa->length - winsize; winstart++) { int centeridx = winstart + winsize/2; if (strand == 2) centeridx = thismsa->length - centeridx - 1; if (no_alignment[centeridx]) continue; for (j = 0; j < feat_nmods; j++) dummy_emissions[j] = &(feat_emissions[j][winstart]); winscore[centeridx] = hmm_forward(feat_hmm, dummy_emissions, winsize, mem); if (winscore[centeridx] <= NEGINFTY) { winscore[centeridx] = NEGINFTY; continue; } for (j = 0; j < backgd_nmods; j++) dummy_emissions[j] = &(backgd_emissions[j][winstart]); winscore[centeridx] -= hmm_forward(backgd_hmm, dummy_emissions, winsize, mem); if (winscore[centeridx] < NEGINFTY) winscore[centeridx] = NEGINFTY; } } else if (features != NULL) { /* features case */ if (verbose) fprintf(stderr, "Computing scores ...\n"); for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); int s, e; if ((strand == 1 && f->strand == '-') || (strand == 2 && f->strand != '-') || f->start <= 0 || f->end <= 0 || f->end - f->start < 0) continue; /* effective coords */ if (f->strand == '-') { s = thismsa->length - f->end + 1; e = thismsa->length - f->start + 1; } else { s = f->start; e = f->end; } f->score_is_null = 0; for (j = 0; j < feat_nmods; j++) dummy_emissions[j] = &(feat_emissions[j][s-1]); f->score = hmm_forward(feat_hmm, dummy_emissions, e - s + 1, mem); if (f->score <= NEGINFTY) { f->score = NEGINFTY; continue; } for (j = 0; j < backgd_nmods; j++) dummy_emissions[j] = &(backgd_emissions[j][s-1]); f->score -= hmm_forward(backgd_hmm, dummy_emissions, e - s + 1, mem); if (f->score < NEGINFTY) f->score = NEGINFTY; } } } if (verbose) fprintf(stderr, "Generating output ...\n"); if (winsize != -1 && windowWig == FALSE) { /* standard windows output */ for (i = 0, j = 0; i < msa->length; i++) { if (no_alignment[i] == FALSE) printf("%d\t%.3f\t%.3f\n", j + msa->idx_offset + 1, winscore_pos[i], winscore_neg[i]); if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) j++; } } else if (windowWig == TRUE) { /* windows with wig output */ int last = NEGINFTY; for (i = 0, j = 0; i < msa->length; i++) { if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) { if (no_alignment[i] == FALSE && winscore_pos[i] > NEGINFTY) { if (j > last + 1) printf("fixedStep chrom=%s start=%d step=1\n", refidx > 0 ? msa->names[refidx-1] : "alignment", j + msa->idx_offset + 1); printf("%.3f\n", winscore_pos[i]); last = j; } j++; } } } else if (features != NULL) { /* features output */ /* return to coord frame of reference seq (also, replace offset) */ if (refidx != 0) msa_map_gff_coords(msa, features, 0, refidx, msa->idx_offset); else if (msa->idx_offset != 0) { for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); f->start += msa->idx_offset; f->end += msa->idx_offset; } } if (bed_output) gff_print_bed(stdout, features, FALSE); else gff_print_set(stdout, features); } else { /* base-by-base scores */ /* in this case, we can just output the difference between the emissions */ printf("fixedStep chrom=%s start=%d step=1\n", refidx > 0 ? msa->names[refidx-1] : "alignment", msa->idx_offset + 1); for (i = 0, j = 0; i < msa->length; i++) { if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) { printf("%.3f\n", feat_emissions[0][i] - backgd_emissions[0][i]); j++; } } } if (verbose) fprintf(stderr, "\nDone.\n"); return 0; }
int main(int argc, char *argv[]) { struct phastCons_struct *p = phastCons_struct_new(0); struct option long_opts[] = { {"states", 1, 0, 'S'}, {"hmm", 1, 0, 'H'}, {"viterbi", 1, 0, 'V'}, {"most-conserved", 1, 0, 'V'}, /* same as --viterbi */ {"no-post-probs", 0, 0, 'n'}, {"msa-format", 1, 0, 'i'}, {"FC", 0, 0, 'X'}, {"lambda", 1, 0, 'l'}, {"target-coverage", 1, 0, 'C'}, {"transitions", 1, 0, 't'}, {"expected-length", 1, 0, 'E'}, {"expected-lengths", 1, 0, 'E'}, /* for backward compatibility */ {"estimate-trees", 1, 0, 'T'}, {"estimate-rho", 1, 0, 'O'}, {"rho", 1, 0, 'R'}, {"gc", 1, 0, 'G'}, {"ignore-missing", 0, 0, 'z'}, {"nrates", 1, 0, 'k'}, {"log", 1, 0, 'g'}, {"refidx", 1, 0, 'r'}, {"suppress-missing", 0, 0, 'x'}, /* for backward compatibility */ {"reflect-strand", 1, 0, 'U'}, {"catmap", 1, 0, 'c'}, {"extrapolate", 1, 0, 'e'}, {"indels", 0, 0, 'I'}, {"max-micro-indel", 1, 0, 'Y'}, {"indel-params", 1, 0, 'D'}, {"min-informative-types", 1, 0, 'M'}, /* for backward compatibility */ {"require-informative", 1, 0, 'M'}, {"not-informative", 1, 0, 'F'}, {"lnl", 1, 0, 'L'}, {"seqname", 1, 0, 'N'}, {"idpref", 1, 0, 'P'}, {"score", 0, 0, 's'}, {"coding-potential", 0, 0, 'p'}, {"indels-only", 0, 0, 'J'}, {"alias", 1, 0, 'A'}, {"quiet", 0, 0, 'q'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* other vars */ FILE *infile; char *msa_fname; char c; int opt_idx, i, coding_potential=FALSE; List *tmpl = NULL; String *tmpstr; char *mods_fname = NULL; List *mod_fname_list; msa_format_type msa_format = UNKNOWN_FORMAT; while ((c = getopt_long(argc, argv, "S:H:V:ni:k:l:C:G:zt:E:R:T:O:r:xL:sN:P:g:U:c:e:IY:D:JM:F:pA:Xqh", long_opts, &opt_idx)) != -1) { switch (c) { case 'S': p->states = get_arg_list(optarg); break; case 'H': p->hmm = hmm_new_from_file(phast_fopen(optarg, "r")); p->two_state = FALSE; break; case 'V': p->viterbi_f = phast_fopen(optarg, "w+"); tmpstr = str_new_charstr(optarg); if (str_ends_with_charstr(tmpstr, ".gff")) p->gff = TRUE; str_free(tmpstr); break; case 'n': p->post_probs = FALSE; break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: bad argument to --msa-format\n"); break; case 'X': p->FC = TRUE; p->two_state = FALSE; break; case 'l': if (optarg[0] != '~') p->estim_lambda = FALSE; else optarg = &optarg[1]; p->lambda = get_arg_dbl_bounds(optarg, 0, 1); break; case 'C': p->gamma = get_arg_dbl_bounds(optarg, 0, 1); break; case 'G': p->gc = get_arg_dbl_bounds(optarg, 0, 1); break; case 't': p->set_transitions = TRUE; if (optarg[0] != '~') p->estim_transitions = FALSE; else optarg = &optarg[1]; tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 2) die("ERROR: bad argument to --transitions.\n"); p->mu = lst_get_dbl(tmpl, 0); p->nu = lst_get_dbl(tmpl, 1); if (p->mu <= 0 || p->mu >= 1 || p->nu <= 0 || p->nu >= 1) die("ERROR: bad argument to --transitions.\n"); lst_free(tmpl); break; case 'E': if (optarg[0] != '~') p->estim_transitions = FALSE; else optarg = &optarg[1]; p->omega = get_arg_dbl_bounds(optarg, 1, INFTY); p->mu = 1/p->omega; break; case 'T': p->estim_trees = TRUE; p->estim_trees_fname_root = optarg; break; case 'O': p->estim_rho = TRUE; p->estim_trees_fname_root = optarg; break; case 'z': p->ignore_missing = TRUE; break; case 'k': tmpl = get_arg_list_int(optarg); if (lst_size(tmpl) > 2) die("ERROR: too many arguments with --nrates.\n"); p->nrates = lst_get_int(tmpl, 0); if (p->nrates <= 0) die("ERROR: bad argument to --nrates (%d).\n", p->nrates); if (lst_size(tmpl) == 2) { p->nrates2 = lst_get_int(tmpl, 1); if (p->nrates2 <= 0) die("ERROR: bad argument to --nrates (%d).\n", p->nrates2); } lst_free(tmpl); break; case 'R': p->rho = get_arg_dbl_bounds(optarg, 0, 1); break; case 'g': if (!strcmp(optarg, "-")) p->log_f = stderr; else p->log_f = phast_fopen(optarg, "w+"); break; case 'r': p->refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'x': /* do nothing; left in for backward compatibility */ break; case 'U': p->pivot_states = get_arg_list(optarg); /* we want strings not ints for phmm_new */ break; case 'e': p->extrapolate_tree_fname = optarg; break; case 'I': p->indels = TRUE; break; case 'Y': p->max_micro_indel = get_arg_int_bounds(optarg, 1, INFTY); break; case 'D': if (optarg[0] != '~') p->estim_indels = FALSE; else optarg = &optarg[1]; tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-params.\n"); p->alpha_0 = lst_get_dbl(tmpl, 0); p->beta_0 = lst_get_dbl(tmpl, 1); p->tau_0 = lst_get_dbl(tmpl, 2); p->alpha_1 = lst_get_dbl(tmpl, 3); p->beta_1 = lst_get_dbl(tmpl, 4); p->tau_1 = lst_get_dbl(tmpl, 5); if (p->alpha_0 < 0 || p->beta_0 < 0 || p->tau_0 < 0 || p->alpha_1 < 0 || p->beta_1 < 0 || p->tau_1 < 0) die("ERROR: bad argument to --indel-params.\n"); lst_free(tmpl); break; case 'J': p->indels_only = TRUE; p->two_state = FALSE; p->indels = TRUE; p->post_probs = FALSE; break; case 'M': p->inform_reqd = get_arg_list(optarg); break; case 'F': p->not_informative = get_arg_list(optarg); break; case 'c': p->cm = cm_new_string_or_file(optarg); break; case 'L': p->lnl_f = phast_fopen(optarg, "w+"); break; case 'N': p->seqname = optarg; break; case 'P': p->idpref = optarg; break; case 's': p->score = TRUE; break; case 'p': coding_potential = TRUE; break; case 'A': p->alias_hash = make_name_hash(optarg); break; case 'q': p->results_f = NULL; break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if ((!coding_potential && optind != argc - 2) || (coding_potential && optind != argc - 2 && optind != argc - 1)) die("ERROR: extra or missing arguments. Try '%s -h'.\n", argv[0]); set_seed(-1); if (p->extrapolate_tree_fname != NULL && !strcmp(p->extrapolate_tree_fname, "default")) { p->extrapolate_tree_fname = smalloc((strlen(PHAST_HOME)+100)*sizeof(char)); #if defined(__MINGW32__) sprintf(p->extrapolate_tree_fname, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME); #else sprintf(p->extrapolate_tree_fname, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME); #endif } if (p->extrapolate_tree_fname != NULL) p->extrapolate_tree = tr_new_from_file(phast_fopen(p->extrapolate_tree_fname, "r")); mods_fname = (optind == argc - 2 ? argv[argc - 1] : NULL); /* if there are two args, mods are the second one; otherwise will use default mods for coding potential (see below) */ /* set defaults for coding-potential mode */ if (coding_potential) { char tmp[5000]; p->two_state = FALSE; if (p->cm == NULL) p->cm = cm_new_string_or_file("NCATS=4; CNS 1; CDS 2-4"); if (p->hmm == NULL) { #if defined(__MINGW32__) sprintf(tmp, "%s\\data\\phastCons\\%s", PHAST_HOME, p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm"); #else sprintf(tmp, "%s/data/phastCons/%s", PHAST_HOME, p->indels ? "simple-coding-indels.hmm" : "simple-coding.hmm"); #endif if (p->results_f!=NULL) fprintf(p->results_f, "Reading HMM from %s...\n", tmp); p->hmm = hmm_new_from_file(phast_fopen(tmp, "r")); } if (mods_fname == NULL) { #if defined(__MINGW32__) sprintf(tmp, "%s\\data\\exoniphy\\mammals\\r3.ncns.mod, %s\\data\\exoniphy\\mammals\\r3.cns.mod, %s\\data\\exoniphy\\mammals\\r3.cds-1.mod, %s\\data\\exoniphy\\mammals\\r3.cds-2.mod, %s\\data\\exoniphy\\mammals\\r3.cds-3.mod", PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME); #else sprintf(tmp, "\ %s/data/exoniphy/mammals/r3.ncns.mod,\ %s/data/exoniphy/mammals/r3.cns.mod,\ %s/data/exoniphy/mammals/r3.cds-1.mod,\ %s/data/exoniphy/mammals/r3.cds-2.mod,\ %s/data/exoniphy/mammals/r3.cds-3.mod", PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME, PHAST_HOME); #endif mods_fname = tmp; } if (p->states == NULL) p->states = get_arg_list("CDS"); if (p->pivot_states == NULL) p->pivot_states = get_arg_list("background,CNS"); }
int main(int argc, char *argv[]) { char c; int i, j, t, opt_idx, ntrees, nleaves = -1; TreeNode *n, *node_i, *node_j, *lca, *nametree = NULL; TreeNode **tree; List *leaves, ***distance, *tree_fnames, *tot_dist; int mod = FALSE; char **leaf_name; String *trees_arg; FILE *F; struct option long_opts[] = { {"mod", 0, 0, 'm'}, {"tree", 1, 0, 't'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "mt:h", long_opts, &opt_idx)) != -1) { switch (c) { case 'm': mod = TRUE; break; case 't': if (optarg[0] == '(') nametree = tr_new_from_string(optarg); else nametree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind > argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); set_seed(-1); /* build a comma-delimited list and pass to get_arg_list; allows possibility of reading from file via '*' operator */ trees_arg = str_new(1000); for (i = optind; i < argc; i++) { str_append_charstr(trees_arg, argv[i]); if (i < argc - 1) str_append_char(trees_arg, ','); } tree_fnames = get_arg_list(trees_arg->chars); ntrees = lst_size(tree_fnames); tree = smalloc(ntrees * sizeof(void*)); /* read trees */ for (t = 0; t < ntrees; t++) { String *fname = lst_get_ptr(tree_fnames, t); if (mod) { TreeModel *m = tm_new_from_file(F = phast_fopen(fname->chars, "r"), 1); tree[t] = tr_create_copy(m->tree); tm_free(m); phast_fclose(F); } else tree[t] = tr_new_from_file(phast_fopen(fname->chars, "r")); } /* initialization */ nleaves = (tree[0]->nnodes + 1)/2; leaves = lst_new_ptr(nleaves); distance = smalloc(nleaves * sizeof(void*)); leaf_name = smalloc(nleaves * sizeof(void*)); for (i = 0; i < nleaves; i++) { distance[i] = smalloc(nleaves * sizeof(void*)); for (j = i+1; j < nleaves; j++) distance[i][j] = lst_new_dbl(ntrees); } if (nametree == NULL) nametree = tree[0]; for (i = 0, j = 0; i < lst_size(nametree->nodes); i++) { n = lst_get_ptr(nametree->nodes, i); if (n->lchild == NULL && n->rchild == NULL) leaf_name[j++] = n->name; } tot_dist = lst_new_dbl(ntrees); /* now compute distances */ for (t = 0; t < ntrees; t++) { /* obtain list of leaves */ lst_clear(leaves); for (i = 0; i < lst_size(tree[t]->nodes); i++) { n = lst_get_ptr(tree[t]->nodes, i); if (n->lchild == NULL && n->rchild == NULL) lst_push_ptr(leaves, n); } if (lst_size(leaves) != nleaves) die("ERROR: trees have different numbers of leaves.\n"); /* look at all pairs */ for (i = 0; i < nleaves; i++) { node_i = lst_get_ptr(leaves, i); for (j = i+1; j < nleaves; j++) { double dist = 0; node_j = lst_get_ptr(leaves, j); /* because ids are assigned in pre-order, the first ancestor of node j that has an id less than i is the LCA of i and j; we seek the sum of distances from both i and j to this node */ for (n = node_j; n->id >= node_i->id; n = n->parent) dist += n->dparent; lca = n; for (n = node_i; n != lca; n = n->parent) dist += n->dparent; lst_push_dbl(distance[i][j], dist); } } lst_push_dbl(tot_dist, tr_total_len(tree[t])); } /* print distances and (optionally) stats */ if (ntrees == 1) { for (i = 0; i < nleaves; i++) { for (j = i+1; j < nleaves; j++) { printf ("%s\t%s\t%f\n", leaf_name[i], leaf_name[j], lst_get_dbl(distance[i][j], 0)); } } printf ("%s\t%s\t%f\n", "(total)", "-", lst_get_dbl(tot_dist, 0)); } else { double mean, stdev; double quantiles[] = {0, 0.025, 0.05, 0.5, 0.95, 0.975, 1}; double quantile_vals[7]; printf("%-15s %-15s %9s %9s %9s %9s %9s %9s %9s %9s %9s\n", "leaf1", "leaf2", "mean", "stdev", "median", "min", "max", "95%_min", "95%_max", "90%_min", "90%_max"); for (i = 0; i < nleaves; i++) { for (j = i+1; j < nleaves; j++) { mean = lst_dbl_mean(distance[i][j]); stdev = lst_dbl_stdev(distance[i][j]); lst_qsort_dbl(distance[i][j], ASCENDING); lst_dbl_quantiles(distance[i][j], quantiles, 7, quantile_vals); printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", leaf_name[i], leaf_name[j], mean, stdev, quantile_vals[3], quantile_vals[0], quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], quantile_vals[4]); } } /* also do total branch len */ mean = lst_dbl_mean(tot_dist); stdev = lst_dbl_stdev(tot_dist); lst_qsort_dbl(tot_dist, ASCENDING); lst_dbl_quantiles(tot_dist, quantiles, 7, quantile_vals); printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", "(total)", "-", mean, stdev, quantile_vals[3], quantile_vals[0], quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], quantile_vals[4]); } return 0; }
int main(int argc, char *argv[]) { TreeNode *tree = NULL; TreeModel *backgd_mod = NULL; int i, j, size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, nrestarts = 10, npseudocounts = 5, nsamples = -1, nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0, nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, suppress_stdout = 0; List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl; List *msas, *motifs; SeqSet *seqset = NULL; PooledMSA *pmsa = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; Vector *backgd_mnmod = NULL; Hashtable *hash=NULL; String *output_prefix = str_new_charstr("phastm."); double *has_motif = NULL; double prior = PRIOR; char c; GFF_Set *bedfeats = NULL; while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) { switch (c) { case 't': tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: bad input format.\n"); break; case 'b': backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); break; case 's': break; case 'k': size = get_arg_int(optarg); break; case 'm': meme_mode = 1; break; case 'd': pos_examples = get_arg_list(optarg); break; case 'p': profile_mode = 1; break; case 'n': nrestarts = get_arg_int(optarg); break; case 'I': init_list = get_arg_list(optarg); break; case 'P': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n"); nmostprevalent = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nmostprevalent > 0 && tuple_size > 0)) die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", nmostprevalent, tuple_size); lst_free(tmpl); break; case 'R': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n"); nsamples = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nsamples > 0 && tuple_size > 0)) die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size); lst_free(tmpl); break; case 'c': npseudocounts = get_arg_int(optarg); break; case 'w': nbest = get_arg_int(optarg); break; case 'S': sample_parms = 1; break; case 'B': nmotifs = get_arg_int(optarg); break; case 'o': str_free(output_prefix); output_prefix = str_new_charstr(optarg); str_append_char(output_prefix, '.'); break; case 'H': do_html = 1; break; case 'D': do_bed = 1; break; case 'x': suppress_stdout = 1; break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("ERROR: List of alignment files required. Try '%s -h'.\n", argv[0]); if ((nsamples > 0 && nmostprevalent > 0) || (nsamples > 0 && init_list != NULL) || (nmostprevalent > 0 && init_list != NULL)) die("ERROR: -I, -P, and -R are mutually exclusive."); set_seed(-1); msa_name_list = get_arg_list(argv[optind]); if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree; if (tree == NULL && !meme_mode && !profile_mode) die("ERROR: Must specify -t, -m, or -p.\n"); if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && !sample_parms) nrestarts = 1; if (pos_examples != NULL) { hash = hsh_new(lst_size(pos_examples)); for (i = 0; i < lst_size(pos_examples); i++) hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1); has_motif = smalloc(lst_size(msa_name_list) * sizeof(double)); } /* open all MSAs */ msas = lst_new_ptr(lst_size(msa_name_list)); fprintf(stderr, "Reading alignment(s) ...\n"); for (i = 0, j = 0; i < lst_size(msa_name_list); i++) { String *name = lst_get_ptr(msa_name_list, i); FILE *mfile = phast_fopen(name->chars, "r"); msa_format_type temp_format; MSA *msa; if (msa_format == UNKNOWN_FORMAT) temp_format = msa_format_for_content(mfile, 1); else temp_format = msa_format; msa = msa_new_from_file_define_format(mfile, temp_format, NULL); phast_fclose(mfile); if (nseqs == -1) nseqs = msa->nseqs; if (!meme_mode && (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 || msa->nseqs != nseqs)) { fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars); msa_free(msa); continue; } if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* Ns can be a problem */ lst_push_ptr(msas, msa); if (has_motif != NULL) { int k, hm = (hsh_get_int(hash, name->chars) == 1); if (meme_mode) { /* here need to record at individ seq level */ has_motif = srealloc(has_motif, (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */ for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm; } else has_motif[j++] = hm; } } if (!meme_mode) { fprintf(stderr, "Extracting and pooling sufficient statistics ...\n"); pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0); msa_remove_N_from_alph(pmsa->pooled_msa); } /* obtain individual sequences, if necessary */ if (nmostprevalent > 0 || nsamples > 0 || meme_mode) { if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n"); else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n"); seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size); /* for now, assume 1st seq is reference */ msa_remove_N_from_alph(seqset->set); } if (nmostprevalent > 0) { fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", nmostprevalent, tuple_size); init_list = lst_new_ptr(nmostprevalent); mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent); } else if (nsamples > 0) { fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size); init_list = lst_new_ptr(nsamples); mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples); } /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */ if (meme_mode && backgd_mod != NULL && has_motif == NULL) backgd_mnmod = backgd_mod->backgd_freqs; /* estimate background model, if necessary */ else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) { fprintf(stderr, "Fitting background model%s ...\n", has_motif == NULL ? "" : " (for use in initialization)"); /* if discriminative, be clear backgd isn't really part of the estimation procedure */ if (meme_mode) { backgd_mnmod = vec_new(strlen(seqset->set->alphabet)); mtf_estim_backgd_mn(seqset, backgd_mnmod); } else { backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, pmsa->pooled_msa->alphabet, 1, 0, NULL, -1); tm_fit(backgd_mod, pmsa->pooled_msa, tm_params_init(backgd_mod, .1, 5, 0), -1, OPT_MED_PREC, NULL, 0, NULL); } } /* select subset of init strings, if necessary */ if (nbest > 0 && init_list != NULL) { fprintf(stderr, "Winnowing candidate start strings ...\n"); tmpl = lst_new_ptr(nbest); mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa, init_list, nbest, tmpl, !meme_mode, size, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif); lst_free(init_list); init_list = tmpl; } /* Now find motifs */ motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, !meme_mode, size, nmotifs, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif, prior, nrestarts, init_list, sample_parms, npseudocounts); fprintf(stderr, "\n\n"); if (do_bed) bedfeats = gff_new_set_init("phast_motif", "0.1b"); /* generate output */ for (i = 0; i < lst_size(motifs); i++) { Motif *m = lst_get_ptr(motifs, i); if (!suppress_stdout) { if (lst_size(motifs) > 1) printf("\n**********\nMOTIF #%d\n**********\n\n", i+1); mtf_print(stdout, m); } if (do_html) { String *fname = str_dup(output_prefix); str_append_int(fname, i+1); str_append_charstr(fname, ".html"); mtf_print_html(phast_fopen(fname->chars, "w+"), m); str_free(fname); } if (do_bed) mtf_add_features(m, bedfeats); } if (do_html) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "index.html"); mtf_print_summary_html(phast_fopen(fname->chars, "w+"), motifs, output_prefix); str_free(fname); } if (do_bed) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "bed"); gff_print_bed(phast_fopen(fname->chars, "w+"), bedfeats, FALSE); str_free(fname); } return 0; }