int main(int argc, char *argv[]) { int check_start = 0, check_stop = 0, check_splice = 0, check_nonsense = 0, offset5 = 0, offset3 = 0, opt_idx, i, j, indel_strict = 0, no_output = 0, check_alignment = 0, splice_strict = 0; int ncons_tested, nkept, nconserved_exons; int nce_gap_type[NGAP_TYPES], nconsid[NTYPES], nfail[NTYPES]; double Nfrac = 0.05; char c; MSA *msa; GFF_Set *gff; msa_format_type msa_format = UNKNOWN_FORMAT; List *keepers, *problems = lst_new_ptr(10), *ends_adjusted = lst_new_ptr(1), *starts_adjusted = lst_new_ptr(1), *discards=NULL, *intron_splice = lst_new_ptr(10); char *rseq_fname = NULL; FILE *logf = NULL, *mlogf = NULL, *statsf = NULL, *discardf = NULL; cds_gap_type fshift_mode = FSHIFT_BAD; char *groupby = "transcript_id"; msa_coord_map *map; int *countNs, *countCDSs; FILE *infile; char *msa_fname; struct option long_opts[] = { {"start", 0, 0, 's'}, {"stop", 0, 0, 't'}, {"splice", 0, 0, 'l'}, {"nonsense", 0, 0, 'n'}, {"fshift", 0, 0, 'f'}, {"conserved", 0, 0, 'c'}, {"N-limit", 1, 0, 'N'}, {"clean-gaps", 0, 0, 'e'}, {"indel-strict", 0, 0, 'I'}, {"splice-strict", 0, 0, 'C'}, {"groupby", 1, 0, 'g'}, {"msa-format", 1, 0, 'i'}, {"refseq", 1, 0, 'r'}, {"offset5", 1, 0, 'o'}, {"offset3", 1, 0, 'p'}, {"no-output", 0, 0, 'x'}, {"discards", 1, 0, 'd'}, {"log", 1, 0, 'L'}, {"machine-log", 1, 0, 'M'}, {"stats", 1, 0, 'S'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = (char)getopt_long(argc, argv, "N:i:r:L:M:S:g:d:stlnfceICxh", long_opts, &opt_idx)) != -1) { switch(c) { case 's': check_alignment = check_start = 1; break; case 't': check_alignment = check_stop = 1; break; case 'l': check_alignment = check_splice = 1; break; case 'n': check_alignment = check_nonsense = 1; break; case 'f': check_alignment = 1; fshift_mode = FSHIFT_OK; break; case 'c': check_alignment = check_start = check_stop = check_splice = check_nonsense = 1; if (fshift_mode < FSHIFT_OK) fshift_mode = FSHIFT_OK; break; case 'N': Nfrac = get_arg_dbl_bounds(optarg, 0, 1); break; case 'e': check_alignment = 1; if (fshift_mode < CLN_GAPS) fshift_mode = CLN_GAPS; break; case 'I': check_alignment = 1; fshift_mode = NOVRLP_CLN_GAPS; indel_strict = 1; break; case 'C': check_alignment = check_splice = splice_strict = 1; break; case 'g': groupby = optarg; break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("Bad alignment format.\n"); break; case 'r': rseq_fname = optarg; break; case 'o': offset5 = get_arg_int(optarg); break; case 'p': offset3 = get_arg_int(optarg); break; case 'L': logf = phast_fopen(optarg, "w+"); break; case 'M': mlogf = phast_fopen(optarg, "w+"); break; case 'S': statsf = phast_fopen(optarg, "w+"); break; case 'd': discardf = phast_fopen(optarg, "w+"); break; case 'x': no_output = 1; break; case 'h': printf("%s", HELP); exit(0); case '?': die("ERROR: Bad argument. Try the --help option.\n"); } } if (optind + 1 >= argc ) { die("ERROR: Missing required arguments. Try the --help option.\n"); } set_seed(-1); gff = gff_read_set(phast_fopen(argv[optind], "r")); msa_fname = argv[optind+1]; infile = phast_fopen(msa_fname, "r"); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(infile, 1); if (msa_format == MAF) { msa = maf_read(infile, rseq_fname == NULL ? NULL : phast_fopen(rseq_fname, "r"), 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); } else { msa = msa_new_from_file_define_format(infile, msa_format, NULL); if (msa->ss == NULL) ss_from_msas(msa, 1, 1, NULL, NULL, NULL, -1, 0); } if (!msa->ss->tuple_idx) die("ERROR: need ordered tuples\n"); msa_remove_N_from_alph(msa); /* for backward compatibility (old SS files) */ if (msa->idx_offset != 0) { /* avoids offset problem */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); f->start -= msa->idx_offset; f->end -= msa->idx_offset; } } /* set up coordinate map; assume GFF is for sequence 1 */ map = msa_build_coord_map(msa, 1); /* convert all features */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); int newstart, newend; if (f->start < 0 || f->end < f->start) die("ERROR: bad feature in GFF (start=%d, end=%d).\n", f->start, f->end); newstart = msa_map_seq_to_msa(map, f->start); newend = msa_map_seq_to_msa(map, f->end); if (newstart < 0 || newend < newstart) die("ERROR: unable to map coordinates for feature (start=%d, end=%d).\n", f->start, f->end); f->start = newstart; f->end = newend; } gff_group(gff, groupby); /* do this after coord conversion, or group coords and feature coords will be out of sync */ keepers = lst_new_ptr(lst_size(gff->features)); if (discardf != NULL) discards = lst_new_ptr(lst_size(gff->features)); ncons_tested = nkept = nconserved_exons = 0; for (i = 0; i < NTYPES; i++) nconsid[i] = 0; for (i = 0; i < NTYPES; i++) nfail[i] = 0; for (i = 0; i < NGAP_TYPES; i++) nce_gap_type[i] = 0; countNs = smalloc(msa->nseqs * sizeof(int)); countCDSs = smalloc(msa->nseqs * sizeof(int)); for (i = 0; i < lst_size(gff->groups); i++) { GFF_FeatureGroup *group = lst_get_ptr(gff->groups, i); List *gfeatures = group->features; GFF_Feature *feat; status_type status = OKAY; cds_gap_type gt = FSHIFT_BAD; problems_clear(problems); /* make sure have frame info for CDSs */ for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && feat->frame == GFF_NULL_FRAME) die("ERROR: Missing frame info for CDS.\n"); } /* First, exclude stop codons from cds's, if necessary (simplifies the detection of nonsense mutations). */ exclude_stops(group, starts_adjusted, ends_adjusted); /* In all cases, discard any group for which the reference sequence doesn't have valid splice sites or start/stop codons, or has a premature stop codon */ if (!ref_seq_okay(gfeatures, msa, offset3, indel_strict, splice_strict, problems)) { status = BAD_REF; nfail[BAD_REF]++; } else /* Everything else counts as a potentially valid group */ ncons_tested++; if (status == OKAY && check_alignment) { /* only bother with below if interested in cross-species conservation */ /* Check first to make sure there's alignment across species in the cds; if not, there's no need to look at individual features. */ for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && is_incomplete_alignment(feat, msa)) { status = NO_ALN; nfail[NO_ALN]++; problem_add(problems, feat, NO_ALN, -1, -1); break; } } if (status == OKAY) { /* we have alignment and agreement with the ref seq; now check feature by feature */ lst_clear(intron_splice); for (j = 0; j < msa->nseqs; j++) countNs[j] = countCDSs[j] = 0; for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (feat->end - 1 >= msa->length) die("ERROR: feature extends beyond alignment (%d >= %d).\n", feat->end - 1, msa->length); if (check_start && str_equals_charstr(feat->feature, GFF_START_TYPE)) { nconsid[BAD_START]++; if (!is_conserved_start(feat, msa)) { status = BAD_START; problem_add(problems, feat, BAD_START, -1, -1); } } else if (check_stop && str_equals_charstr(feat->feature, GFF_STOP_TYPE)) { nconsid[BAD_STOP]++; if (!is_conserved_stop(feat, msa)) { status = BAD_STOP; problem_add(problems, feat, BAD_STOP, -1, -1); } } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_5)) { nconsid[BAD_5_SPLICE]++; if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) { status = BAD_5_SPLICE; problem_add(problems, feat, BAD_5_SPLICE, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_5_UTR)) { nconsid[BAD_5_SPLICE_UTR]++; if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) { status = BAD_5_SPLICE_UTR; problem_add(problems, feat, BAD_5_SPLICE_UTR, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) { nconsid[BAD_3_SPLICE]++; if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) { status = BAD_3_SPLICE; problem_add(problems, feat, BAD_3_SPLICE, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) { nconsid[BAD_3_SPLICE_UTR]++; if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) { status = BAD_3_SPLICE_UTR; problem_add(problems, feat, BAD_3_SPLICE_UTR, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { if (fshift_mode > FSHIFT_BAD && (gt = get_cds_gap_type(feat, msa, problems)) < fshift_mode) { if (status == OKAY || status == NONSENSE) status = FSHIFT; } if (check_nonsense && !is_nonsense_clean(feat, msa, problems)) { if (status == OKAY) status = NONSENSE; } if (Nfrac < 1) get_N_counts(countNs, countCDSs, feat, msa); } } /* end loop through features in group */ /* still have to make sure splice sites are paired correctly (GT-AG, GC-AG, AT-AC) */ if (status == OKAY && !splice_strict && lst_size(intron_splice) >= 2 && !are_introns_okay(intron_splice, msa, problems, offset5, offset3)) status = BAD_INTRON; /* also check fraction of Ns */ if (Nfrac < 1) { enum {MY_OKAY, MY_FAIL, MY_WARN} Nstatus = MY_OKAY; for (j = 0; j < msa->nseqs; j++) { if ((double)countNs[j] / countCDSs[j] > Nfrac) Nstatus = MY_FAIL; if (Nstatus == MY_OKAY && countNs[j] > 0) Nstatus = MY_WARN; } if (Nstatus == MY_FAIL) { problem_add(problems, NULL, TOO_MANY_Ns, -1, -1); if (status == OKAY) status = TOO_MANY_Ns; } else if (Nstatus == MY_WARN) problem_add(problems, NULL, WARN_Ns, -1, -1); } /* if collecting stats, record counts for failures */ if (statsf != NULL) { if (status != OKAY) { for (j = 0; j < lst_size(problems); j++) { struct Problem *problem = lst_get_ptr(problems, j); status_type ftype = problem->status; if ((ftype == FSHIFT || ftype == NONSENSE) && status != FSHIFT && status != NONSENSE) continue; /* don't count secondary frame shifts and nonsense mutations */ if (ftype == BAD_INTRON && j % 2 == 0) continue; /* only count one of every pair of these */ nfail[ftype]++; } } /* also keep track of the total number of "conserved exons", and the number having each kind of gap */ if ((status == OKAY || (status == FSHIFT && gt >= FSHIFT_OK))) { nconserved_exons++; nce_gap_type[gt]++; /* number of conserved exons having given type of gaps */ } } } /* end if (status == OKAY) [checks for conserved features] */ } /* end if (status == OKAY && check_alignment) [all cross-species checks] */ /* now we have looked at the whole group; we just need to do some final accounting and logging */ if (status == OKAY) { nkept++; if (!no_output) { restore_stops(group, starts_adjusted, ends_adjusted); for (j = 0; j < lst_size(gfeatures); j++) lst_push_ptr(keepers, lst_get_ptr(gfeatures, j)); } if (logf != NULL && lst_size(problems) > 0) /* warnings only */ write_log(logf, group, status, problems, msa, map); if (mlogf != NULL) { /* no problem, need to add an okay status to log */ problem_add(problems, NULL, OKAY, -1, -1); write_machine_log(mlogf, group, problems, map); /* may include warnings */ } } else { if (discardf != NULL) { restore_stops(group, starts_adjusted, ends_adjusted); for (j = 0; j < lst_size(gfeatures); j++) lst_push_ptr(discards, lst_get_ptr(gfeatures, j)); } if (logf != NULL) write_log(logf, group, status, problems, msa, map); if (mlogf != NULL) write_machine_log(mlogf, group, problems, map); } } /* end loop over groups */ /* write main output and discards */ if (!no_output || discardf != NULL) { /* first map features back to coord frame of reference seq. */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); f->start = msa_map_msa_to_seq(map, f->start) + msa->idx_offset; f->end = msa_map_msa_to_seq(map, f->end) + msa->idx_offset; } if (!no_output) { gff->features = keepers; gff_print_set(stdout, gff); } if (discardf != NULL) { gff->features = discards; gff_print_set(discardf, gff); } } /* dump counts to stats file */ if (statsf != NULL) { fprintf(statsf, "#%11s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "total", "nbad_ref", "nconsid", "nkept", "nno_aln", "nbad_starts", "(out of)", "nbad_stops", "(out of)", "nbad_5spl", "(out of)", "nbad_3spl", "(out of)", "nbad_5utr", "(out of)", "nbad_3utr", "(out of)", "nbad_intron", "nnons", "nfshifts", "nNs", "ncons_exons", "nce_ngaps", "nce_nov_cln", "nce_clean", "nce_fshftok"); fprintf(statsf, "%12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d\n", nfail[BAD_REF]+ncons_tested, nfail[BAD_REF], ncons_tested, nkept, nfail[NO_ALN], nfail[BAD_START], nconsid[BAD_START], nfail[BAD_STOP], nconsid[BAD_STOP], nfail[BAD_5_SPLICE], nconsid[BAD_5_SPLICE], nfail[BAD_3_SPLICE], nconsid[BAD_3_SPLICE], nfail[BAD_5_SPLICE_UTR], nconsid[BAD_5_SPLICE_UTR], nfail[BAD_3_SPLICE_UTR], nconsid[BAD_3_SPLICE_UTR], nfail[BAD_INTRON], nfail[NONSENSE], nfail[FSHIFT], nfail[TOO_MANY_Ns], nconserved_exons, nce_gap_type[NGAPS], nce_gap_type[NOVRLP_CLN_GAPS], nce_gap_type[CLN_GAPS], nce_gap_type[FSHIFT_OK]); fprintf(statsf, "%s", STATS_DESCRIPTION); } if (logf != NULL) phast_fclose(logf); if (mlogf != NULL) phast_fclose(mlogf); if (statsf != NULL) phast_fclose(statsf); if (discardf != NULL) phast_fclose(discardf); return 0; }
int main(int argc, char *argv[]) { char c; char *msa_fname = NULL; int opt_idx, i, old_nnodes; MSA *msa; List *pruned_names = lst_new_ptr(5), *tmpl; BDPhyloHmm *bdphmm; GFF_Set *predictions; int found = FALSE; List *ignore_types = lst_new_ptr(1); struct option long_opts[] = { {"refseq", 1, 0, 'M'}, {"msa-format", 1, 0, 'i'}, {"refidx", 1, 0, 'r'}, {"rho", 1, 0, 'R'}, {"phi", 1, 0, 'p'}, {"transitions", 1, 0, 't'}, {"expected-length", 1, 0, 'E'}, {"target-coverage", 1, 0, 'C'}, {"seqname", 1, 0, 'N'}, {"idpref", 1, 0, 'P'}, {"indel-model", 1, 0, 'I'}, {"indel-history", 1, 0, 'H'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; /* arguments and defaults for options */ FILE *refseq_f = NULL, *msa_f = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; TreeModel *source_mod; double rho = DEFAULT_RHO, mu = DEFAULT_MU, nu = DEFAULT_NU, phi = DEFAULT_PHI, gamma = -1, omega = -1, alpha_c = -1, beta_c = -1, tau_c = -1, alpha_n = -1, beta_n = -1, tau_n = -1; int set_transitions = FALSE, refidx = 1, estim_phi = TRUE, estim_gamma = TRUE, estim_omega = TRUE; char *seqname = NULL, *idpref = NULL; IndelHistory *ih = NULL; while ((c = getopt_long(argc, argv, "R:t:p:E:C:r:M:i:N:P:I:H:h", long_opts, &opt_idx)) != -1) { switch (c) { case 'R': rho = get_arg_dbl_bounds(optarg, 0, 1); break; case 't': if (optarg[0] != '~') estim_gamma = estim_omega = FALSE; else optarg = &optarg[1]; set_transitions = TRUE; tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 2) die("ERROR: bad argument to --transitions.\n"); mu = lst_get_dbl(tmpl, 0); nu = lst_get_dbl(tmpl, 1); if (mu <= 0 || mu >= 1 || nu <= 0 || nu >= 1) die("ERROR: bad argument to --transitions.\n"); lst_free(tmpl); break; case 'p': if (optarg[0] != '~') estim_phi = FALSE; else optarg = &optarg[1]; phi = get_arg_dbl_bounds(optarg, 0, 1); break; case 'E': if (optarg[0] != '~') estim_omega = FALSE; else optarg = &optarg[1]; omega = get_arg_dbl_bounds(optarg, 1, INFTY); mu = 1/omega; break; case 'C': if (optarg[0] != '~') estim_gamma = FALSE; else optarg = &optarg[1]; gamma = get_arg_dbl_bounds(optarg, 0, 1); break; case 'r': refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'M': refseq_f = phast_fopen(optarg, "r"); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: unrecognized alignment format.\n"); break; case 'N': seqname = optarg; break; case 'P': idpref = optarg; break; case 'I': tmpl = get_arg_list_dbl(optarg); if (lst_size(tmpl) != 3 && lst_size(tmpl) != 6) die("ERROR: bad argument to --indel-model.\n"); alpha_n = lst_get_dbl(tmpl, 0); beta_n = lst_get_dbl(tmpl, 1); tau_n = lst_get_dbl(tmpl, 2); if (lst_size(tmpl) == 6) { alpha_c = lst_get_dbl(tmpl, 3); beta_c = lst_get_dbl(tmpl, 4); tau_c = lst_get_dbl(tmpl, 5); } else { alpha_c = alpha_n; beta_c = beta_n; tau_c = tau_n; } if (alpha_c <= 0 || alpha_c >= 1 || beta_c <= 0 || beta_c >= 1 || tau_c <= 0 || tau_c >= 1 || alpha_n <= 0 || alpha_n >= 1 || beta_n <= 0 || beta_n >= 1 || tau_n <= 0 || tau_n >= 1) die("ERROR: bad argument to --indel-model.\n"); break; case 'H': fprintf(stderr, "Reading indel history from %s...\n", optarg); ih = ih_new_from_file(phast_fopen(optarg, "r")); break; case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'dless -h'.\n"); } } if (optind != argc - 1) die("Missing alignment file or model file. Try 'dless -h'.\n"); if (set_transitions && (gamma != -1 || omega != -1)) die("ERROR: --transitions and --target-coverage/--expected-length cannot be used together.\n"); if ((gamma != -1 && omega == -1) || (gamma == -1 && omega != -1)) die("ERROR: --target-coverage and --expecteed-length must be used together.\n"); set_seed(-1); if (gamma != -1) nu = gamma/(1-gamma) * mu; fprintf(stderr, "Reading tree model from %s...\n", argv[optind]); source_mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1); if (source_mod->nratecats > 1) die("ERROR: rate variation not currently supported.\n"); if (source_mod->order > 0) die("ERROR: only single nucleotide models are currently supported.\n"); if (!tm_is_reversible(source_mod)) phast_warning("WARNING: p-value computation assumes reversibility and your model is non-reversible.\n"); /* read alignment */ msa_f = phast_fopen(argv[optind], "r"); fprintf(stderr, "Reading alignment from %s...\n", argv[optind]); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(msa_f, 1); if (msa_format == MAF) { msa = maf_read(msa_f, refseq_f, 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); } else msa = msa_new_from_file_define_format(msa_f, msa_format, NULL); if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); if (msa->ss == NULL) { fprintf(stderr, "Extracting sufficient statistics...\n"); ss_from_msas(msa, 1, TRUE, NULL, NULL, NULL, -1, 0); } else if (msa->ss->tuple_idx == NULL) die("ERROR: ordered representation of alignment required unless --suff-stats.\n"); /* prune tree, if necessary */ old_nnodes = source_mod->tree->nnodes; tm_prune(source_mod, msa, pruned_names); if (lst_size(pruned_names) == (old_nnodes + 1) / 2) die("ERROR: no match for leaves of tree in alignment (leaf names must match alignment names).\n"); if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves of tree with no match in alignment ("); for (i = 0; i < lst_size(pruned_names); i++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, i))->chars, i < lst_size(pruned_names) - 1 ? ", " : ").\n"); } /* this has to be done after pruning tree */ tr_name_ancestors(source_mod->tree); /* also make sure match for reference sequence in tree */ if (refidx > 0) { for (i = 0, found = FALSE; !found && i < source_mod->tree->nnodes; i++) { TreeNode *n = lst_get_ptr(source_mod->tree->nodes, i); if (!strcmp(n->name, msa->names[refidx-1])) found = TRUE; } if (!found) die("ERROR: no match for reference sequence in tree.\n"); } /* checks for indel model */ if (alpha_c > 0) { if (ih == NULL) { fprintf(stderr, "Reconstructing indel history by parsimony...\n"); ih = ih_reconstruct(msa, source_mod->tree); } else { if (ih->ncols != msa->length) die("ERROR: indel history doesn't seem to match alignment.\n"); if (ih->tree->nnodes != source_mod->tree->nnodes) die("ERROR: indel history doesn't seem to match tree model.\n"); } } bdphmm = bd_new(source_mod, rho, mu, nu, phi, alpha_c, beta_c, tau_c, alpha_n, beta_n, tau_n, estim_gamma, estim_omega, estim_phi); /* compute emissions */ phmm_compute_emissions(bdphmm->phmm, msa, FALSE); /* add emissions for indel model, if necessary */ if (alpha_c > 0) { fprintf(stderr, "Adjusting emissions for indels...\n"); bd_add_indel_emissions(bdphmm, ih); } /* postprocess for missing data (requires special handling) */ fprintf(stderr, "Adjusting emissions for missing data...\n"); bd_handle_missing_data(bdphmm, msa); if (estim_gamma || estim_omega || estim_phi) { fprintf(stderr, "Estimating free parameters...\n"); bd_estimate_transitions(bdphmm, msa); } /* set seqname and idpref, if necessary */ if (seqname == NULL || idpref == NULL) { /* derive default from file name root */ String *tmp = str_new_charstr(msa_fname); if (!str_equals_charstr(tmp, "-")) { str_remove_path(tmp); str_root(tmp, '.'); if (idpref == NULL) idpref = copy_charstr(tmp->chars); str_root(tmp, '.'); /* apply one more time for double suffix */ if (seqname == NULL) seqname = tmp->chars; } else if (seqname == NULL) seqname = "refseq"; } /* obtain predictions */ fprintf(stderr, "Running Viterbi algorithm...\n"); predictions = phmm_predict_viterbi(bdphmm->phmm, seqname, NULL, idpref, NULL); lst_push_ptr(ignore_types, str_new_charstr("nonconserved")); gff_filter_by_type(predictions, ignore_types, TRUE, NULL); /* score predictions */ fprintf(stderr, "Scoring predictions...\n"); bd_score_predictions(bdphmm, predictions); /* can free emissions now */ for (i = 0; i < bdphmm->phmm->hmm->nstates; i++) sfree(bdphmm->phmm->emissions[i]); sfree(bdphmm->phmm->emissions); bdphmm->phmm->emissions = NULL; /* convert GFF to coord frame of reference sequence and adjust coords by idx_offset, if necessary */ if (refidx != 0 || msa->idx_offset != 0) msa_map_gff_coords(msa, predictions, 0, refidx, msa->idx_offset); if (refidx != 0) gff_flatten(predictions); /* necessary because coord conversion might create overlapping features (can happen in deletions in reference sequence) */ /* now output predictions */ fprintf(stderr, "Writing GFF to stdout...\n"); gff_print_set(stdout, predictions); fprintf(stderr, "Done.\n"); return 0; }
int main(int argc, char *argv[]) { char c; List *l; int i, j, strand, bed_output = 0, backgd_nmods = -1, feat_nmods = -1, winsize = -1, verbose = 0, max_nmods, memblocksize, old_nleaves, refidx = 1, base_by_base = FALSE, windowWig = FALSE; TreeModel **backgd_mods = NULL, **feat_mods = NULL; HMM *backgd_hmm = NULL, *feat_hmm = NULL; msa_format_type inform = UNKNOWN_FORMAT; GFF_Set *features = NULL; MSA *msa, *msa_compl=NULL; double **backgd_emissions, **feat_emissions, **mem, **dummy_emissions, *winscore_pos=NULL, *winscore_neg=NULL; int *no_alignment=NULL; List *pruned_names; char *msa_fname; FILE *infile; int opt_idx; struct option long_opts[] = { {"background-mods", 1, 0, 'b'}, {"background-hmm", 1, 0, 'B'}, {"feature-mods", 1, 0, 'f'}, {"feature-hmm", 1, 0, 'F'}, {"features", 1, 0, 'g'}, {"window", 1, 0, 'w'}, {"window-wig", 1, 0, 'W'}, {"base-by-base", 0, 0, 'y'}, {"msa-format", 1, 0, 'i'}, {"refidx", 1, 0, 'r'}, {"output-bed", 0, 0, 'd'}, {"verbose", 0, 0, 'v'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "B:b:F:f:r:g:w:W:i:ydvh", long_opts, &opt_idx)) != -1) { switch (c) { case 'B': backgd_hmm = hmm_new_from_file(phast_fopen(optarg, "r")); break; case 'b': l = get_arg_list(optarg); backgd_nmods = lst_size(l); backgd_mods = smalloc(backgd_nmods * sizeof(void*)); for (i = 0; i < backgd_nmods; i++) backgd_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1); lst_free_strings(l); lst_free(l); break; case 'F': feat_hmm = hmm_new_from_file(phast_fopen(optarg, "r")); break; case 'f': l = get_arg_list(optarg); feat_nmods = lst_size(l); feat_mods = smalloc(feat_nmods * sizeof(void*)); for (i = 0; i < feat_nmods; i++) feat_mods[i] = tm_new_from_file(phast_fopen(((String*)lst_get_ptr(l, i))->chars, "r"), 1); lst_free_strings(l); lst_free(l); break; case 'g': features = gff_read_set(phast_fopen(optarg, "r")); break; case 'w': winsize = get_arg_int(optarg); if (winsize <= 0) die("ERROR: window size must be positive.\n"); break; case 'W': winsize = get_arg_int(optarg); if (winsize <= 0) die("ERROR: window size must be positive.\n"); windowWig = TRUE; break; case 'y': base_by_base = TRUE; break; case 'i': inform = msa_str_to_format(optarg); if (inform == UNKNOWN_FORMAT) die("Bad argument to -i.\n"); break; case 'r': refidx = get_arg_int_bounds(optarg, 0, INFTY); break; case 'd': bed_output = 1; break; case 'h': printf("%s", HELP); exit(0); case 'v': verbose = 1; break; case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } set_seed(-1); if (backgd_mods == NULL || feat_mods == NULL) die("ERROR: -b and -f required. Try '%s -h'.\n", argv[0]); if (backgd_nmods == 1 && backgd_hmm == NULL) backgd_hmm = hmm_create_trivial(); else if (backgd_hmm == NULL) die("ERROR: -B required. Try '%s -h'.\n", argv[0]); if (feat_nmods == 1 && feat_hmm == NULL) feat_hmm = hmm_create_trivial(); else if (feat_hmm == NULL) die("ERROR: -F required. Try '%s -h'.\n", argv[0]); if ((winsize == -1 && features == NULL && !base_by_base) || (winsize != -1 && features != NULL) || (winsize != -1 && base_by_base) || (features != NULL && base_by_base)) die("ERROR: must specify exactly one of -g, -w, and -y. Try '%s -h'.\n", argv[0]); if (backgd_hmm->nstates != backgd_nmods) die("ERROR: number of states must equal number of tree models for background.\n"); if (feat_hmm->nstates != feat_nmods) die("ERROR: number of states must equal number of tree models for features.\n"); if (features != NULL && lst_size(features->features) == 0) die("ERROR: empty features file.\n"); if (base_by_base && (backgd_nmods > 1 || feat_nmods > 1)) die("ERROR: only single phylogenetic models (not HMMs) are supported with --base-by-base.\n"); if (optind != argc - 1) die("ERROR: too few arguments. Try '%s -h'.\n", argv[0]); if (verbose) fprintf(stderr, "Reading alignment ...\n"); msa_fname = argv[optind]; infile = phast_fopen(msa_fname, "r"); if (inform == UNKNOWN_FORMAT) inform = msa_format_for_content(infile, 1); if (inform == MAF) msa = maf_read(infile, NULL, 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); else msa = msa_new_from_file_define_format(infile, inform, NULL); if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* need ordered representation of alignment */ if (msa->seqs == NULL && (msa->ss == NULL || msa->ss->tuple_idx == NULL) ) die("ERROR: ordered sufficient statistics are required.\n"); pruned_names = lst_new_ptr(msa->nseqs); for (i = 0; i < backgd_nmods; i++) { old_nleaves = (backgd_mods[i]->tree->nnodes + 1) / 2; tm_prune(backgd_mods[i], msa, pruned_names); if (lst_size(pruned_names) >= old_nleaves) die("ERROR: no match for leaves of tree in alignment (background model #%d)\n", i+1); else if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves in background model (#%d) with no match in alignment (", i+1); for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); } for (i = 0; i < feat_nmods; i++) { old_nleaves = (feat_mods[i]->tree->nnodes + 1) / 2; tm_prune(feat_mods[i], msa, pruned_names); if (lst_size(pruned_names) >= old_nleaves) die("ERROR: no match for leaves of tree in alignment (features model #%d)\n", i+1); else if (lst_size(pruned_names) > 0) { fprintf(stderr, "WARNING: pruned away leaves in features model (#%d) with no match in alignment (", i+1); for (j = 0; j < lst_size(pruned_names); j++) fprintf(stderr, "%s%s", ((String*)lst_get_ptr(pruned_names, j))->chars, j < lst_size(pruned_names) - 1 ? ", " : ").\n"); } lst_free_strings(pruned_names); } lst_free(pruned_names); /* first have to subtract offset from features, if necessary */ if (msa->idx_offset != 0 && features != NULL) { for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); f->start -= msa->idx_offset; f->end -= msa->idx_offset; } } /* convert to coord frame of alignment */ if (features != NULL && refidx != 0) { if (verbose) fprintf(stderr, "Mapping coordinates ...\n"); msa_map_gff_coords(msa, features, refidx, 0, 0); if (lst_size(features->features) == 0) die("ERROR: no features within coordinate range of alignment.\n"); } /* Make a reverse complemented copy of the alignment. The two strands will be processed separately, to avoid problems with overlapping features, etc. */ if (!base_by_base) { /* skip in base by base case */ if (verbose) fprintf(stderr, "Creating reverse complemented alignment ...\n"); msa_compl = msa_create_copy(msa, 0); /* temporary workaround: make sure reverse complement not based on sufficient stats */ if (msa_compl->seqs == NULL) ss_to_msa(msa_compl); if (msa_compl->ss != NULL) { ss_free(msa_compl->ss); msa_compl->ss = NULL; } msa_reverse_compl(msa_compl); } /* allocate memory for computing scores */ backgd_emissions = smalloc(backgd_nmods * sizeof(void*)); for (i = 0; i < backgd_nmods; i++) backgd_emissions[i] = smalloc(msa->length * sizeof(double)); feat_emissions = smalloc(feat_nmods * sizeof(void*)); for (i = 0; i < feat_nmods; i++) feat_emissions[i] = smalloc(msa->length * sizeof(double)); max_nmods = max(backgd_nmods, feat_nmods); dummy_emissions = smalloc(max_nmods * sizeof(void*)); mem = smalloc(max_nmods * sizeof(void*)); /* memory for forward algorithm -- each block must be as large as the largest feature */ if (features != NULL) { for (i = 0, memblocksize = -1; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); if (f->end - f->start + 1 > memblocksize) memblocksize = f->end - f->start + 1; } } else memblocksize = winsize; /* -1 if base-by-base mode */ if (memblocksize > 0) for (i = 0; i < max_nmods; i++) mem[i] = smalloc(memblocksize * sizeof(double)); if (winsize != -1) { winscore_pos = smalloc(msa->length * sizeof(double)); winscore_neg = smalloc(msa->length * sizeof(double)); no_alignment = smalloc(msa->length * sizeof(int)); for (i = 0; i < msa->length; i++) { winscore_pos[i] = winscore_neg[i] = NEGINFTY; if (refidx == 0) no_alignment[i] = FALSE; else no_alignment[i] = msa_missing_col(msa, refidx, i); } } /* the rest will be repeated for each strand */ for (strand = 1; strand <= 2; strand++) { MSA *thismsa = strand == 1 ? msa : msa_compl; double *winscore = strand == 1 ? winscore_pos : winscore_neg; if (base_by_base && strand == 2) break; /* don't do second pass in base_by_base case */ if (verbose) fprintf(stderr, "Processing %c strand ...\n", strand == 1 ? '+' : '-'); /* set up dummy categories array, so that emissions are only computed where needed */ thismsa->categories = smalloc(thismsa->length * sizeof(int)); thismsa->ncats = 1; if (winsize != -1) { if (strand == 1) for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = no_alignment[i] ? 0 : 1; else for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = no_alignment[thismsa->length - i - 1] ? 0 : 1; } else if (features != NULL) { for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 0; for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); if (f->start <= 0 || f->end <= 0) { fprintf(stderr, "WARNING: feature out of range ('"); gff_print_feat(stderr, f); fprintf(stderr, "')\n"); continue; } if (strand == 1 && f->strand != '-') for (j = f->start - 1; j < f->end; j++) thismsa->categories[j] = 1; else if (strand == 2 && f->strand == '-') for (j = thismsa->length - f->end; j < thismsa->length - f->start + 1; j++) thismsa->categories[j] = 1; } } else { /* base-by-base scores */ for (i = 0; i < thismsa->length; i++) thismsa->categories[i] = 1; } if (thismsa->ss != NULL) ss_update_categories(thismsa); /* compute emissions */ for (i = 0; i < backgd_nmods; i++) { if (verbose) fprintf(stderr, "Computing emissions for background model #%d ...\n", i+1); tl_compute_log_likelihood(backgd_mods[i], thismsa, backgd_emissions[i], NULL, 1, NULL); } for (i = 0; i < feat_nmods; i++) { if (verbose) fprintf(stderr, "Computing emissions for features model #%d ...\n", i+1); tl_compute_log_likelihood(feat_mods[i], thismsa, feat_emissions[i], NULL, 1, NULL); } /* now compute scores */ if (winsize != -1) { /* windows case */ int winstart; if (verbose) fprintf(stderr, "Computing scores ...\n"); for (winstart = 0; winstart <= thismsa->length - winsize; winstart++) { int centeridx = winstart + winsize/2; if (strand == 2) centeridx = thismsa->length - centeridx - 1; if (no_alignment[centeridx]) continue; for (j = 0; j < feat_nmods; j++) dummy_emissions[j] = &(feat_emissions[j][winstart]); winscore[centeridx] = hmm_forward(feat_hmm, dummy_emissions, winsize, mem); if (winscore[centeridx] <= NEGINFTY) { winscore[centeridx] = NEGINFTY; continue; } for (j = 0; j < backgd_nmods; j++) dummy_emissions[j] = &(backgd_emissions[j][winstart]); winscore[centeridx] -= hmm_forward(backgd_hmm, dummy_emissions, winsize, mem); if (winscore[centeridx] < NEGINFTY) winscore[centeridx] = NEGINFTY; } } else if (features != NULL) { /* features case */ if (verbose) fprintf(stderr, "Computing scores ...\n"); for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); int s, e; if ((strand == 1 && f->strand == '-') || (strand == 2 && f->strand != '-') || f->start <= 0 || f->end <= 0 || f->end - f->start < 0) continue; /* effective coords */ if (f->strand == '-') { s = thismsa->length - f->end + 1; e = thismsa->length - f->start + 1; } else { s = f->start; e = f->end; } f->score_is_null = 0; for (j = 0; j < feat_nmods; j++) dummy_emissions[j] = &(feat_emissions[j][s-1]); f->score = hmm_forward(feat_hmm, dummy_emissions, e - s + 1, mem); if (f->score <= NEGINFTY) { f->score = NEGINFTY; continue; } for (j = 0; j < backgd_nmods; j++) dummy_emissions[j] = &(backgd_emissions[j][s-1]); f->score -= hmm_forward(backgd_hmm, dummy_emissions, e - s + 1, mem); if (f->score < NEGINFTY) f->score = NEGINFTY; } } } if (verbose) fprintf(stderr, "Generating output ...\n"); if (winsize != -1 && windowWig == FALSE) { /* standard windows output */ for (i = 0, j = 0; i < msa->length; i++) { if (no_alignment[i] == FALSE) printf("%d\t%.3f\t%.3f\n", j + msa->idx_offset + 1, winscore_pos[i], winscore_neg[i]); if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) j++; } } else if (windowWig == TRUE) { /* windows with wig output */ int last = NEGINFTY; for (i = 0, j = 0; i < msa->length; i++) { if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) { if (no_alignment[i] == FALSE && winscore_pos[i] > NEGINFTY) { if (j > last + 1) printf("fixedStep chrom=%s start=%d step=1\n", refidx > 0 ? msa->names[refidx-1] : "alignment", j + msa->idx_offset + 1); printf("%.3f\n", winscore_pos[i]); last = j; } j++; } } } else if (features != NULL) { /* features output */ /* return to coord frame of reference seq (also, replace offset) */ if (refidx != 0) msa_map_gff_coords(msa, features, 0, refidx, msa->idx_offset); else if (msa->idx_offset != 0) { for (i = 0; i < lst_size(features->features); i++) { GFF_Feature *f = lst_get_ptr(features->features, i); f->start += msa->idx_offset; f->end += msa->idx_offset; } } if (bed_output) gff_print_bed(stdout, features, FALSE); else gff_print_set(stdout, features); } else { /* base-by-base scores */ /* in this case, we can just output the difference between the emissions */ printf("fixedStep chrom=%s start=%d step=1\n", refidx > 0 ? msa->names[refidx-1] : "alignment", msa->idx_offset + 1); for (i = 0, j = 0; i < msa->length; i++) { if (refidx == 0 || msa_get_char(msa, refidx-1, i) != GAP_CHAR) { printf("%.3f\n", feat_emissions[0][i] - backgd_emissions[0][i]); j++; } } } if (verbose) fprintf(stderr, "\nDone.\n"); return 0; }
int main(int argc, char *argv[]) { TreeNode *tree = NULL; TreeModel *backgd_mod = NULL; int i, j, size = DEFAULT_SIZE, meme_mode = 0, profile_mode = 0, nrestarts = 10, npseudocounts = 5, nsamples = -1, nmostprevalent = -1, tuple_size = -1, nbest = -1, sample_parms = 0, nmotifs = DEFAULT_NUMBER, nseqs = -1, do_html = 0, do_bed = 0, suppress_stdout = 0; List *msa_name_list = NULL, *pos_examples = NULL, *init_list = NULL, *tmpl; List *msas, *motifs; SeqSet *seqset = NULL; PooledMSA *pmsa = NULL; msa_format_type msa_format = UNKNOWN_FORMAT; Vector *backgd_mnmod = NULL; Hashtable *hash=NULL; String *output_prefix = str_new_charstr("phastm."); double *has_motif = NULL; double prior = PRIOR; char c; GFF_Set *bedfeats = NULL; while ((c = getopt(argc, argv, "t:i:b:sk:md:pn:I:R:P:w:c:SB:o:HDxh")) != -1) { switch (c) { case 't': tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("ERROR: bad input format.\n"); break; case 'b': backgd_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); break; case 's': break; case 'k': size = get_arg_int(optarg); break; case 'm': meme_mode = 1; break; case 'd': pos_examples = get_arg_list(optarg); break; case 'p': profile_mode = 1; break; case 'n': nrestarts = get_arg_int(optarg); break; case 'I': init_list = get_arg_list(optarg); break; case 'P': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -P.\n"); nmostprevalent = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nmostprevalent > 0 && tuple_size > 0)) die("ERROR: bad argument nmostprevalent=%i tuple_size=%i\n", nmostprevalent, tuple_size); lst_free(tmpl); break; case 'R': tmpl = str_list_as_int(get_arg_list(optarg)); if (lst_size(tmpl) != 2) die("ERROR: bad argument to -R.\n"); nsamples = lst_get_int(tmpl, 0); tuple_size = lst_get_int(tmpl, 1); if (!(nsamples > 0 && tuple_size > 0)) die("ERROR nsamples=%i tuple_sizse=%i\n", nsamples, tuple_size); lst_free(tmpl); break; case 'c': npseudocounts = get_arg_int(optarg); break; case 'w': nbest = get_arg_int(optarg); break; case 'S': sample_parms = 1; break; case 'B': nmotifs = get_arg_int(optarg); break; case 'o': str_free(output_prefix); output_prefix = str_new_charstr(optarg); str_append_char(output_prefix, '.'); break; case 'H': do_html = 1; break; case 'D': do_bed = 1; break; case 'x': suppress_stdout = 1; break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("ERROR: List of alignment files required. Try '%s -h'.\n", argv[0]); if ((nsamples > 0 && nmostprevalent > 0) || (nsamples > 0 && init_list != NULL) || (nmostprevalent > 0 && init_list != NULL)) die("ERROR: -I, -P, and -R are mutually exclusive."); set_seed(-1); msa_name_list = get_arg_list(argv[optind]); if (backgd_mod != NULL && tree == NULL) tree = backgd_mod->tree; if (tree == NULL && !meme_mode && !profile_mode) die("ERROR: Must specify -t, -m, or -p.\n"); if ((init_list != NULL || nsamples > 0 || nmostprevalent > 0) && !sample_parms) nrestarts = 1; if (pos_examples != NULL) { hash = hsh_new(lst_size(pos_examples)); for (i = 0; i < lst_size(pos_examples); i++) hsh_put_int(hash, ((String*)lst_get_ptr(pos_examples, i))->chars, 1); has_motif = smalloc(lst_size(msa_name_list) * sizeof(double)); } /* open all MSAs */ msas = lst_new_ptr(lst_size(msa_name_list)); fprintf(stderr, "Reading alignment(s) ...\n"); for (i = 0, j = 0; i < lst_size(msa_name_list); i++) { String *name = lst_get_ptr(msa_name_list, i); FILE *mfile = phast_fopen(name->chars, "r"); msa_format_type temp_format; MSA *msa; if (msa_format == UNKNOWN_FORMAT) temp_format = msa_format_for_content(mfile, 1); else temp_format = msa_format; msa = msa_new_from_file_define_format(mfile, temp_format, NULL); phast_fclose(mfile); if (nseqs == -1) nseqs = msa->nseqs; if (!meme_mode && (msa->length - msa_num_gapped_cols(msa, STRIP_ANY_GAPS, -1, -1) < 300 || msa->nseqs != nseqs)) { fprintf(stderr, "WARNING: ignoring alignment '%s' -- too few informative sites.\n", name->chars); msa_free(msa); continue; } if (msa_alph_has_lowercase(msa)) msa_toupper(msa); msa_remove_N_from_alph(msa); /* Ns can be a problem */ lst_push_ptr(msas, msa); if (has_motif != NULL) { int k, hm = (hsh_get_int(hash, name->chars) == 1); if (meme_mode) { /* here need to record at individ seq level */ has_motif = srealloc(has_motif, (j + msa->nseqs + 1) * sizeof(double)); /* FIXME */ for (k = 0; k < msa->nseqs; k++) has_motif[j++] = hm; } else has_motif[j++] = hm; } } if (!meme_mode) { fprintf(stderr, "Extracting and pooling sufficient statistics ...\n"); pmsa = ss_pooled_from_msas(msas, 1, size, NULL, 0); msa_remove_N_from_alph(pmsa->pooled_msa); } /* obtain individual sequences, if necessary */ if (nmostprevalent > 0 || nsamples > 0 || meme_mode) { if (meme_mode) fprintf(stderr, "Converting to individual sequences ...\n"); else fprintf(stderr, "Obtaining reference sequences for pre-processing ...\n"); seqset = mtf_get_seqset(msas, meme_mode ? -1 : 1, 10 * size); /* for now, assume 1st seq is reference */ msa_remove_N_from_alph(seqset->set); } if (nmostprevalent > 0) { fprintf(stderr, "Obtaining %d most prevalent %d-tuples ...\n", nmostprevalent, tuple_size); init_list = lst_new_ptr(nmostprevalent); mtf_get_common_ntuples(seqset, init_list, tuple_size, nmostprevalent); } else if (nsamples > 0) { fprintf(stderr, "Sampling %d %d-tuples ...\n", nsamples, tuple_size); init_list = lst_new_ptr(nsamples); mtf_sample_ntuples(seqset, init_list, tuple_size, nsamples); } /* in meme_mode, backgd model can be specified as eq freqs in a .mod file */ if (meme_mode && backgd_mod != NULL && has_motif == NULL) backgd_mnmod = backgd_mod->backgd_freqs; /* estimate background model, if necessary */ else if (backgd_mod == NULL && (!meme_mode || has_motif == NULL)) { fprintf(stderr, "Fitting background model%s ...\n", has_motif == NULL ? "" : " (for use in initialization)"); /* if discriminative, be clear backgd isn't really part of the estimation procedure */ if (meme_mode) { backgd_mnmod = vec_new(strlen(seqset->set->alphabet)); mtf_estim_backgd_mn(seqset, backgd_mnmod); } else { backgd_mod = tm_new(tr_create_copy(tree), NULL, NULL, F81, pmsa->pooled_msa->alphabet, 1, 0, NULL, -1); tm_fit(backgd_mod, pmsa->pooled_msa, tm_params_init(backgd_mod, .1, 5, 0), -1, OPT_MED_PREC, NULL, 0, NULL); } } /* select subset of init strings, if necessary */ if (nbest > 0 && init_list != NULL) { fprintf(stderr, "Winnowing candidate start strings ...\n"); tmpl = lst_new_ptr(nbest); mtf_winnow_starts(meme_mode ? (void*)seqset : (void*)pmsa, init_list, nbest, tmpl, !meme_mode, size, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif); lst_free(init_list); init_list = tmpl; } /* Now find motifs */ motifs = mtf_find(meme_mode ? (void*)seqset : (void*)pmsa, !meme_mode, size, nmotifs, tree, meme_mode ? (void*)backgd_mnmod : (void*)backgd_mod, has_motif, prior, nrestarts, init_list, sample_parms, npseudocounts); fprintf(stderr, "\n\n"); if (do_bed) bedfeats = gff_new_set_init("phast_motif", "0.1b"); /* generate output */ for (i = 0; i < lst_size(motifs); i++) { Motif *m = lst_get_ptr(motifs, i); if (!suppress_stdout) { if (lst_size(motifs) > 1) printf("\n**********\nMOTIF #%d\n**********\n\n", i+1); mtf_print(stdout, m); } if (do_html) { String *fname = str_dup(output_prefix); str_append_int(fname, i+1); str_append_charstr(fname, ".html"); mtf_print_html(phast_fopen(fname->chars, "w+"), m); str_free(fname); } if (do_bed) mtf_add_features(m, bedfeats); } if (do_html) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "index.html"); mtf_print_summary_html(phast_fopen(fname->chars, "w+"), motifs, output_prefix); str_free(fname); } if (do_bed) { String *fname = str_dup(output_prefix); str_append_charstr(fname, "bed"); gff_print_bed(phast_fopen(fname->chars, "w+"), bedfeats, FALSE); str_free(fname); } return 0; }