/* Exclude stop codons from all CDS in a group, as necessary. Record any features that are changed, so they can be changed back before data is output */ void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, List *ends_adjusted) { int j, k; List *stops = lst_new_ptr(1), *gfeatures = group->features; GFF_Feature *feat; lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted); for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops. We expect at most one, but more are possible */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat); } for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { for (k = 0; k < lst_size(stops); k++) { /* check stops */ GFF_Feature *stop = lst_get_ptr(stops, k); if (feat->strand == '+' && stop->strand == '+' && feat->end == stop->end) { feat->end -= 3; lst_push_ptr(ends_adjusted, feat); } else if (feat->strand == '-' && stop->strand == '-' && feat->start == stop->start) { feat->start += 3; lst_push_ptr(starts_adjusted, feat); } } } } lst_free(stops); }
/* Reset a problem list to the empty state */ void problems_clear(List *problems) { int i; for (i = 0; i < lst_size(problems); i++) { problem_free(lst_get_ptr(problems, i)); } lst_clear(problems); }
/** maps a sequence (array) of category numbers from the spooled space to the unspooled space, using the current unspooler. Original sequence is overwritten */ void cm_spooled_to_unspooled(CategoryMap *cm, int *path, int pathlen) { int j, sp_state, prev_sp_state; List *pred; if (cm->unspooler == NULL) return; pred = lst_new_int(cm->unspooler->nstates_spooled); prev_sp_state = -1; for (j = 0; j < pathlen; j++) { if (!(path[j] >= 0 && path[j] <= cm->unspooler->nstates_spooled)) die("ERROR cm_spooled_to_unspooled: path[%i]=%i, should be in [0, %i]\n", j, path[j], cm->unspooler->nstates_spooled); sp_state = path[j]; path[j] = cm_get_unspooled_state(cm, path[j], pred); if (path[j] == -1) die("ERROR: failure mapping to uspooled state at position %d.\n", j); if (sp_state != prev_sp_state) { /* if the current (spooled) state is not conditioned on any other state, then its predecessor cannot matter, so the list can be cleared */ if (lst_size(cm->unspooler->spooled_to_unspooled[sp_state]->children) == 0) lst_clear(pred); lst_push_int(pred, sp_state); } prev_sp_state = sp_state; } lst_free(pred); }
void tbl_clear(table_t *ct) { int i; for (i = 0; i < ct->max_size; i++) { if (ct->map[i] != NULL) { lst_clear(ct->map[i]->head); free(ct->map[i]); } } free(ct->map); }
/* Print a CategoryMap to a file */ void cm_print(CategoryMap *cm, FILE *F) { int i, j, k; List *tmpl; fprintf(F, "NCATS = %d\n\n", cm->ncats); for (i = 1; i <= cm->ncats; i++) { CategoryRange *cr = cm->ranges[i]; for (j = 0; j < lst_size(cr->feature_types); j++) { String *s = (String*)lst_get_ptr(cr->feature_types, j); fprintf(F, "%-15s %d", s->chars, cr->start_cat_no); if (cr->end_cat_no > cr->start_cat_no) fprintf(F, "-%d", cr->end_cat_no); if (cm->conditioned_on[i] != NULL) { fprintf(F, "\t"); for (k = 0; k < lst_size(cm->conditioned_on[i]); k++) fprintf(F, "%d%s", lst_get_int(cm->conditioned_on[i], k), k + 1 == lst_size(cm->conditioned_on[i]) ? "" : ","); } fprintf(F, "\n"); } i = cr->end_cat_no; /* avoid looking multiple times at the same range */ } /* reconstruct precedence lists */ tmpl = lst_new_int(cm->ncats + 1); for (i = 0; i <= cm->ncats; i++) lst_push_int(tmpl, i); prec = cm->labelling_precedence; lst_qsort(tmpl, compare_prec); fprintf(F, "\nLABELLING_PRECEDENCE = "); for (i = 0; i <= cm->ncats; i++) { int cat = lst_get_int(tmpl, i); if (cm->labelling_precedence[cat] != -1) fprintf(F, "%d%s", cat, i < cm->ncats ? "," : ""); } fprintf(F, "\n"); lst_clear(tmpl); for (i = 0; i <= cm->ncats; i++) lst_push_int(tmpl, i); prec = cm->fill_precedence; lst_qsort(tmpl, compare_prec); fprintf(F, "FILL_PRECEDENCE = "); for (i = 0; i <= cm->ncats; i++) { int cat = lst_get_int(tmpl, i); if (cm->fill_precedence[cat] != -1) fprintf(F, "%d%s", cat, i < cm->ncats ? "," : ""); } fprintf(F, "\n"); lst_free(tmpl); }
int main(int argc, char *argv[]) { char c; int i, j, t, opt_idx, ntrees, nleaves = -1; TreeNode *n, *node_i, *node_j, *lca, *nametree = NULL; TreeNode **tree; List *leaves, ***distance, *tree_fnames, *tot_dist; int mod = FALSE; char **leaf_name; String *trees_arg; FILE *F; struct option long_opts[] = { {"mod", 0, 0, 'm'}, {"tree", 1, 0, 't'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "mt:h", long_opts, &opt_idx)) != -1) { switch (c) { case 'm': mod = TRUE; break; case 't': if (optarg[0] == '(') nametree = tr_new_from_string(optarg); else nametree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind > argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); set_seed(-1); /* build a comma-delimited list and pass to get_arg_list; allows possibility of reading from file via '*' operator */ trees_arg = str_new(1000); for (i = optind; i < argc; i++) { str_append_charstr(trees_arg, argv[i]); if (i < argc - 1) str_append_char(trees_arg, ','); } tree_fnames = get_arg_list(trees_arg->chars); ntrees = lst_size(tree_fnames); tree = smalloc(ntrees * sizeof(void*)); /* read trees */ for (t = 0; t < ntrees; t++) { String *fname = lst_get_ptr(tree_fnames, t); if (mod) { TreeModel *m = tm_new_from_file(F = phast_fopen(fname->chars, "r"), 1); tree[t] = tr_create_copy(m->tree); tm_free(m); phast_fclose(F); } else tree[t] = tr_new_from_file(phast_fopen(fname->chars, "r")); } /* initialization */ nleaves = (tree[0]->nnodes + 1)/2; leaves = lst_new_ptr(nleaves); distance = smalloc(nleaves * sizeof(void*)); leaf_name = smalloc(nleaves * sizeof(void*)); for (i = 0; i < nleaves; i++) { distance[i] = smalloc(nleaves * sizeof(void*)); for (j = i+1; j < nleaves; j++) distance[i][j] = lst_new_dbl(ntrees); } if (nametree == NULL) nametree = tree[0]; for (i = 0, j = 0; i < lst_size(nametree->nodes); i++) { n = lst_get_ptr(nametree->nodes, i); if (n->lchild == NULL && n->rchild == NULL) leaf_name[j++] = n->name; } tot_dist = lst_new_dbl(ntrees); /* now compute distances */ for (t = 0; t < ntrees; t++) { /* obtain list of leaves */ lst_clear(leaves); for (i = 0; i < lst_size(tree[t]->nodes); i++) { n = lst_get_ptr(tree[t]->nodes, i); if (n->lchild == NULL && n->rchild == NULL) lst_push_ptr(leaves, n); } if (lst_size(leaves) != nleaves) die("ERROR: trees have different numbers of leaves.\n"); /* look at all pairs */ for (i = 0; i < nleaves; i++) { node_i = lst_get_ptr(leaves, i); for (j = i+1; j < nleaves; j++) { double dist = 0; node_j = lst_get_ptr(leaves, j); /* because ids are assigned in pre-order, the first ancestor of node j that has an id less than i is the LCA of i and j; we seek the sum of distances from both i and j to this node */ for (n = node_j; n->id >= node_i->id; n = n->parent) dist += n->dparent; lca = n; for (n = node_i; n != lca; n = n->parent) dist += n->dparent; lst_push_dbl(distance[i][j], dist); } } lst_push_dbl(tot_dist, tr_total_len(tree[t])); } /* print distances and (optionally) stats */ if (ntrees == 1) { for (i = 0; i < nleaves; i++) { for (j = i+1; j < nleaves; j++) { printf ("%s\t%s\t%f\n", leaf_name[i], leaf_name[j], lst_get_dbl(distance[i][j], 0)); } } printf ("%s\t%s\t%f\n", "(total)", "-", lst_get_dbl(tot_dist, 0)); } else { double mean, stdev; double quantiles[] = {0, 0.025, 0.05, 0.5, 0.95, 0.975, 1}; double quantile_vals[7]; printf("%-15s %-15s %9s %9s %9s %9s %9s %9s %9s %9s %9s\n", "leaf1", "leaf2", "mean", "stdev", "median", "min", "max", "95%_min", "95%_max", "90%_min", "90%_max"); for (i = 0; i < nleaves; i++) { for (j = i+1; j < nleaves; j++) { mean = lst_dbl_mean(distance[i][j]); stdev = lst_dbl_stdev(distance[i][j]); lst_qsort_dbl(distance[i][j], ASCENDING); lst_dbl_quantiles(distance[i][j], quantiles, 7, quantile_vals); printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", leaf_name[i], leaf_name[j], mean, stdev, quantile_vals[3], quantile_vals[0], quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], quantile_vals[4]); } } /* also do total branch len */ mean = lst_dbl_mean(tot_dist); stdev = lst_dbl_stdev(tot_dist); lst_qsort_dbl(tot_dist, ASCENDING); lst_dbl_quantiles(tot_dist, quantiles, 7, quantile_vals); printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", "(total)", "-", mean, stdev, quantile_vals[3], quantile_vals[0], quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], quantile_vals[4]); } return 0; }
int main(int argc, char *argv[]) { int check_start = 0, check_stop = 0, check_splice = 0, check_nonsense = 0, offset5 = 0, offset3 = 0, opt_idx, i, j, indel_strict = 0, no_output = 0, check_alignment = 0, splice_strict = 0; int ncons_tested, nkept, nconserved_exons; int nce_gap_type[NGAP_TYPES], nconsid[NTYPES], nfail[NTYPES]; double Nfrac = 0.05; char c; MSA *msa; GFF_Set *gff; msa_format_type msa_format = UNKNOWN_FORMAT; List *keepers, *problems = lst_new_ptr(10), *ends_adjusted = lst_new_ptr(1), *starts_adjusted = lst_new_ptr(1), *discards=NULL, *intron_splice = lst_new_ptr(10); char *rseq_fname = NULL; FILE *logf = NULL, *mlogf = NULL, *statsf = NULL, *discardf = NULL; cds_gap_type fshift_mode = FSHIFT_BAD; char *groupby = "transcript_id"; msa_coord_map *map; int *countNs, *countCDSs; FILE *infile; char *msa_fname; struct option long_opts[] = { {"start", 0, 0, 's'}, {"stop", 0, 0, 't'}, {"splice", 0, 0, 'l'}, {"nonsense", 0, 0, 'n'}, {"fshift", 0, 0, 'f'}, {"conserved", 0, 0, 'c'}, {"N-limit", 1, 0, 'N'}, {"clean-gaps", 0, 0, 'e'}, {"indel-strict", 0, 0, 'I'}, {"splice-strict", 0, 0, 'C'}, {"groupby", 1, 0, 'g'}, {"msa-format", 1, 0, 'i'}, {"refseq", 1, 0, 'r'}, {"offset5", 1, 0, 'o'}, {"offset3", 1, 0, 'p'}, {"no-output", 0, 0, 'x'}, {"discards", 1, 0, 'd'}, {"log", 1, 0, 'L'}, {"machine-log", 1, 0, 'M'}, {"stats", 1, 0, 'S'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = (char)getopt_long(argc, argv, "N:i:r:L:M:S:g:d:stlnfceICxh", long_opts, &opt_idx)) != -1) { switch(c) { case 's': check_alignment = check_start = 1; break; case 't': check_alignment = check_stop = 1; break; case 'l': check_alignment = check_splice = 1; break; case 'n': check_alignment = check_nonsense = 1; break; case 'f': check_alignment = 1; fshift_mode = FSHIFT_OK; break; case 'c': check_alignment = check_start = check_stop = check_splice = check_nonsense = 1; if (fshift_mode < FSHIFT_OK) fshift_mode = FSHIFT_OK; break; case 'N': Nfrac = get_arg_dbl_bounds(optarg, 0, 1); break; case 'e': check_alignment = 1; if (fshift_mode < CLN_GAPS) fshift_mode = CLN_GAPS; break; case 'I': check_alignment = 1; fshift_mode = NOVRLP_CLN_GAPS; indel_strict = 1; break; case 'C': check_alignment = check_splice = splice_strict = 1; break; case 'g': groupby = optarg; break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("Bad alignment format.\n"); break; case 'r': rseq_fname = optarg; break; case 'o': offset5 = get_arg_int(optarg); break; case 'p': offset3 = get_arg_int(optarg); break; case 'L': logf = phast_fopen(optarg, "w+"); break; case 'M': mlogf = phast_fopen(optarg, "w+"); break; case 'S': statsf = phast_fopen(optarg, "w+"); break; case 'd': discardf = phast_fopen(optarg, "w+"); break; case 'x': no_output = 1; break; case 'h': printf("%s", HELP); exit(0); case '?': die("ERROR: Bad argument. Try the --help option.\n"); } } if (optind + 1 >= argc ) { die("ERROR: Missing required arguments. Try the --help option.\n"); } set_seed(-1); gff = gff_read_set(phast_fopen(argv[optind], "r")); msa_fname = argv[optind+1]; infile = phast_fopen(msa_fname, "r"); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(infile, 1); if (msa_format == MAF) { msa = maf_read(infile, rseq_fname == NULL ? NULL : phast_fopen(rseq_fname, "r"), 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); } else { msa = msa_new_from_file_define_format(infile, msa_format, NULL); if (msa->ss == NULL) ss_from_msas(msa, 1, 1, NULL, NULL, NULL, -1, 0); } if (!msa->ss->tuple_idx) die("ERROR: need ordered tuples\n"); msa_remove_N_from_alph(msa); /* for backward compatibility (old SS files) */ if (msa->idx_offset != 0) { /* avoids offset problem */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); f->start -= msa->idx_offset; f->end -= msa->idx_offset; } } /* set up coordinate map; assume GFF is for sequence 1 */ map = msa_build_coord_map(msa, 1); /* convert all features */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); int newstart, newend; if (f->start < 0 || f->end < f->start) die("ERROR: bad feature in GFF (start=%d, end=%d).\n", f->start, f->end); newstart = msa_map_seq_to_msa(map, f->start); newend = msa_map_seq_to_msa(map, f->end); if (newstart < 0 || newend < newstart) die("ERROR: unable to map coordinates for feature (start=%d, end=%d).\n", f->start, f->end); f->start = newstart; f->end = newend; } gff_group(gff, groupby); /* do this after coord conversion, or group coords and feature coords will be out of sync */ keepers = lst_new_ptr(lst_size(gff->features)); if (discardf != NULL) discards = lst_new_ptr(lst_size(gff->features)); ncons_tested = nkept = nconserved_exons = 0; for (i = 0; i < NTYPES; i++) nconsid[i] = 0; for (i = 0; i < NTYPES; i++) nfail[i] = 0; for (i = 0; i < NGAP_TYPES; i++) nce_gap_type[i] = 0; countNs = smalloc(msa->nseqs * sizeof(int)); countCDSs = smalloc(msa->nseqs * sizeof(int)); for (i = 0; i < lst_size(gff->groups); i++) { GFF_FeatureGroup *group = lst_get_ptr(gff->groups, i); List *gfeatures = group->features; GFF_Feature *feat; status_type status = OKAY; cds_gap_type gt = FSHIFT_BAD; problems_clear(problems); /* make sure have frame info for CDSs */ for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && feat->frame == GFF_NULL_FRAME) die("ERROR: Missing frame info for CDS.\n"); } /* First, exclude stop codons from cds's, if necessary (simplifies the detection of nonsense mutations). */ exclude_stops(group, starts_adjusted, ends_adjusted); /* In all cases, discard any group for which the reference sequence doesn't have valid splice sites or start/stop codons, or has a premature stop codon */ if (!ref_seq_okay(gfeatures, msa, offset3, indel_strict, splice_strict, problems)) { status = BAD_REF; nfail[BAD_REF]++; } else /* Everything else counts as a potentially valid group */ ncons_tested++; if (status == OKAY && check_alignment) { /* only bother with below if interested in cross-species conservation */ /* Check first to make sure there's alignment across species in the cds; if not, there's no need to look at individual features. */ for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && is_incomplete_alignment(feat, msa)) { status = NO_ALN; nfail[NO_ALN]++; problem_add(problems, feat, NO_ALN, -1, -1); break; } } if (status == OKAY) { /* we have alignment and agreement with the ref seq; now check feature by feature */ lst_clear(intron_splice); for (j = 0; j < msa->nseqs; j++) countNs[j] = countCDSs[j] = 0; for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (feat->end - 1 >= msa->length) die("ERROR: feature extends beyond alignment (%d >= %d).\n", feat->end - 1, msa->length); if (check_start && str_equals_charstr(feat->feature, GFF_START_TYPE)) { nconsid[BAD_START]++; if (!is_conserved_start(feat, msa)) { status = BAD_START; problem_add(problems, feat, BAD_START, -1, -1); } } else if (check_stop && str_equals_charstr(feat->feature, GFF_STOP_TYPE)) { nconsid[BAD_STOP]++; if (!is_conserved_stop(feat, msa)) { status = BAD_STOP; problem_add(problems, feat, BAD_STOP, -1, -1); } } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_5)) { nconsid[BAD_5_SPLICE]++; if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) { status = BAD_5_SPLICE; problem_add(problems, feat, BAD_5_SPLICE, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_5_UTR)) { nconsid[BAD_5_SPLICE_UTR]++; if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) { status = BAD_5_SPLICE_UTR; problem_add(problems, feat, BAD_5_SPLICE_UTR, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) { nconsid[BAD_3_SPLICE]++; if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) { status = BAD_3_SPLICE; problem_add(problems, feat, BAD_3_SPLICE, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) { nconsid[BAD_3_SPLICE_UTR]++; if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) { status = BAD_3_SPLICE_UTR; problem_add(problems, feat, BAD_3_SPLICE_UTR, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { if (fshift_mode > FSHIFT_BAD && (gt = get_cds_gap_type(feat, msa, problems)) < fshift_mode) { if (status == OKAY || status == NONSENSE) status = FSHIFT; } if (check_nonsense && !is_nonsense_clean(feat, msa, problems)) { if (status == OKAY) status = NONSENSE; } if (Nfrac < 1) get_N_counts(countNs, countCDSs, feat, msa); } } /* end loop through features in group */ /* still have to make sure splice sites are paired correctly (GT-AG, GC-AG, AT-AC) */ if (status == OKAY && !splice_strict && lst_size(intron_splice) >= 2 && !are_introns_okay(intron_splice, msa, problems, offset5, offset3)) status = BAD_INTRON; /* also check fraction of Ns */ if (Nfrac < 1) { enum {MY_OKAY, MY_FAIL, MY_WARN} Nstatus = MY_OKAY; for (j = 0; j < msa->nseqs; j++) { if ((double)countNs[j] / countCDSs[j] > Nfrac) Nstatus = MY_FAIL; if (Nstatus == MY_OKAY && countNs[j] > 0) Nstatus = MY_WARN; } if (Nstatus == MY_FAIL) { problem_add(problems, NULL, TOO_MANY_Ns, -1, -1); if (status == OKAY) status = TOO_MANY_Ns; } else if (Nstatus == MY_WARN) problem_add(problems, NULL, WARN_Ns, -1, -1); } /* if collecting stats, record counts for failures */ if (statsf != NULL) { if (status != OKAY) { for (j = 0; j < lst_size(problems); j++) { struct Problem *problem = lst_get_ptr(problems, j); status_type ftype = problem->status; if ((ftype == FSHIFT || ftype == NONSENSE) && status != FSHIFT && status != NONSENSE) continue; /* don't count secondary frame shifts and nonsense mutations */ if (ftype == BAD_INTRON && j % 2 == 0) continue; /* only count one of every pair of these */ nfail[ftype]++; } } /* also keep track of the total number of "conserved exons", and the number having each kind of gap */ if ((status == OKAY || (status == FSHIFT && gt >= FSHIFT_OK))) { nconserved_exons++; nce_gap_type[gt]++; /* number of conserved exons having given type of gaps */ } } } /* end if (status == OKAY) [checks for conserved features] */ } /* end if (status == OKAY && check_alignment) [all cross-species checks] */ /* now we have looked at the whole group; we just need to do some final accounting and logging */ if (status == OKAY) { nkept++; if (!no_output) { restore_stops(group, starts_adjusted, ends_adjusted); for (j = 0; j < lst_size(gfeatures); j++) lst_push_ptr(keepers, lst_get_ptr(gfeatures, j)); } if (logf != NULL && lst_size(problems) > 0) /* warnings only */ write_log(logf, group, status, problems, msa, map); if (mlogf != NULL) { /* no problem, need to add an okay status to log */ problem_add(problems, NULL, OKAY, -1, -1); write_machine_log(mlogf, group, problems, map); /* may include warnings */ } } else { if (discardf != NULL) { restore_stops(group, starts_adjusted, ends_adjusted); for (j = 0; j < lst_size(gfeatures); j++) lst_push_ptr(discards, lst_get_ptr(gfeatures, j)); } if (logf != NULL) write_log(logf, group, status, problems, msa, map); if (mlogf != NULL) write_machine_log(mlogf, group, problems, map); } } /* end loop over groups */ /* write main output and discards */ if (!no_output || discardf != NULL) { /* first map features back to coord frame of reference seq. */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); f->start = msa_map_msa_to_seq(map, f->start) + msa->idx_offset; f->end = msa_map_msa_to_seq(map, f->end) + msa->idx_offset; } if (!no_output) { gff->features = keepers; gff_print_set(stdout, gff); } if (discardf != NULL) { gff->features = discards; gff_print_set(discardf, gff); } } /* dump counts to stats file */ if (statsf != NULL) { fprintf(statsf, "#%11s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "total", "nbad_ref", "nconsid", "nkept", "nno_aln", "nbad_starts", "(out of)", "nbad_stops", "(out of)", "nbad_5spl", "(out of)", "nbad_3spl", "(out of)", "nbad_5utr", "(out of)", "nbad_3utr", "(out of)", "nbad_intron", "nnons", "nfshifts", "nNs", "ncons_exons", "nce_ngaps", "nce_nov_cln", "nce_clean", "nce_fshftok"); fprintf(statsf, "%12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d\n", nfail[BAD_REF]+ncons_tested, nfail[BAD_REF], ncons_tested, nkept, nfail[NO_ALN], nfail[BAD_START], nconsid[BAD_START], nfail[BAD_STOP], nconsid[BAD_STOP], nfail[BAD_5_SPLICE], nconsid[BAD_5_SPLICE], nfail[BAD_3_SPLICE], nconsid[BAD_3_SPLICE], nfail[BAD_5_SPLICE_UTR], nconsid[BAD_5_SPLICE_UTR], nfail[BAD_3_SPLICE_UTR], nconsid[BAD_3_SPLICE_UTR], nfail[BAD_INTRON], nfail[NONSENSE], nfail[FSHIFT], nfail[TOO_MANY_Ns], nconserved_exons, nce_gap_type[NGAPS], nce_gap_type[NOVRLP_CLN_GAPS], nce_gap_type[CLN_GAPS], nce_gap_type[FSHIFT_OK]); fprintf(statsf, "%s", STATS_DESCRIPTION); } if (logf != NULL) phast_fclose(logf); if (mlogf != NULL) phast_fclose(mlogf); if (statsf != NULL) phast_fclose(statsf); if (discardf != NULL) phast_fclose(discardf); return 0; }
/* reconstruct indels by parsimony and assign all base probs to -1 where ancestral bases are inferred not to have been present */ void do_indels(MSA *msa, TreeModel *mod) { int s, tup, i, j; TreeNode *n, *lca; char c; typedef enum {IGNORE, GAP, BASE, MISSING, AMBIG} label_type; List *postorder; label_type *label = smalloc(mod->tree->nnodes * sizeof(label_type)); List *inside = lst_new_ptr(mod->tree->nnodes), *outside = lst_new_ptr(mod->tree->nnodes), *ambig_cases = lst_new_ptr(mod->tree->nnodes); int *seq_to_leaf = smalloc(msa->nseqs * sizeof(int)); /* build mapping from seqs to leaf indices in tree */ for (s = 0; s < msa->nseqs; s++) { TreeNode *n = tr_get_node(mod->tree, msa->names[s]); if (n == NULL) die("ERROR: no match for sequence \"%s\" in tree.\n", msa->names[s]); seq_to_leaf[s] = n->id; } if (mod->msa_seq_idx == NULL) tm_build_seq_idx(mod, msa); postorder = tr_postorder(mod->tree); for (tup = 0; tup < msa->ss->ntuples; tup++) { int min = mod->tree->nnodes, max = -1, ngaps = 0, skip_root = FALSE; /* find min and max ids of seqs that actually have bases (non-gaps) */ for (s = 0; s < msa->nseqs; s++) { if (ss_get_char_tuple(msa, tup, s, 0) == GAP_CHAR) { ngaps++; continue; } if (seq_to_leaf[s] < min) min = seq_to_leaf[s]; if (seq_to_leaf[s] > max) max = seq_to_leaf[s]; /* NOTE: missing data being handled like bases here; in some cases, a base may be inferred at an ancestral node, when the only evidence for it is missing data in the leaves. There are ambiguous cases; we'll err on the side of predicting bases rather than indels */ } if (ngaps <= 1) continue; /* short cut -- impossible to infer gaps in ancestors */ else if (ngaps >= msa->nseqs - 1) { /* in this case, all ancestors must be gaps */ for (i = 0; i < mod->tree->nnodes; i++) { n = lst_get_ptr(mod->tree->nodes, i); if (n->lchild == NULL || n->rchild == NULL) continue; /* ignore leaves */ for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; /* mark as gap */ } continue; } if (min < 0) die("prequel.c: min = %e < 0\n", min); if (max < min) die("prequel.c: max (%e) < min (%e)", max, min); /* the LCA of all leaves with non-gaps must be the first ancestor of the node with the max id that has an id smaller than the min id. This is based on the assumption that node ids are assigned sequentially in a preorder traversal of the tree, which will be true as long as the tree is read from a Newick file by the code in trees.c */ for (lca = lst_get_ptr(mod->tree->nodes, max); lca->id > min; lca = lca->parent); /* by parsimony, the base was inserted on the branch to the LCA, and all ancestral nodes outside the subtree rooted at the LCA did not have bases */ if (lca == mod->tree->lchild || lca == mod->tree->rchild) skip_root = TRUE; /* don't mark root as gap in this case: can't distinguish insertion from deletion so assume deletion */ /* mark ancestral bases outside subtree beneath LCA as gaps */ tr_partition_nodes(mod->tree, lca, inside, outside); for (i = 0; i < mod->tree->nnodes; i++) label[i] = BASE; for (i = 0; i < lst_size(outside); i++) { n = lst_get_ptr(outside, i); label[n->id] = IGNORE; if (n->lchild == NULL || n->rchild == NULL) continue; /* skip leaves */ if (n == mod->tree && skip_root) continue; /* skip root if condition above */ for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; /* mark as gap */ } /* check for gaps in subtree; if there's at most one, we can go on; otherwise have to use parsimony to infer history in subtree */ ngaps = 0; for (i = 0; i < lst_size(inside); i++) { n = lst_get_ptr(inside, i); if (n->lchild == NULL && ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0) == GAP_CHAR) ngaps++; } if (ngaps <= 1) continue; /* use Dollo parsimony to infer the indel history of the subtree beneath the LCA. Use the fact that every base must have a chain of bases to the LCA, because, assuming the alignment is correct, no insertions are possible beneath the LCA */ lst_clear(ambig_cases); for (i = 0; i < lst_size(postorder); i++) { n = lst_get_ptr(postorder, i); if (label[n->id] == IGNORE) continue; /* outside subtree */ /* MISSING means all leaves beneath node have missing data */ /* AMBIG means combination of gaps and missing data beneath node */ else if (n->lchild == NULL) { /* leaf in subtree */ c = ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0); if (c == GAP_CHAR) label[n->id] = GAP; else if (msa->is_missing[(int)c]) label[n->id] = MISSING; else label[n->id] = BASE; } else { /* internal node in subtree */ if (label[n->lchild->id] == BASE || label[n->rchild->id] == BASE) label[n->id] = BASE; /* by Dollo parsimony */ else if ((label[n->lchild->id] == GAP || label[n->lchild->id] == AMBIG) && (label[n->rchild->id] == GAP || label[n->rchild->id] == AMBIG)) label[n->id] = GAP; /* gaps from both sides and no bases -- must be gap */ else if (label[n->lchild->id] == MISSING && label[n->rchild->id] == MISSING) label[n->id] = MISSING; else { /* must be GAP/MISSING or AMBIG/MISSING */ label[n->id] = AMBIG; lst_push_ptr(ambig_cases, n); } } } /* now resolve any ambiguities, by giving each ambiguous node the same label as its parent; traversing ambig_cases in reverse order ensures that parents are visited before children */ /* first make sure root of subtree has a base */ if (label[lca->id] == MISSING || label[lca->id] == AMBIG) label[lca->id] = BASE; /* in this case there is all missing data and gaps beneath the LCA; hard to know what is right, but let's force a base and err on the side of bases rather than gaps */ for (i = lst_size(ambig_cases) - 1; i >= 0; i--) { n = lst_get_ptr(ambig_cases, i); if (n == lca) continue; else label[n->id] = label[n->parent->id]; } /* now mark gaps inside subtree, as needed */ for (i = 0; i < lst_size(inside); i++) { n = lst_get_ptr(inside, i); if (n->lchild == NULL || n->rchild == NULL) continue; if (label[n->id] == GAP) for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; } } lst_free(inside); lst_free(outside); lst_free(ambig_cases); sfree(seq_to_leaf); sfree(label); }
int main(int argc, char *argv[]) { char c; int opt_idx, i, j, k, N, nleaves; List *names, *treelist, *newlist, *tmpl, *groups = NULL; TreeNode *t, *tnew; int *used=NULL; struct option long_opts[] = { {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "h", long_opts, &opt_idx)) != -1) { switch (c) { case 'h': printf("%s", HELP); exit(0); case '?': die("Bad argument. Try 'treeGen -h'.\n"); } } if (optind < argc - 2 || optind > argc - 1) die("ERROR: Wrong number of arguments. Try 'treeGen -h'.\n"); set_seed(-1); names = get_arg_list(argv[optind]); if (lst_size(names) <= 1) die("ERROR: must specify at least two species names.\n"); if (optind == argc - 2) { groups = get_arg_list_int(argv[optind+1]); if (lst_size(names) != lst_size(groups)) die("ERROR: name list and group list must be equal in length.\n"); } nleaves = lst_size(names) - 1; /* excluding outgroup */ N = num_rooted_topologies(nleaves); if (groups != NULL) { int maxgroup = 0; for (i = 0; i < lst_size(groups); i++) if (lst_get_int(groups, i) > maxgroup) maxgroup = lst_get_int(groups, i); used = smalloc((maxgroup+1) * sizeof(int)); for (i = 0; i <= maxgroup; i++) used[i] = FALSE; } /* FIXME: eventually need to consider constraints here */ if (N > 1e9) fprintf(stderr, "WARNING: very large number of topologies expected (%d). Program may not finish.\n", N); /* start with tree consisting of first two names */ t = tr_new_trivial(((String*)lst_get_ptr(names, 0))->chars, ((String*)lst_get_ptr(names, 1))->chars); treelist = lst_new_ptr(1000); newlist = lst_new_ptr(1000); lst_push_ptr(treelist, t); if (groups != NULL) { /* use branch lengths to encode group membership -- sort of an ugly hack but should be okay here */ t->lchild->dparent = lst_get_int(groups, 0); t->rchild->dparent = lst_get_int(groups, 1); if (t->lchild->dparent == t->rchild->dparent) t->dparent = t->lchild->dparent; used[lst_get_int(groups, 0)] = TRUE; used[lst_get_int(groups, 1)] = TRUE; } for (i = 2; i < nleaves; i++) { char *nextname = ((String*)lst_get_ptr(names, i))->chars; int nextgroup = groups != NULL ? lst_get_int(groups, i) : -1; lst_clear(newlist); for (j = 0; j < lst_size(treelist); j++) { t = lst_get_ptr(treelist, j); /* create copies and add leaf to each internal branch */ for (k = 1; k < t->nnodes; k++) { TreeNode *n = lst_get_ptr(t->nodes, k); /* decide whether adding leaf to this branch is consistent with monophyletic groups */ if (groups != NULL) { int branchgroup = n->dparent; int ancgroup = n->parent->dparent; if (nextgroup > 0 && used[nextgroup]) { /* group is represented in the tree */ if (nextgroup != branchgroup) { continue; /* can only add to the designated subtree */ } } else { /* group is zero (background) or not yet represented in the tree */ if (branchgroup != 0 && nextgroup != branchgroup && branchgroup == ancgroup) { continue; /* only prohibit adding inside another designated subtree (adding to leading branch is okay) */ } } } tnew = tr_create_copy(t); tr_add_leaf_internal(tnew, k, nextname, nextgroup); lst_push_ptr(newlist, tnew); } /* now add leaf at root; this time reuse the original copy to avoid unnecessary memory reallocation */ if (nextgroup <= 0 || !used[nextgroup] || t->dparent == nextgroup) { tr_add_leaf_at_root(t, nextname, nextgroup); lst_push_ptr(newlist, t); } else tr_free(t); } /* swap treelist and newlist */ tmpl = treelist; treelist = newlist; newlist = tmpl; if (groups != NULL) used[nextgroup] = TRUE; } /* traverse list and add outgroup at root of each tree */ if (nleaves > 1) { for (j = 0; j < lst_size(treelist); j++) { t = lst_get_ptr(treelist, j); tr_add_leaf_at_root(t, ((String*)lst_get_ptr(names, nleaves))->chars, 0); } } /* print trees */ for (j = 0; j < lst_size(treelist); j++) { t = lst_get_ptr(treelist, j); tr_print(stdout, t, FALSE); } return 0; }
int main(int argc, char *argv[]) { /* variables for options, with defaults */ TreeNode *tree = NULL, *merge_tree = NULL, *extrapolate_tree = NULL; Hashtable *rename_hash = NULL; double scale_factor = 1; List *prune_names = NULL, *label = NULL, *labelType = NULL; int prune_all_but = FALSE, tree_only = FALSE, dissect = FALSE, name_ancestors = FALSE, with_branch = FALSE, print_branchlen=FALSE, inNewick=FALSE, no_branchlen = FALSE, print_distance_to_root = FALSE; TreeModel *mod = NULL, *merge_mod = NULL; char *reroot_name = NULL, *subtree_name =NULL, *get_subtree_name = NULL, *node_distance_name = NULL; /* other variables */ String *suffix, *optstr; char c; int i, opt_idx; TreeNode *n; struct option long_opts[] = { {"scale", 1, 0, 's'}, {"extrapolate", 1, 0, 'e'}, {"prune", 1, 0, 'p'}, {"prune-all-but", 1, 0, 'P'}, {"get-subtree", 1, 0, 'g'}, {"merge", 1, 0, 'm'}, {"rename", 1, 0, 'r'}, {"tree-only", 0, 0, 't'}, {"no-branchlen", 0, 0, 'N'}, {"dissect", 0, 0, 'd'}, {"name-ancestors", 0, 0, 'a'}, {"reroot", 1, 0, 'R'}, {"with-branch", 1, 0, 'B'}, {"subtree", 1, 0, 'S'}, {"branchlen", 0, 0, 'b'}, {"newick", 0, 0, 'n'}, {"label-subtree", 1, 0, 'L'}, {"label-branches", 1, 0, 'l'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "s:p:P:g:m:r:R:B:S:D:l:L:adtNbnh", long_opts, &opt_idx)) != -1) { switch (c) { case 's': scale_factor = get_arg_dbl_bounds(optarg, 0, INFTY); break; case 'e': if (!strcmp(optarg, "default")) { optarg = smalloc(1000 * sizeof(char)); #if defined(__MINGW32__) sprintf(optarg, "%s\\data\\exoniphy\\mammals\\cftr25_hybrid.nh", PHAST_HOME); #else sprintf(optarg, "%s/data/exoniphy/mammals/cftr25_hybrid.nh", PHAST_HOME); #endif } extrapolate_tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'p': prune_names = get_arg_list(optarg); break; case 'P': prune_names = get_arg_list(optarg); prune_all_but = TRUE; break; case 'g': get_subtree_name = optarg; break; case 'm': suffix = str_new_charstr(optarg); str_suffix(suffix, '.'); if (str_equals_charstr(suffix, "nh")) merge_tree = tr_new_from_file(phast_fopen(optarg, "r")); else { merge_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); merge_tree = merge_mod->tree; } break; case 'r': rename_hash = make_name_hash(optarg); break; case 't': tree_only = TRUE; break; case 'N': no_branchlen = TRUE; tree_only = TRUE; break; case 'd': dissect = TRUE; break; case 'b': print_branchlen = TRUE; break; case 'D': print_distance_to_root = TRUE; node_distance_name = optarg; break; case 'R': reroot_name = optarg; break; case 'B': with_branch = TRUE; break; case 'a': name_ancestors = TRUE; break; case 'S': subtree_name = optarg; break; case 'n': inNewick=TRUE; break; case 'L': //do the same for --label--subtree and --label-branches case 'l': if (label == NULL) { label = lst_new_ptr(1); labelType = lst_new_int(1); } optstr = str_new_charstr(optarg); lst_push_ptr(label, optstr); lst_push_int(labelType, (int)c); break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); if (merge_tree != NULL && extrapolate_tree != NULL) die("ERROR: Can't use --merge and --extrapolate together"); set_seed(-1); suffix = str_new_charstr(argv[optind]); str_suffix(suffix, '.'); if (inNewick || str_equals_charstr(suffix, "nh")) { tree = tr_new_from_file(phast_fopen(argv[optind], "r")); tree_only = TRUE; /* can't output tree model in this case */ } else { mod = tm_new_from_file(phast_fopen(argv[optind], "r"), 1); tree = mod->tree; } if (prune_names != NULL) { tr_prune(&tree, prune_names, prune_all_but, NULL); if (mod != NULL) mod->tree = tree; /* root may have changed */ } if (get_subtree_name != NULL) { n = tr_get_node(tree, get_subtree_name); if (n == NULL) { tr_name_ancestors(tree); n = tr_get_node(tree, get_subtree_name); if (n == NULL) { die("ERROR: no node named '%s'.\n", subtree_name); } } tr_prune_supertree(&tree, n); if (mod != NULL) mod->tree = tree; } if (merge_tree != NULL) { tree = tr_hybrid(tree, merge_tree); if (mod != NULL) mod->tree = tree; } else if (extrapolate_tree != NULL) { tr_scale_by_subtree(extrapolate_tree, tree); tree = extrapolate_tree; if (mod != NULL) mod->tree = tree; } if (scale_factor != 1) { if (subtree_name == NULL) tr_scale(tree, scale_factor); else { n = tr_get_node(tree, subtree_name); if (n == NULL) die("ERROR: no node named '%s'.\n", subtree_name); tr_scale_subtree(tree, n, scale_factor, with_branch); } } if (name_ancestors) tr_name_ancestors(tree); if (rename_hash != NULL) { char *newname; for (i = 0; i < tree->nnodes; i++) { n = lst_get_ptr(tree->nodes, i); if (n->name != NULL && n->name[0] != '\0' && (newname = hsh_get(rename_hash, n->name)) != (char*)-1) { strcpy(n->name, newname); } } } if (reroot_name != NULL) { n = tr_get_node(tree, reroot_name); if (n == NULL) die("ERROR: no node named '%s'.\n", reroot_name); tr_reroot(tree, n, with_branch); if (mod != NULL) mod->tree = with_branch ? n->parent : n; tree = with_branch ? n->parent : n; } if (label != NULL) { for (i=0; i < lst_size(label); i++) { String *currstr = (String*)lst_get_ptr(label, i), *arg1, *labelVal; List *tmplst = lst_new_ptr(10); String *nodename; int j; str_split(currstr, ":", tmplst); if (lst_size(tmplst) != 2) die("ERROR: bad argument to --label-branches or --label-subtree.\n"); arg1 = lst_get_ptr(tmplst, 0); labelVal = lst_get_ptr(tmplst, 1); lst_clear(tmplst); if (lst_get_int(labelType, i) == (int)'l') { str_split(arg1, ",", tmplst); for (j=0; j < lst_size(tmplst); j++) { nodename = (String*)lst_get_ptr(tmplst, j); tr_label_node(tree, nodename->chars, labelVal->chars); } lst_free_strings(tmplst); } else if (lst_get_int(labelType, i) == (int)'L') { int include_leading_branch = FALSE; TreeNode *node; nodename = arg1; node = tr_get_node(tree, nodename->chars); if (node == NULL && nodename->chars[nodename->length-1] == '+') { nodename->chars[--nodename->length] = '\0'; node = tr_get_node(tree, nodename->chars); include_leading_branch = TRUE; } tr_label_subtree(tree, nodename->chars, include_leading_branch, labelVal->chars); } else die("ERROR got label_type %c\n", lst_get_int(labelType, (char)i)); str_free(arg1); str_free(labelVal); lst_free(tmplst); str_free(currstr); } lst_free(label); lst_free(labelType); } if (dissect) tr_print_nodes(stdout, tree); if (print_branchlen) printf("TOTAL_TREE_LEN: %f\n", tr_total_len(tree)); if (print_distance_to_root) { TreeNode *node = tr_get_node(tree, node_distance_name); if (node == NULL) die("ERROR: no node named '%s'.\n", node_distance_name); printf("length(root-%s): %f\n", node_distance_name, tr_distance_to_root(node)); } if (dissect==0 && print_branchlen==0 && print_distance_to_root==0) { if (tree_only) tr_print(stdout, tree, no_branchlen==FALSE); else tm_print(stdout, mod); } return 0; }