/* Exclude stop codons from all CDS in a group, as necessary. Record any features that are changed, so they can be changed back before data is output */ void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, List *ends_adjusted) { int j, k; List *stops = lst_new_ptr(1), *gfeatures = group->features; GFF_Feature *feat; lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted); for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops. We expect at most one, but more are possible */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat); } for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { for (k = 0; k < lst_size(stops); k++) { /* check stops */ GFF_Feature *stop = lst_get_ptr(stops, k); if (feat->strand == '+' && stop->strand == '+' && feat->end == stop->end) { feat->end -= 3; lst_push_ptr(ends_adjusted, feat); } else if (feat->strand == '-' && stop->strand == '-' && feat->start == stop->start) { feat->start += 3; lst_push_ptr(starts_adjusted, feat); } } } } lst_free(stops); }
/* add leaf with specified name to specified internal branch */ void tr_add_leaf_internal(TreeNode *t, int branch, char *lname, int lgroup) { TreeNode *oldnode, *newanc, *newleaf; oldnode = lst_get_ptr(t->nodes, branch); /* node beneath branch in question */ if (oldnode == t) die("ERROR tr_add_leaf_internal: oldnode == t\n"); newanc = tr_new_node(); newleaf = tr_new_node(); strcpy(newleaf->name, lname); newleaf->dparent = lgroup; newanc->rchild = newleaf; newleaf->parent = newanc; newanc->lchild = oldnode; newanc->parent = oldnode->parent; if (oldnode->parent->lchild == oldnode) oldnode->parent->lchild = newanc; else oldnode->parent->rchild = newanc; oldnode->parent = newanc; if (lgroup > 0 && lgroup == oldnode->dparent) newanc->dparent = lgroup; /* fix up ids and nodes list */ lst_push_ptr(t->nodes, newanc); newanc->id = lst_size(t->nodes) - 1; /* circumvent normal id assignment */ lst_push_ptr(t->nodes, newleaf); newleaf->id = lst_size(t->nodes) - 1; t->nnodes += 2; }
/* Return a list of category names corresponding to a given list of category names and or numbers. Doesn't allocate new names, just pointers to Strings in the CategoryMap object or the provided List */ List *cm_get_category_str_list(CategoryMap *cm, List *names, int ignore_missing) { int i, cat; List *retval = lst_new_ptr(lst_size(names)); for (i = 0; i < lst_size(names); i++) { String *n = lst_get_ptr(names, i); if (str_as_int(n, &cat) == 0) { if (cm == NULL) die("ERROR: if categories are specified by number, a category map is required\n"); if (cat < 0 || (cm != NULL && cat > cm->ncats)) die("ERROR: category number %d is out of bounds.\n", cat); lst_push_ptr(retval, cm_get_feature(cm, cat)); } else { if (cm != NULL) { cat = cm_get_category(cm, n); if (cat == 0 && !ignore_missing && !str_equals(n, cm_get_feature(cm, 0))) { die("ERROR: illegal category name (\"%s\")\n", n->chars); } //return pointers to cm if possible lst_push_ptr(retval, cm_get_feature(cm, cat)); } //otherwise return pointers to strings in list else lst_push_ptr(retval, n); } } return retval; }
/* add a leaf with specified name to root branch */ void tr_add_leaf_at_root(TreeNode *t, char *lname, int lgroup) { TreeNode *newanc, *newleaf; newanc = tr_new_node(); newleaf = tr_new_node(); strcpy(newleaf->name, lname); newleaf->dparent = lgroup; /* we don't want to change the identity of the root node, so will add the new node below it and rewire as necessary */ newanc->lchild = t->lchild; newanc->rchild = t->rchild; t->lchild->parent = newanc; t->rchild->parent = newanc; t->lchild = newanc; t->rchild = newleaf; newanc->parent = t; newleaf->parent = t; newanc->dparent = t->dparent; if (lgroup == newanc->dparent) t->dparent = lgroup; else t->dparent = 0; /* fix up ids and nodes list */ lst_push_ptr(t->nodes, newanc); newanc->id = lst_size(t->nodes) - 1; /* circumvent normal id assignment */ lst_push_ptr(t->nodes, newleaf); newleaf->id = lst_size(t->nodes) - 1; t->nnodes += 2; }
SEXP rph_gff_featureBits(SEXP gffListP, SEXP orP, SEXP returnGffP) { int numGff, i, j, or, returnGff; long numbit = 0; List *gfflist; GFF_Set *gff, *newgff=NULL; GFF_Feature *feat, *newfeat; SEXP rv; numGff = length(gffListP); gfflist = lst_new_ptr(numGff); // Rf_PrintValue(gffListP); for (i = 0; i < numGff; i++) { gff = (GFF_Set*)EXTPTR_PTR(VECTOR_ELT(gffListP, i)); lst_push_ptr(gfflist, gff); gff_register_protect(gff); } or = LOGICAL_VALUE(orP); returnGff = LOGICAL_VALUE(returnGffP); if (!or && numGff >= 2) { newgff = gff_overlap_gff(lst_get_ptr(gfflist, 0), lst_get_ptr(gfflist, 1), 1, -1.0, FALSE, TRUE, NULL); numbit = gff_flatten_mergeAll(newgff); for (i=2; i < numGff; i++) { checkInterrupt(); gff = gff_overlap_gff(newgff, lst_get_ptr(gfflist, i), 1, -1.0, FALSE, TRUE, NULL); numbit = gff_flatten_mergeAll(gff); gff_free_set(newgff); newgff = gff; } } else { newgff = gff_new_set(); for (i=0; i< numGff; i++) { gff = (GFF_Set*)lst_get_ptr(gfflist, i); for (j=0; j < lst_size(gff->features); j++) { checkInterruptN(j, 1000); feat = lst_get_ptr(gff->features, j); newfeat = gff_new_feature_copy(feat); lst_push_ptr(newgff->features, newfeat); } } numbit = gff_flatten_mergeAll(newgff); } if (returnGff) return rph_gff_new_extptr(newgff); if (numbit > INT_MAX) { PROTECT(rv = allocVector(REALSXP, 1)); REAL(rv)[0] = numbit; } else { PROTECT(rv = allocVector(INTSXP, 1)); INTEGER(rv)[0] = numbit; } UNPROTECT(1); return rv; }
/* Create a category map with a category for each feature type in a GFF_Set. Category numbers are assigned in order of appearance of types */ CategoryMap* cm_new_from_features(GFF_Set *feats) { int i; CategoryMap *retval; Hashtable *hash; List *types; /* first scan features for all types */ hash = hsh_new(10); types = lst_new_ptr(10); for (i = 0; i < lst_size(feats->features); i++) { GFF_Feature *f = lst_get_ptr(feats->features, i); checkInterruptN(i, 10000); if (hsh_get(hash, f->feature->chars) == (void*)-1) { lst_push_ptr(types, f->feature); hsh_put_int(hash, f->feature->chars, 1); } } hsh_free(hash); /* now create a simple category map */ retval = cm_new(lst_size(types)); for (i = 0; i <= retval->ncats; i++) { String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : str_dup(lst_get_ptr(types, i-1)); retval->ranges[i] = cm_new_category_range(type, i, i); } lst_free(types); return retval; }
/* conditioned_on must be an array of integer lists; specifically, the ith element must be the list of state numbers on which the ith state is conditioned. */ Unspooler *cm_create_unspooler(int nstates_spooled, List **conditioned_on) { UnspoolNode *n; int i, j; Stack *s; Unspooler *unsp; int *mark; int capacity; unsp = (Unspooler*)smalloc(sizeof(Unspooler)); unsp->nstates_spooled = nstates_spooled; unsp->nstates_unspooled = 0; unsp->spooled_to_unspooled = (UnspoolNode**)smalloc(nstates_spooled * sizeof(UnspoolNode*)); capacity = nstates_spooled * nstates_spooled; unsp->unspooled_to_spooled = (int*)smalloc(capacity * sizeof(int)); mark = (int*)smalloc(nstates_spooled * sizeof(int)); s = stk_new_ptr(nstates_spooled); for (i = 0; i < nstates_spooled; i++) { /* erase marks (used to detect cycles) */ for (j = 0; j < nstates_spooled; j++) mark[j] = 0; unsp->spooled_to_unspooled[i] = cm_new_unspool_node(i); stk_push_ptr(s, unsp->spooled_to_unspooled[i]); while ((n = (UnspoolNode*)stk_pop_ptr(s)) != NULL) { if (conditioned_on[n->oldstate] == NULL || lst_size(conditioned_on[n->oldstate]) == 0) { n->newstate = unsp->nstates_unspooled++; /* mapping to spooled space */ if (n->newstate >= capacity) { capacity *= 2; unsp->unspooled_to_spooled = (int*)srealloc(unsp->unspooled_to_spooled, capacity * sizeof(int)); } unsp->unspooled_to_spooled[n->newstate] = i; } else { for (j = 0; j < lst_size(conditioned_on[n->oldstate]); j++) { int oldstate = lst_get_int(conditioned_on[n->oldstate], j); UnspoolNode *m; if (mark[oldstate] == 1) die("ERROR: cycle in 'conditioned_on' dependencies.\n"); mark[oldstate] = 1; m = cm_new_unspool_node(oldstate); lst_push_ptr(n->children, m); stk_push_ptr(s, m); } } } } stk_free(s); sfree(mark); return unsp; }
CategoryRange* cm_new_category_range(String *type, int start_cat_no, int end_cat_no) { CategoryRange *cr = (CategoryRange*)smalloc(sizeof(CategoryRange)); cr->feature_types = lst_new_ptr(1); lst_push_ptr(cr->feature_types, type); cr->start_cat_no = start_cat_no; cr->end_cat_no = end_cat_no; return cr; }
CategoryRange* cm_category_range_create_copy(CategoryRange *src) { int i; CategoryRange *retval = cm_new_category_range(str_dup(lst_get_ptr(src->feature_types, 0)), src->start_cat_no, src->end_cat_no); for (i = 1; i < lst_size(src->feature_types); i++) lst_push_ptr(retval->feature_types, str_dup(lst_get_ptr(src->feature_types, i))); return retval; }
/* create a trivial, two-leaf tree */ TreeNode *tr_new_trivial(char *name1, char *name2) { TreeNode *root; root = tr_new_node(); root->lchild = tr_new_node(); strcpy(root->lchild->name, name1); root->lchild->parent = root; root->rchild = tr_new_node(); strcpy(root->rchild->name, name2); root->rchild->parent = root; /* bypass default handling of ids and nodes list */ root->nnodes = 3; root->id = 0; root->lchild->id = 1; root->rchild->id = 2; root->nodes = lst_new_ptr(root->nnodes); lst_push_ptr(root->nodes, root); lst_push_ptr(root->nodes, root->lchild); lst_push_ptr(root->nodes, root->rchild); return root; }
/* return list of category names corresponding to list of category numbers */ List *cm_get_features(CategoryMap *cm, List *catnos) { int mark[cm->ncats+1]; List *retval = lst_new_ptr(lst_size(catnos)); int i, cat; for (i = 0; i <= cm->ncats; i++) mark[i] = 0; for (i = 0; i < lst_size(catnos); i++) { cat = lst_get_int(catnos, i); if (!mark[cm->ranges[cat]->start_cat_no]) { lst_push_ptr(retval, cm_get_feature(cm, cat)); mark[cm->ranges[cat]->start_cat_no] = 1; } } return retval; }
SEXP rph_gff_append(SEXP gffListP) { GFF_Set *newgff = gff_new_set(), *gff; int i, j; for (i=0 ; i<length(gffListP); i++) { gff = (GFF_Set*)EXTPTR_PTR(VECTOR_ELT(gffListP, i)); gff_register_protect(gff); for (j=0; j < lst_size(gff->features); j++) { checkInterruptN(j, 1000); lst_push_ptr(newgff->features, gff_new_feature_copy(lst_get_ptr(gff->features, j))); } } return rph_gff_new_extptr(newgff); }
/* open a file with name out_root.name.maf, or returns it if already open. This is a bit messy because in some cases (splitting by feature) there may be more output files than the OS can handle. But it would be computationally expensive to check and see which files are finished, assuming that the MAF is sorted. So, if it tries to open a file and fails, it the goes through the list of filehandles, finds an open one, closes it, and tries to open the new one again. Repeat until successful. Then, if a filehandle needs to be re-opened, it is opened with append. Again, if this is not successful, it looks for another file to close. If it can't find one the program reports an error and dies. Finally, close_outfiles below checks and makes sure that all files are closed with mafBlock_close_file in the end, so that they get the #eof closer. */ FILE *get_outfile(List *outfileList, Hashtable *outfileHash, String *name, char *out_root, int argc, char *argv[]) { int idx, i; FILE *outfile; char *fname = smalloc((strlen(out_root)+name->length+7)*sizeof(char)); sprintf(fname, "%s.%s.maf", out_root, name->chars); idx = ptr_to_int(hsh_get(outfileHash, fname)); if (idx == -1) { hsh_put(outfileHash, fname, int_to_ptr(lst_size(outfileList))); outfile = mafBlock_open_outfile(fname, argc, argv); while (outfile==NULL) { //too many files are open, close one first for (i=0; i<lst_size(outfileList); i++) { outfile = (FILE*)lst_get_ptr(outfileList, i); if (outfile != NULL) break; } if (i == lst_size(outfileList)) { die("ERROR: too many files open in maf_parse\n"); } else { phast_fclose(outfile); lst_set_ptr(outfileList, i, NULL); } outfile = mafBlock_open_outfile(fname, argc, argv); } lst_push_ptr(outfileList, (void*)outfile); sfree(fname); return outfile; } outfile = (FILE*)lst_get_ptr(outfileList, idx); if (outfile == NULL) { //has already been opened but then closed. outfile = phast_fopen_no_exit(fname, "a"); while (outfile == NULL) { for (i=0; i<lst_size(outfileList); i++) { outfile = (FILE*)lst_get_ptr(outfileList, i); if (outfile != NULL) break; } if (i == lst_size(outfileList)) { die("ERROR: too many files open in maf_parse\n"); } else { phast_fclose(outfile); lst_set_ptr(outfileList, i, NULL); } outfile = phast_fopen_no_exit(fname, "a"); } lst_set_ptr(outfileList, idx, (void*)outfile); } sfree(fname); return outfile; }
SEXP rph_tree_prune(SEXP treeStr, SEXP seqsP, SEXP allButP) { TreeNode *tr = rph_tree_new(treeStr); List *names = lst_new_ptr(LENGTH(seqsP)); String *tempStr; char *temp; int i; SEXP result; for (i=0; i<LENGTH(seqsP); i++) { tempStr = str_new_charstr(CHAR(STRING_ELT(seqsP, i))); lst_push_ptr(names, tempStr); } tr_prune(&tr, names, INTEGER_VALUE(allButP), NULL); temp = tr_to_string(tr, 1); PROTECT(result = NEW_CHARACTER(1)); SET_STRING_ELT(result, 0, mkChar(temp)); UNPROTECT(1); return result; }
MafBlock* mafBlock_copy(MafBlock *src) { MafBlock *block = smalloc(sizeof(MafBlock)); MafSubBlock *sub; int i; if (src->aLine == NULL) block->aLine = NULL; else block->aLine = str_new_charstr(src->aLine->chars); if (src->specMap == NULL) block->specMap = NULL; else block->specMap = hsh_copy(src->specMap); block->seqlen = src->seqlen; if (src->data==NULL) block->data = NULL; else { block->data = lst_new_ptr(lst_size(src->data)); for (i=0; i<lst_size(src->data); i++) { sub = mafSubBlock_copy((MafSubBlock*)lst_get_ptr(src->data, i)); lst_push_ptr(block->data, (void*)sub); } } return block; }
List *mm_build(MS *inputMS, int norder, int pseudoCount, int considerReverse) { int i; Matrix *mm = NULL; List *MatrixList; //testBaseToRow(); if (norder < 0) //Must have a positive order to build markov Model die("ERROR: Order of Markov Models must be zero or greater"); MatrixList = lst_new_ptr(norder+1); //Build a Markov Model (list of Matrix order 0 -> norder) for (i = 0; i <= norder; i++) { mm = mm_build_helper(inputMS, i, pseudoCount, considerReverse); //Build MarkovMatrix of order i lst_push_ptr(MatrixList, mm); } return MatrixList; }
/* Given a cds feature, determine whether it has no gaps (NGAPS), "clean" gaps (all multiples of 3 in length; CLEAN_GAPS) non-overlapping clean gaps (NOVRLP_CLN_GAPS), "okay" gaps (only temporary frame shifts, corrected by compensatory indels; FSHIFT_OK), or real frame-shift gaps (FSHIFT_BAD) */ cds_gap_type get_cds_gap_type(GFF_Feature *feat, MSA *msa, List *problems) { Problem *problem = NULL; cds_gap_type retval = scan_for_gaps(feat, msa, &problem); if (retval == FSHIFT_BAD && is_fshift_okay(feat, msa)) { retval = FSHIFT_OK; /* most of the time the call to is_fshift_okay won't be necessary */ problem->status = WARN_FSHIFT; problem->cds_gap = FSHIFT_OK; } if (problem != NULL) { lst_push_ptr(problems, problem); /* FIXME: It's possible that the single problem identified in scan_for_gaps is actually okay, but there's a frameshift without compensation downstream. In this case, the status will be correct but the problem will point to the wrong place */ } return retval; }
void mafBlock_reorder(MafBlock *block, List *specNameOrder) { String *str; MafSubBlock *sub; List *newData; Hashtable *newSpecMap; int i, idx, *found, oldSize = lst_size(block->data), newSize = lst_size(specNameOrder); found = smalloc(oldSize*sizeof(int)); for (i=0; i<oldSize; i++) found[i]=0; newData = lst_new_ptr(oldSize); newSpecMap = hsh_new(100); for (i=0; i<newSize; i++) { str = (String*)lst_get_ptr(specNameOrder, i); idx = hsh_get_int(block->specMap, str->chars); if (idx != -1) { if (found[idx]==1) die("ERROR: species %s appears twice in reorder list\n", str->chars); sub = (MafSubBlock*)lst_get_ptr(block->data, idx); hsh_put_int(newSpecMap, sub->src->chars, lst_size(newData)); hsh_put_int(newSpecMap, sub->specName->chars, lst_size(newData)); lst_push_ptr(newData, (void*)sub); found[idx] = 1; } } for (i=0; i<oldSize; i++) { if (found[i]==0) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); mafSubBlock_free(sub); } } hsh_free(block->specMap); lst_free(block->data); block->specMap = newSpecMap; block->data = newData; sfree(found); }
int main(int argc, char *argv[]) { char c; int i, j, t, opt_idx, ntrees, nleaves = -1; TreeNode *n, *node_i, *node_j, *lca, *nametree = NULL; TreeNode **tree; List *leaves, ***distance, *tree_fnames, *tot_dist; int mod = FALSE; char **leaf_name; String *trees_arg; FILE *F; struct option long_opts[] = { {"mod", 0, 0, 'm'}, {"tree", 1, 0, 't'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = getopt_long(argc, argv, "mt:h", long_opts, &opt_idx)) != -1) { switch (c) { case 'm': mod = TRUE; break; case 't': if (optarg[0] == '(') nametree = tr_new_from_string(optarg); else nametree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind > argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); set_seed(-1); /* build a comma-delimited list and pass to get_arg_list; allows possibility of reading from file via '*' operator */ trees_arg = str_new(1000); for (i = optind; i < argc; i++) { str_append_charstr(trees_arg, argv[i]); if (i < argc - 1) str_append_char(trees_arg, ','); } tree_fnames = get_arg_list(trees_arg->chars); ntrees = lst_size(tree_fnames); tree = smalloc(ntrees * sizeof(void*)); /* read trees */ for (t = 0; t < ntrees; t++) { String *fname = lst_get_ptr(tree_fnames, t); if (mod) { TreeModel *m = tm_new_from_file(F = phast_fopen(fname->chars, "r"), 1); tree[t] = tr_create_copy(m->tree); tm_free(m); phast_fclose(F); } else tree[t] = tr_new_from_file(phast_fopen(fname->chars, "r")); } /* initialization */ nleaves = (tree[0]->nnodes + 1)/2; leaves = lst_new_ptr(nleaves); distance = smalloc(nleaves * sizeof(void*)); leaf_name = smalloc(nleaves * sizeof(void*)); for (i = 0; i < nleaves; i++) { distance[i] = smalloc(nleaves * sizeof(void*)); for (j = i+1; j < nleaves; j++) distance[i][j] = lst_new_dbl(ntrees); } if (nametree == NULL) nametree = tree[0]; for (i = 0, j = 0; i < lst_size(nametree->nodes); i++) { n = lst_get_ptr(nametree->nodes, i); if (n->lchild == NULL && n->rchild == NULL) leaf_name[j++] = n->name; } tot_dist = lst_new_dbl(ntrees); /* now compute distances */ for (t = 0; t < ntrees; t++) { /* obtain list of leaves */ lst_clear(leaves); for (i = 0; i < lst_size(tree[t]->nodes); i++) { n = lst_get_ptr(tree[t]->nodes, i); if (n->lchild == NULL && n->rchild == NULL) lst_push_ptr(leaves, n); } if (lst_size(leaves) != nleaves) die("ERROR: trees have different numbers of leaves.\n"); /* look at all pairs */ for (i = 0; i < nleaves; i++) { node_i = lst_get_ptr(leaves, i); for (j = i+1; j < nleaves; j++) { double dist = 0; node_j = lst_get_ptr(leaves, j); /* because ids are assigned in pre-order, the first ancestor of node j that has an id less than i is the LCA of i and j; we seek the sum of distances from both i and j to this node */ for (n = node_j; n->id >= node_i->id; n = n->parent) dist += n->dparent; lca = n; for (n = node_i; n != lca; n = n->parent) dist += n->dparent; lst_push_dbl(distance[i][j], dist); } } lst_push_dbl(tot_dist, tr_total_len(tree[t])); } /* print distances and (optionally) stats */ if (ntrees == 1) { for (i = 0; i < nleaves; i++) { for (j = i+1; j < nleaves; j++) { printf ("%s\t%s\t%f\n", leaf_name[i], leaf_name[j], lst_get_dbl(distance[i][j], 0)); } } printf ("%s\t%s\t%f\n", "(total)", "-", lst_get_dbl(tot_dist, 0)); } else { double mean, stdev; double quantiles[] = {0, 0.025, 0.05, 0.5, 0.95, 0.975, 1}; double quantile_vals[7]; printf("%-15s %-15s %9s %9s %9s %9s %9s %9s %9s %9s %9s\n", "leaf1", "leaf2", "mean", "stdev", "median", "min", "max", "95%_min", "95%_max", "90%_min", "90%_max"); for (i = 0; i < nleaves; i++) { for (j = i+1; j < nleaves; j++) { mean = lst_dbl_mean(distance[i][j]); stdev = lst_dbl_stdev(distance[i][j]); lst_qsort_dbl(distance[i][j], ASCENDING); lst_dbl_quantiles(distance[i][j], quantiles, 7, quantile_vals); printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", leaf_name[i], leaf_name[j], mean, stdev, quantile_vals[3], quantile_vals[0], quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], quantile_vals[4]); } } /* also do total branch len */ mean = lst_dbl_mean(tot_dist); stdev = lst_dbl_stdev(tot_dist); lst_qsort_dbl(tot_dist, ASCENDING); lst_dbl_quantiles(tot_dist, quantiles, 7, quantile_vals); printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", "(total)", "-", mean, stdev, quantile_vals[3], quantile_vals[0], quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], quantile_vals[4]); } return 0; }
LocalPwAlignment *la_read_lav(FILE *F, int read_seqs) { String *line = str_new(STR_MED_LEN); int line_no=0; LocalPwAlignment *lpwa = la_new(); List *fields = lst_new_ptr(6); Regex *stanza_start_re = str_re_new("^([dshaxm])[[:space:]]*{"); AlignmentBlock *aln_block = NULL; char stanza_type = '\0'; int i; int done_with[256]; done_with[(int)'d'] = done_with[(int)'s'] = done_with[(int)'h'] = done_with[(int)'x'] = done_with[(int)'m'] = 0; while (str_readline(line, F) != EOF) { str_trim(line); if (line->length == 0) continue; checkInterruptN(line_no, 1000); line_no++; if (line_no == 1) { if (!str_equals_charstr(line, "#:lav")) { die("ERROR: lav file missing header.\n"); } } else if (str_re_match(line, stanza_start_re, fields, 1) >= 0) { String *tmpstr = lst_get_ptr(fields, 1); stanza_type = tmpstr->chars[0]; str_free(tmpstr); str_free(lst_get_ptr(fields, 0)); if (stanza_type != 'a' && done_with[(int)stanza_type]) { die("ERROR: multiple '%c' stanzas in lav file.\n", stanza_type); } if (stanza_type == 'a') { aln_block = la_new_alignment_block(-1, -1, -1, -1, -1, NULL); lst_push_ptr(lpwa->alignment_blocks, aln_block); } } /* end current stanza */ else if (str_equals_charstr(line, "}")) { if (stanza_type == '\0') { die("ERROR: end stanza without matching begin.\n"); } done_with[(int)stanza_type] = 1; stanza_type = '\0'; } else if (stanza_type == 'd') { ; /* do nothing for now */ } else if (stanza_type == 's') { int beg, end; String *tmpstr, *fname, *seq=NULL; FILE *F2; str_double_trim(line); str_split(line, NULL, fields); if (lst_size(fields) != 3 || str_as_int(lst_get_ptr(fields, 1), &beg) != 0 || str_as_int(lst_get_ptr(fields, 2), &end) != 0) { die("ERROR: bad line in 's' stanza in lav file.\n"); } tmpstr = lst_get_ptr(fields, 0); fname = str_new(tmpstr->length-2); /* remove quotes */ str_substring(fname, tmpstr, 1, tmpstr->length-2); if (read_seqs) { F2 = phast_fopen(fname->chars, "r"); seq = msa_read_seq_fasta(F2); phast_fclose(F2); } for (i = 0; i < lst_size(fields); i++) str_free(lst_get_ptr(fields, i)); if (beg != 1) { die("ERROR: unexpected begin index in 's' stanza of lav file (begin index currently must be 1).\n"); } if (lpwa->query_len == -1) { lpwa->query_len = end; if (read_seqs) lpwa->query_seq = seq; } else if (lpwa->target_len == -1) { lpwa->target_len = end; if (read_seqs) lpwa->target_seq = seq; } else { die("ERROR: too many sequences listed in 's' stanza of lav file.\n"); } str_free(fname); } else if (stanza_type == 'h') { String *name; str_double_trim(line); name = str_new(line->length-3); /* get rid of quotes and leading '>' */ str_substring(name, line, 2, line->length-3); if (lpwa->query_name == NULL) lpwa->query_name = name; else if (lpwa->target_name == NULL) lpwa->target_name = name; else { die("ERROR: too many entries in 'h' stanza of lav file.\n"); } } else if (stanza_type == 'a') { String *type; int val[6]; if (!done_with[(int)'s'] || !done_with[(int)'d'] || !done_with[(int)'h']) { die("ERROR: 'a' stanza appears in lav file before 'd', 's', or 'h' stanza.\n"); } str_double_trim(line); str_split(line, NULL, fields); type = lst_get_ptr(fields, 0); if (lst_size(fields) > 6) { die("ERROR: illegal line in 'a' stanza.\n"); } for (i = 1; i < lst_size(fields); i++) { str_as_int(lst_get_ptr(fields, i), &val[i]); str_free(lst_get_ptr(fields, i)); } if (type->chars[0] == 's') aln_block->score = val[1]; else if (type->chars[0] == 'b') { aln_block->query_beg = val[1]; aln_block->target_beg = val[2]; } else if (type->chars[0] == 'e') { aln_block->query_end = val[1]; aln_block->target_end = val[2]; } else if (type->chars[0] == 'l') lst_push_ptr(aln_block->gapless_alns, la_new_gapless_aln(val[1], val[3], val[2], val[4])); str_free(type); } } str_free(line); lst_free(fields); str_re_free(stanza_start_re); return lpwa; }
SEXP rph_phyloFit(SEXP msaP, SEXP treeStrP, SEXP substModP, SEXP scaleOnlyP, SEXP scaleSubtreeP, SEXP nratesP, SEXP alphaP, SEXP rateConstantsP, SEXP initModP, SEXP initBackgdFromDataP, SEXP initRandomP, SEXP initParsimonyP, SEXP clockP, SEXP emP, SEXP maxEmItsP, SEXP precisionP, SEXP gffP, SEXP ninfSitesP, SEXP quietP, SEXP noOptP, SEXP boundP, SEXP logFileP, SEXP selectionP) { struct phyloFit_struct *pf; int numProtect=0, i; double *doubleP; char *die_message=NULL; SEXP rv=R_NilValue; List *new_rate_consts = NULL; List *new_rate_weights = NULL; GetRNGstate(); //seed R's random number generator pf = phyloFit_struct_new(1); //sets appropriate defaults for RPHAST mode pf->msa = (MSA*)EXTPTR_PTR(msaP); if (treeStrP != R_NilValue) pf->tree = rph_tree_new(treeStrP); pf->use_em = LOGICAL_VALUE(emP); if (rateConstantsP != R_NilValue) { PROTECT(rateConstantsP = AS_NUMERIC(rateConstantsP)); numProtect++; doubleP = NUMERIC_POINTER(rateConstantsP); new_rate_consts = lst_new_dbl(LENGTH(rateConstantsP)); for (i=0; i < LENGTH(rateConstantsP); i++) lst_push_dbl(new_rate_consts, doubleP[i]); // pf->use_em = 1; } if (initModP != R_NilValue) { pf->input_mod = (TreeModel*)EXTPTR_PTR(initModP); pf->subst_mod = pf->input_mod->subst_mod; tm_register_protect(pf->input_mod); if (new_rate_consts == NULL && pf->input_mod->rK != NULL && pf->input_mod->nratecats > 1) { new_rate_consts = lst_new_dbl(pf->input_mod->nratecats); for (i=0; i < pf->input_mod->nratecats; i++) lst_push_dbl(new_rate_consts, pf->input_mod->rK[i]); // pf-> = 1; } if (pf->input_mod->empirical_rates && pf->input_mod->freqK != NULL && pf->input_mod->nratecats > 1) { new_rate_weights = lst_new_dbl(pf->input_mod->nratecats); for (i=0; i < pf->input_mod->nratecats; i++) lst_push_dbl(new_rate_weights, pf->input_mod->freqK[i]); } tm_reinit(pf->input_mod, rph_get_subst_mod(substModP), nratesP == R_NilValue ? pf->input_mod->nratecats : INTEGER_VALUE(nratesP), NUMERIC_VALUE(alphaP), new_rate_consts, new_rate_weights); } else { if (nratesP != R_NilValue) pf->nratecats = INTEGER_VALUE(nratesP); if (alphaP != R_NilValue) pf->alpha = NUMERIC_VALUE(alphaP); if (rateConstantsP != R_NilValue) { pf->rate_consts = new_rate_consts; if (nratesP == R_NilValue) pf->nratecats = lst_size(new_rate_consts); else if (lst_size(new_rate_consts) != pf->nratecats) die("length of new_rate_consts does not match nratecats\n"); } } pf->subst_mod = rph_get_subst_mod(substModP); pf->estimate_scale_only = LOGICAL_VALUE(scaleOnlyP); if (scaleSubtreeP != R_NilValue) { pf->subtree_name = smalloc((1+strlen(CHARACTER_VALUE(scaleSubtreeP)))*sizeof(char)); strcpy(pf->subtree_name, CHARACTER_VALUE(scaleSubtreeP)); } pf->random_init = LOGICAL_VALUE(initRandomP); pf->init_backgd_from_data = LOGICAL_VALUE(initBackgdFromDataP); pf->init_parsimony = LOGICAL_VALUE(initParsimonyP); pf->assume_clock = LOGICAL_VALUE(clockP); if (maxEmItsP != R_NilValue) pf->max_em_its = INTEGER_VALUE(maxEmItsP); pf->precision = get_precision(CHARACTER_VALUE(precisionP)); if (pf->precision == OPT_UNKNOWN_PREC) { die_message = "invalid precision"; goto rph_phyloFit_end; } if (gffP != R_NilValue) { pf->gff = (GFF_Set*)EXTPTR_PTR(gffP); gff_register_protect(pf->gff); } if (ninfSitesP != R_NilValue) pf->nsites_threshold = INTEGER_VALUE(ninfSitesP); pf->quiet = LOGICAL_VALUE(quietP); if (noOptP != R_NilValue) { int len=LENGTH(noOptP), pos=0; char *temp; for (i=0; i < LENGTH(noOptP); i++) len += strlen(CHARACTER_VALUE(STRING_ELT(noOptP, i))); temp = smalloc(len*sizeof(char)); for (i=0; i < LENGTH(noOptP); i++) { if (i != 0) temp[pos++] = ','; sprintf(&temp[pos], "%s", CHARACTER_VALUE(STRING_ELT(noOptP, i))); pos += strlen(CHARACTER_VALUE(STRING_ELT(noOptP, i))); } if (pos != len-1) die("ERROR parsing noOpt len=%i pos=%i\n", len, pos); temp[pos] = '\0'; pf->nooptstr = str_new_charstr(temp); } if (boundP != R_NilValue) { pf->bound_arg = lst_new_ptr(LENGTH(boundP)); for (i=0; i < LENGTH(boundP); i++) { String *temp = str_new_charstr(CHARACTER_VALUE(STRING_ELT(boundP, i))); lst_push_ptr(pf->bound_arg, temp); } } if (logFileP != R_NilValue) { if (IS_CHARACTER(logFileP)) pf->logf = phast_fopen(CHARACTER_VALUE(logFileP), "w+"); else if (IS_LOGICAL(logFileP) && LOGICAL_VALUE(logFileP)) { pf->logf = stdout; } } if (selectionP != R_NilValue) { pf->use_selection = TRUE; pf->selection = NUMERIC_VALUE(selectionP); } msa_register_protect(pf->msa); run_phyloFit(pf); rv = PROTECT(rph_listOfLists_to_SEXP(pf->results)); numProtect++; rph_phyloFit_end: if (pf->logf != NULL && pf->logf != stdout && pf->logf != stderr) phast_fclose(pf->logf); PutRNGstate(); if (die_message != NULL) die(die_message); if (numProtect > 0) UNPROTECT(numProtect); return rv; }
/* Transform the coordinates of all features in a GFF according to a local alignment. Each feature in the original GFF will be replaced by zero or more features with transformed begin and end coordinates. The original features are "projected" onto the aligned (target) sequence vis the alignment, in such a way that if a feature contains no aligned bases, then it will not be represented, and if a feature contains bases that align to multiple "blocks", then it will be split into several features, one for each block. The general idea is that the new features should cover only those bases in the target sequence that align to bases in the query sequence. Currently, however, insertions in the target sequence between gapless alignments of the same block are ignored, so that a transformed feature may contain some bases that do not directly align to the query sequence. The rationale is that these insertions should generally be small, and should reflect small-scale events that do not radically disrupt the local properties of the sequence. */ void la_gff_transform(LocalPwAlignment *lpwa, GFF_Set *gff) { int i, j, k; int new_beg, new_end; List *new_features = lst_new_ptr(lst_size(gff->features)); GFF_Feature *feat, *new_feat; for (i = 0; i < lst_size(gff->features); i++) { checkInterruptN(i, 1000); feat = lst_get_ptr(gff->features, i); for (j = 0; j < lst_size(lpwa->alignment_blocks); j++) { /* this is a somewhat inefficient way to proceed, but the number of features and the number of alignment blocks is usually pretty small; will adjust strategy as needed */ AlignmentBlock *ab = lst_get_ptr(lpwa->alignment_blocks, j); new_beg = new_end = -1; if ((ab->query_beg >= feat->start && ab->query_beg <= feat->end) || (ab->query_end >= feat->start && ab->query_end <= feat->end) || (feat->start >= ab->query_beg && feat->end <= ab->query_end)) { /* block and feature overlap */ if (feat->start <= ab->query_beg) /* feature extends to the left of the alignment block; use beg of block */ new_beg = ab->target_beg; else { /* ab->query_beg < feat->start */ /* find first corresponding base within a gapless alignment */ for (k = 0; k < lst_size(ab->gapless_alns); k++) { GaplessAlignment *ga = lst_get_ptr(ab->gapless_alns, k); if (ga->query_beg >= feat->start) { /* gapless alignment overlaps the feature and the feature extends to the left (equal to or) beyond the ga; use the start of the ga */ new_beg = ga->target_beg; break; } else if (ga->query_end >= feat->start) { /* gapless alignment overlaps the feature and the ga extends to the left beyond the feature; use the aligned base within the ga */ new_beg = ga->target_beg + (feat->start - ga->query_beg); break; } } } if (feat->end >= ab->query_end) /* feature extends to the right of the alignment block; use end of block */ new_end = ab->target_end; else { /* find last corresponding base within a gapless alignment */ for (k = lst_size(ab->gapless_alns)-1; k >= 0; k--) { GaplessAlignment *ga = lst_get_ptr(ab->gapless_alns, k); if (ga->query_end <= feat->end) { /* gapless alignment overlaps the feature and the feature extends to the right (equal to or) beyond the ga; use the end of the ga */ new_end = ga->target_end; break; } else if (ga->query_beg <= feat->end) { /* gapless alignment overlaps the feature and the ga extends to the right beyond the feature; use the aligned base within the ga */ new_end = ga->target_beg + (feat->end - ga->query_beg); break; } } } if (!(new_beg != -1 && new_end != -1)) die("ERROR: la_gff_transform: new_beg=%i new_end=%i\n", new_beg, new_end); /* fprintf(stderr, "(%d, %d) -> (%d, %d)\n", feat->start, feat->end, new_beg, new_end); */ new_feat = gff_new_feature_copy(feat); new_feat->start = new_beg; new_feat->end = new_end; lst_push_ptr(new_features, new_feat); } } } for (i = 0; i < lst_size(gff->features); i++) gff_free_feature(lst_get_ptr(gff->features, i)); lst_free(gff->features); gff->features = new_features; gff_sort(gff); }
int main(int argc, char *argv[]) { int check_start = 0, check_stop = 0, check_splice = 0, check_nonsense = 0, offset5 = 0, offset3 = 0, opt_idx, i, j, indel_strict = 0, no_output = 0, check_alignment = 0, splice_strict = 0; int ncons_tested, nkept, nconserved_exons; int nce_gap_type[NGAP_TYPES], nconsid[NTYPES], nfail[NTYPES]; double Nfrac = 0.05; char c; MSA *msa; GFF_Set *gff; msa_format_type msa_format = UNKNOWN_FORMAT; List *keepers, *problems = lst_new_ptr(10), *ends_adjusted = lst_new_ptr(1), *starts_adjusted = lst_new_ptr(1), *discards=NULL, *intron_splice = lst_new_ptr(10); char *rseq_fname = NULL; FILE *logf = NULL, *mlogf = NULL, *statsf = NULL, *discardf = NULL; cds_gap_type fshift_mode = FSHIFT_BAD; char *groupby = "transcript_id"; msa_coord_map *map; int *countNs, *countCDSs; FILE *infile; char *msa_fname; struct option long_opts[] = { {"start", 0, 0, 's'}, {"stop", 0, 0, 't'}, {"splice", 0, 0, 'l'}, {"nonsense", 0, 0, 'n'}, {"fshift", 0, 0, 'f'}, {"conserved", 0, 0, 'c'}, {"N-limit", 1, 0, 'N'}, {"clean-gaps", 0, 0, 'e'}, {"indel-strict", 0, 0, 'I'}, {"splice-strict", 0, 0, 'C'}, {"groupby", 1, 0, 'g'}, {"msa-format", 1, 0, 'i'}, {"refseq", 1, 0, 'r'}, {"offset5", 1, 0, 'o'}, {"offset3", 1, 0, 'p'}, {"no-output", 0, 0, 'x'}, {"discards", 1, 0, 'd'}, {"log", 1, 0, 'L'}, {"machine-log", 1, 0, 'M'}, {"stats", 1, 0, 'S'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = (char)getopt_long(argc, argv, "N:i:r:L:M:S:g:d:stlnfceICxh", long_opts, &opt_idx)) != -1) { switch(c) { case 's': check_alignment = check_start = 1; break; case 't': check_alignment = check_stop = 1; break; case 'l': check_alignment = check_splice = 1; break; case 'n': check_alignment = check_nonsense = 1; break; case 'f': check_alignment = 1; fshift_mode = FSHIFT_OK; break; case 'c': check_alignment = check_start = check_stop = check_splice = check_nonsense = 1; if (fshift_mode < FSHIFT_OK) fshift_mode = FSHIFT_OK; break; case 'N': Nfrac = get_arg_dbl_bounds(optarg, 0, 1); break; case 'e': check_alignment = 1; if (fshift_mode < CLN_GAPS) fshift_mode = CLN_GAPS; break; case 'I': check_alignment = 1; fshift_mode = NOVRLP_CLN_GAPS; indel_strict = 1; break; case 'C': check_alignment = check_splice = splice_strict = 1; break; case 'g': groupby = optarg; break; case 'i': msa_format = msa_str_to_format(optarg); if (msa_format == UNKNOWN_FORMAT) die("Bad alignment format.\n"); break; case 'r': rseq_fname = optarg; break; case 'o': offset5 = get_arg_int(optarg); break; case 'p': offset3 = get_arg_int(optarg); break; case 'L': logf = phast_fopen(optarg, "w+"); break; case 'M': mlogf = phast_fopen(optarg, "w+"); break; case 'S': statsf = phast_fopen(optarg, "w+"); break; case 'd': discardf = phast_fopen(optarg, "w+"); break; case 'x': no_output = 1; break; case 'h': printf("%s", HELP); exit(0); case '?': die("ERROR: Bad argument. Try the --help option.\n"); } } if (optind + 1 >= argc ) { die("ERROR: Missing required arguments. Try the --help option.\n"); } set_seed(-1); gff = gff_read_set(phast_fopen(argv[optind], "r")); msa_fname = argv[optind+1]; infile = phast_fopen(msa_fname, "r"); if (msa_format == UNKNOWN_FORMAT) msa_format = msa_format_for_content(infile, 1); if (msa_format == MAF) { msa = maf_read(infile, rseq_fname == NULL ? NULL : phast_fopen(rseq_fname, "r"), 1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); } else { msa = msa_new_from_file_define_format(infile, msa_format, NULL); if (msa->ss == NULL) ss_from_msas(msa, 1, 1, NULL, NULL, NULL, -1, 0); } if (!msa->ss->tuple_idx) die("ERROR: need ordered tuples\n"); msa_remove_N_from_alph(msa); /* for backward compatibility (old SS files) */ if (msa->idx_offset != 0) { /* avoids offset problem */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); f->start -= msa->idx_offset; f->end -= msa->idx_offset; } } /* set up coordinate map; assume GFF is for sequence 1 */ map = msa_build_coord_map(msa, 1); /* convert all features */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); int newstart, newend; if (f->start < 0 || f->end < f->start) die("ERROR: bad feature in GFF (start=%d, end=%d).\n", f->start, f->end); newstart = msa_map_seq_to_msa(map, f->start); newend = msa_map_seq_to_msa(map, f->end); if (newstart < 0 || newend < newstart) die("ERROR: unable to map coordinates for feature (start=%d, end=%d).\n", f->start, f->end); f->start = newstart; f->end = newend; } gff_group(gff, groupby); /* do this after coord conversion, or group coords and feature coords will be out of sync */ keepers = lst_new_ptr(lst_size(gff->features)); if (discardf != NULL) discards = lst_new_ptr(lst_size(gff->features)); ncons_tested = nkept = nconserved_exons = 0; for (i = 0; i < NTYPES; i++) nconsid[i] = 0; for (i = 0; i < NTYPES; i++) nfail[i] = 0; for (i = 0; i < NGAP_TYPES; i++) nce_gap_type[i] = 0; countNs = smalloc(msa->nseqs * sizeof(int)); countCDSs = smalloc(msa->nseqs * sizeof(int)); for (i = 0; i < lst_size(gff->groups); i++) { GFF_FeatureGroup *group = lst_get_ptr(gff->groups, i); List *gfeatures = group->features; GFF_Feature *feat; status_type status = OKAY; cds_gap_type gt = FSHIFT_BAD; problems_clear(problems); /* make sure have frame info for CDSs */ for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && feat->frame == GFF_NULL_FRAME) die("ERROR: Missing frame info for CDS.\n"); } /* First, exclude stop codons from cds's, if necessary (simplifies the detection of nonsense mutations). */ exclude_stops(group, starts_adjusted, ends_adjusted); /* In all cases, discard any group for which the reference sequence doesn't have valid splice sites or start/stop codons, or has a premature stop codon */ if (!ref_seq_okay(gfeatures, msa, offset3, indel_strict, splice_strict, problems)) { status = BAD_REF; nfail[BAD_REF]++; } else /* Everything else counts as a potentially valid group */ ncons_tested++; if (status == OKAY && check_alignment) { /* only bother with below if interested in cross-species conservation */ /* Check first to make sure there's alignment across species in the cds; if not, there's no need to look at individual features. */ for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && is_incomplete_alignment(feat, msa)) { status = NO_ALN; nfail[NO_ALN]++; problem_add(problems, feat, NO_ALN, -1, -1); break; } } if (status == OKAY) { /* we have alignment and agreement with the ref seq; now check feature by feature */ lst_clear(intron_splice); for (j = 0; j < msa->nseqs; j++) countNs[j] = countCDSs[j] = 0; for (j = 0; j < lst_size(gfeatures); j++) { feat = lst_get_ptr(gfeatures, j); if (feat->end - 1 >= msa->length) die("ERROR: feature extends beyond alignment (%d >= %d).\n", feat->end - 1, msa->length); if (check_start && str_equals_charstr(feat->feature, GFF_START_TYPE)) { nconsid[BAD_START]++; if (!is_conserved_start(feat, msa)) { status = BAD_START; problem_add(problems, feat, BAD_START, -1, -1); } } else if (check_stop && str_equals_charstr(feat->feature, GFF_STOP_TYPE)) { nconsid[BAD_STOP]++; if (!is_conserved_stop(feat, msa)) { status = BAD_STOP; problem_add(problems, feat, BAD_STOP, -1, -1); } } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_5)) { nconsid[BAD_5_SPLICE]++; if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) { status = BAD_5_SPLICE; problem_add(problems, feat, BAD_5_SPLICE, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_5_UTR)) { nconsid[BAD_5_SPLICE_UTR]++; if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) { status = BAD_5_SPLICE_UTR; problem_add(problems, feat, BAD_5_SPLICE_UTR, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) { nconsid[BAD_3_SPLICE]++; if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) { status = BAD_3_SPLICE; problem_add(problems, feat, BAD_3_SPLICE, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) { nconsid[BAD_3_SPLICE_UTR]++; if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) { status = BAD_3_SPLICE_UTR; problem_add(problems, feat, BAD_3_SPLICE_UTR, -1, -1); } else lst_push_ptr(intron_splice, feat); } else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { if (fshift_mode > FSHIFT_BAD && (gt = get_cds_gap_type(feat, msa, problems)) < fshift_mode) { if (status == OKAY || status == NONSENSE) status = FSHIFT; } if (check_nonsense && !is_nonsense_clean(feat, msa, problems)) { if (status == OKAY) status = NONSENSE; } if (Nfrac < 1) get_N_counts(countNs, countCDSs, feat, msa); } } /* end loop through features in group */ /* still have to make sure splice sites are paired correctly (GT-AG, GC-AG, AT-AC) */ if (status == OKAY && !splice_strict && lst_size(intron_splice) >= 2 && !are_introns_okay(intron_splice, msa, problems, offset5, offset3)) status = BAD_INTRON; /* also check fraction of Ns */ if (Nfrac < 1) { enum {MY_OKAY, MY_FAIL, MY_WARN} Nstatus = MY_OKAY; for (j = 0; j < msa->nseqs; j++) { if ((double)countNs[j] / countCDSs[j] > Nfrac) Nstatus = MY_FAIL; if (Nstatus == MY_OKAY && countNs[j] > 0) Nstatus = MY_WARN; } if (Nstatus == MY_FAIL) { problem_add(problems, NULL, TOO_MANY_Ns, -1, -1); if (status == OKAY) status = TOO_MANY_Ns; } else if (Nstatus == MY_WARN) problem_add(problems, NULL, WARN_Ns, -1, -1); } /* if collecting stats, record counts for failures */ if (statsf != NULL) { if (status != OKAY) { for (j = 0; j < lst_size(problems); j++) { struct Problem *problem = lst_get_ptr(problems, j); status_type ftype = problem->status; if ((ftype == FSHIFT || ftype == NONSENSE) && status != FSHIFT && status != NONSENSE) continue; /* don't count secondary frame shifts and nonsense mutations */ if (ftype == BAD_INTRON && j % 2 == 0) continue; /* only count one of every pair of these */ nfail[ftype]++; } } /* also keep track of the total number of "conserved exons", and the number having each kind of gap */ if ((status == OKAY || (status == FSHIFT && gt >= FSHIFT_OK))) { nconserved_exons++; nce_gap_type[gt]++; /* number of conserved exons having given type of gaps */ } } } /* end if (status == OKAY) [checks for conserved features] */ } /* end if (status == OKAY && check_alignment) [all cross-species checks] */ /* now we have looked at the whole group; we just need to do some final accounting and logging */ if (status == OKAY) { nkept++; if (!no_output) { restore_stops(group, starts_adjusted, ends_adjusted); for (j = 0; j < lst_size(gfeatures); j++) lst_push_ptr(keepers, lst_get_ptr(gfeatures, j)); } if (logf != NULL && lst_size(problems) > 0) /* warnings only */ write_log(logf, group, status, problems, msa, map); if (mlogf != NULL) { /* no problem, need to add an okay status to log */ problem_add(problems, NULL, OKAY, -1, -1); write_machine_log(mlogf, group, problems, map); /* may include warnings */ } } else { if (discardf != NULL) { restore_stops(group, starts_adjusted, ends_adjusted); for (j = 0; j < lst_size(gfeatures); j++) lst_push_ptr(discards, lst_get_ptr(gfeatures, j)); } if (logf != NULL) write_log(logf, group, status, problems, msa, map); if (mlogf != NULL) write_machine_log(mlogf, group, problems, map); } } /* end loop over groups */ /* write main output and discards */ if (!no_output || discardf != NULL) { /* first map features back to coord frame of reference seq. */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); f->start = msa_map_msa_to_seq(map, f->start) + msa->idx_offset; f->end = msa_map_msa_to_seq(map, f->end) + msa->idx_offset; } if (!no_output) { gff->features = keepers; gff_print_set(stdout, gff); } if (discardf != NULL) { gff->features = discards; gff_print_set(discardf, gff); } } /* dump counts to stats file */ if (statsf != NULL) { fprintf(statsf, "#%11s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "total", "nbad_ref", "nconsid", "nkept", "nno_aln", "nbad_starts", "(out of)", "nbad_stops", "(out of)", "nbad_5spl", "(out of)", "nbad_3spl", "(out of)", "nbad_5utr", "(out of)", "nbad_3utr", "(out of)", "nbad_intron", "nnons", "nfshifts", "nNs", "ncons_exons", "nce_ngaps", "nce_nov_cln", "nce_clean", "nce_fshftok"); fprintf(statsf, "%12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d\n", nfail[BAD_REF]+ncons_tested, nfail[BAD_REF], ncons_tested, nkept, nfail[NO_ALN], nfail[BAD_START], nconsid[BAD_START], nfail[BAD_STOP], nconsid[BAD_STOP], nfail[BAD_5_SPLICE], nconsid[BAD_5_SPLICE], nfail[BAD_3_SPLICE], nconsid[BAD_3_SPLICE], nfail[BAD_5_SPLICE_UTR], nconsid[BAD_5_SPLICE_UTR], nfail[BAD_3_SPLICE_UTR], nconsid[BAD_3_SPLICE_UTR], nfail[BAD_INTRON], nfail[NONSENSE], nfail[FSHIFT], nfail[TOO_MANY_Ns], nconserved_exons, nce_gap_type[NGAPS], nce_gap_type[NOVRLP_CLN_GAPS], nce_gap_type[CLN_GAPS], nce_gap_type[FSHIFT_OK]); fprintf(statsf, "%s", STATS_DESCRIPTION); } if (logf != NULL) phast_fclose(logf); if (mlogf != NULL) phast_fclose(mlogf); if (statsf != NULL) phast_fclose(statsf); if (discardf != NULL) phast_fclose(discardf); return 0; }
/* scans a cds for gaps. Returns CLN_GAPS, NOVRLP_CLN_GAPS, NO_GAPS, or FSHIFT_BAD; doesn't try to check for compensatory indels, which is more complicated (this is left for the special-purpose function below) */ int scan_for_gaps(GFF_Feature *feat, MSA *msa, Problem **problem) { int msa_start = feat->start - 1; int msa_end = feat->end - 1; int i, j; int near_boundary = 0; cds_gap_type retval = NGAPS; List *gaps = lst_new_ptr(10); for (j = 0; retval != FSHIFT_BAD && j < msa->nseqs; j++) { for (i = msa_start; i <= msa_end; i++) { if (ss_get_char_pos(msa, i, j, 0) == GAP_CHAR) { int gap_start, gap_end; struct gap *g; for (gap_start = i-1; gap_start >= msa_start && ss_get_char_pos(msa, gap_start, j, 0) == GAP_CHAR; gap_start--); gap_start++; /* inclusive */ for (gap_end = i+1; gap_end <= msa_end && ss_get_char_pos(msa, gap_end, j, 0) == GAP_CHAR; gap_end++); gap_end--; /* inclusive */ if ((gap_end - gap_start + 1) % 3 != 0) { retval = FSHIFT_BAD; *problem = problem_new(feat, FSHIFT, gap_start, gap_end); (*problem)->cds_gap = FSHIFT_BAD; break; } /* note whether gaps occur near a cds boundary (within 3 sites) */ if (gap_start <= msa_start + 3 || gap_end >= msa_end - 3) near_boundary = 1; if (retval == NGAPS) retval = CLN_GAPS; g = smalloc(sizeof(struct gap)); g->start = gap_start; g->end = gap_end; lst_push_ptr(gaps, g); i = gap_end; } } } if (retval == CLN_GAPS) { /* now check for overlaps */ lst_qsort(gaps, gap_compare); retval = NOVRLP_CLN_GAPS; for (i = 1; i < lst_size(gaps); i++) { struct gap *g1 = lst_get_ptr(gaps, i-1); struct gap *g2 = lst_get_ptr(gaps, i); if (g2->start <= g1->end && (g2->start != g1->start || g2->end != g1->end)) { retval = CLN_GAPS; break; } } if (retval == NOVRLP_CLN_GAPS && near_boundary) retval = CLN_GAPS; /* note that the boundary criterion is being confounded with the overlap criterion. Doesn't seem worth fixing at the moment ... */ } for (i = 0; i < lst_size(gaps); i++) sfree(lst_get_ptr(gaps, i)); lst_free(gaps); return retval; }
int main(int argc, char *argv[]) { char *msa_fname = NULL, *alph = "ACGT"; msa_format_type input_format = UNKNOWN_FORMAT; char c; int opt_idx, seed=-1; String *optstr; List *tmplist = NULL; struct phyloFit_struct *pf; FILE *infile; struct option long_opts[] = { {"msa", 1, 0, 'm'}, {"tree", 1, 0, 't'}, {"subst-mod", 1, 0, 's'}, {"msa-format", 1, 0, 'i'}, {"nrates", 1, 0, 'k'}, {"alpha", 1, 0, 'a'}, {"features", 1, 0, 'g'}, {"catmap", 1, 0, 'c'}, {"log", 1, 0, 'l'}, {"out-root", 1, 0, 'o'}, {"EM", 0, 0, 'E'}, {"error", 1, 0, 'e'}, {"precision", 1, 0, 'p'}, {"do-cats", 1, 0, 'C'}, {"non-overlapping", 0, 0, 'V'}, {"markov", 0, 0, 'N'}, {"reverse-groups", 1, 0, 'R'}, {"init-model", 1, 0, 'M'}, {"init-random", 0, 0, 'r'}, {"init-parsimony", 0, 0, 'y'}, {"print-parsimony", 1, 0, 'Y'}, {"lnl", 0, 0, 'L'}, {"scale-only", 0, 0, 'B'}, {"scale-subtree", 1, 0, 'S'}, {"estimate-freqs", 0, 0, 'F'}, {"sym-freqs", 0, 0, 'W'}, {"no-freqs", 0, 0, 'f'}, {"no-rates", 0, 0, 'n'}, {"no-opt", 1, 0, 'O'}, {"min-informative", 1, 0, 'I'}, {"gaps-as-bases", 0, 0, 'G'}, {"quiet", 0, 0, 'q'}, {"help", 0, 0, 'h'}, {"windows", 1, 0, 'w'}, {"windows-explicit", 1, 0, 'v'}, {"ancestor", 1, 0, 'A'}, {"post-probs", 0, 0, 'P'}, {"expected-subs", 0, 0, 'X'}, {"expected-total-subs", 0, 0, 'Z'}, {"expected-subs-col", 0, 0, 'J'}, {"column-probs", 0, 0, 'U'}, {"rate-constants", 1, 0, 'K'}, {"ignore-branches", 1, 0, 'b'}, {"clock", 0, 0, 'z'}, {"alt-model", 1, 0, 'd'}, {"label-branches", 1, 0, 0}, {"label-subtree", 1, 0, 0}, {"selection", 1, 0, 0}, {"bound", 1, 0, 'u'}, {"seed", 1, 0, 'D'}, {0, 0, 0, 0} }; // NOTE: remaining shortcuts left: HjQx pf = phyloFit_struct_new(0); while ((c = (char)getopt_long(argc, argv, "m:t:s:g:c:C:i:o:k:a:l:w:v:M:p:A:I:K:S:b:d:O:u:Y:e:D:GVENRqLPXZUBFfnrzhWyJ", long_opts, &opt_idx)) != -1) { switch(c) { case 'm': msa_fname = optarg; break; case 't': if (optarg[0] == '(') /* in this case, assume topology given at command line */ pf->tree = tr_new_from_string(optarg); else pf->tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 's': pf->subst_mod = tm_get_subst_mod_type(optarg); if (pf->subst_mod == UNDEF_MOD) die("ERROR: illegal substitution model. Type \"phyloFit -h\" for usage.\n"); break; case 'g': pf->gff = gff_read_set(phast_fopen(optarg, "r")); break; case 'c': pf->cm = cm_new_string_or_file(optarg); break; case 'C': pf->cats_to_do_str = get_arg_list(optarg); break; case 'V': pf->nonoverlapping = TRUE; break; case 'o': pf->output_fname_root = optarg; break; case 'k': pf->nratecats = get_arg_int_bounds(optarg, 0, INFTY); break; case 'a': pf->alpha = get_arg_dbl(optarg); break; case 'R': pf->reverse_group_tag = optarg; break; case 'i': input_format = msa_str_to_format(optarg); if (input_format == UNKNOWN_FORMAT) die("ERROR: unrecognized alignment format. Type 'phyloFit -h' for usage.\n"); break; case 'l': if (!strcmp(optarg, "-")) pf->logf = stderr; else pf->logf = phast_fopen(optarg, "w+"); break; case 'N': pf->use_conditionals = 1; break; case 'w': tmplist = get_arg_list(optarg); if (lst_size(tmplist) != 2 || str_as_int(lst_get_ptr(tmplist, 0), &(pf->window_size)) != 0 || str_as_int(lst_get_ptr(tmplist, 1), &(pf->window_shift)) != 0) die("ERROR: illegal arguments to --windows.\n"); lst_free_strings(tmplist); lst_free(tmplist); break; case 'v': tmplist = get_arg_list(optarg); if (lst_size(tmplist) % 2 != 0) die("ERROR: argument to --windows-explicit must be a list of even length.\n"); pf->window_coords = str_list_as_int(tmplist); lst_free(tmplist); break; case 'E': pf->use_em = TRUE; break; case 'e': pf->error_fname=optarg; break; case 'p': if (!strcmp(optarg, "LOW")) pf->precision = OPT_LOW_PREC; else if (!strcmp(optarg, "MED")) pf->precision = OPT_MED_PREC; else if (!strcmp(optarg, "HIGH")) pf->precision = OPT_HIGH_PREC; else if (!strcmp(optarg, "VERY_HIGH")) pf->precision = OPT_VERY_HIGH_PREC; else die("ERROR: --precision must be LOW, MED, or HIGH.\n\n"); break; case 'M': pf->input_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1); break; case 'r': pf->random_init = TRUE; break; case 'y': pf->init_parsimony = TRUE; break; case 'Y': pf->init_parsimony = TRUE; pf->parsimony_cost_fname = optarg; pf->parsimony_only=TRUE; break; case 'L': pf->likelihood_only = TRUE; break; case 'q': pf->quiet = TRUE; break; case 'P': pf->do_bases = TRUE; break; case 'X': pf->do_expected_nsubst = TRUE; break; case 'Z': pf->do_expected_nsubst_tot = TRUE; break; case 'J': pf->do_expected_nsubst_col = TRUE; break; case 'U': pf->likelihood_only = TRUE; /* force -L */ pf->nsites_threshold = 0; /* also force this; typical use is with small number of tuples, no tuple_idx */ pf->do_column_probs = TRUE; break; case 'A': pf->root_seqname = optarg; break; case 'I': pf->nsites_threshold = get_arg_int(optarg); break; case 'G': pf->gaps_as_bases = TRUE; alph = "ACGT-"; break; case 'B': pf->estimate_scale_only = TRUE; break; case 'S': pf->subtree_name = optarg; break; case 'F': pf->estimate_backgd = TRUE; break; case 'W': pf->estimate_backgd = TRUE; pf->symfreq = TRUE; break; case 'f': pf->no_freqs = TRUE; break; case 'n': pf->no_rates = TRUE; break; case 'K': tmplist = get_arg_list(optarg); pf->rate_consts = str_list_as_dbl(tmplist); pf->nratecats = lst_size(pf->rate_consts); pf->use_em = 1; lst_free_strings(tmplist); lst_free(tmplist); break; case 'b': pf->ignore_branches = get_arg_list(optarg); break; case 'z': pf->assume_clock = TRUE; break; case 'O': if (pf->nooptstr == NULL) pf->nooptstr = str_new_charstr(optarg); else die("ERROR: no-opt argument can only be used once! parameters can be comma-separated list."); break; case 'd': if (pf->alt_mod_str == NULL) { pf->alt_mod_str = lst_new_ptr(1); } optstr = str_new_charstr(optarg); lst_push_ptr(pf->alt_mod_str, optstr); break; case 0: if (strcmp(long_opts[opt_idx].name, "label-branches") == 0 || strcmp(long_opts[opt_idx].name, "label-subtree") == 0) { optstr = str_new_charstr(optarg); if (pf->label_str == NULL) { pf->label_str = lst_new_ptr(3); pf->label_type = lst_new_int(3); } lst_push_ptr(pf->label_str, optstr); lst_push_int(pf->label_type, strcmp(long_opts[opt_idx].name, "label-branches") == 0 ? BRANCH_TYPE : SUBTREE_TYPE); } else if (strcmp(long_opts[opt_idx].name, "selection") == 0) { pf->selection = get_arg_dbl(optarg); pf->use_selection = TRUE; } else { die("ERROR: unknown option. Type 'phyloFit -h' for usage.\n"); } break; case 'u': if (pf->bound_arg == NULL) pf->bound_arg = lst_new_ptr(1); optstr = str_new_charstr(optarg); lst_push_ptr(pf->bound_arg, optstr); break; case 'D': seed = get_arg_int_bounds(optarg, 1, INFTY); break; case 'h': printf("%s", HELP); exit(0); case '?': die("ERROR: illegal argument. Type 'phyloFit -h' for usage.\n"); } } set_seed(seed); if (msa_fname == NULL) { if (optind >= argc) die("ERROR: missing alignment filename. Type 'phyloFit -h' for usage.\n"); msa_fname = argv[optind]; pf->msa_fname = msa_fname; } infile = phast_fopen(msa_fname, "r"); if (input_format == UNKNOWN_FORMAT) input_format = msa_format_for_content(infile, 1); if (pf->nonoverlapping && (pf->use_conditionals || pf->gff != NULL || pf->cats_to_do_str || input_format == SS)) die("ERROR: cannot use --non-overlapping with --markov, --features,\n--msa-format SS, or --do-cats.\n"); /* read alignment */ if (!pf->quiet) fprintf(stderr, "Reading alignment from %s ...\n", msa_fname); if (input_format == MAF) { pf->msa = maf_read(infile, NULL, tm_order(pf->subst_mod) + 1, NULL, pf->gff, pf->cm, pf->nonoverlapping ? tm_order(pf->subst_mod) + 1 : -1, FALSE, pf->reverse_group_tag, NO_STRIP, FALSE); if (pf->gaps_as_bases) msa_reset_alphabet(pf->msa, alph); } else pf->msa = msa_new_from_file_define_format(infile, input_format, alph); /* set up for categories */ /* first label sites, if necessary */ pf->label_categories = (input_format != MAF); run_phyloFit(pf); if (pf->logf != NULL && pf->logf != stderr && pf->logf != stdout) phast_fclose(pf->logf); if (!pf->quiet) fprintf(stderr, "Done.\n"); sfree(pf); return 0; }
/* create a new problem, and add to the list */ Problem *problem_add(List *problems, GFF_Feature *feat, status_type status, int start, int end) { Problem *p = problem_new(feat, status, start, end); lst_push_ptr(problems, p); return p; }
/* reconstruct indels by parsimony and assign all base probs to -1 where ancestral bases are inferred not to have been present */ void do_indels(MSA *msa, TreeModel *mod) { int s, tup, i, j; TreeNode *n, *lca; char c; typedef enum {IGNORE, GAP, BASE, MISSING, AMBIG} label_type; List *postorder; label_type *label = smalloc(mod->tree->nnodes * sizeof(label_type)); List *inside = lst_new_ptr(mod->tree->nnodes), *outside = lst_new_ptr(mod->tree->nnodes), *ambig_cases = lst_new_ptr(mod->tree->nnodes); int *seq_to_leaf = smalloc(msa->nseqs * sizeof(int)); /* build mapping from seqs to leaf indices in tree */ for (s = 0; s < msa->nseqs; s++) { TreeNode *n = tr_get_node(mod->tree, msa->names[s]); if (n == NULL) die("ERROR: no match for sequence \"%s\" in tree.\n", msa->names[s]); seq_to_leaf[s] = n->id; } if (mod->msa_seq_idx == NULL) tm_build_seq_idx(mod, msa); postorder = tr_postorder(mod->tree); for (tup = 0; tup < msa->ss->ntuples; tup++) { int min = mod->tree->nnodes, max = -1, ngaps = 0, skip_root = FALSE; /* find min and max ids of seqs that actually have bases (non-gaps) */ for (s = 0; s < msa->nseqs; s++) { if (ss_get_char_tuple(msa, tup, s, 0) == GAP_CHAR) { ngaps++; continue; } if (seq_to_leaf[s] < min) min = seq_to_leaf[s]; if (seq_to_leaf[s] > max) max = seq_to_leaf[s]; /* NOTE: missing data being handled like bases here; in some cases, a base may be inferred at an ancestral node, when the only evidence for it is missing data in the leaves. There are ambiguous cases; we'll err on the side of predicting bases rather than indels */ } if (ngaps <= 1) continue; /* short cut -- impossible to infer gaps in ancestors */ else if (ngaps >= msa->nseqs - 1) { /* in this case, all ancestors must be gaps */ for (i = 0; i < mod->tree->nnodes; i++) { n = lst_get_ptr(mod->tree->nodes, i); if (n->lchild == NULL || n->rchild == NULL) continue; /* ignore leaves */ for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; /* mark as gap */ } continue; } if (min < 0) die("prequel.c: min = %e < 0\n", min); if (max < min) die("prequel.c: max (%e) < min (%e)", max, min); /* the LCA of all leaves with non-gaps must be the first ancestor of the node with the max id that has an id smaller than the min id. This is based on the assumption that node ids are assigned sequentially in a preorder traversal of the tree, which will be true as long as the tree is read from a Newick file by the code in trees.c */ for (lca = lst_get_ptr(mod->tree->nodes, max); lca->id > min; lca = lca->parent); /* by parsimony, the base was inserted on the branch to the LCA, and all ancestral nodes outside the subtree rooted at the LCA did not have bases */ if (lca == mod->tree->lchild || lca == mod->tree->rchild) skip_root = TRUE; /* don't mark root as gap in this case: can't distinguish insertion from deletion so assume deletion */ /* mark ancestral bases outside subtree beneath LCA as gaps */ tr_partition_nodes(mod->tree, lca, inside, outside); for (i = 0; i < mod->tree->nnodes; i++) label[i] = BASE; for (i = 0; i < lst_size(outside); i++) { n = lst_get_ptr(outside, i); label[n->id] = IGNORE; if (n->lchild == NULL || n->rchild == NULL) continue; /* skip leaves */ if (n == mod->tree && skip_root) continue; /* skip root if condition above */ for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; /* mark as gap */ } /* check for gaps in subtree; if there's at most one, we can go on; otherwise have to use parsimony to infer history in subtree */ ngaps = 0; for (i = 0; i < lst_size(inside); i++) { n = lst_get_ptr(inside, i); if (n->lchild == NULL && ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0) == GAP_CHAR) ngaps++; } if (ngaps <= 1) continue; /* use Dollo parsimony to infer the indel history of the subtree beneath the LCA. Use the fact that every base must have a chain of bases to the LCA, because, assuming the alignment is correct, no insertions are possible beneath the LCA */ lst_clear(ambig_cases); for (i = 0; i < lst_size(postorder); i++) { n = lst_get_ptr(postorder, i); if (label[n->id] == IGNORE) continue; /* outside subtree */ /* MISSING means all leaves beneath node have missing data */ /* AMBIG means combination of gaps and missing data beneath node */ else if (n->lchild == NULL) { /* leaf in subtree */ c = ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0); if (c == GAP_CHAR) label[n->id] = GAP; else if (msa->is_missing[(int)c]) label[n->id] = MISSING; else label[n->id] = BASE; } else { /* internal node in subtree */ if (label[n->lchild->id] == BASE || label[n->rchild->id] == BASE) label[n->id] = BASE; /* by Dollo parsimony */ else if ((label[n->lchild->id] == GAP || label[n->lchild->id] == AMBIG) && (label[n->rchild->id] == GAP || label[n->rchild->id] == AMBIG)) label[n->id] = GAP; /* gaps from both sides and no bases -- must be gap */ else if (label[n->lchild->id] == MISSING && label[n->rchild->id] == MISSING) label[n->id] = MISSING; else { /* must be GAP/MISSING or AMBIG/MISSING */ label[n->id] = AMBIG; lst_push_ptr(ambig_cases, n); } } } /* now resolve any ambiguities, by giving each ambiguous node the same label as its parent; traversing ambig_cases in reverse order ensures that parents are visited before children */ /* first make sure root of subtree has a base */ if (label[lca->id] == MISSING || label[lca->id] == AMBIG) label[lca->id] = BASE; /* in this case there is all missing data and gaps beneath the LCA; hard to know what is right, but let's force a base and err on the side of bases rather than gaps */ for (i = lst_size(ambig_cases) - 1; i >= 0; i--) { n = lst_get_ptr(ambig_cases, i); if (n == lca) continue; else label[n->id] = label[n->parent->id]; } /* now mark gaps inside subtree, as needed */ for (i = 0; i < lst_size(inside); i++) { n = lst_get_ptr(inside, i); if (n->lchild == NULL || n->rchild == NULL) continue; if (label[n->id] == GAP) for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; } } lst_free(inside); lst_free(outside); lst_free(ambig_cases); sfree(seq_to_leaf); sfree(label); }
int main(int argc, char* argv[]) { FILE* F; GFF_Set *gff_real=NULL, *gff_pred=NULL; char c; List *real_fname_list = NULL, *pred_fname_list = NULL, *feat_list = NULL, *seq_len_list = NULL, *l = NULL; int nfile, i, j; char *prefix = NULL; int tot_tp = 0, tot_fp = 0, tot_nreal_pos = 0, tot_npred_pos = 0, tot_seqlen = 0, tot_ncr = 0, tot_npca = 0, tot_nola = 0, tot_nme = 0, tot_npcp = 0, tot_nolp = 0, tot_nwe = 0, tot_nexons_real = 0, tot_nexons_pred = 0, dump_exons = 0, nnc = -1, tot_nnc = -1, nc_threshold = 0; while ((c = (char)getopt(argc, argv, "r:p:f:l:d:n:h")) != -1) { switch(c) { case 'r': real_fname_list = get_arg_list(optarg); break; case 'p': pred_fname_list = get_arg_list(optarg); break; case 'l': l = get_arg_list(optarg); /* convert to ints */ seq_len_list = lst_new_int(lst_size(l)); for (i = 0; i < lst_size(l); i++) { int tmp; if (str_as_int((String*)lst_get_ptr(l, i), &tmp) != 0) { die("ERROR: Bad integer in <seq_len_list>.\n"); } lst_push_int(seq_len_list, tmp); } break; case 'f': feat_list = get_arg_list(optarg); break; case 'd': dump_exons = 1; prefix = optarg; break; case 'n': nnc = tot_nnc = 0; nc_threshold = get_arg_int(optarg); break; case 'h': print_usage(); exit(0); case '?': die("Unrecognized option. Try \"eval_predictions -h\" for help.\n"); } } set_seed(-1); if (feat_list == NULL) { feat_list = lst_new_ptr(1); lst_push_ptr(feat_list, str_new_charstr(GFF_CDS_TYPE)); } if (real_fname_list == NULL || pred_fname_list == NULL || seq_len_list == NULL) { die("ERROR: Must specify -r, -p, and -l. Try \"eval_predictions -h\" for help.\n"); } if (lst_size(real_fname_list) != lst_size(pred_fname_list)) { die("ERROR: Must specify lists of equal length for real and predicted filenames.\n\n."); } if (lst_size(seq_len_list) == 1 && lst_size(real_fname_list) > 1) for (i = 1; i < lst_size(real_fname_list); i++) lst_push_int(seq_len_list, lst_get_int(seq_len_list, 0)); else if (lst_size(seq_len_list) != lst_size(real_fname_list)) die("ERROR: List of sequence lengths does not match lists of real and predicted filenames.\n"); /* print header */ printf("%-25s %-25s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "Real_fname", "Pred_fname", "Sn", "Sp", "AC", "CC", "ESn", "ESp", "CRa", "PCa", "OLa", "ME", "CRp", "PCp", "OLp", "WE"); if (nnc != -1) printf(" %7s %7s %7s %7s", "NCa", "NCp", "CR+NCa", "CR+NCp"); printf("\n"); for (nfile = 0; nfile < lst_size(real_fname_list); nfile++) { int tp, fp, nexons_real, nexons_pred, nwe, nme, ncr, npca, npcp, nola, nolp, nreal_pos, npred_pos, len_real, len_pred, seqlen, already_counted_real; String *real_fname, *pred_fname; GFF_Feature *feat_real, *feat_pred=NULL; real_fname = (String*)lst_get_ptr(real_fname_list, nfile); F = phast_fopen(real_fname->chars, "r"); if ((gff_real = gff_read_set(F)) == NULL) { die("ERROR: Unable to read file \"%s\".\n", real_fname->chars); } phast_fclose(F); pred_fname = (String*)lst_get_ptr(pred_fname_list, nfile); F = phast_fopen(pred_fname->chars, "r"); if ((gff_pred = gff_read_set(F)) == NULL) { die("ERROR: Unable to read file \"%s\".\n", pred_fname->chars); } phast_fclose(F); seqlen = lst_get_int(seq_len_list, nfile); /* sort ungrouped -- only cds exons will be considered, and each one will be considered individually */ gff_ungroup(gff_real); gff_ungroup(gff_pred); gff_sort(gff_real); gff_sort(gff_pred); nexons_real = nexons_pred = nwe = nme = ncr = npca = npcp = nola = nolp = tp = fp = nreal_pos = npred_pos = 0; if (nnc != -1) nnc = 0; i = j = 0; already_counted_real = 0; while (i < lst_size(gff_real->features)) { feat_real = (GFF_Feature*)lst_get_ptr(gff_real->features, i); if (!is_exon(feat_real, feat_list)) { i++; continue; } len_real = feat_real->end - feat_real->start + 1; if (!already_counted_real) { nexons_real++; nreal_pos += len_real; } /* look at all predicted exons up to and overlapping this real exon */ while (j < lst_size(gff_pred->features)) { feat_pred = (GFF_Feature*)lst_get_ptr(gff_pred->features, j); if (!is_exon(feat_pred, feat_list)) { j++; continue; } else if (feat_pred->start > feat_real->end) { if (!already_counted_real) { nme++; if (dump_exons) dump(prefix, feat_real, NULL, ME, -1); } break; } /* otherwise we have a predicted exon to count (start of pred <= end of real) */ nexons_pred++; len_pred = feat_pred->end - feat_pred->start + 1; npred_pos += len_pred; j++; /* we'll be done with this prediction one way or another; next time through look at a new one */ if (feat_pred->end < feat_real->start) { /* WE */ nwe++; fp += len_pred; if (dump_exons) dump(prefix, NULL, feat_pred, WE, 0); } else if (feat_pred->start == feat_real->start && /* CR */ feat_pred->end == feat_real->end) { ncr++; tp += len_pred; if (dump_exons) dump(prefix, feat_real, feat_pred, CR, 1); break; } else if (feat_pred->start == feat_real->start || /* PC */ feat_pred->end == feat_real->end) { pred_type type; npca++; npcp++; if (nnc != -1 && max(abs(feat_pred->start - feat_real->start), abs(feat_pred->end - feat_real->end)) <= nc_threshold) { nnc++; type = NC; } else type = PC; if (len_pred < len_real) tp += len_pred; else { tp += len_real; fp += (len_pred - len_real); } if (dump_exons) dump(prefix, feat_real, feat_pred, type, min(1, (double)len_real/len_pred)); break; } else { /* OL */ int overlap_size; pred_type type; nola++; nolp++; if (nnc != -1 && max(abs(feat_pred->start - feat_real->start), abs(feat_pred->end - feat_real->end)) <= nc_threshold) { nnc++; type = NC; } else type = PC; overlap_size = min(feat_pred->end, feat_real->end) - max(feat_pred->start, feat_real->start) + 1; tp += overlap_size; fp += len_pred - overlap_size; if (dump_exons) dump(prefix, feat_real, feat_pred, type, (double)overlap_size/len_pred); break; } /* NOTE: I'm ignoring the possibility that a predicted exon could be a PC and/or OL with respect to multiple real exons. The effect on the exon-level stats will be fairly minor (at worst a predicted exon is scored as an OL when it should be scored as an PC, and a real exon is erroneously counted as a ME), but the effect on the nucleotide-level Sn and Sp could conceivably be significant. */ } /* if we have counted at least one prediction (and thus failed to reach the end of the list), but the last prediction did not extend as far as the end of the real exon, then delay moving on to the next real exon */ if (j < lst_size(gff_pred->features) && feat_pred->end < feat_real->end) already_counted_real = 1; else { /* if we reached the end of the list of predictions, then it must not have contained any exons, and the real exon in question is a ME (if it hasn't already been counted) */ if (j == lst_size(gff_pred->features) && !already_counted_real) nme++; i++; already_counted_real = 0; } } /* any remaining predictions must be wrong */ for (; j < lst_size(gff_pred->features); j++) { if (is_exon((GFF_Feature*)lst_get_ptr(gff_pred->features, j), feat_list)) { nexons_pred++; nwe++; } } compute_and_print_stats(stdout, real_fname, pred_fname, tp, fp, nreal_pos, npred_pos, seqlen, ncr, npca, nola, nme, npcp, nolp, nwe, nexons_real, nexons_pred, nnc); tot_tp += tp; tot_fp += fp; tot_nreal_pos += nreal_pos; tot_npred_pos += npred_pos; tot_seqlen += seqlen; tot_ncr += ncr; tot_npca += npca; tot_nola += nola; tot_nme += nme; tot_npcp += npcp; tot_nolp += nolp; tot_nwe += nwe; tot_nexons_real += nexons_real; tot_nexons_pred += nexons_pred; if (nnc != -1) tot_nnc += nnc; if (dump_exons && SUMF != NULL) fprintf(SUMF, "# Total number of bases in real exons: %d\n", nreal_pos); gff_free_set(gff_real); gff_free_set(gff_pred); } if (lst_size(real_fname_list) > 1) compute_and_print_stats(stdout, str_new_charstr("TOTAL"), str_new_charstr(""), tot_tp, tot_fp, tot_nreal_pos, tot_npred_pos, tot_seqlen, tot_ncr, tot_npca, tot_nola, tot_nme, tot_npcp, tot_nolp, tot_nwe, tot_nexons_real, tot_nexons_pred, tot_nnc); return 0; }
/* Read a CategoryMap from a file */ CategoryMap *cm_read(FILE *F) { String *line, *name; List *l; int cat, cat2, lineno, i, cm_read_error; CategoryMap *cm = NULL; CategoryRange *existing_range; static Regex *cat_range_re = NULL; static Regex *ncats_re = NULL; static Regex *fill_re = NULL; static Regex *label_re = NULL; static Regex *extend_re = NULL; int has_dependencies = 0; line = str_new(STR_SHORT_LEN); l = lst_new_ptr(3); if (cat_range_re == NULL) { cat_range_re = str_re_new("^[[:space:]]*([^[:space:]]+)[[:space:]]+([[:digit:]]+)(-([[:digit:]]+))?([[:space:]]+([[:digit:]].*))?"); ncats_re = str_re_new("^[[:space:]]*NCATS[[:space:]]*=[[:space:]]*([[:digit:]]+)"); fill_re = str_re_new("^[[:space:]]*FILL_PRECEDENCE[[:space:]]*=[[:space:]]*(.*)$"); label_re = str_re_new("^[[:space:]]*LABELLING_PRECEDENCE[[:space:]]*=[[:space:]]*(.*)$"); extend_re = str_re_new("^[[:space:]]*FEATURE_EXTEND[[:space:]]*:[[:space:]]*(.+)[[:space:]]*\\((.+)\\)$"); } lineno = 0; while ((str_readline(line, F)) != EOF) { lineno++; str_trim(line); if (str_equals_charstr(line, "")) continue; if (str_re_match(line, ncats_re, l, 1) >= 0) { /* NCATS line */ int ncats; str_as_int(lst_get_ptr(l, 1), &ncats); cm = cm_new(ncats); /* 0th category is "background" */ cm->ranges[0] = cm_new_category_range(str_new_charstr(BACKGD_CAT_NAME), 0, 0); } else if (cm == NULL || cm->ncats == 0) die("ERROR: NCATS line must appear first, and must specify a positive number of categories.\n"); else if (str_re_match(line, label_re, l, 1) >= 0) { /* LABELLING_PRECEDENCE line */ List *tmpl = lst_new_ptr(cm->ncats); int tmpi; str_split((String*)lst_get_ptr(l, 1), " ,", tmpl); for (i = 0; i < lst_size(tmpl); i++) { String *s = (String*)lst_get_ptr(tmpl, i); if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) die("ERROR: bad integer in LABELLING_PRECEDENCE.\n"); cm->labelling_precedence[tmpi] = i; str_free(s); } lst_free(tmpl); } else if (str_re_match(line, fill_re, l, 1) >= 0) { /* FILL_PRECEDENCE line */ List *tmpl = lst_new_ptr(cm->ncats); int tmpi; str_split(lst_get_ptr(l, 1), " ,", tmpl); for (i = 0; i < lst_size(tmpl); i++) { String *s = lst_get_ptr(tmpl, i); if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) die("ERROR: bad integer in FILL_PRECEDENCE.\n"); cm->fill_precedence[tmpi] = i; str_free(s); } lst_free(tmpl); } else if (str_re_match(line, extend_re, l, 2) >= 0) { /* FEATURE_EXTEND line */ String *target = lst_get_ptr(l, 2); List *sources = lst_new_ptr(2); str_split(lst_get_ptr(l, 1), " ,", sources); if (cm == NULL || (cat = cm_get_category(cm, target)) == 0) die("ERROR: FEATURE_EXTEND target must be a previously-defined non-background feature type.\n"); for (i = 0; i < lst_size(sources); i++) { if (cm_get_category(cm, lst_get_ptr(sources, i)) == 0) die("ERROR: FEATURE_EXTEND source list must consist of previously-defined non-background feature types.\n"); } } else { /* 'range' line */ if (str_re_match(line, cat_range_re, l, 6) < 0) die("ERROR at line %d: '%s'\n", lineno, line->chars); name = str_dup((String*)lst_get_ptr(l, 1)); str_as_int((String*)lst_get_ptr(l, 2), &cat); cat2 = cat; if (lst_get_ptr(l, 4) != NULL) str_as_int((String*)lst_get_ptr(l, 4), &cat2); if (cat < 0 || cat2 < cat || cat2 > cm->ncats) die("ERROR: Illegal category range.\n"); /* check for existing definitions of the specified category range. Either no such definition must exist, or one must exist that spans exactly the same category numbers */ existing_range = NULL; cm_read_error = 0; for (i = cat; !cm_read_error && i <= cat2; i++) { if (cm->ranges[i] != NULL && existing_range == NULL) existing_range = cm->ranges[i]; else if (cm->ranges[i] != existing_range) cm_read_error = 1; } if (existing_range != NULL && (existing_range->start_cat_no != cat || existing_range->end_cat_no != cat2)) cm_read_error = 1; if (cm_read_error) die("ERROR: Overlapping category ranges.\n"); /* either add new category range, or add new type to existing one */ if (existing_range != NULL) { lst_push_ptr(existing_range->feature_types, name); } else { CategoryRange *cr = cm_new_category_range(name, cat, cat2); for (i = cat; i <= cat2; i++) cm->ranges[i] = cr; } /* now address "conditioned_on" dependencies, if they have been specified */ if (lst_get_ptr(l, 6) != NULL) { if (existing_range != NULL) fprintf(stderr, "WARNING: ignoring 'conditioned on' list for type '%s'\n", name->chars); else { List *tmpl = lst_new_ptr(cm->ncats); int tmpi; if (cm->conditioned_on[cat] != NULL) die("ERROR cm_read: cm->conditioned_on[%i] should be NULL\n", cat); str_split((String*)lst_get_ptr(l, 6), " ,", tmpl); cm->conditioned_on[cat] = lst_new_int(lst_size(tmpl)); for (i = cat + 1; i <= cat2; i++) cm->conditioned_on[i] = cm->conditioned_on[cat]; /* all categories in range point to same "conditioned on" list */ for (i = 0; i < lst_size(tmpl); i++) { String *s = (String*)lst_get_ptr(tmpl, i); if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) die("ERROR: bad integer in 'conditioned on' list for type '%s'.\n", name->chars); lst_push_int(cm->conditioned_on[cat], tmpi); str_free(s); } lst_free(tmpl); has_dependencies = 1; } } } for (i = 0; i < lst_size(l); i++) if (lst_get_ptr(l, i) != NULL) str_free((String*)lst_get_ptr(l, i)); } /* make sure every category has been specified */ for (i = 0; i <= cm->ncats; i++) if (cm->ranges[i] == 0) die("ERROR: category %d has not been specified.\n", i); /* build unspooler, if necessary */ if (has_dependencies) cm->unspooler = cm_create_unspooler(cm->ncats + 1, cm->conditioned_on); str_free(line); lst_free(l); return cm; }
/* Create a GFF_Set from a sequence of category/state numbers, using a specified category map and mapping from raw state numbers to category numbers. */ GFF_Set *cm_labeling_as_gff(CategoryMap *cm, int *path, int length, int *path_to_cat, int *reverse_compl, char *seqname, char *source, List *frame_cats, char *grouptag, char *idpref ) { int beg, end, i, cat, frame, groupno; GFF_Set *gff = gff_new_set_init("PHAST", PHAST_VERSION); int do_frame[cm->ncats+1]; char strand; char groupstr[STR_SHORT_LEN]; int ignore_0 = str_equals_charstr(cm_get_feature(cm, 0), BACKGD_CAT_NAME); /* ignore category 0 if background */ if (length <= 0) return gff; for (i = 0; i <= cm->ncats; i++) do_frame[i] = 0; if (frame_cats != NULL) for (i = 0; i < lst_size(frame_cats); i++) { int cat = cm_get_category(cm, lst_get_ptr(frame_cats, i)); if (cat != 0) /* ignore background or unrecognized name */ do_frame[cat] = 1; } groupno = 1; if (idpref != NULL) sprintf(groupstr, "%s \"%s.%d\"", grouptag != NULL ? grouptag : "id", idpref, groupno); else sprintf(groupstr, "%s \"%d\"", grouptag != NULL ? grouptag : "id", groupno); i = 0; while (i < length) { checkInterruptN(i, 10000); cat = cm->ranges[path_to_cat[path[i]]]->start_cat_no; strand = reverse_compl[path[i]] ? '-' : '+'; frame = do_frame[cat] ? path_to_cat[path[i]] - cat : GFF_NULL_FRAME; /* scan ahead until enter new category range (or reach end of seq) */ beg = i + 1; /* begin of feature (GFF coords) */ for (i++; i < length && cm->ranges[path_to_cat[path[i]]]->start_cat_no == cat; i++); end = i; /* end of feature (GFF coords) */ /* if minus strand, adjust frame to reflect end */ if (strand == '-' && do_frame[cat]) frame = path_to_cat[path[i-1]] - cat; /* if legitimate feature (non-background), then incorp into GFF_Set */ if (cat != 0 || !ignore_0) /* create new feature and add */ lst_push_ptr(gff->features, gff_new_feature(str_new_charstr(seqname), str_new_charstr(source), str_dup(cm_get_feature(cm, cat)), beg, end, 0, strand, frame, str_new_charstr(groupstr), TRUE)); if (cat == 0 && beg > 1) { groupno++; /* increment group number each time a sequence of 0s is encountered */ if (idpref != NULL) sprintf(groupstr, "%s \"%s.%d\"", grouptag != NULL ? grouptag : "id", idpref, groupno); else sprintf(groupstr, "%s \"%d\"", grouptag != NULL ? grouptag : "id", groupno); } } return gff; }