/* last item in predecessors is assumed to be the most recently visited */ int cm_get_unspooled_state(CategoryMap *cm, int spooled_state, List *predecessors) { UnspoolNode *n, *child; int p, pred_idx, i; pred_idx = lst_size(predecessors) - 1; n = cm->unspooler->spooled_to_unspooled[spooled_state]; while (n->newstate == -1) { child = NULL; while (n != child && pred_idx >= 0) { p = lst_get_int(predecessors, pred_idx--); for (i = 0; n != child && i < lst_size(n->children); i++) { child = (UnspoolNode*)lst_get_ptr(n->children, i); if (child->oldstate == p) n = child; } } if (n != child) { fprintf(stderr, "ERROR (cm_get_unspooled_state): no match for state %d preceded by state(s) ", spooled_state); for (i = 0; i < lst_size(predecessors); i++) fprintf(stderr, "%d ", lst_get_int(predecessors, i)); fprintf(stderr, "\n"); return -1; } if (n != child) die("ERROR cm_get_unspooled_state: n != child\n"); } return n->newstate; }
/* Return a list of category names corresponding to a given list of category names and or numbers. Doesn't allocate new names, just pointers to Strings in the CategoryMap object or the provided List */ List *cm_get_category_str_list(CategoryMap *cm, List *names, int ignore_missing) { int i, cat; List *retval = lst_new_ptr(lst_size(names)); for (i = 0; i < lst_size(names); i++) { String *n = lst_get_ptr(names, i); if (str_as_int(n, &cat) == 0) { if (cm == NULL) die("ERROR: if categories are specified by number, a category map is required\n"); if (cat < 0 || (cm != NULL && cat > cm->ncats)) die("ERROR: category number %d is out of bounds.\n", cat); lst_push_ptr(retval, cm_get_feature(cm, cat)); } else { if (cm != NULL) { cat = cm_get_category(cm, n); if (cat == 0 && !ignore_missing && !str_equals(n, cm_get_feature(cm, 0))) { die("ERROR: illegal category name (\"%s\")\n", n->chars); } //return pointers to cm if possible lst_push_ptr(retval, cm_get_feature(cm, cat)); } //otherwise return pointers to strings in list else lst_push_ptr(retval, n); } } return retval; }
/* Closes all outfiles. If already closed, reopen with append, add #eof closer, and close again. see comment above at get_outfile */ void close_outfiles(List *outfileList, Hashtable *outfileHash) { List *keys = hsh_keys(outfileHash); int *done, idx, i; char *fname; FILE *outfile; done = smalloc(lst_size(keys)*sizeof(int)); for (i=0; i<lst_size(keys); i++) { done[i]=0; fname = (char*)lst_get_ptr(keys, i); idx = hsh_get_int(outfileHash, fname); outfile = (FILE*)lst_get_ptr(outfileList, idx); if (outfile != NULL) { mafBlock_close_outfile(outfile); done[i]=1; } } for (i=0; i<lst_size(keys); i++) { if (done[i]) continue; fname = (char*)lst_get_ptr(keys, i); outfile = phast_fopen(fname, "a"); mafBlock_close_outfile(outfile); } sfree(done); lst_free(keys); lst_free(outfileList); hsh_free(outfileHash); }
/* add a leaf with specified name to root branch */ void tr_add_leaf_at_root(TreeNode *t, char *lname, int lgroup) { TreeNode *newanc, *newleaf; newanc = tr_new_node(); newleaf = tr_new_node(); strcpy(newleaf->name, lname); newleaf->dparent = lgroup; /* we don't want to change the identity of the root node, so will add the new node below it and rewire as necessary */ newanc->lchild = t->lchild; newanc->rchild = t->rchild; t->lchild->parent = newanc; t->rchild->parent = newanc; t->lchild = newanc; t->rchild = newleaf; newanc->parent = t; newleaf->parent = t; newanc->dparent = t->dparent; if (lgroup == newanc->dparent) t->dparent = lgroup; else t->dparent = 0; /* fix up ids and nodes list */ lst_push_ptr(t->nodes, newanc); newanc->id = lst_size(t->nodes) - 1; /* circumvent normal id assignment */ lst_push_ptr(t->nodes, newleaf); newleaf->id = lst_size(t->nodes) - 1; t->nnodes += 2; }
/* Exclude stop codons from all CDS in a group, as necessary. Record any features that are changed, so they can be changed back before data is output */ void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, List *ends_adjusted) { int j, k; List *stops = lst_new_ptr(1), *gfeatures = group->features; GFF_Feature *feat; lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted); for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops. We expect at most one, but more are possible */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat); } for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { for (k = 0; k < lst_size(stops); k++) { /* check stops */ GFF_Feature *stop = lst_get_ptr(stops, k); if (feat->strand == '+' && stop->strand == '+' && feat->end == stop->end) { feat->end -= 3; lst_push_ptr(ends_adjusted, feat); } else if (feat->strand == '-' && stop->strand == '-' && feat->start == stop->start) { feat->start += 3; lst_push_ptr(starts_adjusted, feat); } } } } lst_free(stops); }
/* add leaf with specified name to specified internal branch */ void tr_add_leaf_internal(TreeNode *t, int branch, char *lname, int lgroup) { TreeNode *oldnode, *newanc, *newleaf; oldnode = lst_get_ptr(t->nodes, branch); /* node beneath branch in question */ if (oldnode == t) die("ERROR tr_add_leaf_internal: oldnode == t\n"); newanc = tr_new_node(); newleaf = tr_new_node(); strcpy(newleaf->name, lname); newleaf->dparent = lgroup; newanc->rchild = newleaf; newleaf->parent = newanc; newanc->lchild = oldnode; newanc->parent = oldnode->parent; if (oldnode->parent->lchild == oldnode) oldnode->parent->lchild = newanc; else oldnode->parent->rchild = newanc; oldnode->parent = newanc; if (lgroup > 0 && lgroup == oldnode->dparent) newanc->dparent = lgroup; /* fix up ids and nodes list */ lst_push_ptr(t->nodes, newanc); newanc->id = lst_size(t->nodes) - 1; /* circumvent normal id assignment */ lst_push_ptr(t->nodes, newleaf); newleaf->id = lst_size(t->nodes) - 1; t->nnodes += 2; }
void mafBlock_strip_eLines(MafBlock *block) { int i, *keep = smalloc(lst_size(block->data)*sizeof(int)); for (i=0; i<lst_size(block->data); i++) keep[i] = (((MafSubBlock*)lst_get_ptr(block->data, i))->lineType[0] != 'e'); mafBlock_remove_lines(block, keep); sfree(keep); }
/* Create a category map with a category for each feature type in a GFF_Set. Category numbers are assigned in order of appearance of types */ CategoryMap* cm_new_from_features(GFF_Set *feats) { int i; CategoryMap *retval; Hashtable *hash; List *types; /* first scan features for all types */ hash = hsh_new(10); types = lst_new_ptr(10); for (i = 0; i < lst_size(feats->features); i++) { GFF_Feature *f = lst_get_ptr(feats->features, i); checkInterruptN(i, 10000); if (hsh_get(hash, f->feature->chars) == (void*)-1) { lst_push_ptr(types, f->feature); hsh_put_int(hash, f->feature->chars, 1); } } hsh_free(hash); /* now create a simple category map */ retval = cm_new(lst_size(types)); for (i = 0; i <= retval->ncats; i++) { String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : str_dup(lst_get_ptr(types, i-1)); retval->ranges[i] = cm_new_category_range(type, i, i); } lst_free(types); return retval; }
/* conditioned_on must be an array of integer lists; specifically, the ith element must be the list of state numbers on which the ith state is conditioned. */ Unspooler *cm_create_unspooler(int nstates_spooled, List **conditioned_on) { UnspoolNode *n; int i, j; Stack *s; Unspooler *unsp; int *mark; int capacity; unsp = (Unspooler*)smalloc(sizeof(Unspooler)); unsp->nstates_spooled = nstates_spooled; unsp->nstates_unspooled = 0; unsp->spooled_to_unspooled = (UnspoolNode**)smalloc(nstates_spooled * sizeof(UnspoolNode*)); capacity = nstates_spooled * nstates_spooled; unsp->unspooled_to_spooled = (int*)smalloc(capacity * sizeof(int)); mark = (int*)smalloc(nstates_spooled * sizeof(int)); s = stk_new_ptr(nstates_spooled); for (i = 0; i < nstates_spooled; i++) { /* erase marks (used to detect cycles) */ for (j = 0; j < nstates_spooled; j++) mark[j] = 0; unsp->spooled_to_unspooled[i] = cm_new_unspool_node(i); stk_push_ptr(s, unsp->spooled_to_unspooled[i]); while ((n = (UnspoolNode*)stk_pop_ptr(s)) != NULL) { if (conditioned_on[n->oldstate] == NULL || lst_size(conditioned_on[n->oldstate]) == 0) { n->newstate = unsp->nstates_unspooled++; /* mapping to spooled space */ if (n->newstate >= capacity) { capacity *= 2; unsp->unspooled_to_spooled = (int*)srealloc(unsp->unspooled_to_spooled, capacity * sizeof(int)); } unsp->unspooled_to_spooled[n->newstate] = i; } else { for (j = 0; j < lst_size(conditioned_on[n->oldstate]); j++) { int oldstate = lst_get_int(conditioned_on[n->oldstate], j); UnspoolNode *m; if (mark[oldstate] == 1) die("ERROR: cycle in 'conditioned_on' dependencies.\n"); mark[oldstate] = 1; m = cm_new_unspool_node(oldstate); lst_push_ptr(n->children, m); stk_push_ptr(s, m); } } } } stk_free(s); sfree(mark); return unsp; }
void mafBlock_print(FILE *outfile, MafBlock *block, int pretty_print) { int i, j, k, numSpace; int fieldSize[6]; //maximum # of characters in the first 6 fields of block MafSubBlock *sub; char firstChar, formatstr[1000]; char *firstseq=NULL; //if processing has reduced the number of species with data to zero, or has //reduced the block to all gaps, don't print if (lst_size(block->data) == 0 || mafBlock_all_gaps(block)) return; mafBlock_remove_gap_cols(block); mafBlock_get_fieldSizes(block, fieldSize); fprintf(outfile, "%s\n", block->aLine->chars); for (i=0; i<lst_size(block->data); i++) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); for (j=0; j<sub->numLine; j++) { firstChar = sub->lineType[j]; if (firstChar == 's' || firstChar == 'e') { sprintf(formatstr, "%%c %%-%is %%%ii %%%ii %%c %%%ii ", fieldSize[1], fieldSize[2], fieldSize[3], fieldSize[5]); fprintf(outfile, formatstr, firstChar, sub->src->chars, sub->start, sub->size, sub->strand, sub->srcSize); if (firstChar == 's') { if (firstseq == NULL) { fprintf(outfile, "%s\n", sub->seq->chars); if (pretty_print) firstseq = sub->seq->chars; } else { for (k=0; k<block->seqlen; k++) fputc(tolower(sub->seq->chars[k])==tolower(firstseq[k]) ? '.' : sub->seq->chars[k], outfile); } } else fprintf(outfile, "%c\n", sub->eStatus); } else if (firstChar=='i') { sprintf(formatstr, "i %%-%is %%c %%i %%c %%i", fieldSize[1]); fprintf(outfile, formatstr, sub->src->chars, sub->iStatus[0], sub->iCount[0], sub->iStatus[1], sub->iCount[1]); fputc('\n', outfile); } else { if (firstChar != 'q') die("ERROR mafBlock_print: firstChar should be q, got %c\n", firstChar); sprintf(formatstr, "q %%-%is", fieldSize[1]); fprintf(outfile, formatstr, sub->src->chars); numSpace = 6 + fieldSize[2] + fieldSize[3] + fieldSize[5]; for (k=0; k<numSpace; k++) fputc(' ', outfile); fprintf(outfile, "%s\n", sub->quality->chars); } } } fputc('\n', outfile); //blank line to mark end of block // fflush(outfile); }
/* Print a CategoryMap to a file */ void cm_print(CategoryMap *cm, FILE *F) { int i, j, k; List *tmpl; fprintf(F, "NCATS = %d\n\n", cm->ncats); for (i = 1; i <= cm->ncats; i++) { CategoryRange *cr = cm->ranges[i]; for (j = 0; j < lst_size(cr->feature_types); j++) { String *s = (String*)lst_get_ptr(cr->feature_types, j); fprintf(F, "%-15s %d", s->chars, cr->start_cat_no); if (cr->end_cat_no > cr->start_cat_no) fprintf(F, "-%d", cr->end_cat_no); if (cm->conditioned_on[i] != NULL) { fprintf(F, "\t"); for (k = 0; k < lst_size(cm->conditioned_on[i]); k++) fprintf(F, "%d%s", lst_get_int(cm->conditioned_on[i], k), k + 1 == lst_size(cm->conditioned_on[i]) ? "" : ","); } fprintf(F, "\n"); } i = cr->end_cat_no; /* avoid looking multiple times at the same range */ } /* reconstruct precedence lists */ tmpl = lst_new_int(cm->ncats + 1); for (i = 0; i <= cm->ncats; i++) lst_push_int(tmpl, i); prec = cm->labelling_precedence; lst_qsort(tmpl, compare_prec); fprintf(F, "\nLABELLING_PRECEDENCE = "); for (i = 0; i <= cm->ncats; i++) { int cat = lst_get_int(tmpl, i); if (cm->labelling_precedence[cat] != -1) fprintf(F, "%d%s", cat, i < cm->ncats ? "," : ""); } fprintf(F, "\n"); lst_clear(tmpl); for (i = 0; i <= cm->ncats; i++) lst_push_int(tmpl, i); prec = cm->fill_precedence; lst_qsort(tmpl, compare_prec); fprintf(F, "FILL_PRECEDENCE = "); for (i = 0; i <= cm->ncats; i++) { int cat = lst_get_int(tmpl, i); if (cm->fill_precedence[cat] != -1) fprintf(F, "%d%s", cat, i < cm->ncats ? "," : ""); } fprintf(F, "\n"); lst_free(tmpl); }
/* Restore cds coords to include stop codons, as necessary */ void restore_stops(GFF_FeatureGroup *group, List *starts_adjusted, List *ends_adjusted) { int j; if (lst_size(ends_adjusted) == 0 && lst_size(starts_adjusted) == 0) return; for (j = 0; j < lst_size(group->features); j++) { GFF_Feature *feat = lst_get_ptr(group->features, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { if (lst_find_ptr(ends_adjusted, feat) != -1) feat->end += 3; else if (lst_find_ptr(starts_adjusted, feat) != -1) feat->start -= 3; } } }
SEXP rph_tree_summary_depth(SEXP treeP) { TreeNode *tr = rph_tree_new(treeP), *node; int i; List *nodes = tr_preorder(tr); SEXP result = PROTECT(NEW_NUMERIC(lst_size(nodes))); double *d = NUMERIC_POINTER(result); for (i=0; i < lst_size(nodes); i++) { node = (TreeNode*)lst_get_ptr(nodes, i); d[i] = tr_distance_to_root(node); } UNPROTECT(1); return result; }
/* given a list of 5' and 3' splice sites extracted from a group, check whether they form valid pairs in all species */ int are_introns_okay(List *intron_splice, MSA *msa, List *problems, int offset5, int offset3) { int i, j, start1, start2; char str1[3], str2[3], str12[5]; char strand; int retval = 1; char * splice_pairs[3] = {"GTAG", "GCAG", "ATAC"}; str1[2] = '\0'; str2[2] = '\0'; if (lst_size(intron_splice) < 2) return 1; strand = ((GFF_Feature*)lst_get_ptr(intron_splice, 0))->strand; /* assume all same strand */ if (strand == '+') lst_qsort(intron_splice, feature_comparator_ascending); else lst_qsort(intron_splice, feature_comparator_descending); for (i = 0; i < lst_size(intron_splice) - 1; i++) { /* assume every 5' splice and immediately following 3' splice form a pair */ GFF_Feature *f1 = lst_get_ptr(intron_splice, i); GFF_Feature *f2 = lst_get_ptr(intron_splice, i+1); if (str_starts_with_charstr(f1->feature, SPLICE_5) && str_starts_with_charstr(f2->feature, SPLICE_3)) { start1 = f1->start - 1 + (strand == '-' ? offset5 : 0); start2 = f2->start - 1 + (strand == '+' ? offset3 : 0); for (j = 0; j < msa->nseqs; j++) { str1[0] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start1], j, 0); str1[1] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start1+1], j, 0); str2[0] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start2], j, 0); str2[1] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start2+1], j, 0); if (strand == '-') { msa_reverse_compl_seq(str1, 2); msa_reverse_compl_seq(str2, 2); } strcpy(str12, str1); strcat(str12, str2); if (!is_signal(str12, 3, splice_pairs, msa->is_missing)) { problem_add(problems, f1, BAD_INTRON, -1, -1); problem_add(problems, f2, BAD_INTRON, -1, -1); retval = 0; break; } } i++; /* no need to look at next feature */ } } return retval; }
SEXP rph_gff_features(SEXP gffP) { GFF_Set *gff = (GFF_Set*)EXTPTR_PTR(gffP); GFF_Feature *f; int i; SEXP rv; PROTECT(rv = allocVector(STRSXP, lst_size(gff->features))); for (i=0; i < lst_size(gff->features); i++) { checkInterruptN(i, 1000); f = (GFF_Feature*)lst_get_ptr(gff->features, i); SET_STRING_ELT(rv, i, mkChar(f->feature->chars)); } UNPROTECT(1); return rv; }
SEXP rph_gff_scores(SEXP gffP) { GFF_Set *gff = (GFF_Set*)EXTPTR_PTR(gffP); GFF_Feature *f; int i; SEXP rv; PROTECT(rv = allocVector(REALSXP, lst_size(gff->features))); for (i=0; i<lst_size(gff->features); i++) { checkInterruptN(i, 1000); f = (GFF_Feature*)lst_get_ptr(gff->features, i); REAL(rv)[i] = f->score; } UNPROTECT(1); return rv; }
/* return list of category names corresponding to list of category numbers */ List *cm_get_features(CategoryMap *cm, List *catnos) { int mark[cm->ncats+1]; List *retval = lst_new_ptr(lst_size(catnos)); int i, cat; for (i = 0; i <= cm->ncats; i++) mark[i] = 0; for (i = 0; i < lst_size(catnos); i++) { cat = lst_get_int(catnos, i); if (!mark[cm->ranges[cat]->start_cat_no]) { lst_push_ptr(retval, cm_get_feature(cm, cat)); mark[cm->ranges[cat]->start_cat_no] = 1; } } return retval; }
SEXP rph_tree_summary_nodenames(SEXP treeP) { TreeNode *tr = rph_tree_new(treeP), *node; int i; List *nodes = tr_preorder(tr); SEXP result = PROTECT(NEW_CHARACTER(lst_size(nodes))); for (i=0; i < lst_size(nodes); i++) { node = (TreeNode*)lst_get_ptr(nodes, i); if (strlen(node->name)==0 || strcmp(node->name, ";")==0) SET_STRING_ELT(result, i, NA_STRING); else SET_STRING_ELT(result, i, mkChar(node->name)); } UNPROTECT(1); return result; }
SEXP rph_tree_summary_len(SEXP treeP) { TreeNode *tr = rph_tree_new(treeP), *node; int i; List *nodes = tr_preorder(tr); SEXP result = PROTECT(NEW_NUMERIC(lst_size(nodes))); double *d = NUMERIC_POINTER(result); for (i=0; i < lst_size(nodes); i++) { node = (TreeNode*)lst_get_ptr(nodes, i); if (node->parent == NULL) d[i] = -1; else d[i] = node->dparent; } UNPROTECT(1); return result; }
/* open a file with name out_root.name.maf, or returns it if already open. This is a bit messy because in some cases (splitting by feature) there may be more output files than the OS can handle. But it would be computationally expensive to check and see which files are finished, assuming that the MAF is sorted. So, if it tries to open a file and fails, it the goes through the list of filehandles, finds an open one, closes it, and tries to open the new one again. Repeat until successful. Then, if a filehandle needs to be re-opened, it is opened with append. Again, if this is not successful, it looks for another file to close. If it can't find one the program reports an error and dies. Finally, close_outfiles below checks and makes sure that all files are closed with mafBlock_close_file in the end, so that they get the #eof closer. */ FILE *get_outfile(List *outfileList, Hashtable *outfileHash, String *name, char *out_root, int argc, char *argv[]) { int idx, i; FILE *outfile; char *fname = smalloc((strlen(out_root)+name->length+7)*sizeof(char)); sprintf(fname, "%s.%s.maf", out_root, name->chars); idx = ptr_to_int(hsh_get(outfileHash, fname)); if (idx == -1) { hsh_put(outfileHash, fname, int_to_ptr(lst_size(outfileList))); outfile = mafBlock_open_outfile(fname, argc, argv); while (outfile==NULL) { //too many files are open, close one first for (i=0; i<lst_size(outfileList); i++) { outfile = (FILE*)lst_get_ptr(outfileList, i); if (outfile != NULL) break; } if (i == lst_size(outfileList)) { die("ERROR: too many files open in maf_parse\n"); } else { phast_fclose(outfile); lst_set_ptr(outfileList, i, NULL); } outfile = mafBlock_open_outfile(fname, argc, argv); } lst_push_ptr(outfileList, (void*)outfile); sfree(fname); return outfile; } outfile = (FILE*)lst_get_ptr(outfileList, idx); if (outfile == NULL) { //has already been opened but then closed. outfile = phast_fopen_no_exit(fname, "a"); while (outfile == NULL) { for (i=0; i<lst_size(outfileList); i++) { outfile = (FILE*)lst_get_ptr(outfileList, i); if (outfile != NULL) break; } if (i == lst_size(outfileList)) { die("ERROR: too many files open in maf_parse\n"); } else { phast_fclose(outfile); lst_set_ptr(outfileList, i, NULL); } outfile = phast_fopen_no_exit(fname, "a"); } lst_set_ptr(outfileList, idx, (void*)outfile); } sfree(fname); return outfile; }
SEXP rph_tree_summary_rchild(SEXP treeP) { TreeNode *tr = rph_tree_new(treeP), *node; int i, *rchild, nnode, *idmap; List *nodes = tr_preorder(tr); SEXP result; nnode = lst_size(nodes); result = PROTECT(NEW_INTEGER(nnode)); rchild = INTEGER_POINTER(result); idmap = smalloc((nnode+1)*sizeof(int)); for (i=0; i < nnode; i++) { node = (TreeNode*)lst_get_ptr(nodes, i); if (node->id > nnode || node->id < 0) die("invalid id (%i) in tree node\n", node->id); idmap[(int)node->id] = i; } for (i=0; i < nnode; i++) { node = (TreeNode*)lst_get_ptr(nodes, i); if (node->rchild == NULL) rchild[idmap[node->id]] = -1; else rchild[idmap[node->id]] = idmap[node->rchild->id] + 1; } UNPROTECT(1); return result; }
long mafBlock_get_start(MafBlock *block, String *specName) { int idx=0; if (specName != NULL) idx = hsh_get_int(block->specMap, specName->chars); if (idx == -1 || idx >= lst_size(block->data)) return -1; return ((MafSubBlock*)lst_get_ptr(block->data, idx))->start; }
double calcMMscore(char *seqData, int base, List *MarkovMatrices, int conservative) { int i, baseAsNum, j; double val; int mmOrder = lst_size(MarkovMatrices)-1; Matrix *mm; int previousMMbases[mmOrder]; //If there aren't mmOrder previous bases @ base, then adjust mmOrder to take advantage of however many we have if (base < mmOrder) mmOrder = base; //If we run into any unknown "N" characters, adjust the mmOrder accordingly for(i=mmOrder; i>0; i--) { baseAsNum = basetocol(seqData[base-i]); if (baseAsNum < 0) mmOrder = i-1; else previousMMbases[mmOrder-i] = baseAsNum; } //Get score from Markov Matrix mm = lst_get_ptr(MarkovMatrices, mmOrder); j = basesToRow(previousMMbases, mmOrder, mm->ncols); if (j >= 0) val = log(mat_get(mm, j, basetocol(seqData[base]))); else { if (conservative == 1) val = log(0); //If it is an unknown base, probability is 0, in log space =inf else val = 0; //If it is an unknown base probability is 1, in log space log(1)=0 } return val; }
/* Read substitution scores from specified file and return as a kind of pseudo substitution matrix. All nonspecified elements in matrix will be equal to NEGINFTY, which is to be interpretted as "NA" */ Matrix* read_subst_scores(TreeModel *mod, FILE *F) { Matrix *retval = mat_new(mod->rate_matrix->size, mod->rate_matrix->size); String *line = str_new(STR_MED_LEN), *tuple1, *tuple2; List *l = lst_new_ptr(3); int alph_size = (int)strlen(mod->rate_matrix->states); int *inv_alph = mod->rate_matrix->inv_states; double val; mat_set_all(retval, NEGINFTY); while (str_readline(line, F) != EOF) { str_double_trim(line); if (str_starts_with_charstr(line, "#") || line->length == 0) continue; str_split(line, NULL, l); if (lst_size(l) < 3) { die("ERROR: wrong number of columns in subst. score file.\n"); } tuple1 = lst_get_ptr(l, 0); tuple2 = lst_get_ptr(l, 1); if (str_as_dbl(lst_get_ptr(l, 2), &val) != 0) { die("ERROR: bad value in subst. score file.\n"); } mat_set(retval, tuple_index(tuple1->chars, inv_alph, alph_size), tuple_index(tuple2->chars, inv_alph, alph_size), val); str_free(tuple1); str_free(tuple2); str_free(lst_get_ptr(l, 2)); } lst_free(l); str_free(line); return retval; }
/* write machine-readable log entry for discarded feature */ void write_machine_log(FILE *mlogf, GFF_FeatureGroup *group, List *problems, msa_coord_map *map) { int i; for (i = 0; i < lst_size(problems); i++) { write_machine_problem(mlogf, group, lst_get_ptr(problems, i), map); } }
/* Identify branches wrt which a given feature is uninformative, in the sense that all leaves beneath these branches having only missing data. Will set (preallocated) array has_data[i] = I(branch above node i is informative). Will also set *nspec equal to number of leaves that have data. */ void ff_find_missing_branches(TreeModel *mod, MSA *msa, GFF_Feature *feat, int *has_data, int *nspec) { int i, j; List *traversal = tr_postorder(mod->tree); *nspec = 0; for (i = 0; i < lst_size(traversal); i++) { TreeNode *n = lst_get_ptr(traversal, i); if (!((n->lchild == NULL && n->rchild == NULL) || (n->lchild != NULL && n->rchild != NULL))) die("ERROR ff_find_missing_branches: lchild and rchild should both be NULL or not NULL\n"); if (n->parent == NULL) /* root */ has_data[n->id] = FALSE; else if (n->lchild == NULL) { /* leaf */ has_data[n->id] = FALSE; /* initialize to F, set to T if base in any col in feature */ for (j = feat->start-1; j < feat->end; j++) { if (mod->rate_matrix-> inv_states[(int)ss_get_char_tuple(msa, msa->ss->tuple_idx[j], mod->msa_seq_idx[n->id], 0)] >= 0) { has_data[n->id] = TRUE; (*nspec)++; break; } } } else { /* non-root ancestral node */ if (has_data[n->lchild->id] || has_data[n->rchild->id]) has_data[n->id] = TRUE; else has_data[n->id] = FALSE; } } }
/* Reset a problem list to the empty state */ void problems_clear(List *problems) { int i; for (i = 0; i < lst_size(problems); i++) { problem_free(lst_get_ptr(problems, i)); } lst_clear(problems); }
int is_exon(GFF_Feature *feat, List *l) { int i; for (i = 0; i < lst_size(l); i++) if (str_equals_nocase(feat->feature, (String*)lst_get_ptr(l, i))) return 1; return 0; }
//if exclude==0, removes all species not in list. //if exclude==1, removes all species in list void mafBlock_subSpec(MafBlock *block, List *specNameList, int include) { String *str; int i, idx, *keep, oldSize = lst_size(block->data); keep = smalloc(oldSize*sizeof(int)); for (i=0; i<oldSize; i++) keep[i]=(include==0); for (i=0; i<lst_size(specNameList); i++) { str = (String*)lst_get_ptr(specNameList, i); idx = hsh_get_int(block->specMap, str->chars); if (idx != -1) keep[idx] = !(include==0); } mafBlock_remove_lines(block, keep); sfree(keep); return; }
/** maps a sequence (array) of category numbers from the spooled space to the unspooled space, using the current unspooler. Original sequence is overwritten */ void cm_spooled_to_unspooled(CategoryMap *cm, int *path, int pathlen) { int j, sp_state, prev_sp_state; List *pred; if (cm->unspooler == NULL) return; pred = lst_new_int(cm->unspooler->nstates_spooled); prev_sp_state = -1; for (j = 0; j < pathlen; j++) { if (!(path[j] >= 0 && path[j] <= cm->unspooler->nstates_spooled)) die("ERROR cm_spooled_to_unspooled: path[%i]=%i, should be in [0, %i]\n", j, path[j], cm->unspooler->nstates_spooled); sp_state = path[j]; path[j] = cm_get_unspooled_state(cm, path[j], pred); if (path[j] == -1) die("ERROR: failure mapping to uspooled state at position %d.\n", j); if (sp_state != prev_sp_state) { /* if the current (spooled) state is not conditioned on any other state, then its predecessor cannot matter, so the list can be cleared */ if (lst_size(cm->unspooler->spooled_to_unspooled[sp_state]->children) == 0) lst_clear(pred); lst_push_int(pred, sp_state); } prev_sp_state = sp_state; } lst_free(pred); }