/* Create a category map with a category for each feature type in a GFF_Set. Category numbers are assigned in order of appearance of types */ CategoryMap* cm_new_from_features(GFF_Set *feats) { int i; CategoryMap *retval; Hashtable *hash; List *types; /* first scan features for all types */ hash = hsh_new(10); types = lst_new_ptr(10); for (i = 0; i < lst_size(feats->features); i++) { GFF_Feature *f = lst_get_ptr(feats->features, i); checkInterruptN(i, 10000); if (hsh_get(hash, f->feature->chars) == (void*)-1) { lst_push_ptr(types, f->feature); hsh_put_int(hash, f->feature->chars, 1); } } hsh_free(hash); /* now create a simple category map */ retval = cm_new(lst_size(types)); for (i = 0; i <= retval->ncats; i++) { String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : str_dup(lst_get_ptr(types, i-1)); retval->ranges[i] = cm_new_category_range(type, i, i); } lst_free(types); return retval; }
SEXP rph_tree_summary_rchild(SEXP treeP) { TreeNode *tr = rph_tree_new(treeP), *node; int i, *rchild, nnode, *idmap; List *nodes = tr_preorder(tr); SEXP result; nnode = lst_size(nodes); result = PROTECT(NEW_INTEGER(nnode)); rchild = INTEGER_POINTER(result); idmap = smalloc((nnode+1)*sizeof(int)); for (i=0; i < nnode; i++) { node = (TreeNode*)lst_get_ptr(nodes, i); if (node->id > nnode || node->id < 0) die("invalid id (%i) in tree node\n", node->id); idmap[(int)node->id] = i; } for (i=0; i < nnode; i++) { node = (TreeNode*)lst_get_ptr(nodes, i); if (node->rchild == NULL) rchild[idmap[node->id]] = -1; else rchild[idmap[node->id]] = idmap[node->rchild->id] + 1; } UNPROTECT(1); return result; }
/* Read substitution scores from specified file and return as a kind of pseudo substitution matrix. All nonspecified elements in matrix will be equal to NEGINFTY, which is to be interpretted as "NA" */ Matrix* read_subst_scores(TreeModel *mod, FILE *F) { Matrix *retval = mat_new(mod->rate_matrix->size, mod->rate_matrix->size); String *line = str_new(STR_MED_LEN), *tuple1, *tuple2; List *l = lst_new_ptr(3); int alph_size = (int)strlen(mod->rate_matrix->states); int *inv_alph = mod->rate_matrix->inv_states; double val; mat_set_all(retval, NEGINFTY); while (str_readline(line, F) != EOF) { str_double_trim(line); if (str_starts_with_charstr(line, "#") || line->length == 0) continue; str_split(line, NULL, l); if (lst_size(l) < 3) { die("ERROR: wrong number of columns in subst. score file.\n"); } tuple1 = lst_get_ptr(l, 0); tuple2 = lst_get_ptr(l, 1); if (str_as_dbl(lst_get_ptr(l, 2), &val) != 0) { die("ERROR: bad value in subst. score file.\n"); } mat_set(retval, tuple_index(tuple1->chars, inv_alph, alph_size), tuple_index(tuple2->chars, inv_alph, alph_size), val); str_free(tuple1); str_free(tuple2); str_free(lst_get_ptr(l, 2)); } lst_free(l); str_free(line); return retval; }
/* Exclude stop codons from all CDS in a group, as necessary. Record any features that are changed, so they can be changed back before data is output */ void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, List *ends_adjusted) { int j, k; List *stops = lst_new_ptr(1), *gfeatures = group->features; GFF_Feature *feat; lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted); for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops. We expect at most one, but more are possible */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat); } for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { for (k = 0; k < lst_size(stops); k++) { /* check stops */ GFF_Feature *stop = lst_get_ptr(stops, k); if (feat->strand == '+' && stop->strand == '+' && feat->end == stop->end) { feat->end -= 3; lst_push_ptr(ends_adjusted, feat); } else if (feat->strand == '-' && stop->strand == '-' && feat->start == stop->start) { feat->start += 3; lst_push_ptr(starts_adjusted, feat); } } } } lst_free(stops); }
void mafBlock_remove_lines(MafBlock *block, int *keep) { int i, oldSize = lst_size(block->data), newSize=0; MafSubBlock *sub, *testSub; for (i=0; i<oldSize; i++) { if (keep[i]) { if (i != newSize) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); hsh_reset_int(block->specMap, sub->src->chars, newSize); hsh_reset_int(block->specMap, sub->specName->chars, newSize); testSub = (MafSubBlock*)lst_get_ptr(block->data, newSize); if (testSub != NULL) die("ERROR: mafBlock_remove_lines: testSub should be NULL\n"); lst_set_ptr(block->data, newSize, (void*)sub); lst_set_ptr(block->data, i, NULL); } newSize++; } else { sub = (MafSubBlock*)lst_get_ptr(block->data, i); hsh_reset_int(block->specMap, sub->src->chars, -1); hsh_reset_int(block->specMap, sub->specName->chars, -1); mafSubBlock_free(sub); lst_set_ptr(block->data, i, NULL); } } for (i=oldSize-1; i>=newSize; i--) lst_delete_idx(block->data, i); }
/* Closes all outfiles. If already closed, reopen with append, add #eof closer, and close again. see comment above at get_outfile */ void close_outfiles(List *outfileList, Hashtable *outfileHash) { List *keys = hsh_keys(outfileHash); int *done, idx, i; char *fname; FILE *outfile; done = smalloc(lst_size(keys)*sizeof(int)); for (i=0; i<lst_size(keys); i++) { done[i]=0; fname = (char*)lst_get_ptr(keys, i); idx = hsh_get_int(outfileHash, fname); outfile = (FILE*)lst_get_ptr(outfileList, idx); if (outfile != NULL) { mafBlock_close_outfile(outfile); done[i]=1; } } for (i=0; i<lst_size(keys); i++) { if (done[i]) continue; fname = (char*)lst_get_ptr(keys, i); outfile = phast_fopen(fname, "a"); mafBlock_close_outfile(outfile); } sfree(done); lst_free(keys); lst_free(outfileList); hsh_free(outfileHash); }
SEXP rph_gff_featureBits(SEXP gffListP, SEXP orP, SEXP returnGffP) { int numGff, i, j, or, returnGff; long numbit = 0; List *gfflist; GFF_Set *gff, *newgff=NULL; GFF_Feature *feat, *newfeat; SEXP rv; numGff = length(gffListP); gfflist = lst_new_ptr(numGff); // Rf_PrintValue(gffListP); for (i = 0; i < numGff; i++) { gff = (GFF_Set*)EXTPTR_PTR(VECTOR_ELT(gffListP, i)); lst_push_ptr(gfflist, gff); gff_register_protect(gff); } or = LOGICAL_VALUE(orP); returnGff = LOGICAL_VALUE(returnGffP); if (!or && numGff >= 2) { newgff = gff_overlap_gff(lst_get_ptr(gfflist, 0), lst_get_ptr(gfflist, 1), 1, -1.0, FALSE, TRUE, NULL); numbit = gff_flatten_mergeAll(newgff); for (i=2; i < numGff; i++) { checkInterrupt(); gff = gff_overlap_gff(newgff, lst_get_ptr(gfflist, i), 1, -1.0, FALSE, TRUE, NULL); numbit = gff_flatten_mergeAll(gff); gff_free_set(newgff); newgff = gff; } } else { newgff = gff_new_set(); for (i=0; i< numGff; i++) { gff = (GFF_Set*)lst_get_ptr(gfflist, i); for (j=0; j < lst_size(gff->features); j++) { checkInterruptN(j, 1000); feat = lst_get_ptr(gff->features, j); newfeat = gff_new_feature_copy(feat); lst_push_ptr(newgff->features, newfeat); } } numbit = gff_flatten_mergeAll(newgff); } if (returnGff) return rph_gff_new_extptr(newgff); if (numbit > INT_MAX) { PROTECT(rv = allocVector(REALSXP, 1)); REAL(rv)[0] = numbit; } else { PROTECT(rv = allocVector(INTSXP, 1)); INTEGER(rv)[0] = numbit; } UNPROTECT(1); return rv; }
CategoryRange* cm_category_range_create_copy(CategoryRange *src) { int i; CategoryRange *retval = cm_new_category_range(str_dup(lst_get_ptr(src->feature_types, 0)), src->start_cat_no, src->end_cat_no); for (i = 1; i < lst_size(src->feature_types); i++) lst_push_ptr(retval->feature_types, str_dup(lst_get_ptr(src->feature_types, i))); return retval; }
/* given a list of 5' and 3' splice sites extracted from a group, check whether they form valid pairs in all species */ int are_introns_okay(List *intron_splice, MSA *msa, List *problems, int offset5, int offset3) { int i, j, start1, start2; char str1[3], str2[3], str12[5]; char strand; int retval = 1; char * splice_pairs[3] = {"GTAG", "GCAG", "ATAC"}; str1[2] = '\0'; str2[2] = '\0'; if (lst_size(intron_splice) < 2) return 1; strand = ((GFF_Feature*)lst_get_ptr(intron_splice, 0))->strand; /* assume all same strand */ if (strand == '+') lst_qsort(intron_splice, feature_comparator_ascending); else lst_qsort(intron_splice, feature_comparator_descending); for (i = 0; i < lst_size(intron_splice) - 1; i++) { /* assume every 5' splice and immediately following 3' splice form a pair */ GFF_Feature *f1 = lst_get_ptr(intron_splice, i); GFF_Feature *f2 = lst_get_ptr(intron_splice, i+1); if (str_starts_with_charstr(f1->feature, SPLICE_5) && str_starts_with_charstr(f2->feature, SPLICE_3)) { start1 = f1->start - 1 + (strand == '-' ? offset5 : 0); start2 = f2->start - 1 + (strand == '+' ? offset3 : 0); for (j = 0; j < msa->nseqs; j++) { str1[0] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start1], j, 0); str1[1] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start1+1], j, 0); str2[0] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start2], j, 0); str2[1] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start2+1], j, 0); if (strand == '-') { msa_reverse_compl_seq(str1, 2); msa_reverse_compl_seq(str2, 2); } strcpy(str12, str1); strcat(str12, str2); if (!is_signal(str12, 3, splice_pairs, msa->is_missing)) { problem_add(problems, f1, BAD_INTRON, -1, -1); problem_add(problems, f2, BAD_INTRON, -1, -1); retval = 0; break; } } i++; /* no need to look at next feature */ } } return retval; }
/* open a file with name out_root.name.maf, or returns it if already open. This is a bit messy because in some cases (splitting by feature) there may be more output files than the OS can handle. But it would be computationally expensive to check and see which files are finished, assuming that the MAF is sorted. So, if it tries to open a file and fails, it the goes through the list of filehandles, finds an open one, closes it, and tries to open the new one again. Repeat until successful. Then, if a filehandle needs to be re-opened, it is opened with append. Again, if this is not successful, it looks for another file to close. If it can't find one the program reports an error and dies. Finally, close_outfiles below checks and makes sure that all files are closed with mafBlock_close_file in the end, so that they get the #eof closer. */ FILE *get_outfile(List *outfileList, Hashtable *outfileHash, String *name, char *out_root, int argc, char *argv[]) { int idx, i; FILE *outfile; char *fname = smalloc((strlen(out_root)+name->length+7)*sizeof(char)); sprintf(fname, "%s.%s.maf", out_root, name->chars); idx = ptr_to_int(hsh_get(outfileHash, fname)); if (idx == -1) { hsh_put(outfileHash, fname, int_to_ptr(lst_size(outfileList))); outfile = mafBlock_open_outfile(fname, argc, argv); while (outfile==NULL) { //too many files are open, close one first for (i=0; i<lst_size(outfileList); i++) { outfile = (FILE*)lst_get_ptr(outfileList, i); if (outfile != NULL) break; } if (i == lst_size(outfileList)) { die("ERROR: too many files open in maf_parse\n"); } else { phast_fclose(outfile); lst_set_ptr(outfileList, i, NULL); } outfile = mafBlock_open_outfile(fname, argc, argv); } lst_push_ptr(outfileList, (void*)outfile); sfree(fname); return outfile; } outfile = (FILE*)lst_get_ptr(outfileList, idx); if (outfile == NULL) { //has already been opened but then closed. outfile = phast_fopen_no_exit(fname, "a"); while (outfile == NULL) { for (i=0; i<lst_size(outfileList); i++) { outfile = (FILE*)lst_get_ptr(outfileList, i); if (outfile != NULL) break; } if (i == lst_size(outfileList)) { die("ERROR: too many files open in maf_parse\n"); } else { phast_fclose(outfile); lst_set_ptr(outfileList, i, NULL); } outfile = phast_fopen_no_exit(fname, "a"); } lst_set_ptr(outfileList, idx, (void*)outfile); } sfree(fname); return outfile; }
/* Identify branches wrt which a given feature is uninformative, in the sense that all leaves beneath these branches having only missing data. Will set (preallocated) array has_data[i] = I(branch above node i is informative). Will also set *nspec equal to number of leaves that have data. */ void ff_find_missing_branches(TreeModel *mod, MSA *msa, GFF_Feature *feat, int *has_data, int *nspec) { int i, j; List *traversal = tr_postorder(mod->tree); *nspec = 0; for (i = 0; i < lst_size(traversal); i++) { TreeNode *n = lst_get_ptr(traversal, i); if (!((n->lchild == NULL && n->rchild == NULL) || (n->lchild != NULL && n->rchild != NULL))) die("ERROR ff_find_missing_branches: lchild and rchild should both be NULL or not NULL\n"); if (n->parent == NULL) /* root */ has_data[n->id] = FALSE; else if (n->lchild == NULL) { /* leaf */ has_data[n->id] = FALSE; /* initialize to F, set to T if base in any col in feature */ for (j = feat->start-1; j < feat->end; j++) { if (mod->rate_matrix-> inv_states[(int)ss_get_char_tuple(msa, msa->ss->tuple_idx[j], mod->msa_seq_idx[n->id], 0)] >= 0) { has_data[n->id] = TRUE; (*nspec)++; break; } } } else { /* non-root ancestral node */ if (has_data[n->lchild->id] || has_data[n->rchild->id]) has_data[n->id] = TRUE; else has_data[n->id] = FALSE; } } }
/* add leaf with specified name to specified internal branch */ void tr_add_leaf_internal(TreeNode *t, int branch, char *lname, int lgroup) { TreeNode *oldnode, *newanc, *newleaf; oldnode = lst_get_ptr(t->nodes, branch); /* node beneath branch in question */ if (oldnode == t) die("ERROR tr_add_leaf_internal: oldnode == t\n"); newanc = tr_new_node(); newleaf = tr_new_node(); strcpy(newleaf->name, lname); newleaf->dparent = lgroup; newanc->rchild = newleaf; newleaf->parent = newanc; newanc->lchild = oldnode; newanc->parent = oldnode->parent; if (oldnode->parent->lchild == oldnode) oldnode->parent->lchild = newanc; else oldnode->parent->rchild = newanc; oldnode->parent = newanc; if (lgroup > 0 && lgroup == oldnode->dparent) newanc->dparent = lgroup; /* fix up ids and nodes list */ lst_push_ptr(t->nodes, newanc); newanc->id = lst_size(t->nodes) - 1; /* circumvent normal id assignment */ lst_push_ptr(t->nodes, newleaf); newleaf->id = lst_size(t->nodes) - 1; t->nnodes += 2; }
double calcMMscore(char *seqData, int base, List *MarkovMatrices, int conservative) { int i, baseAsNum, j; double val; int mmOrder = lst_size(MarkovMatrices)-1; Matrix *mm; int previousMMbases[mmOrder]; //If there aren't mmOrder previous bases @ base, then adjust mmOrder to take advantage of however many we have if (base < mmOrder) mmOrder = base; //If we run into any unknown "N" characters, adjust the mmOrder accordingly for(i=mmOrder; i>0; i--) { baseAsNum = basetocol(seqData[base-i]); if (baseAsNum < 0) mmOrder = i-1; else previousMMbases[mmOrder-i] = baseAsNum; } //Get score from Markov Matrix mm = lst_get_ptr(MarkovMatrices, mmOrder); j = basesToRow(previousMMbases, mmOrder, mm->ncols); if (j >= 0) val = log(mat_get(mm, j, basetocol(seqData[base]))); else { if (conservative == 1) val = log(0); //If it is an unknown base, probability is 0, in log space =inf else val = 0; //If it is an unknown base probability is 1, in log space log(1)=0 } return val; }
void mafBlock_strip_eLines(MafBlock *block) { int i, *keep = smalloc(lst_size(block->data)*sizeof(int)); for (i=0; i<lst_size(block->data); i++) keep[i] = (((MafSubBlock*)lst_get_ptr(block->data, i))->lineType[0] != 'e'); mafBlock_remove_lines(block, keep); sfree(keep); }
long mafBlock_get_start(MafBlock *block, String *specName) { int idx=0; if (specName != NULL) idx = hsh_get_int(block->specMap, specName->chars); if (idx == -1 || idx >= lst_size(block->data)) return -1; return ((MafSubBlock*)lst_get_ptr(block->data, idx))->start; }
//sets fieldSize[i] to maximum length of field i in MAF, so that block //can be printed with nice formatting void mafBlock_get_fieldSizes(MafBlock *block, int fieldSize[6]) { int i; MafSubBlock *sub; char tempstr[1000]; for (i=0; i<6; i++) fieldSize[i] = 0; fieldSize[0] = 1; //this is always one character fieldSize[4] = 1; //this is always one character (strand) for (i=0; i<lst_size(block->data); i++) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); //field[1] is src if (sub->src->length > fieldSize[1]) fieldSize[1] = sub->src->length; //field[2] is start sprintf(tempstr, "%li", sub->start); if (strlen(tempstr) > fieldSize[2]) fieldSize[2] = (int)strlen(tempstr); //field[3] is size sprintf(tempstr, "%i", sub->size); if (strlen(tempstr) > fieldSize[3]) fieldSize[3] = (int)strlen(tempstr); //field[4] is strand... skip //field[5] is srcSize sprintf(tempstr, "%li", sub->srcSize); if (strlen(tempstr) > fieldSize[5]) fieldSize[5] = (int)strlen(tempstr); //don't worry about size of lastField since it just goes to end-of-line } }
SEXP rph_tree_rename(SEXP treeVec, SEXP oldNamesP, SEXP newNamesP) { int i, numtree = LENGTH(treeVec), treeIdx; TreeNode *tr, *n; SEXP result; Hashtable *hash = hsh_new(20); char *str; for (i=0; i<LENGTH(oldNamesP); i++) { str = smalloc((strlen(CHAR(STRING_ELT(newNamesP, i)))+1)*sizeof(char)); strcpy(str, CHAR(STRING_ELT(newNamesP, i))); hsh_put(hash, CHAR(STRING_ELT(oldNamesP, i)), str); } PROTECT(result = NEW_CHARACTER(numtree)); for (treeIdx=0; treeIdx < numtree; treeIdx++) { tr = rph_tree_new(STRING_ELT(treeVec, treeIdx)); // tr = tr_new_from_string(CHAR(STRING_ELT(treeVec, treeIdx))); for (i=0; i<tr->nnodes; i++) { n = lst_get_ptr(tr->nodes, i); if (n->name[0] != '\0' && (str = hsh_get(hash, n->name)) != (char*)-1) strcpy(n->name, str); } str = tr_to_string(tr, 1); SET_STRING_ELT(result, treeIdx, mkChar(str)); } UNPROTECT(1); return result; }
/* write machine-readable log entry for discarded feature */ void write_machine_log(FILE *mlogf, GFF_FeatureGroup *group, List *problems, msa_coord_map *map) { int i; for (i = 0; i < lst_size(problems); i++) { write_machine_problem(mlogf, group, lst_get_ptr(problems, i), map); } }
/* Reset a problem list to the empty state */ void problems_clear(List *problems) { int i; for (i = 0; i < lst_size(problems); i++) { problem_free(lst_get_ptr(problems, i)); } lst_clear(problems); }
/* last item in predecessors is assumed to be the most recently visited */ int cm_get_unspooled_state(CategoryMap *cm, int spooled_state, List *predecessors) { UnspoolNode *n, *child; int p, pred_idx, i; pred_idx = lst_size(predecessors) - 1; n = cm->unspooler->spooled_to_unspooled[spooled_state]; while (n->newstate == -1) { child = NULL; while (n != child && pred_idx >= 0) { p = lst_get_int(predecessors, pred_idx--); for (i = 0; n != child && i < lst_size(n->children); i++) { child = (UnspoolNode*)lst_get_ptr(n->children, i); if (child->oldstate == p) n = child; } } if (n != child) { fprintf(stderr, "ERROR (cm_get_unspooled_state): no match for state %d preceded by state(s) ", spooled_state); for (i = 0; i < lst_size(predecessors); i++) fprintf(stderr, "%d ", lst_get_int(predecessors, i)); fprintf(stderr, "\n"); return -1; } if (n != child) die("ERROR cm_get_unspooled_state: n != child\n"); } return n->newstate; }
int is_exon(GFF_Feature *feat, List *l) { int i; for (i = 0; i < lst_size(l); i++) if (str_equals_nocase(feat->feature, (String*)lst_get_ptr(l, i))) return 1; return 0; }
/* Return a list of category names corresponding to a given list of category names and or numbers. Doesn't allocate new names, just pointers to Strings in the CategoryMap object or the provided List */ List *cm_get_category_str_list(CategoryMap *cm, List *names, int ignore_missing) { int i, cat; List *retval = lst_new_ptr(lst_size(names)); for (i = 0; i < lst_size(names); i++) { String *n = lst_get_ptr(names, i); if (str_as_int(n, &cat) == 0) { if (cm == NULL) die("ERROR: if categories are specified by number, a category map is required\n"); if (cat < 0 || (cm != NULL && cat > cm->ncats)) die("ERROR: category number %d is out of bounds.\n", cat); lst_push_ptr(retval, cm_get_feature(cm, cat)); } else { if (cm != NULL) { cat = cm_get_category(cm, n); if (cat == 0 && !ignore_missing && !str_equals(n, cm_get_feature(cm, 0))) { die("ERROR: illegal category name (\"%s\")\n", n->chars); } //return pointers to cm if possible lst_push_ptr(retval, cm_get_feature(cm, cat)); } //otherwise return pointers to strings in list else lst_push_ptr(retval, n); } } return retval; }
void mafBlock_mask_region(MafBlock *block, GFF_Set *mask_feats, List *speclist) { MafSubBlock *refblock, *maskblock; int i, j, spec_idx; GFF_Set *feat; GFF_Feature *f, *prevf=NULL; int next_feat_idx = 1; char **maskseq; int num_mask_seq=0; long coord; if (mask_feats == NULL || lst_size(mask_feats->features) == 0L) return; maskseq = smalloc(lst_size(speclist)*sizeof(char*)); for (i=0; i < lst_size(speclist); i++) { spec_idx = hsh_get_int(block->specMap, ((String*)lst_get_ptr(speclist, i))->chars); if (spec_idx == -1) continue; maskblock = lst_get_ptr(block->data, spec_idx); if (maskblock->seq == NULL) continue; maskseq[num_mask_seq++] = maskblock->seq->chars; } if (num_mask_seq == 0) { sfree(maskseq); return; } feat = gff_copy_set_no_groups(mask_feats); gff_flatten_mergeAll(feat); f = lst_get_ptr(feat->features, 0); refblock = lst_get_ptr(block->data, 0); coord = refblock->start; for (i=0; i < block->seqlen; i++) { if (refblock->seq->chars[i] != '-') coord++; //this is 1-based coordinate if (coord > f->end) { if (next_feat_idx == lst_size(feat->features)) break; prevf = f; f = lst_get_ptr(feat->features, next_feat_idx++); if (f->start <= prevf->end) { die("Error: feats not sorted in mafBlock_mask_region"); //shouldn't happen } } if (coord >= f->start && coord <= f->end) { for (j=0; j < num_mask_seq; j++) if (maskseq[j][i] != '-') maskseq[j][i] = 'N'; } } gff_free_set(feat); sfree(maskseq); }
/* Read an amino acid rate matrix in the format used by PAML. Reorder the rows and columns to match 'alph'. Warning: the ordering in the file is assumed to match that used in the files in the PAML distribution (alphabetical order of 3-letter codes), which is also the order of AA_ALPHABET (therefore AA_ALPHABET may not be changed!). Equilibrium frequencies are ignored. */ Matrix *read_paml_matrix(FILE *F, char *alph) { char *paml_alph = "ARNDCQEGHILKMFPSTWYV$"; int size = (int)strlen(paml_alph); Matrix *retval = mat_new(size, size); List *fields = lst_new_ptr(100); String *line = str_new(STR_MED_LEN); int i, j; if (strcmp(alph, paml_alph) != 0) die("ERROR read_paml_matrix (alph (%s) != paml_alph (%s))\n", alph, paml_alph); mat_zero(retval); for (i = 1; i < size-1 && str_readline(line, F) != EOF; ) { /* NOTE: size of matrix allows for stop, but stop not included in file; therefore, only read size-1 lines */ str_double_trim(line); if (line->length == 0) continue; str_split(line, NULL, fields); if (lst_size(fields) != i) { die("ERROR: row %d of matrix must have %d columns.\n", i+1, i); } for (j = 0; j < lst_size(fields); j++) { double val; if (str_as_dbl(lst_get_ptr(fields, j), &val) != 0) { die("ERROR: non-numeric matrix element in subst. matrix ('%s')\n", ((String*)lst_get_ptr(fields, j+1))->chars); } str_free(lst_get_ptr(fields, j)); if (j >= size) die("ERROR read_paml_matrix j (%i) should be < size (%i)\n", j, size); mat_set(retval, i, j, val); mat_set(retval, j, i, val); } i++; } if (i != size - 1) { die("ERROR: too few rows in subst. matrix.\n"); } lst_free(fields); str_free(line); return retval; }
/* Perform a GERP-like computation for each feature. Computes expected number of subst. under neutrality (feat_nneut), expected number after rescaling by ML (feat_nobs), expected number of rejected substitutions (feat_nrejected), and number of species with data (feat_nspecies). If any arrays are NULL, values will not be retained. Gaps and missing data are handled by working with induced subtree. */ void ff_gerp(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode, double *feat_nneut, double *feat_nobs, double *feat_nrejected, double *feat_nspec, FILE *logf) { int i, j, nspec = 0; double nneut, scale, lnl; int *has_data = smalloc(mod->tree->nnodes * sizeof(int)); FeatFitData *d; /* init FeatFitData */ d = ff_init_fit_data(mod, msa, ALL, NNEUT, FALSE); /* iterate through features */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); checkInterrupt(); ff_find_missing_branches(mod, msa, f, has_data, &nspec); if (nspec < 3) nneut = scale = 0; else { vec_set(d->cdata->params, 0, d->cdata->init_scale); d->feat = f; opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d, &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0], logf, NULL, NULL); /* turns out to be faster to use numerical rather than exact derivatives (judging by col case) */ scale = d->cdata->params->data[0]; for (j = 1, nneut = 0; j < mod->tree->nnodes; j++) /* node 0 is root */ if (has_data[j]) nneut += ((TreeNode*)lst_get_ptr(mod->tree->nodes, j))->dparent; } if (feat_nspec != NULL) feat_nspec[i] = (double)nspec; if (feat_nneut != NULL) feat_nneut[i] = nneut; if (feat_nobs != NULL) feat_nobs[i] = scale * nneut; if (feat_nrejected != NULL) { feat_nrejected[i] = nneut * (1 - scale); if (mode == ACC) feat_nrejected[i] *= -1; else if (mode == NNEUT) feat_nrejected[i] = fabs(feat_nrejected[i]); } } ff_free_fit_data(d); sfree(has_data); }
void mafBlock_strip_iLines(MafBlock *block) { MafSubBlock *sub; int i; for (i=0; i<lst_size(block->data); i++) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); mafSubBlock_strip_iLine(sub); } }
void mafBlock_print(FILE *outfile, MafBlock *block, int pretty_print) { int i, j, k, numSpace; int fieldSize[6]; //maximum # of characters in the first 6 fields of block MafSubBlock *sub; char firstChar, formatstr[1000]; char *firstseq=NULL; //if processing has reduced the number of species with data to zero, or has //reduced the block to all gaps, don't print if (lst_size(block->data) == 0 || mafBlock_all_gaps(block)) return; mafBlock_remove_gap_cols(block); mafBlock_get_fieldSizes(block, fieldSize); fprintf(outfile, "%s\n", block->aLine->chars); for (i=0; i<lst_size(block->data); i++) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); for (j=0; j<sub->numLine; j++) { firstChar = sub->lineType[j]; if (firstChar == 's' || firstChar == 'e') { sprintf(formatstr, "%%c %%-%is %%%ii %%%ii %%c %%%ii ", fieldSize[1], fieldSize[2], fieldSize[3], fieldSize[5]); fprintf(outfile, formatstr, firstChar, sub->src->chars, sub->start, sub->size, sub->strand, sub->srcSize); if (firstChar == 's') { if (firstseq == NULL) { fprintf(outfile, "%s\n", sub->seq->chars); if (pretty_print) firstseq = sub->seq->chars; } else { for (k=0; k<block->seqlen; k++) fputc(tolower(sub->seq->chars[k])==tolower(firstseq[k]) ? '.' : sub->seq->chars[k], outfile); } } else fprintf(outfile, "%c\n", sub->eStatus); } else if (firstChar=='i') { sprintf(formatstr, "i %%-%is %%c %%i %%c %%i", fieldSize[1]); fprintf(outfile, formatstr, sub->src->chars, sub->iStatus[0], sub->iCount[0], sub->iStatus[1], sub->iCount[1]); fputc('\n', outfile); } else { if (firstChar != 'q') die("ERROR mafBlock_print: firstChar should be q, got %c\n", firstChar); sprintf(formatstr, "q %%-%is", fieldSize[1]); fprintf(outfile, formatstr, sub->src->chars); numSpace = 6 + fieldSize[2] + fieldSize[3] + fieldSize[5]; for (k=0; k<numSpace; k++) fputc(' ', outfile); fprintf(outfile, "%s\n", sub->quality->chars); } } } fputc('\n', outfile); //blank line to mark end of block // fflush(outfile); }
void cm_free_category_range(CategoryRange *cr) { int i; for (i = 0; i < lst_size(cr->feature_types); i++) { String *s = (String*)lst_get_ptr(cr->feature_types, i); if (s != NULL) str_free(s); } lst_free(cr->feature_types); sfree(cr); }
//change all bases with quality score <= cutoff to N void mafBlock_mask_bases(MafBlock *block, int cutoff, FILE *outfile) { MafSubBlock *sub; int i, j, firstMasked; char *refseq=NULL, *refseqName; long firstCoord, lastCoord=-1, *coord; sub = (MafSubBlock*)lst_get_ptr(block->data, 0); refseq = sub->seq->chars; coord = smalloc(block->seqlen*sizeof(long)); firstCoord = sub->start; refseqName = sub->src->chars; for (i=0; i < block->seqlen; i++) { if (refseq[i] != '-') coord[i] = firstCoord++; else coord[i] = -1; } lastCoord = firstCoord; for (i=0; i<lst_size(block->data); i++) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); if (sub->quality==NULL) continue; firstMasked=-1; for (j=0; j<block->seqlen; j++) { if (sub->quality->chars[j] == '-' && refseq[j]=='-') continue; if ((sub->quality->chars[j] != '-' && sub->quality->chars[j] != 'F') && sub->quality->chars[j] - '0' <= cutoff) { sub->seq->chars[j] = 'N'; if (firstMasked == -1 && refseq[j]!='-') firstMasked = j; } else if (firstMasked != -1) { if (outfile != NULL) { fprintf(outfile, "%s\t%li\t%li\t%s\n", refseqName, coord[firstMasked], coord[j], sub->src->chars); } firstMasked = -1; } } if (outfile != NULL && firstMasked != -1) fprintf(outfile, "%s\t%li\t%li\t%s\n", refseqName, coord[firstMasked], lastCoord, sub->src->chars); } sfree(coord); }
SEXP rph_tree_summary_label(SEXP treeP) { TreeNode *tr = rph_tree_new(treeP), *node; List *nodes = tr_preorder(tr); int i, nnodes = lst_size(nodes); SEXP result; for (i=0; i < nnodes; i++) { node = (TreeNode*)lst_get_ptr(nodes, i); if (node->label != NULL) break; } if (i == nnodes) return R_NilValue; PROTECT(result = NEW_CHARACTER(nnodes)); for (i=0; i < nnodes; i++) { node = (TreeNode*)lst_get_ptr(nodes, i); if (node->label == NULL) SET_STRING_ELT(result, i, NA_STRING); else SET_STRING_ELT(result, i, mkChar(node->label)); } UNPROTECT(1); return result; }