/* Closes all outfiles. If already closed, reopen with append, add #eof closer, and close again. see comment above at get_outfile */ void close_outfiles(List *outfileList, Hashtable *outfileHash) { List *keys = hsh_keys(outfileHash); int *done, idx, i; char *fname; FILE *outfile; done = smalloc(lst_size(keys)*sizeof(int)); for (i=0; i<lst_size(keys); i++) { done[i]=0; fname = (char*)lst_get_ptr(keys, i); idx = hsh_get_int(outfileHash, fname); outfile = (FILE*)lst_get_ptr(outfileList, idx); if (outfile != NULL) { mafBlock_close_outfile(outfile); done[i]=1; } } for (i=0; i<lst_size(keys); i++) { if (done[i]) continue; fname = (char*)lst_get_ptr(keys, i); outfile = phast_fopen(fname, "a"); mafBlock_close_outfile(outfile); } sfree(done); lst_free(keys); lst_free(outfileList); hsh_free(outfileHash); }
/** maps a sequence (array) of category numbers from the spooled space to the unspooled space, using the current unspooler. Original sequence is overwritten */ void cm_spooled_to_unspooled(CategoryMap *cm, int *path, int pathlen) { int j, sp_state, prev_sp_state; List *pred; if (cm->unspooler == NULL) return; pred = lst_new_int(cm->unspooler->nstates_spooled); prev_sp_state = -1; for (j = 0; j < pathlen; j++) { if (!(path[j] >= 0 && path[j] <= cm->unspooler->nstates_spooled)) die("ERROR cm_spooled_to_unspooled: path[%i]=%i, should be in [0, %i]\n", j, path[j], cm->unspooler->nstates_spooled); sp_state = path[j]; path[j] = cm_get_unspooled_state(cm, path[j], pred); if (path[j] == -1) die("ERROR: failure mapping to uspooled state at position %d.\n", j); if (sp_state != prev_sp_state) { /* if the current (spooled) state is not conditioned on any other state, then its predecessor cannot matter, so the list can be cleared */ if (lst_size(cm->unspooler->spooled_to_unspooled[sp_state]->children) == 0) lst_clear(pred); lst_push_int(pred, sp_state); } prev_sp_state = sp_state; } lst_free(pred); }
void block_justify( t_block *block, int dir) { t_lst *lst = block_leaves_get( block, dir); t_link *l; t_block *previous = NULL; for( l = lst->first; l; l = l->next) { t_block *block_nearest = l->data; justify( block, block_nearest, dir); if( previous) { if( previous->id.id != block_nearest->id.id) { justify( previous, block_nearest, NORTH); //justify_tree( previous, block_nearest, NORTH); } } // Go Recursive block_justify( block_nearest, dir); previous = block_nearest; } lst_free( lst); }
/* Create a category map with a category for each feature type in a GFF_Set. Category numbers are assigned in order of appearance of types */ CategoryMap* cm_new_from_features(GFF_Set *feats) { int i; CategoryMap *retval; Hashtable *hash; List *types; /* first scan features for all types */ hash = hsh_new(10); types = lst_new_ptr(10); for (i = 0; i < lst_size(feats->features); i++) { GFF_Feature *f = lst_get_ptr(feats->features, i); checkInterruptN(i, 10000); if (hsh_get(hash, f->feature->chars) == (void*)-1) { lst_push_ptr(types, f->feature); hsh_put_int(hash, f->feature->chars, 1); } } hsh_free(hash); /* now create a simple category map */ retval = cm_new(lst_size(types)); for (i = 0; i <= retval->ncats; i++) { String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : str_dup(lst_get_ptr(types, i-1)); retval->ranges[i] = cm_new_category_range(type, i, i); } lst_free(types); return retval; }
/* * convert a java array of className objects to a sqm_lst_t list. * each element is converted from Java to C using j2c function. */ sqm_lst_t * jarray2lst(JNIEnv *env, jobjectArray jarr, char *className, void * (*j2c)(JNIEnv *, jobject)) { sqm_lst_t *lst; int idx, n; if (NULL == jarr) { PTRACE(1, "jni:NULL array passed to jarray2lst()"); return (NULL); } n = (int)(*env)->GetArrayLength(env, jarr); PTRACE(2, "jni:jarray2lst(jarr[%d],%s)", n, className); lst = lst_create(); for (idx = 0; idx < n; idx++) if (-1 == lst_append(lst, j2c(env, (*env)->GetObjectArrayElement(env, jarr, idx)))) { lst_free(lst); lst = NULL; break; } PTRACE(2, "jni:jarray2lst() done"); return (lst); }
/* * convert a jintArray to a C list of int */ sqm_lst_t * jintArray2lst(JNIEnv *env, jintArray jintArr) { sqm_lst_t *lst; int idx, len, *i; jint *p; if (NULL == jintArr) { PTRACE(1, "jni:NULL array passed to jintArray2lst()"); return (NULL); } len = (int)(*env)->GetArrayLength(env, jintArr); p = (jint *) malloc(len * sizeof (jint)); PTRACE(2, "jni:jintArray2lst(jintArr[%d])", len); lst = lst_create(); (*env)->GetIntArrayRegion(env, jintArr, 0, len, p); for (idx = 0; idx < len; idx++) { i = (int *)malloc(sizeof (int)); *i = (int)p[idx]; if (-1 == lst_append(lst, i)) { lst_free(lst); lst = NULL; break; } } free(p); PTRACE(2, "jni:jintArray2lst() done"); return (lst); }
/* Read substitution scores from specified file and return as a kind of pseudo substitution matrix. All nonspecified elements in matrix will be equal to NEGINFTY, which is to be interpretted as "NA" */ Matrix* read_subst_scores(TreeModel *mod, FILE *F) { Matrix *retval = mat_new(mod->rate_matrix->size, mod->rate_matrix->size); String *line = str_new(STR_MED_LEN), *tuple1, *tuple2; List *l = lst_new_ptr(3); int alph_size = (int)strlen(mod->rate_matrix->states); int *inv_alph = mod->rate_matrix->inv_states; double val; mat_set_all(retval, NEGINFTY); while (str_readline(line, F) != EOF) { str_double_trim(line); if (str_starts_with_charstr(line, "#") || line->length == 0) continue; str_split(line, NULL, l); if (lst_size(l) < 3) { die("ERROR: wrong number of columns in subst. score file.\n"); } tuple1 = lst_get_ptr(l, 0); tuple2 = lst_get_ptr(l, 1); if (str_as_dbl(lst_get_ptr(l, 2), &val) != 0) { die("ERROR: bad value in subst. score file.\n"); } mat_set(retval, tuple_index(tuple1->chars, inv_alph, alph_size), tuple_index(tuple2->chars, inv_alph, alph_size), val); str_free(tuple1); str_free(tuple2); str_free(lst_get_ptr(l, 2)); } lst_free(l); str_free(line); return retval; }
/* Exclude stop codons from all CDS in a group, as necessary. Record any features that are changed, so they can be changed back before data is output */ void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, List *ends_adjusted) { int j, k; List *stops = lst_new_ptr(1), *gfeatures = group->features; GFF_Feature *feat; lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted); for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops. We expect at most one, but more are possible */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat); } for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */ feat = lst_get_ptr(gfeatures, j); if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { for (k = 0; k < lst_size(stops); k++) { /* check stops */ GFF_Feature *stop = lst_get_ptr(stops, k); if (feat->strand == '+' && stop->strand == '+' && feat->end == stop->end) { feat->end -= 3; lst_push_ptr(ends_adjusted, feat); } else if (feat->strand == '-' && stop->strand == '-' && feat->start == stop->start) { feat->start += 3; lst_push_ptr(starts_adjusted, feat); } } } } lst_free(stops); }
int main(){ ElemType userVal; printf("Please enter a set of integers.\n"); int checker; LIST* lst = lst_create(); while (1){ checker = scanf("%i", &userVal); if (checker == EOF || checker == 0) break; lst_push_back(lst, userVal); } printf("The list before the quick sort: \n"); lst_print(lst); printf("The list after the quick sort: \n"); qsort1(lst); lst_print(lst); lst_free(lst); return 0; }
void cm_free_category_range(CategoryRange *cr) { int i; for (i = 0; i < lst_size(cr->feature_types); i++) { String *s = (String*)lst_get_ptr(cr->feature_types, i); if (s != NULL) str_free(s); } lst_free(cr->feature_types); sfree(cr); }
void justify_tree( t_block *block_previous, t_block *block_current, int dir) { float box_previous[8]; float box_current[8]; bzero( box_previous, 8); bzero( box_current, 8); t_lst *lst_previous = block_branch_get( block_previous, dir); t_lst *lst_current = block_branch_get( block_current, dir); get_branch_bounding_box( block_previous, lst_previous, box_previous, dir); get_branch_bounding_box( block_current, lst_current, box_current, dir); do_justify_tree( block_current, lst_current, box_previous, box_current, dir); lst_free( lst_previous); lst_free( lst_current); }
/* Print a CategoryMap to a file */ void cm_print(CategoryMap *cm, FILE *F) { int i, j, k; List *tmpl; fprintf(F, "NCATS = %d\n\n", cm->ncats); for (i = 1; i <= cm->ncats; i++) { CategoryRange *cr = cm->ranges[i]; for (j = 0; j < lst_size(cr->feature_types); j++) { String *s = (String*)lst_get_ptr(cr->feature_types, j); fprintf(F, "%-15s %d", s->chars, cr->start_cat_no); if (cr->end_cat_no > cr->start_cat_no) fprintf(F, "-%d", cr->end_cat_no); if (cm->conditioned_on[i] != NULL) { fprintf(F, "\t"); for (k = 0; k < lst_size(cm->conditioned_on[i]); k++) fprintf(F, "%d%s", lst_get_int(cm->conditioned_on[i], k), k + 1 == lst_size(cm->conditioned_on[i]) ? "" : ","); } fprintf(F, "\n"); } i = cr->end_cat_no; /* avoid looking multiple times at the same range */ } /* reconstruct precedence lists */ tmpl = lst_new_int(cm->ncats + 1); for (i = 0; i <= cm->ncats; i++) lst_push_int(tmpl, i); prec = cm->labelling_precedence; lst_qsort(tmpl, compare_prec); fprintf(F, "\nLABELLING_PRECEDENCE = "); for (i = 0; i <= cm->ncats; i++) { int cat = lst_get_int(tmpl, i); if (cm->labelling_precedence[cat] != -1) fprintf(F, "%d%s", cat, i < cm->ncats ? "," : ""); } fprintf(F, "\n"); lst_clear(tmpl); for (i = 0; i <= cm->ncats; i++) lst_push_int(tmpl, i); prec = cm->fill_precedence; lst_qsort(tmpl, compare_prec); fprintf(F, "FILL_PRECEDENCE = "); for (i = 0; i <= cm->ncats; i++) { int cat = lst_get_int(tmpl, i); if (cm->fill_precedence[cat] != -1) fprintf(F, "%d%s", cat, i < cm->ncats ? "," : ""); } fprintf(F, "\n"); lst_free(tmpl); }
void mafBlock_free_data(MafBlock *block) { MafSubBlock *sub; int i; if (block->data != NULL) { for (i=0; i<lst_size(block->data); i++) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); mafSubBlock_free(sub); } lst_free(block->data); block->data = NULL; } block->seqlen = 0; }
void block_arrange( t_block *block) { t_lst *lst = block_leaves_get( block, WEST); t_link *l; for( l = lst->first; l; l = l->next) { t_block *block_nearest = l->data; drive_away( block, block_nearest, WEST); block_arrange( block_nearest); } lst_free( lst); }
void lst_free(LE *curr) { if (curr) { if (curr->next) lst_free(curr->next); if (curr->string) free(curr->string); free(curr->clef); if (curr->value) free(curr->value); if (curr->next) free(curr->next); } }
/* Read an amino acid rate matrix in the format used by PAML. Reorder the rows and columns to match 'alph'. Warning: the ordering in the file is assumed to match that used in the files in the PAML distribution (alphabetical order of 3-letter codes), which is also the order of AA_ALPHABET (therefore AA_ALPHABET may not be changed!). Equilibrium frequencies are ignored. */ Matrix *read_paml_matrix(FILE *F, char *alph) { char *paml_alph = "ARNDCQEGHILKMFPSTWYV$"; int size = (int)strlen(paml_alph); Matrix *retval = mat_new(size, size); List *fields = lst_new_ptr(100); String *line = str_new(STR_MED_LEN); int i, j; if (strcmp(alph, paml_alph) != 0) die("ERROR read_paml_matrix (alph (%s) != paml_alph (%s))\n", alph, paml_alph); mat_zero(retval); for (i = 1; i < size-1 && str_readline(line, F) != EOF; ) { /* NOTE: size of matrix allows for stop, but stop not included in file; therefore, only read size-1 lines */ str_double_trim(line); if (line->length == 0) continue; str_split(line, NULL, fields); if (lst_size(fields) != i) { die("ERROR: row %d of matrix must have %d columns.\n", i+1, i); } for (j = 0; j < lst_size(fields); j++) { double val; if (str_as_dbl(lst_get_ptr(fields, j), &val) != 0) { die("ERROR: non-numeric matrix element in subst. matrix ('%s')\n", ((String*)lst_get_ptr(fields, j+1))->chars); } str_free(lst_get_ptr(fields, j)); if (j >= size) die("ERROR read_paml_matrix j (%i) should be < size (%i)\n", j, size); mat_set(retval, i, j, val); mat_set(retval, j, i, val); } i++; } if (i != size - 1) { die("ERROR: too few rows in subst. matrix.\n"); } lst_free(fields); str_free(line); return retval; }
void mafBlock_add_iLine(String *line, MafSubBlock *sub) { List *l = lst_new_ptr(6); String *str; int i; if (sub->numLine<1 || sub->lineType[0]!='s') die("ERROR: got i-Line without preceding s-Line in MAF block\n"); if (6 != str_split(line, NULL, l)) die("ERROR: expected six fields in MAF line starting with 'i' (got %i)\n", lst_size(l)); //field[0] should be 'i' if (!(str_compare_charstr((String*)lst_get_ptr(l, 0), "i")==0)) die("ERROR: mafBlock_add_iLine: field[0] should be 'i', got %s\n", ((String*)lst_get_ptr(l, 0))->chars); //field[1] should be src, and should match src already set in sub if (str_compare((String*)lst_get_ptr(l, 1), sub->src) != 0) die("iLine sourceName does not match preceding s-Line (%s, %s)\n", ((String*)lst_get_ptr(l, 1))->chars, sub->src->chars); for (i=0; i<2; i++) { //field[2,4] should be leftStatus, rightStauts str = (String*)lst_get_ptr(l, i*2+2); if (str->length != 1) die("ERROR: i-Line got illegal %sStatus = %s\n", i==0 ? "left": "right", str->chars); sub->iStatus[i] = str->chars[0]; if (sub->iStatus[i] != 'C' && sub->iStatus[i] != 'I' && sub->iStatus[i] != 'N' && sub->iStatus[i] != 'n' && sub->iStatus[i] != 'M' && sub->iStatus[i] != 'T') die("ERROR: i-Line got illegal %sStatus = '%c'\n", i==0 ? "left" : "right", sub->iStatus[i]); //field 3,5 should be leftCount, rightCount str = (String*)lst_get_ptr(l, i*2+3); sub->iCount[i] = atoi(str->chars); } for (i=0; i<6; i++) str_free((String*)lst_get_ptr(l, i)); lst_free(l); sub->lineType[sub->numLine++] = 'i'; }
void mafBlock_add_qLine(String *line, MafSubBlock *sub) { List *l = lst_new_ptr(3); String *str; int i; if (sub->numLine<1 || sub->lineType[0]!='s') die("ERROR: got q-Line without preceding s-Line in MAF block\n"); if (3 != str_split(line, NULL, l)) die("ERROR: expected three fields in q-Line of maf file, got %i\n", lst_size(l)); //field[0] should be 'q' if (!(str_compare_charstr((String*)lst_get_ptr(l, 0), "q")==0)) die("ERROR mafBlock_add_qLine expected 'q' got %s\n", ((String*)lst_get_ptr(l, 0))->chars); //field[1] should be src, and should match src already set in sub if (str_compare((String*)lst_get_ptr(l, 1), sub->src) != 0) die("iLine sourceName does not match preceding s-Line (%s, %s)\n", ((String*)lst_get_ptr(l, 1))->chars, sub->src->chars); //field[2] should be quality if (sub->seq == NULL) die("ERROR mafBlock_add_qLine: sub->seq is NULL\n"); str = (String*)lst_get_ptr(l, 2); if (sub->seq->length != str->length) die("ERROR: length of q-line does not match sequence length\n"); sub->quality = str; for (i=0; i<sub->quality->length; i++) { if (sub->seq->chars[i] == '-') { if (sub->quality->chars[i] != '-') die("ERROR: got quality score where alignment char is gap\n"); } else { if (sub->quality->chars[i] != 'F' && sub->quality->chars[i] < '0' && sub->quality->chars[i] > '9') die("ERROR: Illegal quality score '%c' in MAF block\n", sub->quality->chars[i]); } } for (i=0; i<2; i++) str_free((String*)lst_get_ptr(l, i)); lst_free(l); sub->lineType[sub->numLine++] = 'q'; }
/* given list of spooled category names/numbers, return a list of corresponding unspooled category numbers */ List *cm_get_unspooled_list(CategoryMap *cm, List *spooled) { List *spooled_catnos, *unspooled_catnos; int mark[cm->ncats+1]; int i; spooled_catnos = cm_get_category_list(cm, spooled, 0); if (cm->unspooler == NULL) return spooled_catnos; unspooled_catnos = lst_new_int(lst_size(spooled_catnos) * 3); for (i = 0; i <= cm->ncats; i++) mark[i] = 0; for (i = 0; i < lst_size(spooled_catnos); i++) mark[lst_get_int(spooled_catnos, i)] = 1; for (i = 0; i < cm->unspooler->nstates_unspooled; i++) if (mark[cm->unspooler->unspooled_to_spooled[i]]) lst_push_int(unspooled_catnos, i); lst_free(spooled_catnos); return unspooled_catnos; }
JNIEXPORT jobjectArray Java_com_sun_netstorage_samqfs_mgmt_arc_Archiver_activateCfg(JNIEnv *env, jclass cls /*ARGSUSED*/, jobject ctx) { sqm_lst_t *err_warn_lst; jobjectArray warnArr, errArr; int res; PTRACE(1, "jni:Archiver_activateCfg() entry"); res = activate_archiver_cfg(CTX, &err_warn_lst); PTRACE(1, "jni:activateCfg returned %d, lst[%d]", res, (res == -2 || res == -3) ? err_warn_lst->length : -1); switch (res) { case -1: /* internal error */ ThrowEx(env); return (NULL); case -2: /* archiver.cmd errors */ errArr = lst2jarray(env, err_warn_lst, "java/lang/String", charr2String); lst_free_deep(err_warn_lst); ThrowMultiMsgEx(env, errArr); return (NULL); case -3: /* archiver.cmd warnings */ warnArr = lst2jarray(env, err_warn_lst, "java/lang/String", charr2String); lst_free_deep(err_warn_lst); break; default: /* success */ lst_free(err_warn_lst); warnArr = NULL; } PTRACE(1, "jni:Archiver_activateCfg() done"); return (warnArr); }
void mafBlock_reorder(MafBlock *block, List *specNameOrder) { String *str; MafSubBlock *sub; List *newData; Hashtable *newSpecMap; int i, idx, *found, oldSize = lst_size(block->data), newSize = lst_size(specNameOrder); found = smalloc(oldSize*sizeof(int)); for (i=0; i<oldSize; i++) found[i]=0; newData = lst_new_ptr(oldSize); newSpecMap = hsh_new(100); for (i=0; i<newSize; i++) { str = (String*)lst_get_ptr(specNameOrder, i); idx = hsh_get_int(block->specMap, str->chars); if (idx != -1) { if (found[idx]==1) die("ERROR: species %s appears twice in reorder list\n", str->chars); sub = (MafSubBlock*)lst_get_ptr(block->data, idx); hsh_put_int(newSpecMap, sub->src->chars, lst_size(newData)); hsh_put_int(newSpecMap, sub->specName->chars, lst_size(newData)); lst_push_ptr(newData, (void*)sub); found[idx] = 1; } } for (i=0; i<oldSize; i++) { if (found[i]==0) { sub = (MafSubBlock*)lst_get_ptr(block->data, i); mafSubBlock_free(sub); } } hsh_free(block->specMap); lst_free(block->data); block->specMap = newSpecMap; block->data = newData; sfree(found); }
/* Free memory associated with category map. */ void cm_free(CategoryMap *cm) { int i; for (i = 0; i <= cm->ncats; i++) { int len = 0; if (cm->ranges[i] != NULL) { len = cm->ranges[i]->end_cat_no - cm->ranges[i]->start_cat_no; cm_free_category_range(cm->ranges[i]); } if (cm->conditioned_on[i] != NULL) lst_free(cm->conditioned_on[i]); i += len; } sfree(cm->ranges); sfree(cm->conditioned_on); sfree(cm->labelling_precedence); sfree(cm->fill_precedence); if (cm->unspooler != NULL) cm_free_unspooler(cm->unspooler); sfree(cm); return; }
void cm_free_unspool_node(UnspoolNode *n) { lst_free(n->children); sfree(n); }
/* free list of problem objects */ void problems_free(List *problems) { problems_clear(problems); lst_free(problems); }
/* reconstruct indels by parsimony and assign all base probs to -1 where ancestral bases are inferred not to have been present */ void do_indels(MSA *msa, TreeModel *mod) { int s, tup, i, j; TreeNode *n, *lca; char c; typedef enum {IGNORE, GAP, BASE, MISSING, AMBIG} label_type; List *postorder; label_type *label = smalloc(mod->tree->nnodes * sizeof(label_type)); List *inside = lst_new_ptr(mod->tree->nnodes), *outside = lst_new_ptr(mod->tree->nnodes), *ambig_cases = lst_new_ptr(mod->tree->nnodes); int *seq_to_leaf = smalloc(msa->nseqs * sizeof(int)); /* build mapping from seqs to leaf indices in tree */ for (s = 0; s < msa->nseqs; s++) { TreeNode *n = tr_get_node(mod->tree, msa->names[s]); if (n == NULL) die("ERROR: no match for sequence \"%s\" in tree.\n", msa->names[s]); seq_to_leaf[s] = n->id; } if (mod->msa_seq_idx == NULL) tm_build_seq_idx(mod, msa); postorder = tr_postorder(mod->tree); for (tup = 0; tup < msa->ss->ntuples; tup++) { int min = mod->tree->nnodes, max = -1, ngaps = 0, skip_root = FALSE; /* find min and max ids of seqs that actually have bases (non-gaps) */ for (s = 0; s < msa->nseqs; s++) { if (ss_get_char_tuple(msa, tup, s, 0) == GAP_CHAR) { ngaps++; continue; } if (seq_to_leaf[s] < min) min = seq_to_leaf[s]; if (seq_to_leaf[s] > max) max = seq_to_leaf[s]; /* NOTE: missing data being handled like bases here; in some cases, a base may be inferred at an ancestral node, when the only evidence for it is missing data in the leaves. There are ambiguous cases; we'll err on the side of predicting bases rather than indels */ } if (ngaps <= 1) continue; /* short cut -- impossible to infer gaps in ancestors */ else if (ngaps >= msa->nseqs - 1) { /* in this case, all ancestors must be gaps */ for (i = 0; i < mod->tree->nnodes; i++) { n = lst_get_ptr(mod->tree->nodes, i); if (n->lchild == NULL || n->rchild == NULL) continue; /* ignore leaves */ for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; /* mark as gap */ } continue; } if (min < 0) die("prequel.c: min = %e < 0\n", min); if (max < min) die("prequel.c: max (%e) < min (%e)", max, min); /* the LCA of all leaves with non-gaps must be the first ancestor of the node with the max id that has an id smaller than the min id. This is based on the assumption that node ids are assigned sequentially in a preorder traversal of the tree, which will be true as long as the tree is read from a Newick file by the code in trees.c */ for (lca = lst_get_ptr(mod->tree->nodes, max); lca->id > min; lca = lca->parent); /* by parsimony, the base was inserted on the branch to the LCA, and all ancestral nodes outside the subtree rooted at the LCA did not have bases */ if (lca == mod->tree->lchild || lca == mod->tree->rchild) skip_root = TRUE; /* don't mark root as gap in this case: can't distinguish insertion from deletion so assume deletion */ /* mark ancestral bases outside subtree beneath LCA as gaps */ tr_partition_nodes(mod->tree, lca, inside, outside); for (i = 0; i < mod->tree->nnodes; i++) label[i] = BASE; for (i = 0; i < lst_size(outside); i++) { n = lst_get_ptr(outside, i); label[n->id] = IGNORE; if (n->lchild == NULL || n->rchild == NULL) continue; /* skip leaves */ if (n == mod->tree && skip_root) continue; /* skip root if condition above */ for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; /* mark as gap */ } /* check for gaps in subtree; if there's at most one, we can go on; otherwise have to use parsimony to infer history in subtree */ ngaps = 0; for (i = 0; i < lst_size(inside); i++) { n = lst_get_ptr(inside, i); if (n->lchild == NULL && ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0) == GAP_CHAR) ngaps++; } if (ngaps <= 1) continue; /* use Dollo parsimony to infer the indel history of the subtree beneath the LCA. Use the fact that every base must have a chain of bases to the LCA, because, assuming the alignment is correct, no insertions are possible beneath the LCA */ lst_clear(ambig_cases); for (i = 0; i < lst_size(postorder); i++) { n = lst_get_ptr(postorder, i); if (label[n->id] == IGNORE) continue; /* outside subtree */ /* MISSING means all leaves beneath node have missing data */ /* AMBIG means combination of gaps and missing data beneath node */ else if (n->lchild == NULL) { /* leaf in subtree */ c = ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0); if (c == GAP_CHAR) label[n->id] = GAP; else if (msa->is_missing[(int)c]) label[n->id] = MISSING; else label[n->id] = BASE; } else { /* internal node in subtree */ if (label[n->lchild->id] == BASE || label[n->rchild->id] == BASE) label[n->id] = BASE; /* by Dollo parsimony */ else if ((label[n->lchild->id] == GAP || label[n->lchild->id] == AMBIG) && (label[n->rchild->id] == GAP || label[n->rchild->id] == AMBIG)) label[n->id] = GAP; /* gaps from both sides and no bases -- must be gap */ else if (label[n->lchild->id] == MISSING && label[n->rchild->id] == MISSING) label[n->id] = MISSING; else { /* must be GAP/MISSING or AMBIG/MISSING */ label[n->id] = AMBIG; lst_push_ptr(ambig_cases, n); } } } /* now resolve any ambiguities, by giving each ambiguous node the same label as its parent; traversing ambig_cases in reverse order ensures that parents are visited before children */ /* first make sure root of subtree has a base */ if (label[lca->id] == MISSING || label[lca->id] == AMBIG) label[lca->id] = BASE; /* in this case there is all missing data and gaps beneath the LCA; hard to know what is right, but let's force a base and err on the side of bases rather than gaps */ for (i = lst_size(ambig_cases) - 1; i >= 0; i--) { n = lst_get_ptr(ambig_cases, i); if (n == lca) continue; else label[n->id] = label[n->parent->id]; } /* now mark gaps inside subtree, as needed */ for (i = 0; i < lst_size(inside); i++) { n = lst_get_ptr(inside, i); if (n->lchild == NULL || n->rchild == NULL) continue; if (label[n->id] == GAP) for (j = 0; j < mod->rate_matrix->size; j++) mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1; } } lst_free(inside); lst_free(outside); lst_free(ambig_cases); sfree(seq_to_leaf); sfree(label); }
int main(int argc, char* argv[]) { FILE* F; MSA *msa; int *msa_gap_patterns = NULL; HMM *hmm = NULL; TreeNode *tree = NULL; int i, input_format = SS, msa_idx, quiet_mode = FALSE, ncats, nmsas, ncats_unspooled, indel_nseqs = -1; String *msa_fname, *gff_fname; List *gff_fname_list = NULL, *msa_fname_list = NULL, *msa_length_list = NULL, *model_indels_str = NULL; Matrix *traincounts = NULL; Vector *begcounts = NULL, *statecounts = NULL; CategoryMap *cm = NULL; char c; GapPatternMap *gpm = NULL; GFF_Set *gff; char *reverse_groups_tag = NULL; while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) { switch(c) { case 'i': input_format = msa_str_to_format(optarg); if (input_format == -1) die("ERROR: bad alignment format.\n"); break; case 'g': gff_fname_list = get_arg_list(optarg); break; case 'c': cm = cm_new_string_or_file(optarg); break; case 'm': msa_fname_list = get_arg_list(optarg); break; case 'M': msa_length_list = str_list_as_int(get_arg_list(optarg)); break; case 'R': reverse_groups_tag = optarg; break; case 'I': model_indels_str = get_arg_list(optarg); break; case 'n': indel_nseqs = get_arg_int(optarg); break; case 't': if (optarg[0] == '(') /* in this case, assume topology given at command line */ tree = tr_new_from_string(optarg); else tree = tr_new_from_file(phast_fopen(optarg, "r")); break; case 'q': quiet_mode = TRUE; break; case 'h': print_usage(); exit(0); case '?': die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n"); } } if (msa_fname_list == NULL) die("ERROR: -m required. Type 'hmm_train -h' for usage.\n"); if (gff_fname_list == NULL) die("ERROR: -g required in training mode. Type 'hmm_train -h' for usage.\n"); if (msa_length_list != NULL && msa_fname_list != NULL) die("ERROR: -m and -M are mutually exclusive. Type 'hmm_train -h' for usage.\n"); if (model_indels_str != NULL && tree == NULL) die("ERROR: -I requires -t. Type 'hmm_train -h' for usage.\n"); if (cm == NULL) die("ERROR: category map required.\n"); set_seed(-1); ncats = cm->ncats + 1; ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled : ncats; nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) : lst_size(msa_fname_list)); if (model_indels_str != NULL) { if (tree == NULL) die("ERROR: tree is NULL\n"); /*FIXME: indel_ncats broken */ gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE); ncats = cm->ncats + 1; /* numbers will change */ ncats_unspooled = cm->unspooler == NULL ? ncats : cm->unspooler->nstates_unspooled; } /* allocate memory for storage of "training paths" */ traincounts = mat_new(ncats_unspooled, ncats_unspooled); statecounts = vec_new(ncats_unspooled); begcounts = vec_new(ncats_unspooled); mat_zero(traincounts); vec_zero(statecounts); vec_zero(begcounts); /* create skeleton of new HMM. */ hmm = hmm_new_nstates(ncats_unspooled, 0, 0); /* Main loop: consider each MSA in turn */ for (msa_idx = 0; msa_idx < nmsas; msa_idx++) { if (msa_fname_list != NULL) { msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx); F = phast_fopen(msa_fname->chars, "r"); if (!quiet_mode) fprintf(stderr, "Reading alignment from %s ...\n", F == stdin ? "stdin" : msa_fname->chars); msa = msa_new_from_file(F, NULL); phast_fclose(F); } else { /* only lengths of alignments specified */ msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL); /* just a shell in this case */ } gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx); if (!quiet_mode) fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars); gff = gff_read_set(phast_fopen(gff_fname->chars, "r")); /* convert GFF to coordinate frame of alignment */ if (msa_length_list == NULL) { if (!quiet_mode) fprintf(stderr, "Mapping annotations to alignment ...\n"); msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */ } if (model_indels_str != NULL) { if (!quiet_mode) fprintf(stderr, "Obtaining gap patterns ...\n"); msa_gap_patterns = smalloc(msa->length * sizeof(int)); gp_set_phylo_patterns(gpm, msa_gap_patterns, msa); } /* at this point, we don't actually need the alignment anymore; if using ordered suff stats (likely with large data sets), can free them now, to avoid running out of memory */ if (msa->ss != NULL) { ss_free(msa->ss); msa->ss = NULL; } if (reverse_groups_tag != NULL) { if (!quiet_mode) fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n", reverse_groups_tag); /* we don't need to reverse complement the whole alignment -- just the gff and possibly the gap pattern array (pass a NULL msa) */ gff_group(gff, reverse_groups_tag); msa_reverse_compl_feats(NULL, gff, msa_gap_patterns); } if (!quiet_mode) fprintf(stderr, "Labeling sites by category ...\n"); msa_label_categories(msa, gff, cm); gff_free_set(gff); if (model_indels_str != NULL) { if (!quiet_mode) fprintf(stderr, "Remapping categories according to gap patterns ...\n"); if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) { /* in this case, we'll simply reassign non-trivial gap patterns randomly. This will achieve the desired effect with minimal coding, as long as the number of sites is not too small (the indel model is probably useless anyway if the number is small) */ int pat, newpat; int npatterns = 4 * indel_nseqs - 5; int complex_allowed[cm->ncats+1]; List *no_complex_names, *no_complex_nums; if (!quiet_mode) fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs); /* set up index indicating by cat no. whether complex gaps are allowed */ for (i = 0; i < ncats; i++) complex_allowed[i] = 1; no_complex_names = lst_new_ptr(10); str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names); no_complex_nums = cm_get_category_list(cm, no_complex_names, 1); for (i = 0; i < lst_size(no_complex_nums); i++) complex_allowed[lst_get_int(no_complex_nums, i)] = 0; lst_free(no_complex_nums); lst_free_strings(no_complex_names); lst_free(no_complex_names); /* now reassign all non-null numbers */ for (i = 0; i < msa->length; ) { if ((pat = msa_gap_patterns[i]) != 0) { if (complex_allowed[msa->categories[i]]) newpat = 1 + ((double)npatterns * unif_rand()); /* random number in interval [1, npatterns] */ else newpat = 1 + ((double)(npatterns-1) * unif_rand()); /* random number in interval [1,npatterns-1] (excludes complex gap pattern) */ for (; i < msa->length && msa_gap_patterns[i] == pat; i++) msa_gap_patterns[i] = newpat; /* change for whole sequence */ } else i++; } } /* obtain gapped category number for each site */ for (i = 0; i < msa->length; i++) if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL) msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]]; } if (!quiet_mode) fprintf(stderr, "Unspooling categories ...\n"); cm_spooled_to_unspooled(cm, msa->categories, msa->length); if (!quiet_mode) fprintf(stderr, "Collecting training data ...\n"); hmm_train_update_counts(traincounts, statecounts, begcounts, msa->categories, msa->length, ncats_unspooled); if (msa_gap_patterns != NULL) sfree(msa_gap_patterns); msa_free(msa); } /* now train HMM, using cumulative data */ hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL, begcounts, NULL); /* if modeling indels, adjust begin transitions so probability is distributed among different "gap pattern" states that all correspond to the same ungapped state (category); this helps avoid problems that occur when training on a few large sequences (e.g., whole chromosomes) and then testing on many shorter ones */ if (model_indels_str != NULL) { double tprob[gpm->ncats]; int nst[gpm->ncats]; /* total prob and number of states per spooled, ungapped category */ for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0; for (i = 0; i < hmm->nstates; i++) { if (vec_get(hmm->begin_transitions, i) > 0) /* have to go from unspooled space to spooled space, then to ungapped space (HMM states correspond to unspooled, gapped categories). Note that states with nonzero begin probs shouldn't be conditioned on other states. */ tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] += vec_get(hmm->begin_transitions, i); nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++; } for (i = 0; i < hmm->nstates; i++) if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0) vec_set(hmm->begin_transitions, i, tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] / nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]); /* (uniform prior) */ } /* write trained HMM */ hmm_print(stdout, hmm); if (!quiet_mode) fprintf(stderr, "Done.\n"); return 0; }
/** Create a list of surfaces. */ list356_t* get_surfaces() { list356_t* surfaces = make_list() ; // Table. point3_t vertices[8] = { {0, 0, 1}, {8, 0, 1}, {0, 8, 1}, {8, 8, 1}, {0, 0, -1}, {8, 0, -1}, {0, 8, -1}, {8, 8, -1}, } ; int indices[] = { 0, 1, 3, 0, 3, 2, // top 0, 2, 6, 0, 6, 4, // left 4, 6, 7, 4, 7, 5, // bottom 1, 5, 7, 1, 7, 3, // right 2, 3, 7, 2, 7, 6, // back 0, 4, 5, 0, 5, 1 // front } ; int top_offset = 6 ; int offset = 36 ; list356_t* table_surfaces = make_list() ; for (int i=0; i<top_offset/3; ++i) { lst_add(table_surfaces, make_triangle( vertices[indices[3*i]], vertices[indices[3*i+1]], vertices[indices[3*i+2]], &RED, &RED, &WHITE, 10.0f)) ; } for (int i=top_offset/3; i<offset/3; ++i) { lst_add(table_surfaces, make_triangle( vertices[indices[3*i]], vertices[indices[3*i+1]], vertices[indices[3*i+2]], &GREEN, &GREEN, &WHITE, 10.0f)) ; } // Two purple spheres. lst_add(table_surfaces, make_sphere(6, 6, 1.75+.01, .75, &PURPLE, &PURPLE, &WHITE, 100.0f)) ; lst_add(table_surfaces, make_sphere(5, 2, 1.75+.01, .75, &PURPLE, &PURPLE, &WHITE, 100.0f)) ; // Transparent cube. point3_t cube_vertices[] = { {4, 0, 3}, {5, 0, 3}, {4, 1, 3}, {5, 1, 3}, {4, 0, 1.01}, {5, 0, 1.01}, {4, 1, 1.01}, {5, 1, 1.01}, } ; for (int i=0; i<offset/3; ++i) { surface_t* t = make_triangle( cube_vertices[indices[3*i]], cube_vertices[indices[3*i+1]], cube_vertices[indices[3*i+2]], &BLACK, &BLACK, &WHITE, 10.0f) ; t->refr_index = 1.1f ; t->atten = &GREENISH ; lst_add(surfaces, t) ; } list356_itr_t* itr = lst_iterator(table_surfaces) ; while (lst_has_next(itr)) lst_add(surfaces, lst_next(itr)) ; lst_free(table_surfaces) ; // Plane at z=-1. surface_t* plane = make_plane( (point3_t){0, 0, -1}, (point3_t){1, 0, -1}, (point3_t){1, 1, -1}, &LIGHT_GREY, &LIGHT_GREY, &BLACK, 10.0f) ; plane->refl_color = &LIGHT_GREY ; lst_add(surfaces, plane) ; return surfaces ; }
int main(int argc, char *argv[]) { char c; int opt_idx; GFF_Set *gff; List *include = NULL; char *groupby = "transcript_id", *exongroup_tag = NULL; int unique = FALSE, sort = FALSE, simplebed = FALSE, fix_start_stop = FALSE, add_utrs = FALSE, add_introns = FALSE, add_signals = FALSE; enum {GFF, BED, GENEPRED, WIG} output_format = GFF; FILE *discards_f = NULL, *groups_f = NULL; struct option long_opts[] = { {"output", 1, 0, 'o'}, {"include-only", 1, 0, 'i'}, {"include-groups", 1, 0, 'l'}, {"groupby", 1, 0, 'g'}, {"exongroup", 1, 0, 'e'}, {"add-utrs", 0, 0, 'U'}, {"add-introns", 0, 0, 'I'}, {"add-signals", 0, 0, 'S'}, {"fix-start-stop", 0, 0, 'f'}, {"unique", 0, 0, 'u'}, {"sort", 0, 0, 's'}, {"simplebed", 0, 0, 'b'}, {"discards", 1, 0, 'd'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} }; while ((c = (char)getopt_long(argc, argv, "o:i:l:g:e:d:UISfusbh", long_opts, &opt_idx)) != -1) { switch (c) { case 'o': if (!strcmp("bed", optarg)) output_format = BED; else if (!strcmp("genepred", optarg)) output_format = GENEPRED; else if (!strcmp("wig", optarg)) output_format = WIG; else if (strcmp("gff", optarg)) die("ERROR: bad output format.\n"); break; case 'i': include = get_arg_list(optarg); break; case 'l': groups_f = phast_fopen(optarg, "r"); break; case 'g': groupby = optarg; break; case 'e': exongroup_tag = optarg; break; case 'U': add_utrs = TRUE; break; case 'I': add_introns = TRUE; break; case 'S': add_signals = TRUE; break; case 'f': fix_start_stop = TRUE; break; case 'u': unique = TRUE; break; case 'b': simplebed = TRUE; output_format = BED; break; case 'd': discards_f = phast_fopen(optarg, "w+"); break; case 's': sort = TRUE; break; case 'h': usage(argv[0]); case '?': die("Bad argument. Try '%s -h'.\n", argv[0]); } } if (optind != argc - 1) die("Input filename required. Try '%s -h'.\n", argv[0]); set_seed(-1); gff = gff_read_set(phast_fopen(argv[optind], "r")); if (lst_size(gff->features) == 0) exit(0); /* helps avoid unexpected behavior below */ /* filter by type */ if (include != NULL) gff_filter_by_type(gff, include, FALSE, discards_f); /* group */ gff_group(gff, groupby); /* utrs, introns, & signals */ if (add_utrs) gff_create_utrs(gff); if (add_introns) gff_create_introns(gff); if (add_signals) gff_create_signals(gff); /* subgroup */ if (exongroup_tag != NULL) gff_exon_group(gff, exongroup_tag); /* filter by group */ if (groups_f != NULL) { String *s = str_new(STR_LONG_LEN); List *groups = lst_new_ptr(10000); str_slurp(s, groups_f); str_split(s, NULL, groups); gff_filter_by_group(gff, groups); lst_free_strings(groups); lst_free(groups); str_free(s); } /* sort */ if (sort) gff_sort(gff); /* make unique */ if (unique) gff_remove_overlaps(gff, discards_f); if (fix_start_stop) gff_fix_start_stop(gff); if (output_format == BED) gff_print_bed(stdout, gff, !simplebed); else if (output_format == GENEPRED) gff_print_genepred(stdout, gff); else if (output_format == WIG) wig_print(stdout, gff); else gff_print_set(stdout, gff); gff_free_set(gff); return 0; }
/* scans a cds for gaps. Returns CLN_GAPS, NOVRLP_CLN_GAPS, NO_GAPS, or FSHIFT_BAD; doesn't try to check for compensatory indels, which is more complicated (this is left for the special-purpose function below) */ int scan_for_gaps(GFF_Feature *feat, MSA *msa, Problem **problem) { int msa_start = feat->start - 1; int msa_end = feat->end - 1; int i, j; int near_boundary = 0; cds_gap_type retval = NGAPS; List *gaps = lst_new_ptr(10); for (j = 0; retval != FSHIFT_BAD && j < msa->nseqs; j++) { for (i = msa_start; i <= msa_end; i++) { if (ss_get_char_pos(msa, i, j, 0) == GAP_CHAR) { int gap_start, gap_end; struct gap *g; for (gap_start = i-1; gap_start >= msa_start && ss_get_char_pos(msa, gap_start, j, 0) == GAP_CHAR; gap_start--); gap_start++; /* inclusive */ for (gap_end = i+1; gap_end <= msa_end && ss_get_char_pos(msa, gap_end, j, 0) == GAP_CHAR; gap_end++); gap_end--; /* inclusive */ if ((gap_end - gap_start + 1) % 3 != 0) { retval = FSHIFT_BAD; *problem = problem_new(feat, FSHIFT, gap_start, gap_end); (*problem)->cds_gap = FSHIFT_BAD; break; } /* note whether gaps occur near a cds boundary (within 3 sites) */ if (gap_start <= msa_start + 3 || gap_end >= msa_end - 3) near_boundary = 1; if (retval == NGAPS) retval = CLN_GAPS; g = smalloc(sizeof(struct gap)); g->start = gap_start; g->end = gap_end; lst_push_ptr(gaps, g); i = gap_end; } } } if (retval == CLN_GAPS) { /* now check for overlaps */ lst_qsort(gaps, gap_compare); retval = NOVRLP_CLN_GAPS; for (i = 1; i < lst_size(gaps); i++) { struct gap *g1 = lst_get_ptr(gaps, i-1); struct gap *g2 = lst_get_ptr(gaps, i); if (g2->start <= g1->end && (g2->start != g1->start || g2->end != g1->end)) { retval = CLN_GAPS; break; } } if (retval == NOVRLP_CLN_GAPS && near_boundary) retval = CLN_GAPS; /* note that the boundary criterion is being confounded with the overlap criterion. Doesn't seem worth fixing at the moment ... */ } for (i = 0; i < lst_size(gaps); i++) sfree(lst_get_ptr(gaps, i)); lst_free(gaps); return retval; }
/* checks to see if reference sequence looks okay wrt a given list of features */ int ref_seq_okay(List *features, MSA *msa, int offset3, int indel_strict, int splice_strict, List *problems) { List *signals = NULL; char *seq = NULL; int seqalloc = 0; int idx, retval = TRUE; GFF_Feature *feat, *lastfeat_helper = NULL; if (indel_strict) { signals = lst_new_ptr(10); str_split(str_new_charstr(SIGNALS), ",", signals); } for (idx = 0; idx < lst_size(features); idx++) { int i, j, len, has_gaps = 0; feat = lst_get_ptr(features, idx); if (seqalloc <= feat->end - feat->start + 2) { seqalloc = (feat->end - feat->start) * 2; seq = srealloc(seq, seqalloc * sizeof(char)); } for (i = feat->start - 1, len = 0; i < feat->end; i++) { if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR) seq[len++] = ss_get_char_pos(msa, i, 0, 0); else if (!has_gaps) has_gaps = 1; } seq[len] = '\0'; if (feat->strand == '-') msa_reverse_compl_seq(seq, len); if (str_equals_charstr(feat->feature, GFF_START_TYPE) && strcmp(seq, "ATG") != 0) { problem_add(problems, feat, BAD_REF_START, -1, -1); retval = FALSE; } else if (str_equals_charstr(feat->feature, GFF_STOP_TYPE) && (feat->frame != 0 || !is_stop_codon(seq))) { problem_add(problems, feat, BAD_REF_STOP, -1, -1); retval = FALSE; } else if (str_starts_with_charstr(feat->feature, SPLICE_5) && !is_valid_5splice(seq, splice_strict)) { problem_add(problems, feat, BAD_REF_5_SPLICE, -1, -1); retval = FALSE; } else if (str_starts_with_charstr(feat->feature, SPLICE_3) && !is_valid_3splice(&seq[offset3], splice_strict)) { problem_add(problems, feat, BAD_REF_3_SPLICE, -1, -1); retval = FALSE; } else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) { for (i = (3 - feat->frame) % 3; i <= len - 3; i += 3) { if (is_stop_codon(&seq[i])) { problem_add(problems, feat, BAD_REF_ORF, -1, -1); retval = FALSE; break; } } } if (indel_strict) { int strict_okay = TRUE; List *signals = lst_new_ptr(10); str_split(str_new_charstr(SIGNALS), ",", signals); if (str_in_list(feat->feature, signals)) { /* reject any signal feature with gaps in the ref seq, unless they appear in a non-critical part of a splice site or in a "prestart" feature */ if (has_gaps) { if (str_starts_with_charstr(feat->feature, SPLICE_5)) { if (ss_get_char_pos(msa, feat->start-1, 0, 0) == GAP_CHAR || ss_get_char_pos(msa, feat->start, 0, 0) == GAP_CHAR) strict_okay = FALSE; } else if (str_starts_with_charstr(feat->feature, SPLICE_3)) { if (ss_get_char_pos(msa, feat->end-1, 0, 0) == GAP_CHAR || ss_get_char_pos(msa, feat->end-2, 0, 0) == GAP_CHAR) strict_okay = FALSE; } else if (!str_equals_charstr(feat->feature, "prestart")) strict_okay = FALSE; } /* in addition, if two signals occur consec. with gaps and only gaps between them, assume a violation of --indel-strict */ if (lastfeat_helper != NULL && lastfeat_helper->end < feat->start-1) { int allgaps = 1; for (j = lastfeat_helper->end; allgaps && j < feat->start-1; j++) /* note indexing: -1+1 for end and -1 for start */ if (ss_get_char_pos(msa, j, 0, 0) != GAP_CHAR) allgaps = 0; if (allgaps) strict_okay = FALSE; } lastfeat_helper = feat; } else lastfeat_helper = NULL; /* also exclude CDS exons of length less than 6 in indel_strict case -- these cause problems in exoniphy training because start_codon is adjacent to cds5ss */ if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && len <= 6) strict_okay = FALSE; if (!strict_okay) { problem_add(problems, feat, BAD_REF_INDEL_STRICT_FAIL, -1, -1); retval = FALSE; } lst_free_strings(signals); lst_free(signals); } } if (seq != NULL) sfree(seq); return retval; }