/* Exclude stop codons from all CDS in a group, as necessary.  Record
   any features that are changed, so they can be changed back before
   data is output */
void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, 
                   List *ends_adjusted) {
  int j, k;
  List *stops = lst_new_ptr(1), *gfeatures = group->features;
  GFF_Feature *feat;
  lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted);
  for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops.  We 
                                                 expect at most one, but more 
                                                 are possible */
    feat = lst_get_ptr(gfeatures, j);
    if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat);
  }
  for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */
    feat = lst_get_ptr(gfeatures, j);
    if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
      for (k = 0; k < lst_size(stops); k++) { /* check stops */
        GFF_Feature *stop = lst_get_ptr(stops, k);
        if (feat->strand == '+' && stop->strand == '+' && 
            feat->end == stop->end) {
          feat->end -= 3; 
          lst_push_ptr(ends_adjusted, feat);
        }
        else if (feat->strand == '-' && stop->strand == '-' && 
                 feat->start == stop->start) {
          feat->start += 3; 
          lst_push_ptr(starts_adjusted, feat);
        }
      }
    }
  }
  lst_free(stops);
}
Example #2
0
/* add leaf with specified name to specified internal branch */
void tr_add_leaf_internal(TreeNode *t, int branch, char *lname, int lgroup) {
  TreeNode *oldnode, *newanc, *newleaf;

  oldnode = lst_get_ptr(t->nodes, branch); /* node beneath branch in question */
  if (oldnode == t)
    die("ERROR tr_add_leaf_internal: oldnode == t\n");

  newanc = tr_new_node();
  newleaf = tr_new_node();
  strcpy(newleaf->name, lname);
  newleaf->dparent = lgroup;

  newanc->rchild = newleaf;
  newleaf->parent = newanc;
  newanc->lchild = oldnode;
  newanc->parent = oldnode->parent; 

  if (oldnode->parent->lchild == oldnode)
    oldnode->parent->lchild = newanc;
  else 
    oldnode->parent->rchild = newanc;

  oldnode->parent = newanc;

  if (lgroup > 0 && lgroup == oldnode->dparent)
    newanc->dparent = lgroup;

  /* fix up ids and nodes list */
  lst_push_ptr(t->nodes, newanc);
  newanc->id = lst_size(t->nodes) - 1; /* circumvent normal id assignment */
  lst_push_ptr(t->nodes, newleaf);
  newleaf->id = lst_size(t->nodes) - 1;
  t->nnodes += 2;
}
Example #3
0
/* Return a list of category names corresponding to a given list of
   category names and or numbers.  Doesn't allocate new names,
   just pointers to Strings in the CategoryMap object or the
   provided List */
List *cm_get_category_str_list(CategoryMap *cm, List *names, int ignore_missing) {
  int i, cat;
  List *retval = lst_new_ptr(lst_size(names));
  for (i = 0; i < lst_size(names); i++) {
    String *n = lst_get_ptr(names, i);
    if (str_as_int(n, &cat) == 0) {
      if (cm == NULL)
	die("ERROR: if categories are specified by number, a category map is required\n");
      if (cat < 0 || (cm != NULL && cat > cm->ncats)) 
        die("ERROR: category number %d is out of bounds.\n", cat);
      lst_push_ptr(retval, cm_get_feature(cm, cat));
    }
    else {
      if (cm != NULL) {
	cat = cm_get_category(cm, n);
	if (cat == 0 && !ignore_missing && !str_equals(n, cm_get_feature(cm, 0))) {
	  die("ERROR: illegal category name (\"%s\")\n", n->chars);
	}
	//return pointers to cm if possible
	lst_push_ptr(retval, cm_get_feature(cm, cat));
      }
      //otherwise return pointers to strings in list
      else lst_push_ptr(retval, n);
    }
  }
  return retval;
}
Example #4
0
/* add a leaf with specified name to root branch */
void tr_add_leaf_at_root(TreeNode *t, char *lname, int lgroup) {
  TreeNode *newanc, *newleaf;

  newanc = tr_new_node();
  newleaf = tr_new_node();
  strcpy(newleaf->name, lname);
  newleaf->dparent = lgroup;

  /* we don't want to change the identity of the root node, so will
     add the new node below it and rewire as necessary */
  newanc->lchild = t->lchild;
  newanc->rchild = t->rchild;
  t->lchild->parent = newanc;
  t->rchild->parent = newanc;
  t->lchild = newanc;
  t->rchild = newleaf;
  newanc->parent = t;
  newleaf->parent = t;

  newanc->dparent = t->dparent;

  if (lgroup == newanc->dparent) 
    t->dparent = lgroup;    
  else
    t->dparent = 0; 

  /* fix up ids and nodes list */
  lst_push_ptr(t->nodes, newanc);
  newanc->id = lst_size(t->nodes) - 1; /* circumvent normal id assignment */
  lst_push_ptr(t->nodes, newleaf);
  newleaf->id = lst_size(t->nodes) - 1;
  t->nnodes += 2;
}
Example #5
0
SEXP rph_gff_featureBits(SEXP gffListP, SEXP orP, SEXP returnGffP) {
  int numGff, i, j, or, returnGff;
  long numbit = 0;
  List *gfflist;
  GFF_Set *gff, *newgff=NULL;
  GFF_Feature *feat, *newfeat;
  SEXP rv;

  numGff = length(gffListP);
  gfflist = lst_new_ptr(numGff);
  //  Rf_PrintValue(gffListP);
  for (i = 0; i < numGff; i++) {
    gff = (GFF_Set*)EXTPTR_PTR(VECTOR_ELT(gffListP, i));
    lst_push_ptr(gfflist, gff);
    gff_register_protect(gff);
  }
  or = LOGICAL_VALUE(orP);
  returnGff = LOGICAL_VALUE(returnGffP);
  if (!or && numGff >= 2) {
    newgff = gff_overlap_gff(lst_get_ptr(gfflist, 0),
			     lst_get_ptr(gfflist, 1),
			     1, -1.0, FALSE, TRUE, NULL);
    numbit = gff_flatten_mergeAll(newgff);
    for (i=2; i < numGff; i++) {
      checkInterrupt();
      gff = gff_overlap_gff(newgff,
			    lst_get_ptr(gfflist, i),
			    1, -1.0, FALSE, TRUE, NULL);
      numbit = gff_flatten_mergeAll(gff);
      gff_free_set(newgff);
      newgff = gff;
    }
  } else {
    newgff = gff_new_set();
    for (i=0; i< numGff; i++) {
      gff = (GFF_Set*)lst_get_ptr(gfflist, i);
      for (j=0; j < lst_size(gff->features); j++) {
	checkInterruptN(j, 1000);
	feat = lst_get_ptr(gff->features, j);
	newfeat = gff_new_feature_copy(feat);
	lst_push_ptr(newgff->features, newfeat);
      }
    }
    numbit = gff_flatten_mergeAll(newgff);
  }
  if (returnGff)
    return rph_gff_new_extptr(newgff);

  if (numbit > INT_MAX) {
    PROTECT(rv = allocVector(REALSXP, 1));
    REAL(rv)[0] = numbit;
  } else {
    PROTECT(rv = allocVector(INTSXP, 1));
    INTEGER(rv)[0] = numbit;
  }
  UNPROTECT(1);
  return rv;
}
Example #6
0
/* Create a category map with a category for each feature type in a
    GFF_Set.  Category numbers are assigned in order of appearance of
    types */
CategoryMap* cm_new_from_features(GFF_Set *feats) {
  int i;
  CategoryMap *retval;
  Hashtable *hash;
  List *types;

  /* first scan features for all types */
  hash = hsh_new(10);
  types = lst_new_ptr(10);
  for (i = 0; i < lst_size(feats->features); i++) {
    GFF_Feature *f = lst_get_ptr(feats->features, i);
    checkInterruptN(i, 10000);
    if (hsh_get(hash, f->feature->chars) == (void*)-1) {
      lst_push_ptr(types, f->feature);
      hsh_put_int(hash, f->feature->chars, 1);
    }
  }
  hsh_free(hash);

  /* now create a simple category map */
  retval = cm_new(lst_size(types));
  for (i = 0; i <= retval->ncats; i++) {
    String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : 
      str_dup(lst_get_ptr(types, i-1));
    retval->ranges[i] = cm_new_category_range(type, i, i);
  }
  lst_free(types);
  return retval;
}
Example #7
0
/* conditioned_on must be an array of integer lists; specifically, the
   ith element must be the list of state numbers on which the ith
   state is conditioned. */
Unspooler *cm_create_unspooler(int nstates_spooled, List **conditioned_on) {
  UnspoolNode *n;
  int i, j;
  Stack *s;
  Unspooler *unsp;
  int *mark;
  int capacity;

  unsp = (Unspooler*)smalloc(sizeof(Unspooler));
  unsp->nstates_spooled = nstates_spooled;
  unsp->nstates_unspooled = 0;
  unsp->spooled_to_unspooled = 
    (UnspoolNode**)smalloc(nstates_spooled * sizeof(UnspoolNode*));
  capacity = nstates_spooled * nstates_spooled;
  unsp->unspooled_to_spooled = (int*)smalloc(capacity * sizeof(int));

  mark = (int*)smalloc(nstates_spooled * sizeof(int));
  s = stk_new_ptr(nstates_spooled);
  for (i = 0; i < nstates_spooled; i++) {
    /* erase marks (used to detect cycles) */
    for (j = 0; j < nstates_spooled; j++) mark[j] = 0;

    unsp->spooled_to_unspooled[i] = cm_new_unspool_node(i);
    stk_push_ptr(s, unsp->spooled_to_unspooled[i]);
    while ((n = (UnspoolNode*)stk_pop_ptr(s)) != NULL) {
      if (conditioned_on[n->oldstate] == NULL ||
          lst_size(conditioned_on[n->oldstate]) == 0) {
        n->newstate = unsp->nstates_unspooled++;

        /* mapping to spooled space */
        if (n->newstate >= capacity) {
          capacity *= 2;
          unsp->unspooled_to_spooled = 
            (int*)srealloc(unsp->unspooled_to_spooled, 
                          capacity * sizeof(int));          
        }
        unsp->unspooled_to_spooled[n->newstate] = i;
      }
      else {
        for (j = 0; j < lst_size(conditioned_on[n->oldstate]); j++) {
          int oldstate = lst_get_int(conditioned_on[n->oldstate], j);
          UnspoolNode *m;

          if (mark[oldstate] == 1)
            die("ERROR: cycle in 'conditioned_on' dependencies.\n");
          mark[oldstate] = 1;

          m = cm_new_unspool_node(oldstate);
          lst_push_ptr(n->children, m);
          stk_push_ptr(s, m);
        }
      }
    }
  }
  stk_free(s);
  sfree(mark);
  return unsp;
}
Example #8
0
CategoryRange* cm_new_category_range(String *type, int start_cat_no,
                                     int end_cat_no) {
  CategoryRange *cr = (CategoryRange*)smalloc(sizeof(CategoryRange));
  cr->feature_types = lst_new_ptr(1);
  lst_push_ptr(cr->feature_types, type);
  cr->start_cat_no = start_cat_no;
  cr->end_cat_no = end_cat_no;
  return cr;
}
Example #9
0
CategoryRange* cm_category_range_create_copy(CategoryRange *src) {
  int i;
  CategoryRange *retval = 
    cm_new_category_range(str_dup(lst_get_ptr(src->feature_types, 0)),
                          src->start_cat_no, src->end_cat_no);
  for (i = 1; i < lst_size(src->feature_types); i++)
    lst_push_ptr(retval->feature_types, 
                 str_dup(lst_get_ptr(src->feature_types, i)));
  return retval;
}
Example #10
0
/* create a trivial, two-leaf tree */
TreeNode *tr_new_trivial(char *name1, char *name2) {
  TreeNode *root;
  root = tr_new_node();
  root->lchild = tr_new_node();
  strcpy(root->lchild->name, name1);
  root->lchild->parent = root;
  root->rchild = tr_new_node();
  strcpy(root->rchild->name, name2);
  root->rchild->parent = root;

  /* bypass default handling of ids and nodes list */
  root->nnodes = 3;
  root->id = 0;
  root->lchild->id = 1;
  root->rchild->id = 2;  
  root->nodes = lst_new_ptr(root->nnodes);
  lst_push_ptr(root->nodes, root);
  lst_push_ptr(root->nodes, root->lchild);
  lst_push_ptr(root->nodes, root->rchild);

  return root;
}
Example #11
0
/* return list of category names corresponding to list of category
   numbers */
List *cm_get_features(CategoryMap *cm, List *catnos) {
  int mark[cm->ncats+1];
  List *retval = lst_new_ptr(lst_size(catnos));
  int i, cat;
  for (i = 0; i <= cm->ncats; i++) mark[i] = 0;
  for (i = 0; i < lst_size(catnos); i++) {
    cat = lst_get_int(catnos, i);
    if (!mark[cm->ranges[cat]->start_cat_no]) {
      lst_push_ptr(retval, cm_get_feature(cm, cat));
      mark[cm->ranges[cat]->start_cat_no] = 1;
    }
  }
  return retval;
}
Example #12
0
SEXP rph_gff_append(SEXP gffListP) {
  GFF_Set *newgff = gff_new_set(), *gff;
  int i, j;
  for (i=0 ; i<length(gffListP); i++) {
    gff = (GFF_Set*)EXTPTR_PTR(VECTOR_ELT(gffListP, i));
    gff_register_protect(gff);
    for (j=0; j < lst_size(gff->features); j++) {
      checkInterruptN(j, 1000);
      lst_push_ptr(newgff->features,
		   gff_new_feature_copy(lst_get_ptr(gff->features, j)));
    }
  }
  return rph_gff_new_extptr(newgff);
}
Example #13
0
/*
  open a file with name out_root.name.maf, or returns it if already open.
  This is a bit messy because in some cases (splitting by feature) there may
  be more output files than the OS can handle.  But it would be computationally
  expensive to check and see which files are finished, assuming that the MAF is
  sorted.  

  So, if it tries to open a file and fails, it the goes through the list of
  filehandles, finds an open one, closes it, and tries to open the new one 
  again.  Repeat until successful.

  Then, if a filehandle needs to be re-opened, it is opened with append.  Again,
  if this is not successful, it looks for another file to close.  If it can't
  find one the program reports an error and dies.

  Finally, close_outfiles below checks and makes sure that all files
  are closed with mafBlock_close_file in the end, so that they get the #eof
  closer.
 */
FILE *get_outfile(List *outfileList, Hashtable *outfileHash, String *name, char *out_root,
		  int argc, char *argv[]) {
  int idx, i;
  FILE *outfile;
  char *fname = smalloc((strlen(out_root)+name->length+7)*sizeof(char));
  sprintf(fname, "%s.%s.maf", out_root, name->chars);
  idx = ptr_to_int(hsh_get(outfileHash, fname));
  if (idx == -1) {
    hsh_put(outfileHash, fname, int_to_ptr(lst_size(outfileList)));
    outfile = mafBlock_open_outfile(fname, argc, argv);
    while (outfile==NULL) {  //too many files are open, close one first
      for (i=0; i<lst_size(outfileList); i++) {
	outfile = (FILE*)lst_get_ptr(outfileList, i);
	if (outfile != NULL) break;
      }
      if (i == lst_size(outfileList)) {
	die("ERROR: too many files open in maf_parse\n");
      } else {
	phast_fclose(outfile);
	lst_set_ptr(outfileList, i, NULL);
      }
      outfile = mafBlock_open_outfile(fname, argc, argv);
    }
    lst_push_ptr(outfileList, (void*)outfile);
    sfree(fname);
    return outfile;
  }
  outfile = (FILE*)lst_get_ptr(outfileList, idx);
  if (outfile == NULL) { //has already been opened but then closed.
    outfile = phast_fopen_no_exit(fname, "a");
    while (outfile == NULL) {
      for (i=0; i<lst_size(outfileList); i++) {
	outfile = (FILE*)lst_get_ptr(outfileList, i);
	if (outfile != NULL) break;
      }
      if (i == lst_size(outfileList)) {
	die("ERROR: too many files open in maf_parse\n");
      } else {
	phast_fclose(outfile);
	lst_set_ptr(outfileList, i, NULL);
      }
      outfile = phast_fopen_no_exit(fname, "a");
    }
    lst_set_ptr(outfileList, idx, (void*)outfile);
  }
  sfree(fname);
  return outfile;
}
Example #14
0
SEXP rph_tree_prune(SEXP treeStr, SEXP seqsP, SEXP allButP) {
  TreeNode *tr = rph_tree_new(treeStr);
  List *names = lst_new_ptr(LENGTH(seqsP));
  String *tempStr;
  char *temp;
  int i;
  SEXP result;
  for (i=0; i<LENGTH(seqsP); i++) {
    tempStr = str_new_charstr(CHAR(STRING_ELT(seqsP, i)));
    lst_push_ptr(names, tempStr);
  }
  tr_prune(&tr, names, INTEGER_VALUE(allButP), NULL);
  temp = tr_to_string(tr, 1);
  PROTECT(result = NEW_CHARACTER(1));
  SET_STRING_ELT(result, 0, mkChar(temp));
  UNPROTECT(1);
  return result;
}
Example #15
0
MafBlock* mafBlock_copy(MafBlock *src) {
  MafBlock *block = smalloc(sizeof(MafBlock));
  MafSubBlock *sub;
  int i;
  if (src->aLine == NULL) block->aLine = NULL;
  else block->aLine = str_new_charstr(src->aLine->chars);
  if (src->specMap == NULL) block->specMap = NULL;
  else block->specMap = hsh_copy(src->specMap);
  block->seqlen = src->seqlen;
  if (src->data==NULL) block->data = NULL;
  else {
    block->data = lst_new_ptr(lst_size(src->data));
    for (i=0; i<lst_size(src->data); i++) {
      sub = mafSubBlock_copy((MafSubBlock*)lst_get_ptr(src->data, i));
      lst_push_ptr(block->data, (void*)sub);
    }
  }
  return block;
}
List *mm_build(MS *inputMS, int norder, int pseudoCount, int considerReverse) {
  int i;
  Matrix *mm = NULL;
  List *MatrixList; 
	
  //testBaseToRow();

  if (norder < 0) //Must have a positive order to build markov Model
    die("ERROR: Order of Markov Models must be zero or greater");
	
  MatrixList = lst_new_ptr(norder+1);

  //Build a Markov Model (list of Matrix order 0 -> norder)
  for (i = 0; i <= norder; i++) {
    mm = mm_build_helper(inputMS, i, pseudoCount, considerReverse); //Build MarkovMatrix of order i
    lst_push_ptr(MatrixList, mm);
  }

  return MatrixList;
}
/* Given a cds feature, determine whether it has no gaps (NGAPS),
   "clean" gaps (all multiples of 3 in length; CLEAN_GAPS)
   non-overlapping clean gaps (NOVRLP_CLN_GAPS), "okay" gaps (only
   temporary frame shifts, corrected by compensatory indels;
   FSHIFT_OK), or real frame-shift gaps (FSHIFT_BAD) */
cds_gap_type get_cds_gap_type(GFF_Feature *feat, MSA *msa, List *problems) {
  Problem *problem = NULL;
  cds_gap_type retval = scan_for_gaps(feat, msa, &problem);

  if (retval == FSHIFT_BAD && is_fshift_okay(feat, msa)) {
    retval = FSHIFT_OK;
                                /* most of the time the call to
                                   is_fshift_okay won't be
                                   necessary */
    problem->status = WARN_FSHIFT;
    problem->cds_gap = FSHIFT_OK;
  }
  if (problem != NULL) {
    lst_push_ptr(problems, problem);
    /* FIXME: It's possible that the single problem identified in
       scan_for_gaps is actually okay, but there's a frameshift
       without compensation downstream.  In this case, the status will
       be correct but the problem will point to the wrong place */
  }
  return retval;
}
Example #18
0
void mafBlock_reorder(MafBlock *block, List *specNameOrder) {
  String *str;
  MafSubBlock *sub;
  List *newData;
  Hashtable *newSpecMap;
  int i, idx, *found, oldSize = lst_size(block->data), newSize = lst_size(specNameOrder);

  found = smalloc(oldSize*sizeof(int));
  for (i=0; i<oldSize; i++) found[i]=0;

  newData = lst_new_ptr(oldSize);
  newSpecMap = hsh_new(100);

  for (i=0; i<newSize; i++) {
    str = (String*)lst_get_ptr(specNameOrder, i);
    idx = hsh_get_int(block->specMap, str->chars);
    if (idx != -1) {
      if (found[idx]==1) die("ERROR: species %s appears twice in reorder list\n", 
			     str->chars);
      sub = (MafSubBlock*)lst_get_ptr(block->data, idx);
      hsh_put_int(newSpecMap, sub->src->chars, lst_size(newData));
      hsh_put_int(newSpecMap, sub->specName->chars, lst_size(newData));
      lst_push_ptr(newData, (void*)sub);
      found[idx] = 1;
    }
  }
  for (i=0; i<oldSize; i++) {
    if (found[i]==0) {
      sub = (MafSubBlock*)lst_get_ptr(block->data, i);
      mafSubBlock_free(sub);
    }
  }
  hsh_free(block->specMap);
  lst_free(block->data);
  block->specMap = newSpecMap;
  block->data = newData;
  sfree(found);
}
Example #19
0
int main(int argc, char *argv[]) {
  char c;
  int i, j, t, opt_idx, ntrees, nleaves = -1;
  TreeNode *n, *node_i, *node_j, *lca, *nametree = NULL;
  TreeNode **tree;
  List *leaves, ***distance, *tree_fnames, *tot_dist;
  int mod = FALSE;
  char **leaf_name;
  String *trees_arg;
  FILE *F;

  struct option long_opts[] = {
    {"mod", 0, 0, 'm'},
    {"tree", 1, 0, 't'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = getopt_long(argc, argv, "mt:h", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'm':
      mod = TRUE;
      break;
    case 't':
      if (optarg[0] == '(')
        nametree = tr_new_from_string(optarg);
      else 
        nametree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind > argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  set_seed(-1);

  /* build a comma-delimited list and pass to get_arg_list; allows
     possibility of reading from file via '*' operator */
  trees_arg = str_new(1000);
  for (i = optind; i < argc; i++) {
    str_append_charstr(trees_arg, argv[i]);
    if (i < argc - 1) str_append_char(trees_arg, ',');
  }
  tree_fnames = get_arg_list(trees_arg->chars);

  ntrees = lst_size(tree_fnames);
  tree = smalloc(ntrees * sizeof(void*));

  /* read trees */
  for (t = 0; t < ntrees; t++) {
    String *fname = lst_get_ptr(tree_fnames, t);
    if (mod) {
      TreeModel *m = tm_new_from_file(F = phast_fopen(fname->chars, "r"), 1);
      tree[t] = tr_create_copy(m->tree);
      tm_free(m);
      phast_fclose(F);
    }
    else
      tree[t] = tr_new_from_file(phast_fopen(fname->chars, "r"));
  }

  /* initialization */
  nleaves = (tree[0]->nnodes + 1)/2;
  leaves = lst_new_ptr(nleaves);    
  distance = smalloc(nleaves * sizeof(void*));
  leaf_name = smalloc(nleaves * sizeof(void*));
  for (i = 0; i < nleaves; i++) {
    distance[i] = smalloc(nleaves * sizeof(void*));
    for (j = i+1; j < nleaves; j++) 
      distance[i][j] = lst_new_dbl(ntrees);
  }
  if (nametree == NULL) nametree = tree[0];
  for (i = 0, j = 0; i < lst_size(nametree->nodes); i++) {
    n = lst_get_ptr(nametree->nodes, i);
    if (n->lchild == NULL && n->rchild == NULL)
      leaf_name[j++] = n->name;
  }
  tot_dist = lst_new_dbl(ntrees);

  /* now compute distances */
  for (t = 0; t < ntrees; t++) {
    /* obtain list of leaves */
    lst_clear(leaves);
    for (i = 0; i < lst_size(tree[t]->nodes); i++) {
      n = lst_get_ptr(tree[t]->nodes, i);
      if (n->lchild == NULL && n->rchild == NULL)
        lst_push_ptr(leaves, n);
    }

    if (lst_size(leaves) != nleaves)
      die("ERROR: trees have different numbers of leaves.\n");

    /* look at all pairs */
    for (i = 0; i < nleaves; i++) {
      node_i = lst_get_ptr(leaves, i);
      for (j = i+1; j < nleaves; j++) {
        double dist = 0;
        node_j = lst_get_ptr(leaves, j);
        /* because ids are assigned in pre-order, the first ancestor of
           node j that has an id less than i is the LCA of i and j; we
           seek the sum of distances from both i and j to this node */
        for (n = node_j; n->id >= node_i->id; n = n->parent)
          dist += n->dparent;      
        lca = n;
        for (n = node_i; n != lca; n = n->parent)
          dist += n->dparent;            
        lst_push_dbl(distance[i][j], dist);
      }
    }
    lst_push_dbl(tot_dist, tr_total_len(tree[t]));
  }


  /* print distances and (optionally) stats */
  if (ntrees == 1) {
    for (i = 0; i < nleaves; i++) {
      for (j = i+1; j < nleaves; j++) {
        printf ("%s\t%s\t%f\n", leaf_name[i], leaf_name[j], 
                lst_get_dbl(distance[i][j], 0));
      }
    }
    printf ("%s\t%s\t%f\n", "(total)", "-", lst_get_dbl(tot_dist, 0));
  }
  else {
    double mean, stdev;
    double quantiles[] = {0, 0.025, 0.05, 0.5, 0.95, 0.975, 1};
    double quantile_vals[7]; 

    printf("%-15s %-15s %9s %9s %9s %9s %9s %9s %9s %9s %9s\n", "leaf1", 
           "leaf2", "mean", "stdev", "median", "min", "max", "95%_min", 
           "95%_max", "90%_min", "90%_max");

    for (i = 0; i < nleaves; i++) {
      for (j = i+1; j < nleaves; j++) {
        mean = lst_dbl_mean(distance[i][j]);
        stdev = lst_dbl_stdev(distance[i][j]);
        lst_qsort_dbl(distance[i][j], ASCENDING);
        lst_dbl_quantiles(distance[i][j], quantiles, 7, quantile_vals);

        printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", 
               leaf_name[i], leaf_name[j], mean, stdev, quantile_vals[3], quantile_vals[0], 
               quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], 
               quantile_vals[4]);
      }
    }

    /* also do total branch len */
    mean = lst_dbl_mean(tot_dist);
    stdev = lst_dbl_stdev(tot_dist);
    lst_qsort_dbl(tot_dist, ASCENDING);
    lst_dbl_quantiles(tot_dist, quantiles, 7, quantile_vals);
    
    printf("%-15s %-15s %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f %9.5f\n", 
	   "(total)", "-", mean, stdev, quantile_vals[3], quantile_vals[0], 
	   quantile_vals[6], quantile_vals[1], quantile_vals[5], quantile_vals[2], 
	   quantile_vals[4]);
  }

  return 0;
}
Example #20
0
LocalPwAlignment *la_read_lav(FILE *F, int read_seqs) {
  String *line = str_new(STR_MED_LEN);
  int line_no=0;
  LocalPwAlignment *lpwa = la_new();
  List *fields = lst_new_ptr(6);
  Regex *stanza_start_re = str_re_new("^([dshaxm])[[:space:]]*{");
  AlignmentBlock *aln_block = NULL;
  char stanza_type = '\0';
  int i;
  int done_with[256];
  done_with[(int)'d'] = done_with[(int)'s'] = done_with[(int)'h'] = 
    done_with[(int)'x'] = done_with[(int)'m'] = 0;

  while (str_readline(line, F) != EOF) {
    str_trim(line);
    if (line->length == 0) continue;

    checkInterruptN(line_no, 1000);
    line_no++;
    if (line_no == 1) {
      if (!str_equals_charstr(line, "#:lav")) {
        die("ERROR: lav file missing header.\n");
      }
    } 
    else if (str_re_match(line, stanza_start_re, fields, 1) >= 0) {
      String *tmpstr = lst_get_ptr(fields, 1);
      stanza_type = tmpstr->chars[0];
      str_free(tmpstr);
      str_free(lst_get_ptr(fields, 0));

      if (stanza_type != 'a' && done_with[(int)stanza_type]) {
        die("ERROR: multiple '%c' stanzas in lav file.\n", 
                stanza_type);
      }

      if (stanza_type == 'a') {
        aln_block = la_new_alignment_block(-1, -1, -1, -1, -1, NULL);
        lst_push_ptr(lpwa->alignment_blocks, aln_block);
      }
    }

    /* end current stanza */
    else if (str_equals_charstr(line, "}")) {
      if (stanza_type == '\0') {
        die("ERROR: end stanza without matching begin.\n");
      }
      done_with[(int)stanza_type] = 1;
      stanza_type = '\0';
    }

    else if (stanza_type == 'd') {
      ; /* do nothing for now */
    }
    else if (stanza_type == 's') {
      int beg, end;
      String *tmpstr, *fname, *seq=NULL;
      FILE *F2;

      str_double_trim(line);
      str_split(line, NULL, fields);
      if (lst_size(fields) != 3 || 
          str_as_int(lst_get_ptr(fields, 1), &beg) != 0 || 
          str_as_int(lst_get_ptr(fields, 2), &end) != 0) {
        die("ERROR: bad line in 's' stanza in lav file.\n");
      }
      tmpstr = lst_get_ptr(fields, 0);
      fname = str_new(tmpstr->length-2); /* remove quotes */
      str_substring(fname, tmpstr, 1, tmpstr->length-2);
      if (read_seqs) {
        F2 = phast_fopen(fname->chars, "r");
        seq = msa_read_seq_fasta(F2);
	phast_fclose(F2);
      }

      for (i = 0; i < lst_size(fields); i++) 
        str_free(lst_get_ptr(fields, i));
      
      if (beg != 1) {
        die("ERROR: unexpected begin index in 's' stanza of lav file (begin index currently must be 1).\n");
      }
      if (lpwa->query_len == -1) {
        lpwa->query_len = end;
        if (read_seqs) lpwa->query_seq = seq;
      }
      else if (lpwa->target_len == -1) {
        lpwa->target_len = end;
        if (read_seqs) lpwa->target_seq = seq;
      }
      else {
        die("ERROR: too many sequences listed in 's' stanza of lav file.\n");
      }
      str_free(fname);
    }
    else if (stanza_type == 'h') {
      String *name;

      str_double_trim(line);
      name = str_new(line->length-3); /* get rid of quotes and
                                         leading '>' */
      str_substring(name, line, 2, line->length-3);
      if (lpwa->query_name == NULL) lpwa->query_name = name;
      else if (lpwa->target_name == NULL) lpwa->target_name = name;
      else {
        die("ERROR: too many entries in 'h' stanza of lav file.\n");
      }
    }
    else if (stanza_type == 'a') {
      String *type;
      int val[6];
      if (!done_with[(int)'s'] || !done_with[(int)'d'] || 
          !done_with[(int)'h']) {
        die("ERROR: 'a' stanza appears in lav file before 'd', 's', or 'h' stanza.\n");
      }

      str_double_trim(line);
      str_split(line, NULL, fields);
      type = lst_get_ptr(fields, 0);
      if (lst_size(fields) > 6) {
        die("ERROR: illegal line in 'a' stanza.\n");
      }
      for (i = 1; i < lst_size(fields); i++) {
        str_as_int(lst_get_ptr(fields, i), &val[i]);
        str_free(lst_get_ptr(fields, i));
      }

      if (type->chars[0] == 's') 
        aln_block->score = val[1];
      else if (type->chars[0] == 'b') {
        aln_block->query_beg = val[1];
        aln_block->target_beg = val[2];      
      }
      else if (type->chars[0] == 'e') {
        aln_block->query_end = val[1];
        aln_block->target_end = val[2];      
      }
      else if (type->chars[0] == 'l') 
        lst_push_ptr(aln_block->gapless_alns, 
                     la_new_gapless_aln(val[1], val[3], val[2], val[4]));

      str_free(type);
    }     
  }
  str_free(line);
  lst_free(fields);
  str_re_free(stanza_start_re);

  return lpwa;
}
SEXP rph_phyloFit(SEXP msaP, 
		  SEXP treeStrP, 
		  SEXP substModP,
		  SEXP scaleOnlyP,
		  SEXP scaleSubtreeP,
		  SEXP nratesP,
		  SEXP alphaP,
		  SEXP rateConstantsP,
		  SEXP initModP,
		  SEXP initBackgdFromDataP,
		  SEXP initRandomP,
		  SEXP initParsimonyP,
		  SEXP clockP,
		  SEXP emP,
		  SEXP maxEmItsP,
		  SEXP precisionP,
		  SEXP gffP,
		  SEXP ninfSitesP,
		  SEXP quietP,
		  SEXP noOptP,
		  SEXP boundP,
		  SEXP logFileP,
		  SEXP selectionP) {
  struct phyloFit_struct *pf;
  int numProtect=0, i;
  double *doubleP;
  char *die_message=NULL;
  SEXP rv=R_NilValue;
  List *new_rate_consts = NULL;
  List *new_rate_weights = NULL;

  GetRNGstate(); //seed R's random number generator
  pf = phyloFit_struct_new(1);  //sets appropriate defaults for RPHAST mode

  pf->msa = (MSA*)EXTPTR_PTR(msaP);

  if (treeStrP != R_NilValue) 
    pf->tree = rph_tree_new(treeStrP);

  pf->use_em = LOGICAL_VALUE(emP);

  if (rateConstantsP != R_NilValue) {
    PROTECT(rateConstantsP = AS_NUMERIC(rateConstantsP));
    numProtect++;
    doubleP = NUMERIC_POINTER(rateConstantsP);
    new_rate_consts = lst_new_dbl(LENGTH(rateConstantsP));
    for (i=0; i < LENGTH(rateConstantsP); i++)
      lst_push_dbl(new_rate_consts, doubleP[i]);
//    pf->use_em = 1;
  }

  if (initModP != R_NilValue) {
    pf->input_mod = (TreeModel*)EXTPTR_PTR(initModP);
    pf->subst_mod = pf->input_mod->subst_mod;
    tm_register_protect(pf->input_mod);
    
    if (new_rate_consts == NULL && pf->input_mod->rK != NULL && pf->input_mod->nratecats > 1) {
      new_rate_consts = lst_new_dbl(pf->input_mod->nratecats);
      for (i=0; i < pf->input_mod->nratecats; i++) 
	lst_push_dbl(new_rate_consts, pf->input_mod->rK[i]);
//      pf-> = 1;
    }

    if (pf->input_mod->empirical_rates && pf->input_mod->freqK != NULL && pf->input_mod->nratecats > 1) {
      new_rate_weights = lst_new_dbl(pf->input_mod->nratecats);
      for (i=0; i < pf->input_mod->nratecats; i++)
	lst_push_dbl(new_rate_weights, pf->input_mod->freqK[i]);
    }

    tm_reinit(pf->input_mod, 
	      rph_get_subst_mod(substModP),
	      nratesP == R_NilValue ? pf->input_mod->nratecats : INTEGER_VALUE(nratesP),
	      NUMERIC_VALUE(alphaP),
	      new_rate_consts,
	      new_rate_weights);
  } else {
    if (nratesP != R_NilValue)
      pf->nratecats = INTEGER_VALUE(nratesP);
    if (alphaP != R_NilValue)
      pf->alpha = NUMERIC_VALUE(alphaP);
    if (rateConstantsP != R_NilValue) {
      pf->rate_consts = new_rate_consts;
      if (nratesP == R_NilValue)
	pf->nratecats = lst_size(new_rate_consts);
      else if (lst_size(new_rate_consts) != pf->nratecats) 
	die("length of new_rate_consts does not match nratecats\n");
    }
  }
  pf->subst_mod = rph_get_subst_mod(substModP);
  
  pf->estimate_scale_only = LOGICAL_VALUE(scaleOnlyP);
  
  if (scaleSubtreeP != R_NilValue) {
    pf->subtree_name = smalloc((1+strlen(CHARACTER_VALUE(scaleSubtreeP)))*sizeof(char));
    strcpy(pf->subtree_name, CHARACTER_VALUE(scaleSubtreeP));
  }
  
  pf->random_init = LOGICAL_VALUE(initRandomP);

  pf->init_backgd_from_data = LOGICAL_VALUE(initBackgdFromDataP);
  
  pf->init_parsimony = LOGICAL_VALUE(initParsimonyP);
  
  pf->assume_clock = LOGICAL_VALUE(clockP);

  if (maxEmItsP != R_NilValue)
    pf->max_em_its = INTEGER_VALUE(maxEmItsP);

  pf->precision = get_precision(CHARACTER_VALUE(precisionP));
  if (pf->precision == OPT_UNKNOWN_PREC) {
    die_message = "invalid precision";
    goto rph_phyloFit_end;
  }

  if (gffP != R_NilValue) {
    pf->gff = (GFF_Set*)EXTPTR_PTR(gffP);
    gff_register_protect(pf->gff);
  }

  if (ninfSitesP != R_NilValue)
    pf->nsites_threshold = INTEGER_VALUE(ninfSitesP);
  
  pf->quiet = LOGICAL_VALUE(quietP);

  if (noOptP != R_NilValue) {
    int len=LENGTH(noOptP), pos=0;
    char *temp;
    for (i=0; i < LENGTH(noOptP); i++) 
      len += strlen(CHARACTER_VALUE(STRING_ELT(noOptP, i)));
    temp = smalloc(len*sizeof(char));
    for (i=0; i < LENGTH(noOptP); i++) {
      if (i != 0) temp[pos++] = ',';
      sprintf(&temp[pos], "%s", CHARACTER_VALUE(STRING_ELT(noOptP, i)));
      pos += strlen(CHARACTER_VALUE(STRING_ELT(noOptP, i)));
    }
    if (pos != len-1) die("ERROR parsing noOpt len=%i pos=%i\n", len, pos);
    temp[pos] = '\0';
    pf->nooptstr = str_new_charstr(temp);
  }

  if (boundP != R_NilValue) {
    pf->bound_arg = lst_new_ptr(LENGTH(boundP));
    for (i=0; i < LENGTH(boundP); i++) {
      String *temp = str_new_charstr(CHARACTER_VALUE(STRING_ELT(boundP, i)));
      lst_push_ptr(pf->bound_arg, temp);
    }
  }

  if (logFileP != R_NilValue) {
    if (IS_CHARACTER(logFileP)) 
      pf->logf = phast_fopen(CHARACTER_VALUE(logFileP), "w+");
    else if (IS_LOGICAL(logFileP) &&
	     LOGICAL_VALUE(logFileP)) {
      pf->logf = stdout;
    }
  }

  if (selectionP != R_NilValue) {
    pf->use_selection = TRUE;
    pf->selection = NUMERIC_VALUE(selectionP);
  }

  msa_register_protect(pf->msa);

  run_phyloFit(pf);
  rv = PROTECT(rph_listOfLists_to_SEXP(pf->results));
  numProtect++;

 rph_phyloFit_end:
  if (pf->logf != NULL && pf->logf != stdout && pf->logf != stderr)
    phast_fclose(pf->logf);
  PutRNGstate();
  if (die_message != NULL) die(die_message);
  if (numProtect > 0) 
    UNPROTECT(numProtect);
  return rv;
}
Example #22
0
/* Transform the coordinates of all features in a GFF according to a
   local alignment.  Each feature in the original GFF will be replaced
   by zero or more features with transformed begin and end
   coordinates.  The original features are "projected" onto the
   aligned (target) sequence vis the alignment, in such a way that if
   a feature contains no aligned bases, then it will not be
   represented, and if a feature contains bases that align to multiple
   "blocks", then it will be split into several features, one for each
   block.  

   The general idea is that the new features should cover only those
   bases in the target sequence that align to bases in the query
   sequence.  Currently, however, insertions in the target sequence
   between gapless alignments of the same block are ignored, so that a
   transformed feature may contain some bases that do not directly
   align to the query sequence.  The rationale is that these
   insertions should generally be small, and should reflect
   small-scale events that do not radically disrupt the local
   properties of the sequence. */
void la_gff_transform(LocalPwAlignment *lpwa, GFF_Set *gff) {
  int i, j, k;
  int new_beg, new_end;
  List *new_features = lst_new_ptr(lst_size(gff->features));
  GFF_Feature *feat, *new_feat;

  for (i = 0; i < lst_size(gff->features); i++) {
    checkInterruptN(i, 1000);
    feat = lst_get_ptr(gff->features, i);
    for (j = 0; j < lst_size(lpwa->alignment_blocks); j++) {
                                /* this is a somewhat inefficient way
                                   to proceed, but the number of
                                   features and the number of
                                   alignment blocks is usually pretty
                                   small; will adjust strategy as
                                   needed */
      AlignmentBlock *ab = lst_get_ptr(lpwa->alignment_blocks, j);
      new_beg = new_end = -1;
      if ((ab->query_beg >= feat->start && ab->query_beg <= feat->end) ||
          (ab->query_end >= feat->start && ab->query_end <= feat->end) ||
          (feat->start >= ab->query_beg && feat->end <= ab->query_end)) {
                                /* block and feature overlap */
        if (feat->start <= ab->query_beg)
                                /* feature extends to the left of the
                                   alignment block; use beg of
                                   block */
          new_beg = ab->target_beg;
        else {                  /* ab->query_beg < feat->start */
          /* find first corresponding base within a gapless alignment */
          for (k = 0; k < lst_size(ab->gapless_alns); k++) {
            GaplessAlignment *ga = lst_get_ptr(ab->gapless_alns, k);
            if (ga->query_beg >= feat->start) {
                                /* gapless alignment overlaps the
                                   feature and the feature extends to
                                   the left (equal to or) beyond the
                                   ga; use the start of the ga */
              new_beg = ga->target_beg;
              break;
            }
            else if (ga->query_end >= feat->start) {
                                /* gapless alignment overlaps the
                                   feature and the ga extends to the
                                   left beyond the feature; use the
                                   aligned base within the ga */
              new_beg = ga->target_beg + (feat->start - ga->query_beg);
              break;
            }
          }
        }
        if (feat->end >= ab->query_end) 
                                /* feature extends to the right of the
                                   alignment block; use end of
                                   block */
          new_end = ab->target_end;
        else {
          /* find last corresponding base within a gapless alignment */
          for (k = lst_size(ab->gapless_alns)-1; k >= 0; k--) {
            GaplessAlignment *ga = lst_get_ptr(ab->gapless_alns, k);
            if (ga->query_end <= feat->end) {
                                /* gapless alignment overlaps the
                                   feature and the feature extends to
                                   the right (equal to or) beyond the
                                   ga; use the end of the ga */
              new_end = ga->target_end;
              break;
            }
            else if (ga->query_beg <= feat->end) {
                                /* gapless alignment overlaps the
                                   feature and the ga extends to the
                                   right beyond the feature; use the
                                   aligned base within the ga */
              new_end = ga->target_beg + (feat->end - ga->query_beg);
              break;
            }
          }
        }
        
        if (!(new_beg != -1 && new_end != -1))
	  die("ERROR: la_gff_transform: new_beg=%i new_end=%i\n",
	      new_beg, new_end);
/*         fprintf(stderr, "(%d, %d) -> (%d, %d)\n", feat->start, feat->end, new_beg, new_end); */
        new_feat = gff_new_feature_copy(feat);
        new_feat->start = new_beg;
        new_feat->end = new_end;
        lst_push_ptr(new_features, new_feat);
      }
    }
  }

  for (i = 0; i < lst_size(gff->features); i++)
    gff_free_feature(lst_get_ptr(gff->features, i));
  lst_free(gff->features);
  gff->features = new_features;
  gff_sort(gff);
}
int main(int argc, char *argv[]) {

  int check_start = 0, check_stop = 0, check_splice = 0, check_nonsense = 0,
    offset5 = 0, offset3 = 0, opt_idx, i, j, indel_strict = 0, no_output = 0,
    check_alignment = 0, splice_strict = 0;
  int ncons_tested, nkept, nconserved_exons;
  int nce_gap_type[NGAP_TYPES], nconsid[NTYPES], nfail[NTYPES];
  double Nfrac = 0.05;
  char c;
  MSA *msa;
  GFF_Set *gff;
  msa_format_type msa_format = UNKNOWN_FORMAT;
  List *keepers, *problems = lst_new_ptr(10), 
    *ends_adjusted = lst_new_ptr(1), *starts_adjusted = lst_new_ptr(1), 
    *discards=NULL, *intron_splice = lst_new_ptr(10);
  char *rseq_fname = NULL;
  FILE *logf = NULL, *mlogf = NULL, *statsf = NULL, *discardf = NULL;
  cds_gap_type fshift_mode = FSHIFT_BAD;
  char *groupby = "transcript_id";
  msa_coord_map *map;
  int *countNs, *countCDSs;
  FILE *infile;
  char *msa_fname;

  struct option long_opts[] = {
    {"start", 0, 0, 's'},
    {"stop", 0, 0, 't'},
    {"splice", 0, 0, 'l'},
    {"nonsense", 0, 0, 'n'},
    {"fshift", 0, 0, 'f'},
    {"conserved", 0, 0, 'c'},
    {"N-limit", 1, 0, 'N'},
    {"clean-gaps", 0, 0, 'e'},
    {"indel-strict", 0, 0, 'I'},
    {"splice-strict", 0, 0, 'C'},
    {"groupby", 1, 0, 'g'},
    {"msa-format", 1, 0, 'i'},
    {"refseq", 1, 0, 'r'},
    {"offset5", 1, 0, 'o'},
    {"offset3", 1, 0, 'p'},
    {"no-output", 0, 0, 'x'},
    {"discards", 1, 0, 'd'},
    {"log", 1, 0, 'L'},
    {"machine-log", 1, 0, 'M'},
    {"stats", 1, 0, 'S'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = (char)getopt_long(argc, argv, "N:i:r:L:M:S:g:d:stlnfceICxh", 
                          long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 's':
      check_alignment = check_start = 1;
      break;
    case 't':
      check_alignment = check_stop = 1;
      break;
    case 'l':
      check_alignment = check_splice = 1;
      break;
    case 'n':
      check_alignment = check_nonsense = 1;
      break;
    case 'f':
      check_alignment = 1;
      fshift_mode = FSHIFT_OK;
      break;
    case 'c':
      check_alignment = check_start = check_stop = check_splice = check_nonsense = 1;
      if (fshift_mode < FSHIFT_OK) fshift_mode = FSHIFT_OK;
      break;
    case 'N':
      Nfrac = get_arg_dbl_bounds(optarg, 0, 1);
      break;
    case 'e':
      check_alignment = 1;
      if (fshift_mode < CLN_GAPS) fshift_mode = CLN_GAPS;
      break;
    case 'I':
      check_alignment = 1;
      fshift_mode = NOVRLP_CLN_GAPS;
      indel_strict = 1;
      break;
    case 'C':
      check_alignment = check_splice = splice_strict = 1;
      break;
    case 'g':
      groupby = optarg;
      break;
    case 'i':
      msa_format = msa_str_to_format(optarg);
      if (msa_format == UNKNOWN_FORMAT) die("Bad alignment format.\n");
      break;
    case 'r':
      rseq_fname = optarg;
      break;
    case 'o':
      offset5 = get_arg_int(optarg);
      break;
    case 'p':
      offset3 = get_arg_int(optarg);
      break;
    case 'L':
      logf = phast_fopen(optarg, "w+");
      break;
    case 'M':
      mlogf = phast_fopen(optarg, "w+");
      break;
    case 'S':
      statsf = phast_fopen(optarg, "w+");
      break;
    case 'd':
      discardf = phast_fopen(optarg, "w+");
      break;
    case 'x':
      no_output = 1;
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("ERROR: Bad argument.  Try the --help option.\n");
    }
  }

  if (optind + 1 >= argc ) {
    die("ERROR:  Missing required arguments.  Try the --help option.\n");
  }
  
  set_seed(-1);

  gff = gff_read_set(phast_fopen(argv[optind], "r"));
  msa_fname = argv[optind+1];
  infile = phast_fopen(msa_fname, "r");
  if (msa_format == UNKNOWN_FORMAT)
    msa_format = msa_format_for_content(infile, 1);
  if (msa_format == MAF) {
    msa = maf_read(infile, 
                   rseq_fname == NULL ? NULL : phast_fopen(rseq_fname, "r"), 
                   1, NULL, NULL, NULL, -1, TRUE, NULL, NO_STRIP, FALSE); 
  }
  else {
    msa = msa_new_from_file_define_format(infile,
                            msa_format, NULL); 
    if (msa->ss == NULL) 
      ss_from_msas(msa, 1, 1, NULL, NULL, NULL, -1, 0);
  }
  if (!msa->ss->tuple_idx)
    die("ERROR: need ordered tuples\n");
  msa_remove_N_from_alph(msa);  /* for backward compatibility (old SS files) */

  if (msa->idx_offset != 0) {   /* avoids offset problem */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start -= msa->idx_offset;
      f->end -= msa->idx_offset;
    }
  }

  /* set up coordinate map; assume GFF is for sequence 1 */
  map = msa_build_coord_map(msa, 1);

  /* convert all features */
  for (i = 0; i < lst_size(gff->features); i++) {
    GFF_Feature *f = lst_get_ptr(gff->features, i);
    int newstart, newend;
 
    if (f->start < 0 || f->end < f->start)
      die("ERROR: bad feature in GFF (start=%d, end=%d).\n",
          f->start, f->end);

    newstart = msa_map_seq_to_msa(map, f->start);
    newend = msa_map_seq_to_msa(map, f->end);

    if (newstart < 0 || newend < newstart)
      die("ERROR: unable to map coordinates for feature (start=%d, end=%d).\n",
          f->start, f->end);

    f->start = newstart;
    f->end = newend;
  }

  gff_group(gff, groupby);	/* do this after coord conversion, or
                               group coords and feature coords
                               will be out of sync */

  keepers = lst_new_ptr(lst_size(gff->features));
  if (discardf != NULL) discards = lst_new_ptr(lst_size(gff->features));

  ncons_tested = nkept = nconserved_exons = 0;
  for (i = 0; i < NTYPES; i++) nconsid[i] = 0;
  for (i = 0; i < NTYPES; i++) nfail[i] = 0;
  for (i = 0; i < NGAP_TYPES; i++) nce_gap_type[i] = 0;  

  countNs = smalloc(msa->nseqs * sizeof(int));
  countCDSs = smalloc(msa->nseqs * sizeof(int));

  for (i = 0; i < lst_size(gff->groups); i++) {
    GFF_FeatureGroup *group = lst_get_ptr(gff->groups, i);
    List *gfeatures = group->features;
    GFF_Feature *feat;
    status_type status = OKAY;
    cds_gap_type gt = FSHIFT_BAD;
    problems_clear(problems);

    /* make sure have frame info for CDSs */
    for (j = 0; j < lst_size(gfeatures); j++) {
      feat = lst_get_ptr(gfeatures, j);
      if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && 
          feat->frame == GFF_NULL_FRAME)
        die("ERROR: Missing frame info for CDS.\n");
    }

    /* First, exclude stop codons from cds's, if necessary (simplifies
       the detection of nonsense mutations). */
    exclude_stops(group, starts_adjusted, ends_adjusted);

    /* In all cases, discard any group for which the reference sequence
       doesn't have valid splice sites or start/stop codons, or has a
       premature stop codon */
    if (!ref_seq_okay(gfeatures, msa, offset3, indel_strict, splice_strict,
                      problems)) {
      status = BAD_REF;
      nfail[BAD_REF]++;
    }
    else
      /* Everything else counts as a potentially valid group */
      ncons_tested++;

    if (status == OKAY && check_alignment) {      
                                /* only bother with below if
                                   interested in cross-species
                                   conservation */

      /* Check first to make sure there's alignment across species in
         the cds; if not, there's no need to look at individual
         features. */
      for (j = 0; j < lst_size(gfeatures); j++) { 
        feat = lst_get_ptr(gfeatures, j);
        if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) &&
            is_incomplete_alignment(feat, msa)) {
          status = NO_ALN;
          nfail[NO_ALN]++;
          problem_add(problems, feat, NO_ALN, -1, -1);
          break;
        }
      }

      if (status == OKAY) {     /* we have alignment and agreement
                                   with the ref seq; now check feature
                                   by feature  */

        lst_clear(intron_splice);
        for (j = 0; j < msa->nseqs; j++) countNs[j] = countCDSs[j] = 0;

        for (j = 0; j < lst_size(gfeatures); j++) {
          feat = lst_get_ptr(gfeatures, j);

          if (feat->end - 1 >= msa->length) 
            die("ERROR: feature extends beyond alignment (%d >= %d).\n",
                feat->end - 1, msa->length);
        
          if (check_start && str_equals_charstr(feat->feature, GFF_START_TYPE)) {

            nconsid[BAD_START]++;

            if (!is_conserved_start(feat, msa)) {
              status = BAD_START;
              problem_add(problems, feat, BAD_START, -1, -1);
            }
          }

          else if (check_stop && str_equals_charstr(feat->feature, GFF_STOP_TYPE)) {

            nconsid[BAD_STOP]++;

            if (!is_conserved_stop(feat, msa)) {
              status = BAD_STOP;
              problem_add(problems, feat, BAD_STOP, -1, -1);
            }
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5)) {

            nconsid[BAD_5_SPLICE]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE;
              problem_add(problems, feat, BAD_5_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && 
                   str_equals_charstr(feat->feature, SPLICE_5_UTR)) {

            nconsid[BAD_5_SPLICE_UTR]++;

            if (!is_conserved_5splice(feat, msa, offset5, splice_strict)) {
              status = BAD_5_SPLICE_UTR;
              problem_add(problems, feat, BAD_5_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {


            nconsid[BAD_3_SPLICE]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE;
              problem_add(problems, feat, BAD_3_SPLICE, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (check_splice && str_equals_charstr(feat->feature, SPLICE_3)) {

            nconsid[BAD_3_SPLICE_UTR]++;

            if (!is_conserved_3splice(feat, msa, offset3, splice_strict)) {
              status = BAD_3_SPLICE_UTR;
              problem_add(problems, feat, BAD_3_SPLICE_UTR, -1, -1);
            }
            else lst_push_ptr(intron_splice, feat);
          }

          else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
 
            if (fshift_mode > FSHIFT_BAD 
		&& (gt = get_cds_gap_type(feat, msa, problems)) < fshift_mode) {
              if (status == OKAY || status == NONSENSE) status = FSHIFT;
            }

            if (check_nonsense && !is_nonsense_clean(feat, msa, problems)) {
              if (status == OKAY) status = NONSENSE;
            }

            if (Nfrac < 1) 
              get_N_counts(countNs, countCDSs, feat, msa);
          }
        } /* end loop through features in group */

        /* still have to make sure splice sites are paired correctly
           (GT-AG, GC-AG, AT-AC) */
        if (status == OKAY && !splice_strict && lst_size(intron_splice) >= 2 &&
            !are_introns_okay(intron_splice, msa, problems, offset5, offset3)) 
          status = BAD_INTRON;

        /* also check fraction of Ns */
        if (Nfrac < 1) {
          enum {MY_OKAY, MY_FAIL, MY_WARN} Nstatus = MY_OKAY;
          for (j = 0; j < msa->nseqs; j++) {
            if ((double)countNs[j] / countCDSs[j] > Nfrac) Nstatus = MY_FAIL;
            if (Nstatus == MY_OKAY && countNs[j] > 0) Nstatus = MY_WARN;
          }
          if (Nstatus == MY_FAIL) {
            problem_add(problems, NULL, TOO_MANY_Ns, -1, -1);
            if (status == OKAY) status = TOO_MANY_Ns;
          }
          else if (Nstatus == MY_WARN) 
            problem_add(problems, NULL, WARN_Ns, -1, -1);
        }

        /* if collecting stats, record counts for failures */
        if (statsf != NULL) {
          if (status != OKAY) {
            for (j = 0; j < lst_size(problems); j++) {
              struct Problem *problem = lst_get_ptr(problems, j);
              status_type ftype = problem->status;
              if ((ftype == FSHIFT || ftype == NONSENSE) && 
                  status != FSHIFT && status != NONSENSE)
                continue;       /* don't count secondary frame shifts
                                   and nonsense mutations */ 

              if (ftype == BAD_INTRON && j % 2 == 0)
                continue;       /* only count one of every pair of these */

              nfail[ftype]++;
            }
          }

          /* also keep track of the total number of "conserved exons", and
             the number having each kind of gap */
          if ((status == OKAY || (status == FSHIFT && gt >= FSHIFT_OK))) {
            nconserved_exons++;
            nce_gap_type[gt]++;     /* number of conserved exons having
                                       given type of gaps */
          }
        }
      } /* end if (status == OKAY) [checks for conserved features] */
    } /* end if (status == OKAY && check_alignment) [all cross-species
         checks] */

    /* now we have looked at the whole group; we just need to do some
       final accounting and logging */

    if (status == OKAY) {
      nkept++;
      if (!no_output) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++)
          lst_push_ptr(keepers, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL && lst_size(problems) > 0) /* warnings only */
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL) {
        /* no problem, need to add an okay status to log */
        problem_add(problems, NULL, OKAY, -1, -1);
        write_machine_log(mlogf, group, problems, map); /* may include
                                                           warnings */
      }
    }
    else {
      if (discardf != NULL) {
        restore_stops(group, starts_adjusted, ends_adjusted);
        for (j = 0; j < lst_size(gfeatures); j++) 
          lst_push_ptr(discards, lst_get_ptr(gfeatures, j));
      }
      if (logf != NULL) 
        write_log(logf, group, status, problems, msa, map);
      if (mlogf != NULL)
        write_machine_log(mlogf, group, problems, map);
    }
  } /* end loop over groups */

  /* write main output and discards */
  if (!no_output || discardf != NULL) {
    /* first map features back to coord frame of reference seq. */
    for (i = 0; i < lst_size(gff->features); i++) {
      GFF_Feature *f = lst_get_ptr(gff->features, i);
      f->start = msa_map_msa_to_seq(map, f->start) + msa->idx_offset;
      f->end = msa_map_msa_to_seq(map, f->end) + msa->idx_offset;
    }

    if (!no_output) {
      gff->features = keepers;
      gff_print_set(stdout, gff);
    }

    if (discardf != NULL) {
      gff->features = discards;
      gff_print_set(discardf, gff);
    }
  }


  /* dump counts to stats file */
  if (statsf != NULL) {
    fprintf(statsf, "#%11s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", 
            "total", "nbad_ref", "nconsid", "nkept", "nno_aln", 
            "nbad_starts", "(out of)", "nbad_stops", "(out of)", 
            "nbad_5spl", "(out of)", "nbad_3spl", "(out of)", 
            "nbad_5utr", "(out of)", "nbad_3utr", "(out of)", 
            "nbad_intron", "nnons", "nfshifts", "nNs", "ncons_exons", 
            "nce_ngaps", "nce_nov_cln", "nce_clean", "nce_fshftok");
    fprintf(statsf, "%12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d %12d\n", 
            nfail[BAD_REF]+ncons_tested, nfail[BAD_REF], ncons_tested, nkept, 
            nfail[NO_ALN], nfail[BAD_START], nconsid[BAD_START], 
            nfail[BAD_STOP], nconsid[BAD_STOP], nfail[BAD_5_SPLICE], 
            nconsid[BAD_5_SPLICE], nfail[BAD_3_SPLICE], nconsid[BAD_3_SPLICE],
            nfail[BAD_5_SPLICE_UTR], nconsid[BAD_5_SPLICE_UTR],
            nfail[BAD_3_SPLICE_UTR], nconsid[BAD_3_SPLICE_UTR], 
            nfail[BAD_INTRON], nfail[NONSENSE], nfail[FSHIFT], 
            nfail[TOO_MANY_Ns], nconserved_exons, nce_gap_type[NGAPS], 
            nce_gap_type[NOVRLP_CLN_GAPS], nce_gap_type[CLN_GAPS], 
            nce_gap_type[FSHIFT_OK]);
    fprintf(statsf, "%s", STATS_DESCRIPTION);
  }

  if (logf != NULL) phast_fclose(logf);
  if (mlogf != NULL) phast_fclose(mlogf);
  if (statsf != NULL) phast_fclose(statsf);
  if (discardf != NULL) phast_fclose(discardf);

  return 0;
}
/* scans a cds for gaps.  Returns CLN_GAPS, NOVRLP_CLN_GAPS, NO_GAPS,
   or FSHIFT_BAD; doesn't try to check for compensatory indels, which
   is more complicated (this is left for the special-purpose function
   below) */
int scan_for_gaps(GFF_Feature *feat, MSA *msa, Problem **problem) {
  int msa_start = feat->start - 1;
  int msa_end = feat->end - 1;
  int i, j;
  int near_boundary = 0;
  cds_gap_type retval = NGAPS;
  List *gaps = lst_new_ptr(10);

  for (j = 0; retval != FSHIFT_BAD && j < msa->nseqs; j++) {
    for (i = msa_start; i <= msa_end; i++) {
      if (ss_get_char_pos(msa, i, j, 0) == GAP_CHAR) {
        int gap_start, gap_end;
        struct gap *g;

        for (gap_start = i-1; gap_start >= msa_start && 
               ss_get_char_pos(msa, gap_start, j, 0) == GAP_CHAR; gap_start--);
        gap_start++;            /* inclusive */
        for (gap_end = i+1; gap_end <= msa_end && 
               ss_get_char_pos(msa, gap_end, j, 0) == GAP_CHAR; gap_end++);
        gap_end--;              /* inclusive */

        if ((gap_end - gap_start + 1) % 3 != 0) {
          retval = FSHIFT_BAD;
          *problem = problem_new(feat, FSHIFT, gap_start, gap_end);
          (*problem)->cds_gap = FSHIFT_BAD;
          break;
        }

        /* note whether gaps occur near a cds boundary (within 3 sites) */
        if (gap_start <= msa_start + 3 || gap_end >= msa_end - 3)
          near_boundary = 1;
        
        if (retval == NGAPS) retval = CLN_GAPS;
        g = smalloc(sizeof(struct gap));
        g->start = gap_start;
        g->end = gap_end;
        lst_push_ptr(gaps, g);

        i = gap_end;
      }
    }
  }

  if (retval == CLN_GAPS) {     /* now check for overlaps */
    lst_qsort(gaps, gap_compare);
    retval = NOVRLP_CLN_GAPS;
    for (i = 1; i < lst_size(gaps); i++) {
      struct gap *g1 = lst_get_ptr(gaps, i-1);
      struct gap *g2 = lst_get_ptr(gaps, i);
      if (g2->start <= g1->end && 
          (g2->start != g1->start || g2->end != g1->end)) {
        retval = CLN_GAPS;
        break;
      }
    }
    if (retval == NOVRLP_CLN_GAPS && near_boundary)
      retval = CLN_GAPS;        /* note that the boundary criterion is
                                   being confounded with the overlap
                                   criterion.  Doesn't seem worth
                                   fixing at the moment ...  */
  }

  for (i = 0; i < lst_size(gaps); i++) sfree(lst_get_ptr(gaps, i));
  lst_free(gaps);
  return retval;
}
int main(int argc, char *argv[]) {
  char *msa_fname = NULL, *alph = "ACGT";
  msa_format_type input_format = UNKNOWN_FORMAT;
  char c;
  int opt_idx, seed=-1;
  String *optstr;
  List *tmplist = NULL; 
  struct phyloFit_struct *pf;
  FILE *infile;
  
  struct option long_opts[] = {
    {"msa", 1, 0, 'm'},
    {"tree", 1, 0, 't'},
    {"subst-mod", 1, 0, 's'},
    {"msa-format", 1, 0, 'i'},
    {"nrates", 1, 0, 'k'},
    {"alpha", 1, 0, 'a'},
    {"features", 1, 0, 'g'},
    {"catmap", 1, 0, 'c'},
    {"log", 1, 0, 'l'},
    {"out-root", 1, 0, 'o'},
    {"EM", 0, 0, 'E'},
    {"error", 1, 0, 'e'},
    {"precision", 1, 0, 'p'},
    {"do-cats", 1, 0, 'C'},
    {"non-overlapping", 0, 0, 'V'},
    {"markov", 0, 0, 'N'},
    {"reverse-groups", 1, 0, 'R'},
    {"init-model", 1, 0, 'M'},
    {"init-random", 0, 0, 'r'},
    {"init-parsimony", 0, 0, 'y'},
    {"print-parsimony", 1, 0, 'Y'},
    {"lnl", 0, 0, 'L'},
    {"scale-only", 0, 0, 'B'},
    {"scale-subtree", 1, 0, 'S'},
    {"estimate-freqs", 0, 0, 'F'},
    {"sym-freqs", 0, 0, 'W'},
    {"no-freqs", 0, 0, 'f'},
    {"no-rates", 0, 0, 'n'},
    {"no-opt", 1, 0, 'O'},
    {"min-informative", 1, 0, 'I'},
    {"gaps-as-bases", 0, 0, 'G'},     
    {"quiet", 0, 0, 'q'},
    {"help", 0, 0, 'h'},
    {"windows", 1, 0, 'w'},
    {"windows-explicit", 1, 0, 'v'},
    {"ancestor", 1, 0, 'A'},
    {"post-probs", 0, 0, 'P'},
    {"expected-subs", 0, 0, 'X'},
    {"expected-total-subs", 0, 0, 'Z'},
    {"expected-subs-col", 0, 0, 'J'},
    {"column-probs", 0, 0, 'U'},
    {"rate-constants", 1, 0, 'K'},
    {"ignore-branches", 1, 0, 'b'},
    {"clock", 0, 0, 'z'},
    {"alt-model", 1, 0, 'd'},
    {"label-branches", 1, 0, 0},
    {"label-subtree", 1, 0, 0},
    {"selection", 1, 0, 0},
    {"bound", 1, 0, 'u'},
    {"seed", 1, 0, 'D'},
    {0, 0, 0, 0}
  };

  // NOTE: remaining shortcuts left: HjQx

  pf = phyloFit_struct_new(0);

  while ((c = (char)getopt_long(argc, argv, "m:t:s:g:c:C:i:o:k:a:l:w:v:M:p:A:I:K:S:b:d:O:u:Y:e:D:GVENRqLPXZUBFfnrzhWyJ", long_opts, &opt_idx)) != -1) {
    switch(c) {
    case 'm':
      msa_fname = optarg;
      break;
    case 't':
      if (optarg[0] == '(')        /* in this case, assume topology given
                                   at command line */
        pf->tree = tr_new_from_string(optarg);
      else 
        pf->tree = tr_new_from_file(phast_fopen(optarg, "r"));
      break;
    case 's':
      pf->subst_mod = tm_get_subst_mod_type(optarg);
      if (pf->subst_mod == UNDEF_MOD) 
        die("ERROR: illegal substitution model.     Type \"phyloFit -h\" for usage.\n");
      break;
    case 'g':
      pf->gff = gff_read_set(phast_fopen(optarg, "r"));
      break;
    case 'c':
      pf->cm = cm_new_string_or_file(optarg);
      break;
    case 'C':
      pf->cats_to_do_str = get_arg_list(optarg);
      break;
    case 'V':
      pf->nonoverlapping = TRUE;
      break;
    case 'o':
      pf->output_fname_root = optarg;
      break;
    case 'k':
      pf->nratecats = get_arg_int_bounds(optarg, 0, INFTY);
      break;
    case 'a':
      pf->alpha = get_arg_dbl(optarg);
      break;
    case 'R':
      pf->reverse_group_tag = optarg;
      break;
    case 'i':
      input_format = msa_str_to_format(optarg);
      if (input_format == UNKNOWN_FORMAT)
        die("ERROR: unrecognized alignment format.    Type 'phyloFit -h' for usage.\n");
      break;
    case 'l':
      if (!strcmp(optarg, "-"))
	pf->logf = stderr;
      else pf->logf = phast_fopen(optarg, "w+");
      break;
    case 'N':
      pf->use_conditionals = 1;
      break;
    case 'w':
      tmplist = get_arg_list(optarg);
      if (lst_size(tmplist) != 2 ||
          str_as_int(lst_get_ptr(tmplist, 0), &(pf->window_size)) != 0 ||
          str_as_int(lst_get_ptr(tmplist, 1), &(pf->window_shift)) != 0) 
        die("ERROR: illegal arguments to --windows.\n");
      lst_free_strings(tmplist);
      lst_free(tmplist);
      break;
    case 'v':
      tmplist = get_arg_list(optarg);
      if (lst_size(tmplist) % 2 != 0) 
        die("ERROR: argument to --windows-explicit must be a list of even length.\n");
      pf->window_coords = str_list_as_int(tmplist);
      lst_free(tmplist);
      break;
    case 'E':
      pf->use_em = TRUE;
      break;
    case 'e':
      pf->error_fname=optarg;
      break;
    case 'p':
      if (!strcmp(optarg, "LOW")) pf->precision = OPT_LOW_PREC;
      else if (!strcmp(optarg, "MED")) pf->precision = OPT_MED_PREC;
      else if (!strcmp(optarg, "HIGH")) pf->precision = OPT_HIGH_PREC;
      else if (!strcmp(optarg, "VERY_HIGH")) pf->precision = OPT_VERY_HIGH_PREC;
      else die("ERROR: --precision must be LOW, MED, or HIGH.\n\n");
      break;
    case 'M':
      pf->input_mod = tm_new_from_file(phast_fopen(optarg, "r"), 1);
      break;
    case 'r':
      pf->random_init = TRUE;
      break;
    case 'y':
      pf->init_parsimony = TRUE;
      break;
    case 'Y':
      pf->init_parsimony = TRUE;
      pf->parsimony_cost_fname = optarg;
      pf->parsimony_only=TRUE;
      break; 
    case 'L':
      pf->likelihood_only = TRUE;
      break;
    case 'q':
      pf->quiet = TRUE;
      break;
    case 'P':
      pf->do_bases = TRUE;
      break;
    case 'X':
      pf->do_expected_nsubst = TRUE;
      break;
    case 'Z':
      pf->do_expected_nsubst_tot = TRUE;
      break;
    case 'J':
      pf->do_expected_nsubst_col = TRUE;
      break;
    case 'U':
      pf->likelihood_only = TRUE;        /* force -L */
      pf->nsites_threshold = 0;        /* also force this; typical use is
                                   with small number of tuples, no
                                   tuple_idx */
      pf->do_column_probs = TRUE;
      break;
    case 'A':
      pf->root_seqname = optarg;
      break;
    case 'I':
      pf->nsites_threshold = get_arg_int(optarg);
      break;
    case 'G':
      pf->gaps_as_bases = TRUE;
      alph = "ACGT-";
      break;
    case 'B':
      pf->estimate_scale_only = TRUE;
      break;
    case 'S':
      pf->subtree_name = optarg;
      break;       
    case 'F':
      pf->estimate_backgd = TRUE;
      break;
    case 'W':
      pf->estimate_backgd = TRUE;
      pf->symfreq = TRUE;
      break;
    case 'f':
      pf->no_freqs = TRUE;
      break;
    case 'n':
      pf->no_rates = TRUE;
      break;
    case 'K':
      tmplist = get_arg_list(optarg);
      pf->rate_consts = str_list_as_dbl(tmplist);
      pf->nratecats = lst_size(pf->rate_consts);
      pf->use_em = 1;
      lst_free_strings(tmplist); lst_free(tmplist);
      break;
    case 'b':
      pf->ignore_branches = get_arg_list(optarg);
      break;
    case 'z':
      pf->assume_clock = TRUE;
      break;
    case 'O':
      if (pf->nooptstr == NULL) 
	pf->nooptstr = str_new_charstr(optarg);
      else die("ERROR: no-opt argument can only be used once!  parameters can be comma-separated list.");
      break;
    case 'd':
      if (pf->alt_mod_str == NULL) {
	pf->alt_mod_str = lst_new_ptr(1);
      }
      optstr = str_new_charstr(optarg);
      lst_push_ptr(pf->alt_mod_str, optstr);
      break;
    case 0:
      if (strcmp(long_opts[opt_idx].name, "label-branches") == 0 ||
	  strcmp(long_opts[opt_idx].name, "label-subtree") == 0) {
	optstr = str_new_charstr(optarg);
	if (pf->label_str == NULL) {
	  pf->label_str = lst_new_ptr(3);
	  pf->label_type = lst_new_int(3);
	}
	lst_push_ptr(pf->label_str, optstr);
	lst_push_int(pf->label_type, 
		     strcmp(long_opts[opt_idx].name, "label-branches") == 0 ? 
		     BRANCH_TYPE : SUBTREE_TYPE);
      }
      else if (strcmp(long_opts[opt_idx].name, "selection") == 0) {
	pf->selection = get_arg_dbl(optarg);
	pf->use_selection = TRUE;
      }
      else {
	die("ERROR: unknown option.  Type 'phyloFit -h' for usage.\n");
      }
      break;
    case 'u':
      if (pf->bound_arg == NULL) 
	pf->bound_arg = lst_new_ptr(1);
      optstr = str_new_charstr(optarg);
      lst_push_ptr(pf->bound_arg, optstr);
      break;
    case 'D':
      seed = get_arg_int_bounds(optarg, 1, INFTY);
      break;
    case 'h':
      printf("%s", HELP);
      exit(0);
    case '?':
      die("ERROR: illegal argument.     Type 'phyloFit -h' for usage.\n");
    }
  }

  set_seed(seed);

  if (msa_fname == NULL) {
    if (optind >= argc) 
      die("ERROR: missing alignment filename.  Type 'phyloFit -h' for usage.\n");
    msa_fname = argv[optind];
    pf->msa_fname = msa_fname;
  }

  infile = phast_fopen(msa_fname, "r");

  if (input_format == UNKNOWN_FORMAT)
    input_format = msa_format_for_content(infile, 1);

  if (pf->nonoverlapping && (pf->use_conditionals || pf->gff != NULL || 
			     pf->cats_to_do_str || input_format == SS))
    die("ERROR: cannot use --non-overlapping with --markov, --features,\n--msa-format SS, or --do-cats.\n");


  /* read alignment */
  if (!pf->quiet) fprintf(stderr, "Reading alignment from %s ...\n", msa_fname);
  if (input_format == MAF) {
    pf->msa = maf_read(infile, NULL, 
		       tm_order(pf->subst_mod) + 1, 
		       NULL, pf->gff, pf->cm, 
		       pf->nonoverlapping ? tm_order(pf->subst_mod) + 1 : -1, 
		       FALSE, pf->reverse_group_tag, NO_STRIP, FALSE);
    if (pf->gaps_as_bases) 
      msa_reset_alphabet(pf->msa, alph);
  }
  else 
    pf->msa = msa_new_from_file_define_format(infile, 
				input_format, alph);

  /* set up for categories */
  /* first label sites, if necessary */
  pf->label_categories = (input_format != MAF);

  run_phyloFit(pf);

  if (pf->logf != NULL && pf->logf != stderr && pf->logf != stdout)
    phast_fclose(pf->logf);
  if (!pf->quiet) fprintf(stderr, "Done.\n");
  sfree(pf);
  
  return 0;
}
/* create a new problem, and add to the list */
Problem *problem_add(List *problems, GFF_Feature *feat, status_type status,
                     int start, int end) {
  Problem *p = problem_new(feat, status, start, end);
  lst_push_ptr(problems, p);
  return p;
}
/* reconstruct indels by parsimony and assign all base probs to -1
   where ancestral bases are inferred not to have been present */
void do_indels(MSA *msa, TreeModel *mod) {
  int s, tup, i, j;
  TreeNode *n, *lca;
  char c;
  typedef enum {IGNORE, GAP, BASE, MISSING, AMBIG} label_type;
  List *postorder;

  label_type *label = smalloc(mod->tree->nnodes * sizeof(label_type));
  List *inside = lst_new_ptr(mod->tree->nnodes), 
    *outside = lst_new_ptr(mod->tree->nnodes),
    *ambig_cases = lst_new_ptr(mod->tree->nnodes);
  int *seq_to_leaf = smalloc(msa->nseqs * sizeof(int));

  /* build mapping from seqs to leaf indices in tree */
  for (s = 0; s < msa->nseqs; s++) {
    TreeNode *n = tr_get_node(mod->tree, msa->names[s]);
    if (n == NULL)
      die("ERROR: no match for sequence \"%s\" in tree.\n", msa->names[s]);
    seq_to_leaf[s] = n->id;
  }    

  if (mod->msa_seq_idx == NULL)
    tm_build_seq_idx(mod, msa);

  postorder = tr_postorder(mod->tree);

  for (tup = 0; tup < msa->ss->ntuples; tup++) {
    int min = mod->tree->nnodes, max = -1, ngaps = 0, skip_root = FALSE;

    /* find min and max ids of seqs that actually have bases (non-gaps) */
    for (s = 0; s < msa->nseqs; s++) {
      if (ss_get_char_tuple(msa, tup, s, 0) == GAP_CHAR) {
        ngaps++;
        continue;
      }
      if (seq_to_leaf[s] < min) min = seq_to_leaf[s];
      if (seq_to_leaf[s] > max) max = seq_to_leaf[s];

      /* NOTE: missing data being handled like bases here; in some
         cases, a base may be inferred at an ancestral node, when the
         only evidence for it is missing data in the leaves.  There
         are ambiguous cases; we'll err on the side of predicting
         bases rather than indels */
    }

    if (ngaps <= 1) continue;	/* short cut -- impossible to infer
                                   gaps in ancestors */

    else if (ngaps >= msa->nseqs - 1) {
      /* in this case, all ancestors must be gaps */
      for (i = 0; i < mod->tree->nnodes; i++) {
        n = lst_get_ptr(mod->tree->nodes, i);
        if (n->lchild == NULL || n->rchild == NULL) 
          continue;               /* ignore leaves */
        for (j = 0; j < mod->rate_matrix->size; j++)
          mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
	/* mark as gap */
      }
      continue;
    }

    if (min < 0) die("prequel.c: min = %e < 0\n", min);
    if (max < min) die("prequel.c: max (%e) < min (%e)", max, min);

    /* the LCA of all leaves with non-gaps must be the first ancestor of
       the node with the max id that has an id smaller than the min
       id.  This is based on the assumption that node ids are assigned
       sequentially in a preorder traversal of the tree, which will be
       true as long as the tree is read from a Newick file by the code
       in trees.c */
    for (lca = lst_get_ptr(mod->tree->nodes, max); lca->id > min; 
         lca = lca->parent);

    /* by parsimony, the base was inserted on the branch to the LCA,
       and all ancestral nodes outside the subtree rooted at the LCA
       did not have bases */

    if (lca == mod->tree->lchild || lca == mod->tree->rchild)
      skip_root = TRUE;        /* don't mark root as gap in this case:
                                  can't distinguish insertion from
                                  deletion so assume deletion */

    /* mark ancestral bases outside subtree beneath LCA as gaps */
    tr_partition_nodes(mod->tree, lca, inside, outside);
    for (i = 0; i < mod->tree->nnodes; i++) label[i] = BASE;
    for (i = 0; i < lst_size(outside); i++) {
      n = lst_get_ptr(outside, i);
      label[n->id] = IGNORE;
      if (n->lchild == NULL || n->rchild == NULL) 
        continue;               /* skip leaves */
      if (n == mod->tree && skip_root) 
        continue;               /* skip root if condition above */
      for (j = 0; j < mod->rate_matrix->size; j++)
        mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
      /* mark as gap */
    }

    /* check for gaps in subtree; if there's at most one, we can go
       on; otherwise have to use parsimony to infer history in subtree */
    ngaps = 0;
    for (i = 0; i < lst_size(inside); i++) {
      n = lst_get_ptr(inside, i);
      if (n->lchild == NULL &&
          ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0) == GAP_CHAR)
        ngaps++;
    }
    if (ngaps <= 1) continue;

    /* use Dollo parsimony to infer the indel history of the subtree
       beneath the LCA.  Use the fact that every base must have a
       chain of bases to the LCA, because, assuming the alignment is
       correct, no insertions are possible beneath the LCA */
    lst_clear(ambig_cases);
    for (i = 0; i < lst_size(postorder); i++) {
      n = lst_get_ptr(postorder, i);
      if (label[n->id] == IGNORE) continue; /* outside subtree */

      /* MISSING means all leaves beneath node have missing data */
      /* AMBIG means combination of gaps and missing data beneath node */

      else if (n->lchild == NULL) {  /* leaf in subtree */
        c = ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0);
        if (c == GAP_CHAR)
          label[n->id] = GAP;
        else if (msa->is_missing[(int)c]) 
          label[n->id] = MISSING;
        else
          label[n->id] = BASE;
      }
      else {                    /* internal node in subtree */
        if (label[n->lchild->id] == BASE || label[n->rchild->id] == BASE)
          label[n->id] = BASE;  /* by Dollo parsimony */
        else if ((label[n->lchild->id] == GAP || label[n->lchild->id] == AMBIG) &&
                 (label[n->rchild->id] == GAP || label[n->rchild->id] == AMBIG))
          label[n->id] = GAP;   /* gaps from both sides and no bases -- must be gap */
        else if (label[n->lchild->id] == MISSING && label[n->rchild->id] == MISSING)
          label[n->id] = MISSING;
        else {              /* must be GAP/MISSING or AMBIG/MISSING */
          label[n->id] = AMBIG;
          lst_push_ptr(ambig_cases, n);
        }
      }
    }

    /* now resolve any ambiguities, by giving each ambiguous node the same
       label as its parent; traversing ambig_cases in reverse order
       ensures that parents are visited before children  */

    /* first make sure root of subtree has a base */
    if (label[lca->id] == MISSING || label[lca->id] == AMBIG)
      label[lca->id] = BASE;
    /* in this case there is all missing data and gaps beneath the LCA;
       hard to know what is right, but let's force a base and err on
       the side of bases rather than gaps */

    for (i = lst_size(ambig_cases) - 1; i >= 0; i--) {
      n = lst_get_ptr(ambig_cases, i);
      if (n == lca) continue;
      else label[n->id] = label[n->parent->id];
    }

    /* now mark gaps inside subtree, as needed */
    for (i = 0; i < lst_size(inside); i++) {
      n = lst_get_ptr(inside, i);
      if (n->lchild == NULL || n->rchild == NULL) continue;
      if (label[n->id] == GAP) 
        for (j = 0; j < mod->rate_matrix->size; j++)
          mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
    }
  }

  lst_free(inside);
  lst_free(outside);
  lst_free(ambig_cases);
  sfree(seq_to_leaf);
  sfree(label);
}
int main(int argc, char* argv[]) {
  FILE* F;
  GFF_Set *gff_real=NULL, *gff_pred=NULL;
  char c;
  List *real_fname_list = NULL, *pred_fname_list = NULL, 
    *feat_list = NULL, *seq_len_list = NULL, *l = NULL;
  int nfile, i, j;
  char *prefix = NULL;
  int tot_tp = 0, tot_fp = 0, tot_nreal_pos = 0, tot_npred_pos = 0, 
    tot_seqlen = 0, tot_ncr = 0, tot_npca = 0, tot_nola = 0, tot_nme = 0, 
    tot_npcp = 0, tot_nolp = 0, tot_nwe = 0, tot_nexons_real = 0, 
    tot_nexons_pred = 0, dump_exons = 0, nnc = -1, tot_nnc = -1, 
    nc_threshold = 0;

  while ((c = (char)getopt(argc, argv, "r:p:f:l:d:n:h")) != -1) {
    switch(c) {
    case 'r':
      real_fname_list = get_arg_list(optarg);
      break;
    case 'p':
      pred_fname_list = get_arg_list(optarg);
      break;
    case 'l':
      l = get_arg_list(optarg);
      /* convert to ints */
      seq_len_list = lst_new_int(lst_size(l));
      for (i = 0; i < lst_size(l); i++) {
        int tmp;
        if (str_as_int((String*)lst_get_ptr(l, i), 
                       &tmp) != 0) {
          die("ERROR: Bad integer in <seq_len_list>.\n"); 
        }
        lst_push_int(seq_len_list, tmp);
      }
      break;
    case 'f':
      feat_list = get_arg_list(optarg);
      break;
    case 'd':
      dump_exons = 1;
      prefix = optarg;
      break;
    case 'n':
      nnc = tot_nnc = 0;
      nc_threshold = get_arg_int(optarg);
      break;
    case 'h':
      print_usage();
      exit(0);
    case '?':
      die("Unrecognized option.  Try \"eval_predictions -h\" for help.\n");
    }
  }

  set_seed(-1);

  if (feat_list == NULL) {
    feat_list = lst_new_ptr(1);
    lst_push_ptr(feat_list, str_new_charstr(GFF_CDS_TYPE));
  }
  
  if (real_fname_list == NULL || pred_fname_list == NULL || 
      seq_len_list == NULL) {
    die("ERROR: Must specify -r, -p, and -l.  Try \"eval_predictions -h\" for help.\n");
  }

  if (lst_size(real_fname_list) != lst_size(pred_fname_list)) {
    die("ERROR: Must specify lists of equal length for real and predicted filenames.\n\n.");
  }

  if (lst_size(seq_len_list) == 1 && lst_size(real_fname_list) > 1)
    for (i = 1; i < lst_size(real_fname_list); i++)
      lst_push_int(seq_len_list, lst_get_int(seq_len_list, 0));
  else if (lst_size(seq_len_list) != lst_size(real_fname_list))
    die("ERROR: List of sequence lengths does not match lists of real and predicted filenames.\n");

  /* print header */
  printf("%-25s %-25s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s", "Real_fname", "Pred_fname", "Sn", "Sp", "AC", "CC", "ESn", "ESp", "CRa", "PCa", "OLa", "ME", "CRp", "PCp", "OLp", "WE");
  if (nnc != -1) printf(" %7s %7s %7s %7s", "NCa", "NCp", "CR+NCa", "CR+NCp");
  printf("\n");

  for (nfile = 0; nfile < lst_size(real_fname_list); nfile++) {
    int tp, fp, nexons_real, nexons_pred, nwe, nme, ncr, npca, 
      npcp, nola, nolp, nreal_pos, npred_pos, len_real, len_pred, seqlen,
      already_counted_real;
    String *real_fname, *pred_fname;
    GFF_Feature *feat_real, *feat_pred=NULL;

    real_fname = (String*)lst_get_ptr(real_fname_list, nfile);
    F = phast_fopen(real_fname->chars, "r");
    if ((gff_real = gff_read_set(F)) == NULL) {
      die("ERROR: Unable to read file \"%s\".\n", 
	  real_fname->chars);
    }
    phast_fclose(F);

    pred_fname = (String*)lst_get_ptr(pred_fname_list, nfile);
    F = phast_fopen(pred_fname->chars, "r");
    if ((gff_pred = gff_read_set(F)) == NULL) {
      die("ERROR: Unable to read file \"%s\".\n", 
	  pred_fname->chars);
    }
    phast_fclose(F);

    seqlen = lst_get_int(seq_len_list, nfile);

    /* sort ungrouped -- only cds exons will be considered, and each
       one will be considered individually */
    gff_ungroup(gff_real); 
    gff_ungroup(gff_pred);
    gff_sort(gff_real);
    gff_sort(gff_pred);

    nexons_real = nexons_pred = nwe = nme = ncr = npca = npcp = nola = 
      nolp = tp = fp = nreal_pos = npred_pos = 0;
    if (nnc != -1) nnc = 0;
    i = j = 0;
    already_counted_real = 0;
    while (i < lst_size(gff_real->features)) {
      feat_real = (GFF_Feature*)lst_get_ptr(gff_real->features, i);
      if (!is_exon(feat_real, feat_list)) { i++; continue; }

      len_real = feat_real->end - feat_real->start + 1;

      if (!already_counted_real) {
        nexons_real++;
        nreal_pos += len_real;
      }

      /* look at all predicted exons up to and overlapping this real exon */
      while (j < lst_size(gff_pred->features)) {
        feat_pred = (GFF_Feature*)lst_get_ptr(gff_pred->features, j);
        if (!is_exon(feat_pred, feat_list)) {
          j++;
          continue;
        }
        else if (feat_pred->start > feat_real->end) {
          if (!already_counted_real) {
            nme++;
            if (dump_exons) dump(prefix, feat_real, NULL, ME, -1);
          }
          break;
        }

        /* otherwise we have a predicted exon to count (start of pred
           <= end of real) */
        nexons_pred++;
        len_pred = feat_pred->end - feat_pred->start + 1;
        npred_pos += len_pred;
        j++;                    /* we'll be done with this prediction
                                   one way or another; next time
                                   through look at a new one */

        if (feat_pred->end < feat_real->start) { /* WE */
          nwe++;
          fp += len_pred;
          if (dump_exons) dump(prefix, NULL, feat_pred, WE, 0);
        }
        else if (feat_pred->start == feat_real->start && /* CR */
                 feat_pred->end == feat_real->end) {
          ncr++;
          tp += len_pred;
          if (dump_exons) dump(prefix, feat_real, feat_pred, CR, 1);
          break;
        }
        else if (feat_pred->start == feat_real->start || /* PC */
                 feat_pred->end == feat_real->end) {
          pred_type type;
          npca++;
          npcp++;
          if (nnc != -1 && 
              max(abs(feat_pred->start - feat_real->start), 
                  abs(feat_pred->end - feat_real->end)) <= nc_threshold) {
            nnc++; 
            type = NC;
          }
          else type = PC;
          if (len_pred < len_real) 
            tp += len_pred;
          else {
            tp += len_real;
            fp += (len_pred - len_real);
          }
          if (dump_exons) dump(prefix, feat_real, feat_pred, type, 
                               min(1, (double)len_real/len_pred));
          break;
        }
        else {                  /* OL */
          int overlap_size;
          pred_type type;
          nola++;
          nolp++;
          if (nnc != -1 && 
              max(abs(feat_pred->start - feat_real->start), 
                  abs(feat_pred->end - feat_real->end)) <= nc_threshold) {
            nnc++; 
            type = NC;
          }
          else type = PC;

          overlap_size = min(feat_pred->end, feat_real->end) - 
            max(feat_pred->start, feat_real->start) + 1;
          tp += overlap_size;
          fp += len_pred - overlap_size;
          if (dump_exons) dump(prefix, feat_real, feat_pred, type,
                               (double)overlap_size/len_pred);
          break;
        }
        /* NOTE: I'm ignoring the possibility that a predicted exon
           could be a PC and/or OL with respect to multiple real
           exons.  The effect on the exon-level stats will be fairly
           minor (at worst a predicted exon is scored as an OL when it
           should be scored as an PC, and a real exon is erroneously
           counted as a ME), but the effect on the nucleotide-level Sn
           and Sp could conceivably be significant.  */
      }

      /* if we have counted at least one prediction (and thus failed
         to reach the end of the list), but the last prediction did
         not extend as far as the end of the real exon, then delay
         moving on to the next real exon */
      if (j < lst_size(gff_pred->features) && feat_pred->end < feat_real->end) 
          already_counted_real = 1;
      else {
        /* if we reached the end of the list of predictions, then it
           must not have contained any exons, and the real exon in
           question is a ME (if it hasn't already been counted) */
        if (j == lst_size(gff_pred->features) && !already_counted_real) 
          nme++; 

        i++;
        already_counted_real = 0;
      }
    }
    
    /* any remaining predictions must be wrong */
    for (; j < lst_size(gff_pred->features); j++) {
      if (is_exon((GFF_Feature*)lst_get_ptr(gff_pred->features, j), 
                  feat_list)) {
        nexons_pred++;
        nwe++;
      }
    }

    compute_and_print_stats(stdout, real_fname, pred_fname, 
                            tp, fp, nreal_pos, npred_pos, seqlen, ncr, 
                            npca, nola, nme, npcp, nolp, nwe, 
                            nexons_real, nexons_pred, nnc);

    tot_tp += tp;
    tot_fp += fp;
    tot_nreal_pos += nreal_pos;
    tot_npred_pos += npred_pos;
    tot_seqlen += seqlen;
    tot_ncr += ncr;
    tot_npca += npca;
    tot_nola += nola;
    tot_nme += nme;
    tot_npcp += npcp;
    tot_nolp += nolp;
    tot_nwe += nwe;
    tot_nexons_real += nexons_real;
    tot_nexons_pred += nexons_pred;
    if (nnc != -1) tot_nnc += nnc;

    if (dump_exons && SUMF != NULL)
      fprintf(SUMF, "# Total number of bases in real exons: %d\n", nreal_pos);

    gff_free_set(gff_real);
    gff_free_set(gff_pred);
  }

  if (lst_size(real_fname_list) > 1)
    compute_and_print_stats(stdout, str_new_charstr("TOTAL"), str_new_charstr(""), 
                            tot_tp, tot_fp, tot_nreal_pos, tot_npred_pos, 
                            tot_seqlen, tot_ncr, tot_npca, tot_nola, tot_nme, 
                            tot_npcp, tot_nolp, tot_nwe, tot_nexons_real, 
                            tot_nexons_pred, tot_nnc);

  return 0;
}
Example #29
0
/* Read a CategoryMap from a file */
CategoryMap *cm_read(FILE *F) {
  String *line, *name;
  List *l;
  int cat, cat2, lineno, i, cm_read_error;
  CategoryMap *cm = NULL;
  CategoryRange *existing_range;
  static Regex *cat_range_re = NULL;
  static Regex *ncats_re = NULL;
  static Regex *fill_re = NULL;
  static Regex *label_re = NULL;
  static Regex *extend_re = NULL;
  int has_dependencies = 0;

  line = str_new(STR_SHORT_LEN);
  l = lst_new_ptr(3);
  if (cat_range_re == NULL) {
    cat_range_re = str_re_new("^[[:space:]]*([^[:space:]]+)[[:space:]]+([[:digit:]]+)(-([[:digit:]]+))?([[:space:]]+([[:digit:]].*))?"); 
    ncats_re = str_re_new("^[[:space:]]*NCATS[[:space:]]*=[[:space:]]*([[:digit:]]+)");
    fill_re = str_re_new("^[[:space:]]*FILL_PRECEDENCE[[:space:]]*=[[:space:]]*(.*)$");
    label_re = str_re_new("^[[:space:]]*LABELLING_PRECEDENCE[[:space:]]*=[[:space:]]*(.*)$");
    extend_re = str_re_new("^[[:space:]]*FEATURE_EXTEND[[:space:]]*:[[:space:]]*(.+)[[:space:]]*\\((.+)\\)$");
  }

  lineno = 0;
  while ((str_readline(line, F)) != EOF) {
    lineno++;
    str_trim(line);
    if (str_equals_charstr(line, ""))
      continue;

    if (str_re_match(line, ncats_re, l, 1) >= 0) { 
                                /* NCATS line */
      int ncats;
      str_as_int(lst_get_ptr(l, 1), &ncats);
      cm = cm_new(ncats);

      /* 0th category is "background" */
      cm->ranges[0] = 
        cm_new_category_range(str_new_charstr(BACKGD_CAT_NAME), 0, 0);
    }

    else if (cm == NULL || cm->ncats == 0) 
      die("ERROR: NCATS line must appear first, and must specify a positive number of categories.\n");

    else if (str_re_match(line, label_re, l, 1) >= 0) {               
                                /* LABELLING_PRECEDENCE line */
      List *tmpl = lst_new_ptr(cm->ncats);
      int tmpi;
      str_split((String*)lst_get_ptr(l, 1), " ,", tmpl);
      for (i = 0; i < lst_size(tmpl); i++) {
        String *s = (String*)lst_get_ptr(tmpl, i);
        if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
          die("ERROR: bad integer in LABELLING_PRECEDENCE.\n");
        cm->labelling_precedence[tmpi] = i;
        str_free(s);
      }
      lst_free(tmpl);
    }

    else if (str_re_match(line, fill_re, l, 1) >= 0) {               
                                /* FILL_PRECEDENCE line */
      List *tmpl = lst_new_ptr(cm->ncats);
      int tmpi;
      str_split(lst_get_ptr(l, 1), " ,", tmpl);
      for (i = 0; i < lst_size(tmpl); i++) {
        String *s = lst_get_ptr(tmpl, i);
        if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
          die("ERROR: bad integer in FILL_PRECEDENCE.\n");
        cm->fill_precedence[tmpi] = i;
        str_free(s);
      }
      lst_free(tmpl);
    }

    else if (str_re_match(line, extend_re, l, 2) >= 0) {
                                /* FEATURE_EXTEND line */
      String *target = lst_get_ptr(l, 2);
      List *sources = lst_new_ptr(2);
      str_split(lst_get_ptr(l, 1), " ,", sources);

      if (cm == NULL || (cat = cm_get_category(cm, target)) == 0)
        die("ERROR: FEATURE_EXTEND target must be a previously-defined non-background feature type.\n");

      for (i = 0; i < lst_size(sources); i++) {
        if (cm_get_category(cm, lst_get_ptr(sources, i)) == 0) 
          die("ERROR: FEATURE_EXTEND source list must consist of previously-defined non-background feature types.\n");
      }
    }

    else {                      /* 'range' line */
      if (str_re_match(line, cat_range_re, l, 6) < 0) 
        die("ERROR at line %d: '%s'\n", lineno, line->chars);

      name = str_dup((String*)lst_get_ptr(l, 1));
      str_as_int((String*)lst_get_ptr(l, 2), &cat);

      cat2 = cat;
      if (lst_get_ptr(l, 4) != NULL) 
        str_as_int((String*)lst_get_ptr(l, 4), &cat2);

      if (cat < 0 || cat2 < cat || cat2 > cm->ncats)
        die("ERROR: Illegal category range.\n");

      /* check for existing definitions of the specified category
         range.  Either no such definition must exist, or one must
         exist that spans exactly the same category numbers */
      existing_range = NULL;
      cm_read_error = 0;
      for (i = cat; !cm_read_error && i <= cat2; i++) {
        if (cm->ranges[i] != NULL && existing_range == NULL) 
          existing_range = cm->ranges[i];
        else if (cm->ranges[i] != existing_range)
          cm_read_error = 1;
      }
      if (existing_range != NULL && (existing_range->start_cat_no != cat || 
                                     existing_range->end_cat_no != cat2)) 
        cm_read_error = 1;

      if (cm_read_error) 
        die("ERROR: Overlapping category ranges.\n");

      /* either add new category range, or add new type to existing one */
      if (existing_range != NULL) {
        lst_push_ptr(existing_range->feature_types, name);
      }
      else {
        CategoryRange *cr = cm_new_category_range(name, cat, cat2);
        for (i = cat; i <= cat2; i++) 
          cm->ranges[i] = cr;
      }

      /* now address "conditioned_on" dependencies, if they have been
         specified */
      if (lst_get_ptr(l, 6) != NULL) {
        if (existing_range != NULL) 
          fprintf(stderr, "WARNING: ignoring 'conditioned on' list for type '%s'\n", 
                  name->chars);
        else {
          List *tmpl = lst_new_ptr(cm->ncats);
          int tmpi;
	  if (cm->conditioned_on[cat] != NULL)
	    die("ERROR cm_read: cm->conditioned_on[%i] should be NULL\n",
		cat);

          str_split((String*)lst_get_ptr(l, 6), " ,", tmpl);
          cm->conditioned_on[cat] = lst_new_int(lst_size(tmpl));
          for (i = cat + 1; i <= cat2; i++)
            cm->conditioned_on[i] = cm->conditioned_on[cat];
          /* all categories in range point to
             same "conditioned on" list */

          for (i = 0; i < lst_size(tmpl); i++) {
            String *s = (String*)lst_get_ptr(tmpl, i);
            if (str_as_int(s, &tmpi) != 0 || tmpi < 0 || tmpi > cm->ncats) 
              die("ERROR: bad integer in 'conditioned on' list for type '%s'.\n", 
                      name->chars);
            lst_push_int(cm->conditioned_on[cat], tmpi);
            str_free(s);
          }
          lst_free(tmpl);
          
          has_dependencies = 1;
        }
      }
    }

    for (i = 0; i < lst_size(l); i++)
      if (lst_get_ptr(l, i) != NULL)
        str_free((String*)lst_get_ptr(l, i));
  }

  /* make sure every category has been specified */
  for (i = 0; i <= cm->ncats; i++) 
    if (cm->ranges[i] == 0) 
      die("ERROR: category %d has not been specified.\n", i);

  /* build unspooler, if necessary */
  if (has_dependencies)
    cm->unspooler = cm_create_unspooler(cm->ncats + 1, cm->conditioned_on);

  str_free(line);
  lst_free(l);
  return cm;
}
Example #30
0
/* Create a GFF_Set from a sequence of category/state numbers, using
   a specified category map and mapping from raw state numbers to
   category numbers.  */
GFF_Set *cm_labeling_as_gff(CategoryMap *cm, int *path, 
                            int length, int *path_to_cat, 
                            int *reverse_compl, char *seqname, 
                            char *source, List *frame_cats, 
                            char *grouptag,  char *idpref
                            ) {
  int beg, end, i, cat, frame, groupno;
  GFF_Set *gff = gff_new_set_init("PHAST", PHAST_VERSION);
  int do_frame[cm->ncats+1];
  char strand;
  char groupstr[STR_SHORT_LEN];
  int ignore_0 = str_equals_charstr(cm_get_feature(cm, 0), BACKGD_CAT_NAME);
                                /* ignore category 0 if background  */

  if (length <= 0) return gff;

  for (i = 0; i <= cm->ncats; i++) do_frame[i] = 0;
  if (frame_cats != NULL)
    for (i = 0; i < lst_size(frame_cats); i++) {
      int cat = cm_get_category(cm, lst_get_ptr(frame_cats, i));
      if (cat != 0)             /* ignore background or unrecognized name */
        do_frame[cat] = 1;
    }

  groupno = 1;
  if (idpref != NULL)
    sprintf(groupstr, "%s \"%s.%d\"", grouptag != NULL ? grouptag : "id", 
            idpref, groupno);
  else
    sprintf(groupstr, "%s \"%d\"", grouptag != NULL ? grouptag : "id", groupno);

  i = 0;
  while (i < length) {
    checkInterruptN(i, 10000);
    cat = cm->ranges[path_to_cat[path[i]]]->start_cat_no;
    strand = reverse_compl[path[i]] ? '-' : '+';
    frame = do_frame[cat] ? path_to_cat[path[i]] - cat : GFF_NULL_FRAME;

    /* scan ahead until enter new category range (or reach end of seq) */
    beg = i + 1;                /* begin of feature (GFF coords) */
    for (i++; i < length && 
           cm->ranges[path_to_cat[path[i]]]->start_cat_no == cat; i++);
    end = i;                    /* end of feature (GFF coords) */

    /* if minus strand, adjust frame to reflect end */
    if (strand == '-' && do_frame[cat]) 
      frame = path_to_cat[path[i-1]] - cat;

    /* if legitimate feature (non-background), then incorp into GFF_Set */
    if (cat != 0 || !ignore_0)  /* create new feature and add */
      lst_push_ptr(gff->features, 
                   gff_new_feature(str_new_charstr(seqname), 
                                   str_new_charstr(source), 
                                   str_dup(cm_get_feature(cm, cat)), 
                                   beg, end, 0, strand, frame, 
                                   str_new_charstr(groupstr), TRUE));

    if (cat == 0 && beg > 1) {
      groupno++;                /* increment group number each time a
                                   sequence of 0s is encountered  */
      if (idpref != NULL)
        sprintf(groupstr, "%s \"%s.%d\"", grouptag != NULL ? grouptag : "id", 
                idpref, groupno);
      else
        sprintf(groupstr, "%s \"%d\"", grouptag != NULL ? grouptag : "id", 
                groupno);
    }
  }

  return gff;
}