Example #1
0
/* Closes all outfiles.  If already closed, reopen with append, add #eof 
   closer, and close again.  see comment above at get_outfile */
void close_outfiles(List *outfileList, Hashtable *outfileHash) {
  List *keys = hsh_keys(outfileHash);
  int *done, idx, i;
  char *fname;
  FILE *outfile;
  done = smalloc(lst_size(keys)*sizeof(int));
  for (i=0; i<lst_size(keys); i++) {
    done[i]=0;
    fname = (char*)lst_get_ptr(keys, i);
    idx = hsh_get_int(outfileHash, fname);
    outfile = (FILE*)lst_get_ptr(outfileList, idx);
    if (outfile != NULL) {
      mafBlock_close_outfile(outfile);
      done[i]=1;
    }
  }
  for (i=0; i<lst_size(keys); i++) {
    if (done[i]) continue;
    fname = (char*)lst_get_ptr(keys, i);
    outfile = phast_fopen(fname, "a");
    mafBlock_close_outfile(outfile);
  }
  sfree(done);
  lst_free(keys);
  lst_free(outfileList);
  hsh_free(outfileHash);
}
Example #2
0
/** maps a sequence (array) of category numbers from the spooled space to
   the unspooled space, using the current unspooler.  Original
   sequence is overwritten */
void cm_spooled_to_unspooled(CategoryMap *cm, int *path, int pathlen) {
  int j, sp_state, prev_sp_state;
  List *pred;

  if (cm->unspooler == NULL) return;

  pred = lst_new_int(cm->unspooler->nstates_spooled);
  prev_sp_state = -1;
  for (j = 0; j < pathlen; j++) {
    if (!(path[j] >= 0 && path[j] <= cm->unspooler->nstates_spooled))
      die("ERROR cm_spooled_to_unspooled: path[%i]=%i, should be in [0, %i]\n",
	  j, path[j], cm->unspooler->nstates_spooled);

    sp_state = path[j];
    path[j] = cm_get_unspooled_state(cm, path[j], pred);

    if (path[j] == -1) 
      die("ERROR: failure mapping to uspooled state at position %d.\n", j);

    if (sp_state != prev_sp_state) {
      /* if the current (spooled) state is not conditioned on any
         other state, then its predecessor cannot matter, so the list
         can be cleared */
      if (lst_size(cm->unspooler->spooled_to_unspooled[sp_state]->children) == 0)
        lst_clear(pred);

      lst_push_int(pred, sp_state);
    }

    prev_sp_state = sp_state;
  }

  lst_free(pred);
}
Example #3
0
File: block.c Project: rvba/minuit
void block_justify( t_block *block, int dir)
{
	t_lst *lst = block_leaves_get( block, dir);

	t_link *l;
	t_block *previous = NULL;
	for( l = lst->first; l; l = l->next)
	{
		t_block *block_nearest = l->data;
		justify( block, block_nearest, dir);

		if( previous)
		{
			if( previous->id.id != block_nearest->id.id)
			{
				justify( previous, block_nearest, NORTH);
				//justify_tree( previous, block_nearest, NORTH);
			}
		}

		// Go Recursive
		block_justify( block_nearest, dir);

		previous = block_nearest;
	}

	lst_free( lst);
}
Example #4
0
/* Create a category map with a category for each feature type in a
    GFF_Set.  Category numbers are assigned in order of appearance of
    types */
CategoryMap* cm_new_from_features(GFF_Set *feats) {
  int i;
  CategoryMap *retval;
  Hashtable *hash;
  List *types;

  /* first scan features for all types */
  hash = hsh_new(10);
  types = lst_new_ptr(10);
  for (i = 0; i < lst_size(feats->features); i++) {
    GFF_Feature *f = lst_get_ptr(feats->features, i);
    checkInterruptN(i, 10000);
    if (hsh_get(hash, f->feature->chars) == (void*)-1) {
      lst_push_ptr(types, f->feature);
      hsh_put_int(hash, f->feature->chars, 1);
    }
  }
  hsh_free(hash);

  /* now create a simple category map */
  retval = cm_new(lst_size(types));
  for (i = 0; i <= retval->ncats; i++) {
    String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : 
      str_dup(lst_get_ptr(types, i-1));
    retval->ranges[i] = cm_new_category_range(type, i, i);
  }
  lst_free(types);
  return retval;
}
Example #5
0
/*
 * convert a java array of className objects to a sqm_lst_t list.
 * each element is converted from Java to C using j2c function.
 */
sqm_lst_t *
jarray2lst(JNIEnv *env,
    jobjectArray jarr,
    char *className,
    void * (*j2c)(JNIEnv *, jobject)) {

	sqm_lst_t *lst;
	int idx, n;

	if (NULL == jarr) {
		PTRACE(1, "jni:NULL array passed to jarray2lst()");
		return (NULL);
	}
	n = (int)(*env)->GetArrayLength(env, jarr);
	PTRACE(2, "jni:jarray2lst(jarr[%d],%s)", n, className);
	lst = lst_create();

	for (idx = 0; idx < n; idx++)
		if (-1 == lst_append(lst,
		    j2c(env, (*env)->GetObjectArrayElement(env, jarr, idx)))) {
			lst_free(lst);
			lst = NULL;
			break;
		}

	PTRACE(2, "jni:jarray2lst() done");
	return (lst);
}
Example #6
0
/*
 * convert a jintArray to a C list of int
 */
sqm_lst_t *
jintArray2lst(JNIEnv *env,
    jintArray jintArr) {

	sqm_lst_t *lst;
	int idx, len, *i;
	jint *p;

	if (NULL == jintArr) {
		PTRACE(1, "jni:NULL array passed to jintArray2lst()");
		return (NULL);
	}
	len = (int)(*env)->GetArrayLength(env, jintArr);
	p = (jint *) malloc(len * sizeof (jint));
	PTRACE(2, "jni:jintArray2lst(jintArr[%d])", len);
	lst = lst_create();

	(*env)->GetIntArrayRegion(env, jintArr, 0, len, p);

	for (idx = 0; idx < len; idx++) {
		i = (int *)malloc(sizeof (int));
		*i = (int)p[idx];
		if (-1 == lst_append(lst, i)) {
			lst_free(lst);
			lst = NULL;
			break;
		}
	}
	free(p);

	PTRACE(2, "jni:jintArray2lst() done");
	return (lst);

}
/* Read substitution scores from specified file and return as a kind
   of pseudo substitution matrix.  All nonspecified elements in matrix
   will be equal to NEGINFTY, which is to be interpretted as "NA" */
Matrix* read_subst_scores(TreeModel *mod, FILE *F) {
  Matrix *retval = mat_new(mod->rate_matrix->size,
                                        mod->rate_matrix->size);
  String *line = str_new(STR_MED_LEN), *tuple1, *tuple2;
  List *l = lst_new_ptr(3);
  int alph_size = (int)strlen(mod->rate_matrix->states);
  int *inv_alph = mod->rate_matrix->inv_states;
  double val;
  mat_set_all(retval, NEGINFTY);
  while (str_readline(line, F) != EOF) {
    str_double_trim(line);
    if (str_starts_with_charstr(line, "#") || line->length == 0) 
      continue;
    str_split(line, NULL, l);
    if (lst_size(l) < 3) {
      die("ERROR: wrong number of columns in subst. score file.\n");
    }
    tuple1 = lst_get_ptr(l, 0);
    tuple2 = lst_get_ptr(l, 1);
    if (str_as_dbl(lst_get_ptr(l, 2), &val) != 0) {
      die("ERROR: bad value in subst. score file.\n");
    }
    mat_set(retval, tuple_index(tuple1->chars, inv_alph, alph_size),
                   tuple_index(tuple2->chars, inv_alph, alph_size), val);
    str_free(tuple1); str_free(tuple2); str_free(lst_get_ptr(l, 2));
  }
  lst_free(l);
  str_free(line);
  return retval;
}
/* Exclude stop codons from all CDS in a group, as necessary.  Record
   any features that are changed, so they can be changed back before
   data is output */
void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, 
                   List *ends_adjusted) {
  int j, k;
  List *stops = lst_new_ptr(1), *gfeatures = group->features;
  GFF_Feature *feat;
  lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted);
  for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops.  We 
                                                 expect at most one, but more 
                                                 are possible */
    feat = lst_get_ptr(gfeatures, j);
    if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat);
  }
  for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */
    feat = lst_get_ptr(gfeatures, j);
    if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
      for (k = 0; k < lst_size(stops); k++) { /* check stops */
        GFF_Feature *stop = lst_get_ptr(stops, k);
        if (feat->strand == '+' && stop->strand == '+' && 
            feat->end == stop->end) {
          feat->end -= 3; 
          lst_push_ptr(ends_adjusted, feat);
        }
        else if (feat->strand == '-' && stop->strand == '-' && 
                 feat->start == stop->start) {
          feat->start += 3; 
          lst_push_ptr(starts_adjusted, feat);
        }
      }
    }
  }
  lst_free(stops);
}
Example #9
0
int main(){

ElemType userVal;

printf("Please enter a set of integers.\n");
int checker;

LIST* lst = lst_create();

while (1){

   checker = scanf("%i", &userVal);
   if (checker == EOF || checker == 0)
    break;
   lst_push_back(lst, userVal);
}
printf("The list before the quick sort: \n");
lst_print(lst);
printf("The list after the quick sort: \n");
qsort1(lst);
lst_print(lst);
lst_free(lst);

return 0;
}
Example #10
0
void cm_free_category_range(CategoryRange *cr) {
  int i;
  for (i = 0; i < lst_size(cr->feature_types); i++) {
    String *s = (String*)lst_get_ptr(cr->feature_types, i);
    if (s != NULL) str_free(s);
  }
  lst_free(cr->feature_types);
  sfree(cr);
}
Example #11
0
File: block.c Project: rvba/minuit
void justify_tree( t_block *block_previous, t_block *block_current, int dir)
{
	float box_previous[8];
	float box_current[8];

	bzero( box_previous, 8);
	bzero( box_current, 8);

	t_lst *lst_previous = block_branch_get( block_previous, dir);
	t_lst *lst_current = block_branch_get( block_current, dir);

	get_branch_bounding_box( block_previous, lst_previous, box_previous, dir);
	get_branch_bounding_box( block_current, lst_current, box_current, dir);

	do_justify_tree( block_current, lst_current, box_previous, box_current, dir);

	lst_free( lst_previous);
	lst_free( lst_current);
}
Example #12
0
/* Print a CategoryMap to a file */
void cm_print(CategoryMap *cm, FILE *F) {
  int i, j, k;
  List *tmpl;
  fprintf(F, "NCATS = %d\n\n", cm->ncats);

  for (i = 1; i <= cm->ncats; i++) {
    CategoryRange *cr = cm->ranges[i];
    for (j = 0; j < lst_size(cr->feature_types); j++) {
      String *s = (String*)lst_get_ptr(cr->feature_types, j);
      fprintf(F, "%-15s %d", s->chars, cr->start_cat_no);
      if (cr->end_cat_no > cr->start_cat_no)
        fprintf(F, "-%d", cr->end_cat_no);
      if (cm->conditioned_on[i] != NULL) {
        fprintf(F, "\t");
        for (k = 0; k < lst_size(cm->conditioned_on[i]); k++)
          fprintf(F, "%d%s", lst_get_int(cm->conditioned_on[i], k),
                  k + 1 == lst_size(cm->conditioned_on[i]) ? "" : ",");
      }
      fprintf(F, "\n");
    }
    i = cr->end_cat_no;         /* avoid looking multiple times at the
                                   same range */
  }

  /* reconstruct precedence lists */
  tmpl = lst_new_int(cm->ncats + 1);
  for (i = 0; i <= cm->ncats; i++) 
    lst_push_int(tmpl, i);
  prec = cm->labelling_precedence;
  lst_qsort(tmpl, compare_prec);
  fprintf(F, "\nLABELLING_PRECEDENCE = ");
  for (i = 0; i <= cm->ncats; i++) {
    int cat = lst_get_int(tmpl, i);
    if (cm->labelling_precedence[cat] != -1)
      fprintf(F, "%d%s", cat, i < cm->ncats ? "," : "");
  }
  fprintf(F, "\n");

  lst_clear(tmpl);
  for (i = 0; i <= cm->ncats; i++) 
    lst_push_int(tmpl, i);
  prec = cm->fill_precedence;
  lst_qsort(tmpl, compare_prec);
  fprintf(F, "FILL_PRECEDENCE = ");
  for (i = 0; i <= cm->ncats; i++) {
    int cat = lst_get_int(tmpl, i);
    if (cm->fill_precedence[cat] != -1)
      fprintf(F, "%d%s", cat, i < cm->ncats ? "," : "");
  }
  fprintf(F, "\n");
  lst_free(tmpl);
}
Example #13
0
void mafBlock_free_data(MafBlock *block) {
  MafSubBlock *sub;
  int i;
  
  if (block->data != NULL) {
    for (i=0; i<lst_size(block->data); i++) {
      sub = (MafSubBlock*)lst_get_ptr(block->data, i);
      mafSubBlock_free(sub);
    }
    lst_free(block->data);
    block->data = NULL;
  }
  block->seqlen = 0;
}
Example #14
0
File: block.c Project: rvba/minuit
void block_arrange( t_block *block)
{
	t_lst *lst = block_leaves_get( block, WEST);

	t_link *l;
	for( l = lst->first; l; l = l->next)
	{
		t_block *block_nearest = l->data;
		drive_away( block, block_nearest, WEST);
		block_arrange( block_nearest);
	}

	lst_free( lst);
}
Example #15
0
void		lst_free(LE *curr)
{
	if (curr)
	{
		if (curr->next)
			lst_free(curr->next);
		if (curr->string)
			free(curr->string);
		free(curr->clef);
		if (curr->value)
			free(curr->value);
		if (curr->next)
			free(curr->next);
	}
}
/* Read an amino acid rate matrix in the format used by PAML.  Reorder
   the rows and columns to match 'alph'.  Warning: the ordering in the
   file is assumed to match that used in the files in the PAML
   distribution (alphabetical order of 3-letter codes), which is also
   the order of AA_ALPHABET (therefore AA_ALPHABET may not be
   changed!).  Equilibrium frequencies are ignored.  */ 
Matrix *read_paml_matrix(FILE *F, char *alph) {
  char *paml_alph = "ARNDCQEGHILKMFPSTWYV$";
  int size = (int)strlen(paml_alph);
  Matrix *retval = mat_new(size, size);
  List *fields = lst_new_ptr(100);
  String *line = str_new(STR_MED_LEN);
  int i, j;
  if (strcmp(alph, paml_alph) != 0)
    die("ERROR read_paml_matrix (alph (%s) != paml_alph (%s))\n",
	alph, paml_alph);
  mat_zero(retval);

  for (i = 1; i < size-1 && str_readline(line, F) != EOF; ) {
    /* NOTE: size of matrix allows for stop, but stop not included in
       file; therefore, only read size-1 lines */
    str_double_trim(line);
    if (line->length == 0) continue;
    str_split(line, NULL, fields);
    if (lst_size(fields) != i) {
      die("ERROR: row %d of matrix must have %d columns.\n",
	  i+1, i);
    }
    for (j = 0; j < lst_size(fields); j++) {
      double val;

      if (str_as_dbl(lst_get_ptr(fields, j), &val) != 0) {
        die("ERROR: non-numeric matrix element in subst. matrix ('%s')\n", 
	    ((String*)lst_get_ptr(fields, j+1))->chars);
      }
      str_free(lst_get_ptr(fields, j));

      if (j >= size)
	die("ERROR read_paml_matrix j (%i) should be < size (%i)\n", j, size);
      mat_set(retval, i, j, val);
      mat_set(retval, j, i, val);
    }
    i++;
  }

  if (i != size - 1) {
    die("ERROR: too few rows in subst. matrix.\n");
  }
  
  lst_free(fields);
  str_free(line);
  return retval;
}
Example #17
0
void mafBlock_add_iLine(String *line, MafSubBlock *sub) {
  List *l = lst_new_ptr(6);
  String *str;
  int i;

  if (sub->numLine<1 || sub->lineType[0]!='s') 
    die("ERROR: got i-Line without preceding s-Line in MAF block\n");
  
  if (6 != str_split(line, NULL, l))
    die("ERROR: expected six fields in MAF line starting with 'i' (got %i)\n",
	lst_size(l));

  //field[0] should be 'i'
  if (!(str_compare_charstr((String*)lst_get_ptr(l, 0), "i")==0))
    die("ERROR: mafBlock_add_iLine: field[0] should be 'i', got %s\n",
	((String*)lst_get_ptr(l, 0))->chars);

  //field[1] should be src, and should match src already set in sub
  if (str_compare((String*)lst_get_ptr(l, 1), sub->src) != 0)
    die("iLine sourceName does not match preceding s-Line (%s, %s)\n", 
	((String*)lst_get_ptr(l, 1))->chars, sub->src->chars);

  for (i=0; i<2; i++) {

    //field[2,4] should be leftStatus, rightStauts
    str = (String*)lst_get_ptr(l, i*2+2);
    if (str->length != 1) die("ERROR: i-Line got illegal %sStatus = %s\n",
			      i==0 ? "left": "right", str->chars);
    sub->iStatus[i] = str->chars[0];
    if (sub->iStatus[i] != 'C' && sub->iStatus[i] != 'I' &&
	sub->iStatus[i] != 'N' && sub->iStatus[i] != 'n' &&
	sub->iStatus[i] != 'M' && sub->iStatus[i] != 'T')
      die("ERROR: i-Line got illegal %sStatus = '%c'\n",
	  i==0 ? "left" : "right", sub->iStatus[i]);

    //field 3,5 should be leftCount, rightCount
    str = (String*)lst_get_ptr(l, i*2+3);
    sub->iCount[i] = atoi(str->chars);
  }
  
  for (i=0; i<6; i++) str_free((String*)lst_get_ptr(l, i));
  lst_free(l);
  sub->lineType[sub->numLine++] = 'i';
}
Example #18
0
void mafBlock_add_qLine(String *line, MafSubBlock *sub) {
  List *l = lst_new_ptr(3);
  String *str;
  int i;

  if (sub->numLine<1 || sub->lineType[0]!='s') 
    die("ERROR: got q-Line without preceding s-Line in MAF block\n");

  if (3 != str_split(line, NULL, l))
    die("ERROR: expected three fields in q-Line of maf file, got %i\n", lst_size(l));
  
  //field[0] should be 'q'
  if (!(str_compare_charstr((String*)lst_get_ptr(l, 0), "q")==0))
    die("ERROR mafBlock_add_qLine expected 'q' got %s\n",
	((String*)lst_get_ptr(l, 0))->chars);
  
  //field[1] should be src, and should match src already set in sub
  if (str_compare((String*)lst_get_ptr(l, 1), sub->src) != 0)
    die("iLine sourceName does not match preceding s-Line (%s, %s)\n", 
	((String*)lst_get_ptr(l, 1))->chars, sub->src->chars);

  //field[2] should be quality
  if (sub->seq == NULL)
    die("ERROR mafBlock_add_qLine: sub->seq is NULL\n");
  str = (String*)lst_get_ptr(l, 2);
  if (sub->seq->length != str->length) 
    die("ERROR: length of q-line does not match sequence length\n");
  sub->quality = str;
  for (i=0; i<sub->quality->length; i++) {
    if (sub->seq->chars[i] == '-') {
      if (sub->quality->chars[i] != '-') 
	die("ERROR: got quality score where alignment char is gap\n");
    } else {
      if (sub->quality->chars[i] != 'F' && sub->quality->chars[i] < '0' &&
	  sub->quality->chars[i] > '9')
	die("ERROR: Illegal quality score '%c' in MAF block\n", 
	    sub->quality->chars[i]);
    }
  }
   
  for (i=0; i<2; i++) str_free((String*)lst_get_ptr(l, i));
  lst_free(l);
  sub->lineType[sub->numLine++] = 'q';
}
Example #19
0
/* given list of spooled category names/numbers, return a list of
   corresponding unspooled category numbers */
List *cm_get_unspooled_list(CategoryMap *cm, List *spooled) {
  List *spooled_catnos, *unspooled_catnos;
  int mark[cm->ncats+1];
  int i;

  spooled_catnos = cm_get_category_list(cm, spooled, 0);
  if (cm->unspooler == NULL) return spooled_catnos;

  unspooled_catnos = lst_new_int(lst_size(spooled_catnos) * 3);
  for (i = 0; i <= cm->ncats; i++) mark[i] = 0;
  for (i = 0; i < lst_size(spooled_catnos); i++) 
    mark[lst_get_int(spooled_catnos, i)] = 1;

  for (i = 0; i < cm->unspooler->nstates_unspooled; i++) 
    if (mark[cm->unspooler->unspooled_to_spooled[i]])
      lst_push_int(unspooled_catnos, i);

  lst_free(spooled_catnos);
  return unspooled_catnos;
}
Example #20
0
JNIEXPORT jobjectArray
Java_com_sun_netstorage_samqfs_mgmt_arc_Archiver_activateCfg(JNIEnv *env,
	jclass cls /*ARGSUSED*/, jobject ctx) {

	sqm_lst_t *err_warn_lst;
	jobjectArray warnArr, errArr;
	int res;

	PTRACE(1, "jni:Archiver_activateCfg() entry");
	res = activate_archiver_cfg(CTX, &err_warn_lst);
	PTRACE(1, "jni:activateCfg returned %d, lst[%d]", res,
	    (res == -2 || res == -3) ? err_warn_lst->length : -1);
	switch (res) {
	case -1:
		/* internal error */
		ThrowEx(env);
		return (NULL);
	case -2:
		/* archiver.cmd errors */
		errArr = lst2jarray(env,
			err_warn_lst, "java/lang/String", charr2String);
		lst_free_deep(err_warn_lst);
		ThrowMultiMsgEx(env, errArr);
		return (NULL);
	case -3:
		/* archiver.cmd warnings */
		warnArr = lst2jarray(env,
			err_warn_lst, "java/lang/String", charr2String);
		lst_free_deep(err_warn_lst);
		break;
	default:
		/* success */
		lst_free(err_warn_lst);
		warnArr = NULL;
	}
	PTRACE(1, "jni:Archiver_activateCfg() done");
	return (warnArr);
}
Example #21
0
void mafBlock_reorder(MafBlock *block, List *specNameOrder) {
  String *str;
  MafSubBlock *sub;
  List *newData;
  Hashtable *newSpecMap;
  int i, idx, *found, oldSize = lst_size(block->data), newSize = lst_size(specNameOrder);

  found = smalloc(oldSize*sizeof(int));
  for (i=0; i<oldSize; i++) found[i]=0;

  newData = lst_new_ptr(oldSize);
  newSpecMap = hsh_new(100);

  for (i=0; i<newSize; i++) {
    str = (String*)lst_get_ptr(specNameOrder, i);
    idx = hsh_get_int(block->specMap, str->chars);
    if (idx != -1) {
      if (found[idx]==1) die("ERROR: species %s appears twice in reorder list\n", 
			     str->chars);
      sub = (MafSubBlock*)lst_get_ptr(block->data, idx);
      hsh_put_int(newSpecMap, sub->src->chars, lst_size(newData));
      hsh_put_int(newSpecMap, sub->specName->chars, lst_size(newData));
      lst_push_ptr(newData, (void*)sub);
      found[idx] = 1;
    }
  }
  for (i=0; i<oldSize; i++) {
    if (found[i]==0) {
      sub = (MafSubBlock*)lst_get_ptr(block->data, i);
      mafSubBlock_free(sub);
    }
  }
  hsh_free(block->specMap);
  lst_free(block->data);
  block->specMap = newSpecMap;
  block->data = newData;
  sfree(found);
}
Example #22
0
/* Free memory associated with category map. */
void cm_free(CategoryMap *cm) {
  int i;
  for (i = 0; i <= cm->ncats; i++) {
    int len = 0;
    if (cm->ranges[i] != NULL) {
      len = cm->ranges[i]->end_cat_no - cm->ranges[i]->start_cat_no;
      cm_free_category_range(cm->ranges[i]);
    }
    if (cm->conditioned_on[i] != NULL)
      lst_free(cm->conditioned_on[i]);
    i += len;
  }
  sfree(cm->ranges);
  sfree(cm->conditioned_on);
  sfree(cm->labelling_precedence);
  sfree(cm->fill_precedence);

  if (cm->unspooler != NULL)
    cm_free_unspooler(cm->unspooler);

  sfree(cm);
  return;
}
Example #23
0
void cm_free_unspool_node(UnspoolNode *n) {
  lst_free(n->children);
  sfree(n);
}
/* free list of problem objects */
void problems_free(List *problems) {
  problems_clear(problems);
  lst_free(problems);
}
/* reconstruct indels by parsimony and assign all base probs to -1
   where ancestral bases are inferred not to have been present */
void do_indels(MSA *msa, TreeModel *mod) {
  int s, tup, i, j;
  TreeNode *n, *lca;
  char c;
  typedef enum {IGNORE, GAP, BASE, MISSING, AMBIG} label_type;
  List *postorder;

  label_type *label = smalloc(mod->tree->nnodes * sizeof(label_type));
  List *inside = lst_new_ptr(mod->tree->nnodes), 
    *outside = lst_new_ptr(mod->tree->nnodes),
    *ambig_cases = lst_new_ptr(mod->tree->nnodes);
  int *seq_to_leaf = smalloc(msa->nseqs * sizeof(int));

  /* build mapping from seqs to leaf indices in tree */
  for (s = 0; s < msa->nseqs; s++) {
    TreeNode *n = tr_get_node(mod->tree, msa->names[s]);
    if (n == NULL)
      die("ERROR: no match for sequence \"%s\" in tree.\n", msa->names[s]);
    seq_to_leaf[s] = n->id;
  }    

  if (mod->msa_seq_idx == NULL)
    tm_build_seq_idx(mod, msa);

  postorder = tr_postorder(mod->tree);

  for (tup = 0; tup < msa->ss->ntuples; tup++) {
    int min = mod->tree->nnodes, max = -1, ngaps = 0, skip_root = FALSE;

    /* find min and max ids of seqs that actually have bases (non-gaps) */
    for (s = 0; s < msa->nseqs; s++) {
      if (ss_get_char_tuple(msa, tup, s, 0) == GAP_CHAR) {
        ngaps++;
        continue;
      }
      if (seq_to_leaf[s] < min) min = seq_to_leaf[s];
      if (seq_to_leaf[s] > max) max = seq_to_leaf[s];

      /* NOTE: missing data being handled like bases here; in some
         cases, a base may be inferred at an ancestral node, when the
         only evidence for it is missing data in the leaves.  There
         are ambiguous cases; we'll err on the side of predicting
         bases rather than indels */
    }

    if (ngaps <= 1) continue;	/* short cut -- impossible to infer
                                   gaps in ancestors */

    else if (ngaps >= msa->nseqs - 1) {
      /* in this case, all ancestors must be gaps */
      for (i = 0; i < mod->tree->nnodes; i++) {
        n = lst_get_ptr(mod->tree->nodes, i);
        if (n->lchild == NULL || n->rchild == NULL) 
          continue;               /* ignore leaves */
        for (j = 0; j < mod->rate_matrix->size; j++)
          mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
	/* mark as gap */
      }
      continue;
    }

    if (min < 0) die("prequel.c: min = %e < 0\n", min);
    if (max < min) die("prequel.c: max (%e) < min (%e)", max, min);

    /* the LCA of all leaves with non-gaps must be the first ancestor of
       the node with the max id that has an id smaller than the min
       id.  This is based on the assumption that node ids are assigned
       sequentially in a preorder traversal of the tree, which will be
       true as long as the tree is read from a Newick file by the code
       in trees.c */
    for (lca = lst_get_ptr(mod->tree->nodes, max); lca->id > min; 
         lca = lca->parent);

    /* by parsimony, the base was inserted on the branch to the LCA,
       and all ancestral nodes outside the subtree rooted at the LCA
       did not have bases */

    if (lca == mod->tree->lchild || lca == mod->tree->rchild)
      skip_root = TRUE;        /* don't mark root as gap in this case:
                                  can't distinguish insertion from
                                  deletion so assume deletion */

    /* mark ancestral bases outside subtree beneath LCA as gaps */
    tr_partition_nodes(mod->tree, lca, inside, outside);
    for (i = 0; i < mod->tree->nnodes; i++) label[i] = BASE;
    for (i = 0; i < lst_size(outside); i++) {
      n = lst_get_ptr(outside, i);
      label[n->id] = IGNORE;
      if (n->lchild == NULL || n->rchild == NULL) 
        continue;               /* skip leaves */
      if (n == mod->tree && skip_root) 
        continue;               /* skip root if condition above */
      for (j = 0; j < mod->rate_matrix->size; j++)
        mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
      /* mark as gap */
    }

    /* check for gaps in subtree; if there's at most one, we can go
       on; otherwise have to use parsimony to infer history in subtree */
    ngaps = 0;
    for (i = 0; i < lst_size(inside); i++) {
      n = lst_get_ptr(inside, i);
      if (n->lchild == NULL &&
          ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0) == GAP_CHAR)
        ngaps++;
    }
    if (ngaps <= 1) continue;

    /* use Dollo parsimony to infer the indel history of the subtree
       beneath the LCA.  Use the fact that every base must have a
       chain of bases to the LCA, because, assuming the alignment is
       correct, no insertions are possible beneath the LCA */
    lst_clear(ambig_cases);
    for (i = 0; i < lst_size(postorder); i++) {
      n = lst_get_ptr(postorder, i);
      if (label[n->id] == IGNORE) continue; /* outside subtree */

      /* MISSING means all leaves beneath node have missing data */
      /* AMBIG means combination of gaps and missing data beneath node */

      else if (n->lchild == NULL) {  /* leaf in subtree */
        c = ss_get_char_tuple(msa, tup, mod->msa_seq_idx[n->id], 0);
        if (c == GAP_CHAR)
          label[n->id] = GAP;
        else if (msa->is_missing[(int)c]) 
          label[n->id] = MISSING;
        else
          label[n->id] = BASE;
      }
      else {                    /* internal node in subtree */
        if (label[n->lchild->id] == BASE || label[n->rchild->id] == BASE)
          label[n->id] = BASE;  /* by Dollo parsimony */
        else if ((label[n->lchild->id] == GAP || label[n->lchild->id] == AMBIG) &&
                 (label[n->rchild->id] == GAP || label[n->rchild->id] == AMBIG))
          label[n->id] = GAP;   /* gaps from both sides and no bases -- must be gap */
        else if (label[n->lchild->id] == MISSING && label[n->rchild->id] == MISSING)
          label[n->id] = MISSING;
        else {              /* must be GAP/MISSING or AMBIG/MISSING */
          label[n->id] = AMBIG;
          lst_push_ptr(ambig_cases, n);
        }
      }
    }

    /* now resolve any ambiguities, by giving each ambiguous node the same
       label as its parent; traversing ambig_cases in reverse order
       ensures that parents are visited before children  */

    /* first make sure root of subtree has a base */
    if (label[lca->id] == MISSING || label[lca->id] == AMBIG)
      label[lca->id] = BASE;
    /* in this case there is all missing data and gaps beneath the LCA;
       hard to know what is right, but let's force a base and err on
       the side of bases rather than gaps */

    for (i = lst_size(ambig_cases) - 1; i >= 0; i--) {
      n = lst_get_ptr(ambig_cases, i);
      if (n == lca) continue;
      else label[n->id] = label[n->parent->id];
    }

    /* now mark gaps inside subtree, as needed */
    for (i = 0; i < lst_size(inside); i++) {
      n = lst_get_ptr(inside, i);
      if (n->lchild == NULL || n->rchild == NULL) continue;
      if (label[n->id] == GAP) 
        for (j = 0; j < mod->rate_matrix->size; j++)
          mod->tree_posteriors->base_probs[0][j][n->id][tup] = -1;
    }
  }

  lst_free(inside);
  lst_free(outside);
  lst_free(ambig_cases);
  sfree(seq_to_leaf);
  sfree(label);
}
Example #26
0
int main(int argc, char* argv[]) {
    FILE* F;
    MSA *msa;
    int *msa_gap_patterns = NULL;
    HMM *hmm = NULL;
    TreeNode *tree = NULL;
    int i, input_format = SS, msa_idx, quiet_mode = FALSE,
           ncats, nmsas, ncats_unspooled, indel_nseqs = -1;
    String *msa_fname, *gff_fname;
    List *gff_fname_list = NULL, *msa_fname_list = NULL,
          *msa_length_list = NULL, *model_indels_str = NULL;
    Matrix *traincounts = NULL;
    Vector *begcounts = NULL, *statecounts = NULL;
    CategoryMap *cm = NULL;
    char c;
    GapPatternMap *gpm = NULL;
    GFF_Set *gff;
    char *reverse_groups_tag = NULL;

    while ((c = getopt(argc, argv, "i:g:c:m:M:R:I:n:t:P:G:qh")) != -1) {
        switch(c) {
        case 'i':
            input_format = msa_str_to_format(optarg);
            if (input_format == -1)
                die("ERROR: bad alignment format.\n");
            break;
        case 'g':
            gff_fname_list = get_arg_list(optarg);
            break;
        case 'c':
            cm = cm_new_string_or_file(optarg);
            break;
        case 'm':
            msa_fname_list = get_arg_list(optarg);
            break;
        case 'M':
            msa_length_list = str_list_as_int(get_arg_list(optarg));
            break;
        case 'R':
            reverse_groups_tag = optarg;
            break;
        case 'I':
            model_indels_str = get_arg_list(optarg);
            break;
        case 'n':
            indel_nseqs = get_arg_int(optarg);
            break;
        case 't':
            if (optarg[0] == '(')     /* in this case, assume topology given
                                   at command line */
                tree = tr_new_from_string(optarg);
            else
                tree = tr_new_from_file(phast_fopen(optarg, "r"));
            break;
        case 'q':
            quiet_mode = TRUE;
            break;
        case 'h':
            print_usage();
            exit(0);
        case '?':
            die("ERROR: unrecognized option.\n\nType 'hmm_train -h' for usage.\n");
        }
    }

    if (msa_fname_list == NULL)
        die("ERROR: -m required.  Type 'hmm_train -h' for usage.\n");
    if (gff_fname_list == NULL)
        die("ERROR: -g required in training mode.  Type 'hmm_train -h' for usage.\n");
    if (msa_length_list != NULL && msa_fname_list != NULL)
        die("ERROR: -m and -M are mutually exclusive.  Type 'hmm_train -h' for usage.\n");
    if (model_indels_str != NULL && tree == NULL)
        die("ERROR: -I requires -t.  Type 'hmm_train -h' for usage.\n");
    if (cm == NULL)
        die("ERROR: category map required.\n");

    set_seed(-1);

    ncats = cm->ncats + 1;
    ncats_unspooled = cm->unspooler != NULL ? cm->unspooler->nstates_unspooled :
                      ncats;
    nmsas = (msa_length_list != NULL ? lst_size(msa_length_list) :
             lst_size(msa_fname_list));

    if (model_indels_str != NULL) {
        if (tree == NULL)
            die("ERROR: tree is NULL\n");  /*FIXME: indel_ncats broken */
        gpm = gp_create_gapcats(cm, model_indels_str, tree, FALSE);
        ncats = cm->ncats + 1;    /* numbers will change */
        ncats_unspooled = cm->unspooler == NULL ? ncats :
                          cm->unspooler->nstates_unspooled;
    }

    /* allocate memory for storage of "training paths" */
    traincounts = mat_new(ncats_unspooled, ncats_unspooled);
    statecounts = vec_new(ncats_unspooled);
    begcounts = vec_new(ncats_unspooled);
    mat_zero(traincounts);
    vec_zero(statecounts);
    vec_zero(begcounts);


    /* create skeleton of new HMM. */
    hmm = hmm_new_nstates(ncats_unspooled, 0, 0);

    /* Main loop: consider each MSA in turn */
    for (msa_idx = 0; msa_idx < nmsas; msa_idx++) {
        if (msa_fname_list != NULL) {
            msa_fname = (String*)lst_get_ptr(msa_fname_list, msa_idx);
            F = phast_fopen(msa_fname->chars, "r");
            if (!quiet_mode)
                fprintf(stderr, "Reading alignment from %s ...\n",
                        F == stdin ? "stdin" : msa_fname->chars);
            msa = msa_new_from_file(F, NULL);
            phast_fclose(F);

        }
        else {                      /* only lengths of alignments specified */
            msa = msa_new(NULL, NULL, 0, lst_get_int(msa_length_list, msa_idx), NULL);
            /* just a shell in this case */
        }

        gff_fname = (String*)lst_get_ptr(gff_fname_list, msa_idx);
        if (!quiet_mode)
            fprintf(stderr, "Reading annotations from %s ...\n", gff_fname->chars);
        gff = gff_read_set(phast_fopen(gff_fname->chars, "r"));

        /* convert GFF to coordinate frame of alignment */
        if (msa_length_list == NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Mapping annotations to alignment ...\n");
            msa_map_gff_coords(msa, gff, 1, 0, 0); /* assume seq 1 is ref */
        }

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Obtaining gap patterns ...\n");
            msa_gap_patterns = smalloc(msa->length * sizeof(int));
            gp_set_phylo_patterns(gpm, msa_gap_patterns, msa);
        }

        /* at this point, we don't actually need the alignment anymore;
           if using ordered suff stats (likely with large data sets),
           can free them now, to avoid running out of memory */
        if (msa->ss != NULL) {
            ss_free(msa->ss);
            msa->ss = NULL;
        }

        if (reverse_groups_tag != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Reverse complementing features on negative strand (group by '%s') ...\n",
                        reverse_groups_tag);
            /* we don't need to reverse complement the whole alignment --
               just the gff and possibly the gap pattern array (pass a
               NULL msa) */
            gff_group(gff, reverse_groups_tag);
            msa_reverse_compl_feats(NULL, gff, msa_gap_patterns);
        }

        if (!quiet_mode)
            fprintf(stderr, "Labeling sites by category ...\n");
        msa_label_categories(msa, gff, cm);

        gff_free_set(gff);

        if (model_indels_str != NULL) {
            if (!quiet_mode)
                fprintf(stderr, "Remapping categories according to gap patterns ...\n");

            if (indel_nseqs > 0 && indel_nseqs != msa->nseqs) {
                /* in this case, we'll simply reassign non-trivial gap
                   patterns randomly.  This will achieve the desired
                   effect with minimal coding, as long as the number of
                   sites is not too small (the indel model is probably
                   useless anyway if the number is small) */
                int pat, newpat;
                int npatterns = 4 * indel_nseqs - 5;
                int complex_allowed[cm->ncats+1];
                List *no_complex_names, *no_complex_nums;

                if (!quiet_mode)
                    fprintf(stderr, "(target number of sequences: %d)\n", indel_nseqs);

                /* set up index indicating by cat no. whether complex gaps
                   are allowed */
                for (i = 0; i < ncats; i++) complex_allowed[i] = 1;
                no_complex_names = lst_new_ptr(10);
                str_split(str_new_charstr(NO_COMPLEX), ",", no_complex_names);
                no_complex_nums = cm_get_category_list(cm, no_complex_names, 1);
                for (i = 0; i < lst_size(no_complex_nums); i++)
                    complex_allowed[lst_get_int(no_complex_nums, i)] = 0;
                lst_free(no_complex_nums);
                lst_free_strings(no_complex_names);
                lst_free(no_complex_names);

                /* now reassign all non-null numbers */
                for (i = 0; i < msa->length; ) {
                    if ((pat = msa_gap_patterns[i]) != 0) {
                        if (complex_allowed[msa->categories[i]])
                            newpat = 1 + ((double)npatterns * unif_rand());
                        /* random number in interval [1, npatterns] */
                        else
                            newpat = 1 + ((double)(npatterns-1) * unif_rand());
                        /* random number in interval [1,npatterns-1]
                           (excludes complex gap pattern) */
                        for (; i < msa->length && msa_gap_patterns[i] == pat; i++)
                            msa_gap_patterns[i] = newpat; /* change for whole sequence */
                    }
                    else i++;
                }
            }

            /* obtain gapped category number for each site */
            for (i = 0; i < msa->length; i++)
                if (gpm->cat_x_pattern_to_gapcat[msa->categories[i]] != NULL)
                    msa->categories[i] = gpm->cat_x_pattern_to_gapcat[msa->categories[i]][msa_gap_patterns[i]];
        }

        if (!quiet_mode)
            fprintf(stderr, "Unspooling categories ...\n");
        cm_spooled_to_unspooled(cm, msa->categories, msa->length);

        if (!quiet_mode)
            fprintf(stderr, "Collecting training data ...\n");
        hmm_train_update_counts(traincounts, statecounts, begcounts,
                                msa->categories, msa->length,
                                ncats_unspooled);

        if (msa_gap_patterns != NULL) sfree(msa_gap_patterns);
        msa_free(msa);
    }

    /* now train HMM, using cumulative data */
    hmm_train_from_counts(hmm, traincounts, NULL, statecounts, NULL,
                          begcounts, NULL);

    /* if modeling indels, adjust begin transitions so probability is
       distributed among different "gap pattern" states that all
       correspond to the same ungapped state (category); this helps
       avoid problems that occur when training on a few large sequences
       (e.g., whole chromosomes) and then testing on many shorter ones */
    if (model_indels_str != NULL) {
        double tprob[gpm->ncats];
        int nst[gpm->ncats];  /* total prob and number of states per
                             spooled, ungapped category */
        for (i = 0; i < gpm->ncats; i++) tprob[i] = nst[i] = 0;
        for (i = 0; i < hmm->nstates; i++) {
            if (vec_get(hmm->begin_transitions, i) > 0)
                /* have to go from unspooled space to spooled space, then to
                   ungapped space (HMM states correspond to unspooled,
                   gapped categories).  Note that states with nonzero begin
                   probs shouldn't be conditioned on other states. */
                tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] +=
                    vec_get(hmm->begin_transitions, i);
            nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]++;
        }
        for (i = 0; i < hmm->nstates; i++)
            if (tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] > 0)
                vec_set(hmm->begin_transitions, i,
                        tprob[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]] /
                        nst[gpm->gapcat_to_cat[cm_unspooled_to_spooled_cat(cm, i)]]);
        /* (uniform prior) */
    }

    /* write trained HMM */
    hmm_print(stdout, hmm);

    if (!quiet_mode) fprintf(stderr, "Done.\n");

    return 0;
}
Example #27
0
/** Create a list of surfaces.
 */
list356_t* get_surfaces() {

    list356_t* surfaces = make_list() ;

    // Table.
    point3_t vertices[8] = {
        {0, 0, 1}, {8, 0, 1}, {0, 8, 1}, {8, 8, 1},
        {0, 0, -1}, {8, 0, -1}, {0, 8, -1}, {8, 8, -1},
    } ;

    int indices[] = {
        0, 1, 3,    0, 3, 2,        // top
        0, 2, 6,    0, 6, 4,        // left
        4, 6, 7,    4, 7, 5,        // bottom
        1, 5, 7,    1, 7, 3,        // right
        2, 3, 7,    2, 7, 6,        // back
        0, 4, 5,    0, 5, 1         // front
    } ;
    int top_offset = 6 ;
    int offset = 36 ;

    list356_t* table_surfaces = make_list() ;
    for (int i=0; i<top_offset/3; ++i) {
        lst_add(table_surfaces, make_triangle(
                    vertices[indices[3*i]],
                    vertices[indices[3*i+1]],
                    vertices[indices[3*i+2]],
                    &RED, &RED, &WHITE, 10.0f)) ;
    }
    for (int i=top_offset/3; i<offset/3; ++i) {
        lst_add(table_surfaces, make_triangle(
                    vertices[indices[3*i]],
                    vertices[indices[3*i+1]],
                    vertices[indices[3*i+2]],
                    &GREEN, &GREEN, &WHITE, 10.0f)) ;
    }

    // Two purple spheres.
    lst_add(table_surfaces, 
            make_sphere(6, 6, 1.75+.01, .75, 
                &PURPLE, &PURPLE, &WHITE, 100.0f)) ;
    lst_add(table_surfaces, 
            make_sphere(5, 2, 1.75+.01, .75, 
                &PURPLE, &PURPLE, &WHITE, 100.0f)) ;

    // Transparent cube.
    point3_t cube_vertices[] = {
            {4, 0, 3}, {5, 0, 3}, {4, 1, 3}, {5, 1, 3},
            {4, 0, 1.01}, {5, 0, 1.01}, {4, 1, 1.01}, {5, 1, 1.01},
    } ;
    for (int i=0; i<offset/3; ++i) {
        surface_t* t = make_triangle(
                    cube_vertices[indices[3*i]],
                    cube_vertices[indices[3*i+1]],
                    cube_vertices[indices[3*i+2]],
                    &BLACK, &BLACK, &WHITE, 10.0f) ;
        t->refr_index = 1.1f ;
        t->atten = &GREENISH ;
        lst_add(surfaces, t) ;
    }

    list356_itr_t* itr = lst_iterator(table_surfaces) ;
    while (lst_has_next(itr)) lst_add(surfaces, lst_next(itr)) ;
    lst_free(table_surfaces) ;

    // Plane at z=-1.
    surface_t* plane = make_plane(
                (point3_t){0, 0, -1},
                (point3_t){1, 0, -1},
                (point3_t){1, 1, -1},
                &LIGHT_GREY, &LIGHT_GREY, &BLACK, 10.0f) ;
    plane->refl_color = &LIGHT_GREY ;
    lst_add(surfaces, plane) ;

    return surfaces ;

}
int main(int argc, char *argv[]) {
  char c;
  int opt_idx;
  GFF_Set *gff;
  List *include = NULL;
  char *groupby = "transcript_id", *exongroup_tag = NULL;
  int unique = FALSE, sort = FALSE, simplebed = FALSE, fix_start_stop = FALSE,
    add_utrs = FALSE, add_introns = FALSE, add_signals = FALSE;
  enum {GFF, BED, GENEPRED, WIG} output_format = GFF;
  FILE *discards_f = NULL, *groups_f = NULL;

  struct option long_opts[] = {
    {"output", 1, 0, 'o'},
    {"include-only", 1, 0, 'i'},
    {"include-groups", 1, 0, 'l'},
    {"groupby", 1, 0, 'g'},
    {"exongroup", 1, 0, 'e'},
    {"add-utrs", 0, 0, 'U'},
    {"add-introns", 0, 0, 'I'},
    {"add-signals", 0, 0, 'S'},
    {"fix-start-stop", 0, 0, 'f'},
    {"unique", 0, 0, 'u'},
    {"sort", 0, 0, 's'},
    {"simplebed", 0, 0, 'b'},
    {"discards", 1, 0, 'd'},
    {"help", 0, 0, 'h'},
    {0, 0, 0, 0}
  };

  while ((c = (char)getopt_long(argc, argv, "o:i:l:g:e:d:UISfusbh", long_opts, &opt_idx)) != -1) {
    switch (c) {
    case 'o':
      if (!strcmp("bed", optarg)) output_format = BED;
      else if (!strcmp("genepred", optarg)) output_format = GENEPRED;
      else if (!strcmp("wig", optarg)) output_format = WIG;
      else if (strcmp("gff", optarg)) die("ERROR: bad output format.\n");
      break;
    case 'i':
      include = get_arg_list(optarg);
      break;
    case 'l':
      groups_f = phast_fopen(optarg, "r");
      break;
    case 'g':
      groupby = optarg;
      break;
    case 'e':
      exongroup_tag = optarg;
      break;
    case 'U':
      add_utrs = TRUE;
      break;
    case 'I':
      add_introns = TRUE;
      break;
    case 'S':
      add_signals = TRUE;
      break;
    case 'f':
      fix_start_stop = TRUE;
      break;
    case 'u':
      unique = TRUE;
      break;
    case 'b':
      simplebed = TRUE;
      output_format = BED;
      break;
    case 'd':
      discards_f = phast_fopen(optarg, "w+");
      break;
    case 's':
      sort = TRUE;
      break;
    case 'h':
      usage(argv[0]);
    case '?':
      die("Bad argument.  Try '%s -h'.\n", argv[0]);
    }
  }

  if (optind != argc - 1) 
    die("Input filename required.  Try '%s -h'.\n", argv[0]);

  set_seed(-1);

  gff = gff_read_set(phast_fopen(argv[optind], "r"));

  if (lst_size(gff->features) == 0) exit(0); /* helps avoid unexpected
                                                behavior below */

  /* filter by type */
  if (include != NULL) gff_filter_by_type(gff, include, FALSE, discards_f);

  /* group */
  gff_group(gff, groupby);

  /* utrs, introns, & signals */
  if (add_utrs) gff_create_utrs(gff);
  if (add_introns) gff_create_introns(gff);
  if (add_signals) gff_create_signals(gff);

  /* subgroup */
  if (exongroup_tag != NULL) gff_exon_group(gff, exongroup_tag);

  /* filter by group */
  if (groups_f != NULL) {
    String *s = str_new(STR_LONG_LEN);
    List *groups = lst_new_ptr(10000);
    str_slurp(s, groups_f);
    str_split(s, NULL, groups);
    gff_filter_by_group(gff, groups);
    lst_free_strings(groups); lst_free(groups);
    str_free(s);
  }

  /* sort */
  if (sort) gff_sort(gff);

  /* make unique */
  if (unique) gff_remove_overlaps(gff, discards_f);
  
  if (fix_start_stop) gff_fix_start_stop(gff);

  if (output_format == BED)
    gff_print_bed(stdout, gff, !simplebed);
  else if (output_format == GENEPRED)
    gff_print_genepred(stdout, gff);
  else if (output_format == WIG)
    wig_print(stdout, gff);
  else 
    gff_print_set(stdout, gff);
  gff_free_set(gff);
  
  return 0;
}
/* scans a cds for gaps.  Returns CLN_GAPS, NOVRLP_CLN_GAPS, NO_GAPS,
   or FSHIFT_BAD; doesn't try to check for compensatory indels, which
   is more complicated (this is left for the special-purpose function
   below) */
int scan_for_gaps(GFF_Feature *feat, MSA *msa, Problem **problem) {
  int msa_start = feat->start - 1;
  int msa_end = feat->end - 1;
  int i, j;
  int near_boundary = 0;
  cds_gap_type retval = NGAPS;
  List *gaps = lst_new_ptr(10);

  for (j = 0; retval != FSHIFT_BAD && j < msa->nseqs; j++) {
    for (i = msa_start; i <= msa_end; i++) {
      if (ss_get_char_pos(msa, i, j, 0) == GAP_CHAR) {
        int gap_start, gap_end;
        struct gap *g;

        for (gap_start = i-1; gap_start >= msa_start && 
               ss_get_char_pos(msa, gap_start, j, 0) == GAP_CHAR; gap_start--);
        gap_start++;            /* inclusive */
        for (gap_end = i+1; gap_end <= msa_end && 
               ss_get_char_pos(msa, gap_end, j, 0) == GAP_CHAR; gap_end++);
        gap_end--;              /* inclusive */

        if ((gap_end - gap_start + 1) % 3 != 0) {
          retval = FSHIFT_BAD;
          *problem = problem_new(feat, FSHIFT, gap_start, gap_end);
          (*problem)->cds_gap = FSHIFT_BAD;
          break;
        }

        /* note whether gaps occur near a cds boundary (within 3 sites) */
        if (gap_start <= msa_start + 3 || gap_end >= msa_end - 3)
          near_boundary = 1;
        
        if (retval == NGAPS) retval = CLN_GAPS;
        g = smalloc(sizeof(struct gap));
        g->start = gap_start;
        g->end = gap_end;
        lst_push_ptr(gaps, g);

        i = gap_end;
      }
    }
  }

  if (retval == CLN_GAPS) {     /* now check for overlaps */
    lst_qsort(gaps, gap_compare);
    retval = NOVRLP_CLN_GAPS;
    for (i = 1; i < lst_size(gaps); i++) {
      struct gap *g1 = lst_get_ptr(gaps, i-1);
      struct gap *g2 = lst_get_ptr(gaps, i);
      if (g2->start <= g1->end && 
          (g2->start != g1->start || g2->end != g1->end)) {
        retval = CLN_GAPS;
        break;
      }
    }
    if (retval == NOVRLP_CLN_GAPS && near_boundary)
      retval = CLN_GAPS;        /* note that the boundary criterion is
                                   being confounded with the overlap
                                   criterion.  Doesn't seem worth
                                   fixing at the moment ...  */
  }

  for (i = 0; i < lst_size(gaps); i++) sfree(lst_get_ptr(gaps, i));
  lst_free(gaps);
  return retval;
}
/* checks to see if reference sequence looks okay wrt a given
   list of features */
int ref_seq_okay(List *features, MSA *msa, int offset3, 
                 int indel_strict, int splice_strict, List *problems) {
  List *signals = NULL;
  char *seq = NULL;
  int seqalloc = 0;
  int idx, retval = TRUE;
  GFF_Feature *feat, *lastfeat_helper = NULL;

  if (indel_strict) {
    signals = lst_new_ptr(10);
    str_split(str_new_charstr(SIGNALS), ",", signals);
  }

  for (idx = 0; idx < lst_size(features); idx++) {
    int i, j, len, has_gaps = 0; 

    feat = lst_get_ptr(features, idx);

    if (seqalloc <= feat->end - feat->start + 2) {
      seqalloc = (feat->end - feat->start) * 2; 
      seq = srealloc(seq, seqalloc * sizeof(char));
    }

    for (i = feat->start - 1, len = 0; i < feat->end; i++) {
      if (ss_get_char_pos(msa, i, 0, 0) != GAP_CHAR)
        seq[len++] = ss_get_char_pos(msa, i, 0, 0);
      else if (!has_gaps) has_gaps = 1;
    }
    seq[len] = '\0';
    if (feat->strand == '-') msa_reverse_compl_seq(seq, len);

    if (str_equals_charstr(feat->feature, GFF_START_TYPE) && strcmp(seq, "ATG") != 0) {
      problem_add(problems, feat, BAD_REF_START, -1, -1);
      retval = FALSE;
    }
    else if (str_equals_charstr(feat->feature, GFF_STOP_TYPE) && 
             (feat->frame != 0 || !is_stop_codon(seq))) {
      problem_add(problems, feat, BAD_REF_STOP, -1, -1);
      retval = FALSE;
    }
    else if (str_starts_with_charstr(feat->feature, SPLICE_5) && 
             !is_valid_5splice(seq, splice_strict)) {
      problem_add(problems, feat, BAD_REF_5_SPLICE, -1, -1);
      retval = FALSE;
    }
    else if (str_starts_with_charstr(feat->feature, SPLICE_3) &&
             !is_valid_3splice(&seq[offset3], splice_strict)) {
      problem_add(problems, feat, BAD_REF_3_SPLICE, -1, -1);
      retval = FALSE;
    }
    else if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
      for (i = (3 - feat->frame) % 3; i <= len - 3; i += 3) {
        if (is_stop_codon(&seq[i])) {
          problem_add(problems, feat, BAD_REF_ORF, -1, -1);
          retval = FALSE;
          break;
        }
      }
    }

    if (indel_strict) {
      int strict_okay = TRUE;
      List *signals = lst_new_ptr(10);
      str_split(str_new_charstr(SIGNALS), ",", signals);

      if (str_in_list(feat->feature, signals)) {
        /* reject any signal feature with gaps in the ref seq, unless they
           appear in a non-critical part of a splice site or in a
           "prestart" feature  */
        if (has_gaps) {          
          if (str_starts_with_charstr(feat->feature, SPLICE_5)) {
            if (ss_get_char_pos(msa, feat->start-1, 0, 0) == GAP_CHAR ||
                ss_get_char_pos(msa, feat->start, 0, 0) == GAP_CHAR)
              strict_okay = FALSE;
          }
          else if (str_starts_with_charstr(feat->feature, SPLICE_3)) {
            if (ss_get_char_pos(msa, feat->end-1, 0, 0) == GAP_CHAR ||
                ss_get_char_pos(msa, feat->end-2, 0, 0) == GAP_CHAR)
              strict_okay = FALSE;
          }
          else if (!str_equals_charstr(feat->feature, "prestart"))
            strict_okay = FALSE;
        }
        /* in addition, if two signals occur consec. with gaps and
           only gaps between them, assume a violation of
           --indel-strict */
        if (lastfeat_helper != NULL && lastfeat_helper->end < feat->start-1) {
          int allgaps = 1;
          for (j = lastfeat_helper->end; allgaps && j < feat->start-1; j++) 
                                /* note indexing: -1+1 for end and -1
                                   for start  */
            if (ss_get_char_pos(msa, j, 0, 0) != GAP_CHAR) allgaps = 0;
          if (allgaps) 
            strict_okay = FALSE;
        }
        lastfeat_helper = feat;
      }
      else lastfeat_helper = NULL;
    
      /* also exclude CDS exons of length less than 6 in indel_strict
         case -- these cause problems in exoniphy training because
         start_codon is adjacent to cds5ss */
      if (str_equals_charstr(feat->feature, GFF_CDS_TYPE) && len <= 6)
        strict_okay = FALSE;

      if (!strict_okay) {
        problem_add(problems, feat, BAD_REF_INDEL_STRICT_FAIL, -1, -1);
        retval = FALSE;
      }
      lst_free_strings(signals);
      lst_free(signals);
    }
  }
  if (seq != NULL) sfree(seq);
  return retval;
}