Example #1
0
/* Create a category map with a category for each feature type in a
    GFF_Set.  Category numbers are assigned in order of appearance of
    types */
CategoryMap* cm_new_from_features(GFF_Set *feats) {
  int i;
  CategoryMap *retval;
  Hashtable *hash;
  List *types;

  /* first scan features for all types */
  hash = hsh_new(10);
  types = lst_new_ptr(10);
  for (i = 0; i < lst_size(feats->features); i++) {
    GFF_Feature *f = lst_get_ptr(feats->features, i);
    checkInterruptN(i, 10000);
    if (hsh_get(hash, f->feature->chars) == (void*)-1) {
      lst_push_ptr(types, f->feature);
      hsh_put_int(hash, f->feature->chars, 1);
    }
  }
  hsh_free(hash);

  /* now create a simple category map */
  retval = cm_new(lst_size(types));
  for (i = 0; i <= retval->ncats; i++) {
    String *type = i == 0 ? str_new_charstr(BACKGD_CAT_NAME) : 
      str_dup(lst_get_ptr(types, i-1));
    retval->ranges[i] = cm_new_category_range(type, i, i);
  }
  lst_free(types);
  return retval;
}
Example #2
0
SEXP rph_tree_summary_rchild(SEXP treeP) {
  TreeNode *tr = rph_tree_new(treeP), *node;
  int i, *rchild, nnode, *idmap;
  List *nodes = tr_preorder(tr);
  SEXP result;

  nnode = lst_size(nodes);
  result = PROTECT(NEW_INTEGER(nnode));
  rchild = INTEGER_POINTER(result);
  idmap = smalloc((nnode+1)*sizeof(int));
  for (i=0; i < nnode; i++) {
    node = (TreeNode*)lst_get_ptr(nodes, i);
    if (node->id > nnode || node->id < 0)
      die("invalid id (%i) in tree node\n", node->id);
    idmap[(int)node->id] = i;
  }
  for (i=0; i < nnode; i++) {
    node = (TreeNode*)lst_get_ptr(nodes, i);
    if (node->rchild == NULL)
      rchild[idmap[node->id]] = -1;
    else rchild[idmap[node->id]] = idmap[node->rchild->id] + 1;
  }
  UNPROTECT(1);
  return result;
}
/* Read substitution scores from specified file and return as a kind
   of pseudo substitution matrix.  All nonspecified elements in matrix
   will be equal to NEGINFTY, which is to be interpretted as "NA" */
Matrix* read_subst_scores(TreeModel *mod, FILE *F) {
  Matrix *retval = mat_new(mod->rate_matrix->size,
                                        mod->rate_matrix->size);
  String *line = str_new(STR_MED_LEN), *tuple1, *tuple2;
  List *l = lst_new_ptr(3);
  int alph_size = (int)strlen(mod->rate_matrix->states);
  int *inv_alph = mod->rate_matrix->inv_states;
  double val;
  mat_set_all(retval, NEGINFTY);
  while (str_readline(line, F) != EOF) {
    str_double_trim(line);
    if (str_starts_with_charstr(line, "#") || line->length == 0) 
      continue;
    str_split(line, NULL, l);
    if (lst_size(l) < 3) {
      die("ERROR: wrong number of columns in subst. score file.\n");
    }
    tuple1 = lst_get_ptr(l, 0);
    tuple2 = lst_get_ptr(l, 1);
    if (str_as_dbl(lst_get_ptr(l, 2), &val) != 0) {
      die("ERROR: bad value in subst. score file.\n");
    }
    mat_set(retval, tuple_index(tuple1->chars, inv_alph, alph_size),
                   tuple_index(tuple2->chars, inv_alph, alph_size), val);
    str_free(tuple1); str_free(tuple2); str_free(lst_get_ptr(l, 2));
  }
  lst_free(l);
  str_free(line);
  return retval;
}
/* Exclude stop codons from all CDS in a group, as necessary.  Record
   any features that are changed, so they can be changed back before
   data is output */
void exclude_stops(GFF_FeatureGroup *group, List *starts_adjusted, 
                   List *ends_adjusted) {
  int j, k;
  List *stops = lst_new_ptr(1), *gfeatures = group->features;
  GFF_Feature *feat;
  lst_clear(stops); lst_clear(ends_adjusted); lst_clear(starts_adjusted);
  for (j = 0; j < lst_size(gfeatures); j++) { /* first grab all stops.  We 
                                                 expect at most one, but more 
                                                 are possible */
    feat = lst_get_ptr(gfeatures, j);
    if (str_equals_charstr(feat->feature, GFF_STOP_TYPE)) lst_push_ptr(stops, feat);
  }
  for (j = 0; j < lst_size(gfeatures); j++) { /* now look at CDSs */
    feat = lst_get_ptr(gfeatures, j);
    if (str_equals_charstr(feat->feature, GFF_CDS_TYPE)) {
      for (k = 0; k < lst_size(stops); k++) { /* check stops */
        GFF_Feature *stop = lst_get_ptr(stops, k);
        if (feat->strand == '+' && stop->strand == '+' && 
            feat->end == stop->end) {
          feat->end -= 3; 
          lst_push_ptr(ends_adjusted, feat);
        }
        else if (feat->strand == '-' && stop->strand == '-' && 
                 feat->start == stop->start) {
          feat->start += 3; 
          lst_push_ptr(starts_adjusted, feat);
        }
      }
    }
  }
  lst_free(stops);
}
Example #5
0
void mafBlock_remove_lines(MafBlock *block, int *keep) {
  int i, oldSize = lst_size(block->data), newSize=0;
  MafSubBlock *sub, *testSub;
  for (i=0; i<oldSize; i++) {
    if (keep[i]) {
      if (i != newSize) {
	sub = (MafSubBlock*)lst_get_ptr(block->data, i);
	hsh_reset_int(block->specMap, sub->src->chars, newSize);
	hsh_reset_int(block->specMap, sub->specName->chars, newSize);
	testSub = (MafSubBlock*)lst_get_ptr(block->data, newSize);
	if (testSub != NULL)
	  die("ERROR: mafBlock_remove_lines: testSub should be NULL\n");
	lst_set_ptr(block->data, newSize, (void*)sub);
	lst_set_ptr(block->data, i, NULL);
      }
      newSize++;
    } else {
      sub = (MafSubBlock*)lst_get_ptr(block->data, i);
      hsh_reset_int(block->specMap, sub->src->chars, -1);
      hsh_reset_int(block->specMap, sub->specName->chars, -1);
      mafSubBlock_free(sub);
      lst_set_ptr(block->data, i, NULL);
    }
  }
  for (i=oldSize-1; i>=newSize; i--)
    lst_delete_idx(block->data, i);
}
Example #6
0
/* Closes all outfiles.  If already closed, reopen with append, add #eof 
   closer, and close again.  see comment above at get_outfile */
void close_outfiles(List *outfileList, Hashtable *outfileHash) {
  List *keys = hsh_keys(outfileHash);
  int *done, idx, i;
  char *fname;
  FILE *outfile;
  done = smalloc(lst_size(keys)*sizeof(int));
  for (i=0; i<lst_size(keys); i++) {
    done[i]=0;
    fname = (char*)lst_get_ptr(keys, i);
    idx = hsh_get_int(outfileHash, fname);
    outfile = (FILE*)lst_get_ptr(outfileList, idx);
    if (outfile != NULL) {
      mafBlock_close_outfile(outfile);
      done[i]=1;
    }
  }
  for (i=0; i<lst_size(keys); i++) {
    if (done[i]) continue;
    fname = (char*)lst_get_ptr(keys, i);
    outfile = phast_fopen(fname, "a");
    mafBlock_close_outfile(outfile);
  }
  sfree(done);
  lst_free(keys);
  lst_free(outfileList);
  hsh_free(outfileHash);
}
Example #7
0
SEXP rph_gff_featureBits(SEXP gffListP, SEXP orP, SEXP returnGffP) {
  int numGff, i, j, or, returnGff;
  long numbit = 0;
  List *gfflist;
  GFF_Set *gff, *newgff=NULL;
  GFF_Feature *feat, *newfeat;
  SEXP rv;

  numGff = length(gffListP);
  gfflist = lst_new_ptr(numGff);
  //  Rf_PrintValue(gffListP);
  for (i = 0; i < numGff; i++) {
    gff = (GFF_Set*)EXTPTR_PTR(VECTOR_ELT(gffListP, i));
    lst_push_ptr(gfflist, gff);
    gff_register_protect(gff);
  }
  or = LOGICAL_VALUE(orP);
  returnGff = LOGICAL_VALUE(returnGffP);
  if (!or && numGff >= 2) {
    newgff = gff_overlap_gff(lst_get_ptr(gfflist, 0),
			     lst_get_ptr(gfflist, 1),
			     1, -1.0, FALSE, TRUE, NULL);
    numbit = gff_flatten_mergeAll(newgff);
    for (i=2; i < numGff; i++) {
      checkInterrupt();
      gff = gff_overlap_gff(newgff,
			    lst_get_ptr(gfflist, i),
			    1, -1.0, FALSE, TRUE, NULL);
      numbit = gff_flatten_mergeAll(gff);
      gff_free_set(newgff);
      newgff = gff;
    }
  } else {
    newgff = gff_new_set();
    for (i=0; i< numGff; i++) {
      gff = (GFF_Set*)lst_get_ptr(gfflist, i);
      for (j=0; j < lst_size(gff->features); j++) {
	checkInterruptN(j, 1000);
	feat = lst_get_ptr(gff->features, j);
	newfeat = gff_new_feature_copy(feat);
	lst_push_ptr(newgff->features, newfeat);
      }
    }
    numbit = gff_flatten_mergeAll(newgff);
  }
  if (returnGff)
    return rph_gff_new_extptr(newgff);

  if (numbit > INT_MAX) {
    PROTECT(rv = allocVector(REALSXP, 1));
    REAL(rv)[0] = numbit;
  } else {
    PROTECT(rv = allocVector(INTSXP, 1));
    INTEGER(rv)[0] = numbit;
  }
  UNPROTECT(1);
  return rv;
}
Example #8
0
CategoryRange* cm_category_range_create_copy(CategoryRange *src) {
  int i;
  CategoryRange *retval = 
    cm_new_category_range(str_dup(lst_get_ptr(src->feature_types, 0)),
                          src->start_cat_no, src->end_cat_no);
  for (i = 1; i < lst_size(src->feature_types); i++)
    lst_push_ptr(retval->feature_types, 
                 str_dup(lst_get_ptr(src->feature_types, i)));
  return retval;
}
/* given a list of 5' and 3' splice sites extracted from a group,
   check whether they form valid pairs in all species */
int are_introns_okay(List *intron_splice,  MSA *msa, List *problems,
                     int offset5, int offset3) {
  int i, j, start1, start2;
  char str1[3], str2[3], str12[5];
  char strand;
  int retval = 1;
  char * splice_pairs[3] = {"GTAG", "GCAG", "ATAC"};

  str1[2] = '\0'; str2[2] = '\0';

  if (lst_size(intron_splice) < 2) return 1;

  strand = ((GFF_Feature*)lst_get_ptr(intron_splice, 0))->strand;
                                /* assume all same strand */

  if (strand == '+')
    lst_qsort(intron_splice, feature_comparator_ascending); 
  else
    lst_qsort(intron_splice, feature_comparator_descending); 

  for (i = 0; i < lst_size(intron_splice) - 1; i++) {
    /* assume every 5' splice and immediately following 3' splice
       form a pair */
    GFF_Feature *f1 = lst_get_ptr(intron_splice, i);
    GFF_Feature *f2 = lst_get_ptr(intron_splice, i+1);
    if (str_starts_with_charstr(f1->feature, SPLICE_5) &&
        str_starts_with_charstr(f2->feature, SPLICE_3)) {
      start1 = f1->start - 1 + (strand == '-' ? offset5 : 0);
      start2 = f2->start - 1 + (strand == '+' ? offset3 : 0);
      for (j = 0; j < msa->nseqs; j++) {
        str1[0] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start1], j, 0);
        str1[1] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start1+1], j, 0);
        str2[0] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start2], j, 0);
        str2[1] = ss_get_char_tuple(msa, msa->ss->tuple_idx[start2+1], j, 0);
        if (strand == '-') {
          msa_reverse_compl_seq(str1, 2);
          msa_reverse_compl_seq(str2, 2);
        }
	strcpy(str12, str1); strcat(str12, str2);
        if (!is_signal(str12, 3, splice_pairs, msa->is_missing)) {
          problem_add(problems, f1, BAD_INTRON, -1, -1);
          problem_add(problems, f2, BAD_INTRON, -1, -1);
          retval = 0;
          break;
        }
      }
      i++;                      /* no need to look at next feature */
    }
  }
  return retval;
}
Example #10
0
/*
  open a file with name out_root.name.maf, or returns it if already open.
  This is a bit messy because in some cases (splitting by feature) there may
  be more output files than the OS can handle.  But it would be computationally
  expensive to check and see which files are finished, assuming that the MAF is
  sorted.  

  So, if it tries to open a file and fails, it the goes through the list of
  filehandles, finds an open one, closes it, and tries to open the new one 
  again.  Repeat until successful.

  Then, if a filehandle needs to be re-opened, it is opened with append.  Again,
  if this is not successful, it looks for another file to close.  If it can't
  find one the program reports an error and dies.

  Finally, close_outfiles below checks and makes sure that all files
  are closed with mafBlock_close_file in the end, so that they get the #eof
  closer.
 */
FILE *get_outfile(List *outfileList, Hashtable *outfileHash, String *name, char *out_root,
		  int argc, char *argv[]) {
  int idx, i;
  FILE *outfile;
  char *fname = smalloc((strlen(out_root)+name->length+7)*sizeof(char));
  sprintf(fname, "%s.%s.maf", out_root, name->chars);
  idx = ptr_to_int(hsh_get(outfileHash, fname));
  if (idx == -1) {
    hsh_put(outfileHash, fname, int_to_ptr(lst_size(outfileList)));
    outfile = mafBlock_open_outfile(fname, argc, argv);
    while (outfile==NULL) {  //too many files are open, close one first
      for (i=0; i<lst_size(outfileList); i++) {
	outfile = (FILE*)lst_get_ptr(outfileList, i);
	if (outfile != NULL) break;
      }
      if (i == lst_size(outfileList)) {
	die("ERROR: too many files open in maf_parse\n");
      } else {
	phast_fclose(outfile);
	lst_set_ptr(outfileList, i, NULL);
      }
      outfile = mafBlock_open_outfile(fname, argc, argv);
    }
    lst_push_ptr(outfileList, (void*)outfile);
    sfree(fname);
    return outfile;
  }
  outfile = (FILE*)lst_get_ptr(outfileList, idx);
  if (outfile == NULL) { //has already been opened but then closed.
    outfile = phast_fopen_no_exit(fname, "a");
    while (outfile == NULL) {
      for (i=0; i<lst_size(outfileList); i++) {
	outfile = (FILE*)lst_get_ptr(outfileList, i);
	if (outfile != NULL) break;
      }
      if (i == lst_size(outfileList)) {
	die("ERROR: too many files open in maf_parse\n");
      } else {
	phast_fclose(outfile);
	lst_set_ptr(outfileList, i, NULL);
      }
      outfile = phast_fopen_no_exit(fname, "a");
    }
    lst_set_ptr(outfileList, idx, (void*)outfile);
  }
  sfree(fname);
  return outfile;
}
Example #11
0
/* Identify branches wrt which a given feature is uninformative,
   in the sense that all leaves beneath these branches having only missing
   data.  Will set (preallocated) array has_data[i] = I(branch above
   node i is informative).  Will also set *nspec equal to number of
   leaves that have data. */
void ff_find_missing_branches(TreeModel *mod, MSA *msa, GFF_Feature *feat,
                              int *has_data, int *nspec) {
    int i, j;
    List *traversal = tr_postorder(mod->tree);
    *nspec = 0;
    for (i = 0; i < lst_size(traversal); i++) {
        TreeNode *n = lst_get_ptr(traversal, i);
        if (!((n->lchild == NULL && n->rchild == NULL) ||
                (n->lchild != NULL && n->rchild != NULL)))
            die("ERROR ff_find_missing_branches: lchild and rchild should both be NULL or not NULL\n");
        if (n->parent == NULL)      /* root */
            has_data[n->id] = FALSE;
        else if (n->lchild == NULL) {    /* leaf */
            has_data[n->id] = FALSE;       /* initialize to F, set to T if
                                        base in any col in feature */
            for (j = feat->start-1; j < feat->end; j++) {
                if (mod->rate_matrix->
                        inv_states[(int)ss_get_char_tuple(msa, msa->ss->tuple_idx[j],
                                   mod->msa_seq_idx[n->id], 0)] >= 0) {
                    has_data[n->id] = TRUE;
                    (*nspec)++;
                    break;
                }
            }
        }
        else {                      /* non-root ancestral node */
            if (has_data[n->lchild->id] || has_data[n->rchild->id])
                has_data[n->id] = TRUE;
            else
                has_data[n->id] = FALSE;
        }
    }
}
Example #12
0
/* add leaf with specified name to specified internal branch */
void tr_add_leaf_internal(TreeNode *t, int branch, char *lname, int lgroup) {
  TreeNode *oldnode, *newanc, *newleaf;

  oldnode = lst_get_ptr(t->nodes, branch); /* node beneath branch in question */
  if (oldnode == t)
    die("ERROR tr_add_leaf_internal: oldnode == t\n");

  newanc = tr_new_node();
  newleaf = tr_new_node();
  strcpy(newleaf->name, lname);
  newleaf->dparent = lgroup;

  newanc->rchild = newleaf;
  newleaf->parent = newanc;
  newanc->lchild = oldnode;
  newanc->parent = oldnode->parent; 

  if (oldnode->parent->lchild == oldnode)
    oldnode->parent->lchild = newanc;
  else 
    oldnode->parent->rchild = newanc;

  oldnode->parent = newanc;

  if (lgroup > 0 && lgroup == oldnode->dparent)
    newanc->dparent = lgroup;

  /* fix up ids and nodes list */
  lst_push_ptr(t->nodes, newanc);
  newanc->id = lst_size(t->nodes) - 1; /* circumvent normal id assignment */
  lst_push_ptr(t->nodes, newleaf);
  newleaf->id = lst_size(t->nodes) - 1;
  t->nnodes += 2;
}
double calcMMscore(char *seqData, int base, List *MarkovMatrices, int conservative) {
  int i, baseAsNum, j;
  double val;
  int mmOrder = lst_size(MarkovMatrices)-1;
  Matrix *mm;
  int previousMMbases[mmOrder];
    
  //If there aren't mmOrder previous bases @ base, then adjust mmOrder to take advantage of however many we have
  if (base < mmOrder)
    mmOrder = base;
      
  //If we run into any unknown "N" characters, adjust the mmOrder accordingly
  for(i=mmOrder; i>0; i--)
    {
      baseAsNum = basetocol(seqData[base-i]);
      if (baseAsNum < 0)
        mmOrder = i-1;
      else
        previousMMbases[mmOrder-i] = baseAsNum;
    }
   	
  //Get score from Markov Matrix
  mm =  lst_get_ptr(MarkovMatrices, mmOrder);
  j = basesToRow(previousMMbases, mmOrder, mm->ncols);
  if (j >= 0)
    val = log(mat_get(mm, j, basetocol(seqData[base])));
  else
	{
      if (conservative == 1)
        val = log(0);	//If it is an unknown base, probability is 0, in log space =inf
      else
        val = 0; //If it is an unknown base probability is 1, in log space log(1)=0
	}
  return val;
}
Example #14
0
void mafBlock_strip_eLines(MafBlock *block) {
  int i, *keep = smalloc(lst_size(block->data)*sizeof(int));
  for (i=0; i<lst_size(block->data); i++) 
    keep[i] = (((MafSubBlock*)lst_get_ptr(block->data, i))->lineType[0] != 'e');
  mafBlock_remove_lines(block, keep);
  sfree(keep);
}
Example #15
0
long mafBlock_get_start(MafBlock *block, String *specName) {
  int idx=0;
  if (specName != NULL) 
    idx = hsh_get_int(block->specMap, specName->chars);
  if (idx == -1 || idx >= lst_size(block->data)) return -1;
  return ((MafSubBlock*)lst_get_ptr(block->data, idx))->start;
}
Example #16
0
//sets fieldSize[i] to maximum length of field i in MAF, so that block
//can be printed with nice formatting
void mafBlock_get_fieldSizes(MafBlock *block, int fieldSize[6]) {
  int i;
  MafSubBlock *sub;
  char tempstr[1000];
  for (i=0; i<6; i++) fieldSize[i] = 0;

  fieldSize[0] = 1;  //this is always one character
  fieldSize[4] = 1;  //this is always one character (strand)
  for (i=0; i<lst_size(block->data); i++) {
    sub = (MafSubBlock*)lst_get_ptr(block->data, i);

    //field[1] is src
    if (sub->src->length > fieldSize[1])
      fieldSize[1] = sub->src->length;

    //field[2] is start
    sprintf(tempstr, "%li", sub->start);
    if (strlen(tempstr) > fieldSize[2])
      fieldSize[2] = (int)strlen(tempstr);

    //field[3] is size
    sprintf(tempstr, "%i", sub->size);
    if (strlen(tempstr) > fieldSize[3])
      fieldSize[3] = (int)strlen(tempstr);
    
    //field[4] is strand... skip
    
    //field[5] is srcSize
    sprintf(tempstr, "%li", sub->srcSize);
    if (strlen(tempstr) > fieldSize[5])
      fieldSize[5] = (int)strlen(tempstr);

    //don't worry about size of lastField since it just goes to end-of-line
  }
}
Example #17
0
SEXP rph_tree_rename(SEXP treeVec, SEXP oldNamesP, SEXP newNamesP) {
  int i, numtree = LENGTH(treeVec), treeIdx;
  TreeNode *tr, *n;
  SEXP result;
  Hashtable *hash = hsh_new(20);
  char *str;

  for (i=0; i<LENGTH(oldNamesP); i++) {
    str = smalloc((strlen(CHAR(STRING_ELT(newNamesP, i)))+1)*sizeof(char));
    strcpy(str, CHAR(STRING_ELT(newNamesP, i)));
    hsh_put(hash, CHAR(STRING_ELT(oldNamesP, i)), str);
  }

  PROTECT(result = NEW_CHARACTER(numtree));
  for (treeIdx=0; treeIdx < numtree; treeIdx++) {
    tr = rph_tree_new(STRING_ELT(treeVec, treeIdx));
    //    tr = tr_new_from_string(CHAR(STRING_ELT(treeVec, treeIdx)));
    for (i=0; i<tr->nnodes; i++) {
      n = lst_get_ptr(tr->nodes, i);
      if (n->name[0] != '\0' &&
	  (str = hsh_get(hash, n->name)) != (char*)-1)
	strcpy(n->name, str);
    }
    str = tr_to_string(tr, 1);
    SET_STRING_ELT(result, treeIdx, mkChar(str));
  }
  UNPROTECT(1);
  return result;
}
/* write machine-readable log entry for discarded feature */
void write_machine_log(FILE *mlogf, GFF_FeatureGroup *group, List *problems,
                       msa_coord_map *map) {
  int i;
  for (i = 0; i < lst_size(problems); i++) {
    write_machine_problem(mlogf, group, lst_get_ptr(problems, i), map);
  }
}
/* Reset a problem list to the empty state */
void problems_clear(List *problems) {
  int i;
  for (i = 0; i < lst_size(problems); i++) {
    problem_free(lst_get_ptr(problems, i));
  }
  lst_clear(problems);
}
Example #20
0
/* last item in predecessors is assumed to be the most recently visited */
int cm_get_unspooled_state(CategoryMap *cm, int spooled_state, 
                           List *predecessors) {
  UnspoolNode *n, *child;
  int p, pred_idx, i;

  pred_idx = lst_size(predecessors) - 1;
  n = cm->unspooler->spooled_to_unspooled[spooled_state];
  while (n->newstate == -1) {
    child = NULL;
    while (n != child && pred_idx >= 0) {
      p = lst_get_int(predecessors, pred_idx--);
      for (i = 0; n != child && i < lst_size(n->children); i++) {
        child = (UnspoolNode*)lst_get_ptr(n->children, i);
        if (child->oldstate == p) 
          n = child;
      }
    }
    if (n != child) {
      fprintf(stderr, "ERROR (cm_get_unspooled_state): no match for state %d preceded by state(s) ", spooled_state);
      for (i = 0; i < lst_size(predecessors); i++) 
        fprintf(stderr, "%d ", lst_get_int(predecessors, i));
      fprintf(stderr, "\n");
      return -1;
    }
    if (n != child)
      die("ERROR cm_get_unspooled_state: n != child\n");
  }
  return n->newstate;
}
int is_exon(GFF_Feature *feat, List *l) {
  int i;
  for (i = 0; i < lst_size(l); i++) 
    if (str_equals_nocase(feat->feature, (String*)lst_get_ptr(l, i)))
      return 1;
  return 0;
}
Example #22
0
/* Return a list of category names corresponding to a given list of
   category names and or numbers.  Doesn't allocate new names,
   just pointers to Strings in the CategoryMap object or the
   provided List */
List *cm_get_category_str_list(CategoryMap *cm, List *names, int ignore_missing) {
  int i, cat;
  List *retval = lst_new_ptr(lst_size(names));
  for (i = 0; i < lst_size(names); i++) {
    String *n = lst_get_ptr(names, i);
    if (str_as_int(n, &cat) == 0) {
      if (cm == NULL)
	die("ERROR: if categories are specified by number, a category map is required\n");
      if (cat < 0 || (cm != NULL && cat > cm->ncats)) 
        die("ERROR: category number %d is out of bounds.\n", cat);
      lst_push_ptr(retval, cm_get_feature(cm, cat));
    }
    else {
      if (cm != NULL) {
	cat = cm_get_category(cm, n);
	if (cat == 0 && !ignore_missing && !str_equals(n, cm_get_feature(cm, 0))) {
	  die("ERROR: illegal category name (\"%s\")\n", n->chars);
	}
	//return pointers to cm if possible
	lst_push_ptr(retval, cm_get_feature(cm, cat));
      }
      //otherwise return pointers to strings in list
      else lst_push_ptr(retval, n);
    }
  }
  return retval;
}
Example #23
0
void mafBlock_mask_region(MafBlock *block, GFF_Set *mask_feats, List *speclist) {
  MafSubBlock *refblock, *maskblock;
  int i, j, spec_idx;
  GFF_Set *feat;
  GFF_Feature *f, *prevf=NULL;
  int next_feat_idx = 1;
  char **maskseq;
  int num_mask_seq=0;
  long coord;
  if (mask_feats == NULL || lst_size(mask_feats->features) == 0L) return;
  maskseq = smalloc(lst_size(speclist)*sizeof(char*));
  for (i=0; i < lst_size(speclist); i++) {
    spec_idx = hsh_get_int(block->specMap, ((String*)lst_get_ptr(speclist, i))->chars);
    if (spec_idx == -1) continue;
    maskblock = lst_get_ptr(block->data, spec_idx);
    if (maskblock->seq == NULL) continue;
    maskseq[num_mask_seq++] = maskblock->seq->chars;
  }
  if (num_mask_seq == 0) {
    sfree(maskseq);
    return;
  }
  feat = gff_copy_set_no_groups(mask_feats);
  gff_flatten_mergeAll(feat);
  f = lst_get_ptr(feat->features, 0);

  refblock = lst_get_ptr(block->data, 0);
  coord = refblock->start;
  for (i=0; i < block->seqlen; i++) {
    if (refblock->seq->chars[i] != '-') coord++;  //this is 1-based coordinate
    if (coord > f->end) {
      if (next_feat_idx == lst_size(feat->features))
	break;
      prevf = f;
      f = lst_get_ptr(feat->features, next_feat_idx++);
      if (f->start <= prevf->end) {
	die("Error: feats not sorted in mafBlock_mask_region");  //shouldn't happen
      }
    }
    if (coord >= f->start && coord <= f->end) {
      for (j=0; j < num_mask_seq; j++)
	if (maskseq[j][i] != '-') maskseq[j][i] = 'N';
    }
  }
  gff_free_set(feat);
  sfree(maskseq);
}
/* Read an amino acid rate matrix in the format used by PAML.  Reorder
   the rows and columns to match 'alph'.  Warning: the ordering in the
   file is assumed to match that used in the files in the PAML
   distribution (alphabetical order of 3-letter codes), which is also
   the order of AA_ALPHABET (therefore AA_ALPHABET may not be
   changed!).  Equilibrium frequencies are ignored.  */ 
Matrix *read_paml_matrix(FILE *F, char *alph) {
  char *paml_alph = "ARNDCQEGHILKMFPSTWYV$";
  int size = (int)strlen(paml_alph);
  Matrix *retval = mat_new(size, size);
  List *fields = lst_new_ptr(100);
  String *line = str_new(STR_MED_LEN);
  int i, j;
  if (strcmp(alph, paml_alph) != 0)
    die("ERROR read_paml_matrix (alph (%s) != paml_alph (%s))\n",
	alph, paml_alph);
  mat_zero(retval);

  for (i = 1; i < size-1 && str_readline(line, F) != EOF; ) {
    /* NOTE: size of matrix allows for stop, but stop not included in
       file; therefore, only read size-1 lines */
    str_double_trim(line);
    if (line->length == 0) continue;
    str_split(line, NULL, fields);
    if (lst_size(fields) != i) {
      die("ERROR: row %d of matrix must have %d columns.\n",
	  i+1, i);
    }
    for (j = 0; j < lst_size(fields); j++) {
      double val;

      if (str_as_dbl(lst_get_ptr(fields, j), &val) != 0) {
        die("ERROR: non-numeric matrix element in subst. matrix ('%s')\n", 
	    ((String*)lst_get_ptr(fields, j+1))->chars);
      }
      str_free(lst_get_ptr(fields, j));

      if (j >= size)
	die("ERROR read_paml_matrix j (%i) should be < size (%i)\n", j, size);
      mat_set(retval, i, j, val);
      mat_set(retval, j, i, val);
    }
    i++;
  }

  if (i != size - 1) {
    die("ERROR: too few rows in subst. matrix.\n");
  }
  
  lst_free(fields);
  str_free(line);
  return retval;
}
Example #25
0
/* Perform a GERP-like computation for each feature.  Computes expected
   number of subst. under neutrality (feat_nneut), expected number
   after rescaling by ML (feat_nobs), expected number of rejected
   substitutions (feat_nrejected), and number of species with data
   (feat_nspecies).  If any arrays are NULL, values will not be
   retained.  Gaps and missing data are handled by working with
   induced subtree.  */
void ff_gerp(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode,
             double *feat_nneut, double *feat_nobs, double *feat_nrejected,
             double *feat_nspec, FILE *logf) {
    int i, j, nspec = 0;
    double nneut, scale, lnl;
    int *has_data = smalloc(mod->tree->nnodes * sizeof(int));
    FeatFitData *d;

    /* init FeatFitData */
    d = ff_init_fit_data(mod, msa, ALL, NNEUT, FALSE);

    /* iterate through features  */
    for (i = 0; i < lst_size(gff->features); i++) {
        GFF_Feature *f = lst_get_ptr(gff->features, i);
        checkInterrupt();
        ff_find_missing_branches(mod, msa, f, has_data, &nspec);

        if (nspec < 3)
            nneut = scale = 0;
        else {
            vec_set(d->cdata->params, 0, d->cdata->init_scale);
            d->feat = f;

            opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d,
                          &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0],
                          logf, NULL, NULL);
            /* turns out to be faster to use numerical rather than exact
               derivatives (judging by col case) */

            scale = d->cdata->params->data[0];
            for (j = 1, nneut = 0; j < mod->tree->nnodes; j++)  /* node 0 is root */
                if (has_data[j])
                    nneut += ((TreeNode*)lst_get_ptr(mod->tree->nodes, j))->dparent;
        }

        if (feat_nspec != NULL) feat_nspec[i] = (double)nspec;
        if (feat_nneut != NULL) feat_nneut[i] = nneut;
        if (feat_nobs != NULL) feat_nobs[i] = scale * nneut;
        if (feat_nrejected != NULL) {
            feat_nrejected[i] = nneut * (1 - scale);
            if (mode == ACC) feat_nrejected[i] *= -1;
            else if (mode == NNEUT) feat_nrejected[i] = fabs(feat_nrejected[i]);
        }
    }
    ff_free_fit_data(d);
    sfree(has_data);
}
Example #26
0
void mafBlock_strip_iLines(MafBlock *block) {
  MafSubBlock *sub;
  int i;
  for (i=0; i<lst_size(block->data); i++) {
    sub = (MafSubBlock*)lst_get_ptr(block->data, i);
    mafSubBlock_strip_iLine(sub);
  }
}
Example #27
0
void mafBlock_print(FILE *outfile, MafBlock *block, int pretty_print) {
  int i, j, k, numSpace;
  int fieldSize[6];  //maximum # of characters in the first 6 fields of block
  MafSubBlock *sub;
  char firstChar, formatstr[1000];
  char *firstseq=NULL;

  //if processing has reduced the number of species with data to zero, or has
  //reduced the block to all gaps, don't print
  if (lst_size(block->data) == 0 ||
      mafBlock_all_gaps(block)) return;
  mafBlock_remove_gap_cols(block);
  mafBlock_get_fieldSizes(block, fieldSize);

  fprintf(outfile, "%s\n", block->aLine->chars);
  for (i=0; i<lst_size(block->data); i++) {
    sub = (MafSubBlock*)lst_get_ptr(block->data, i);
    for (j=0; j<sub->numLine; j++) {
      firstChar = sub->lineType[j];
      if (firstChar == 's' || firstChar == 'e') {
	sprintf(formatstr, "%%c %%-%is %%%ii %%%ii %%c %%%ii ",
		fieldSize[1], fieldSize[2], fieldSize[3], fieldSize[5]);
	fprintf(outfile, formatstr, firstChar, sub->src->chars,
		sub->start, sub->size, sub->strand, sub->srcSize);
	if (firstChar == 's') {
	  if (firstseq == NULL) {
	    fprintf(outfile, "%s\n", sub->seq->chars);
	    if (pretty_print) firstseq = sub->seq->chars;
	  }
	  else {
	    for (k=0; k<block->seqlen; k++)
	      fputc(tolower(sub->seq->chars[k])==tolower(firstseq[k]) ? 
		    '.' : sub->seq->chars[k], 
		    outfile);
	  }
	}
	else fprintf(outfile, "%c\n", sub->eStatus);
      } else if (firstChar=='i') {
	sprintf(formatstr, "i %%-%is %%c %%i %%c %%i",
		fieldSize[1]);
	fprintf(outfile, formatstr, sub->src->chars,
		sub->iStatus[0], sub->iCount[0],
		sub->iStatus[1], sub->iCount[1]);
	fputc('\n', outfile);
      } else {
	if (firstChar != 'q')
	  die("ERROR mafBlock_print: firstChar should be q, got %c\n", firstChar);
	sprintf(formatstr, "q %%-%is", fieldSize[1]);
	fprintf(outfile, formatstr, sub->src->chars);
	numSpace = 6 + fieldSize[2] + fieldSize[3] + fieldSize[5];
	for (k=0; k<numSpace; k++) fputc(' ', outfile);
	fprintf(outfile, "%s\n", sub->quality->chars);
      }
    }
  }
  fputc('\n', outfile);  //blank line to mark end of block
  //  fflush(outfile);
}
Example #28
0
void cm_free_category_range(CategoryRange *cr) {
  int i;
  for (i = 0; i < lst_size(cr->feature_types); i++) {
    String *s = (String*)lst_get_ptr(cr->feature_types, i);
    if (s != NULL) str_free(s);
  }
  lst_free(cr->feature_types);
  sfree(cr);
}
Example #29
0
//change all bases with quality score <= cutoff to N
void mafBlock_mask_bases(MafBlock *block, int cutoff, FILE *outfile) {
  MafSubBlock *sub;
  int i, j, firstMasked;
  char *refseq=NULL, *refseqName;
  long firstCoord, lastCoord=-1, *coord;

  sub = (MafSubBlock*)lst_get_ptr(block->data, 0);
  refseq = sub->seq->chars;
  coord = smalloc(block->seqlen*sizeof(long));
  firstCoord = sub->start;
  refseqName = sub->src->chars;
  for (i=0; i < block->seqlen; i++) {
    if (refseq[i] != '-')
      coord[i] = firstCoord++;
    else coord[i] = -1;
  }
  lastCoord = firstCoord;

  for (i=0; i<lst_size(block->data); i++) {
    sub = (MafSubBlock*)lst_get_ptr(block->data, i);
    if (sub->quality==NULL) continue;
    firstMasked=-1;
    for (j=0; j<block->seqlen; j++) {
      if (sub->quality->chars[j] == '-' && refseq[j]=='-') continue;
      if ((sub->quality->chars[j] != '-' && sub->quality->chars[j] != 'F')
	  && sub->quality->chars[j] - '0' <= cutoff) {
	sub->seq->chars[j] = 'N';
	if (firstMasked == -1 && refseq[j]!='-')
	  firstMasked = j;
      }
      else if (firstMasked != -1) {
	if (outfile != NULL) {
	  fprintf(outfile, "%s\t%li\t%li\t%s\n", 
		  refseqName, coord[firstMasked], coord[j], sub->src->chars);
	}
	firstMasked = -1;
      }
    }
    if (outfile != NULL && firstMasked != -1) 
      fprintf(outfile, "%s\t%li\t%li\t%s\n", refseqName, coord[firstMasked], lastCoord, sub->src->chars);
  }
  sfree(coord);
}
Example #30
0
SEXP rph_tree_summary_label(SEXP treeP) {
  TreeNode *tr = rph_tree_new(treeP), *node;
  List *nodes = tr_preorder(tr);
  int i, nnodes = lst_size(nodes);
  SEXP result;
  for (i=0; i < nnodes; i++) {
    node = (TreeNode*)lst_get_ptr(nodes, i);
    if (node->label != NULL) break;
  }
  if (i == nnodes) return R_NilValue;

  PROTECT(result = NEW_CHARACTER(nnodes));
  for (i=0; i < nnodes; i++) {
    node = (TreeNode*)lst_get_ptr(nodes, i);
    if (node->label == NULL) SET_STRING_ELT(result, i, NA_STRING);
    else SET_STRING_ELT(result, i, mkChar(node->label));
  }
  UNPROTECT(1);
  return result;
}