SAMPLE *get_sample_by_name(
  char *sample_name
)
{
  HASH_TABLE_ENTRY * hash_entry = hash_lookup_str(sample_name, ht_seq_names);
  return(hash_entry != NULL ? (SAMPLE *) hash_get_entry_value(hash_entry) : NULL);
} // get_sample_by_name
Example #2
0
/*
  Adds a new object to the heap.
  The heap is implemented using an array. Properties of the heap:
    The root node is at index position 1
    The root has the smallest score in the heap
    The score for a child node is >= the score of its parent
    The left child of a node at index p is at index position 2*p
    The right child of a node at index p is at index positon 2*p+1
  
  If the heap is not full the object is added to the next available 
  index in the node list. The new object is then compared with its 
  parent. If the new object is < the parent object/s the nodes are 
  swapped until the heap order is re-established. 

  If the heap is full the new object is compared to the root object. 
  If the new object is > the root object, the root is "bumped"
  from the heap and the new object replaces the root object.  
  The new object is then compared with its children nodes and is swapped
  with the smallest child node until the heap order is re-established.

  If the heap is not full when add_node_heap() is called, it returns NULL.
  Otherwise, if the new node is successfully added to the heap, it
  returns a pointer to the bumped node (the old root node); if the
  node cannot be added (it is smaller than the old root), it returns
  a pointer to the new node.

  If a get_key function was defined, no objects will be added
  to the heap that have duplicate keys.
*/
void *add_node_heap(
  HEAP *heap,           // heap
  void *node            // object to add to heap
)
{
  int i = heap->next_node; 	// index for the next node
  int max = heap->max_size; 	// node_list indexes from 0 to max_size
  void *parent;			// parent node objects
  int p_node; 	 		// index of parent
  void *left_node, *right_node;	// objects in the left and right child nodes
  int l_idx, r_idx;		// index of left and right child nodes
  int comp_n;			//
  void *bumped_node = NULL; 	//
  char *node_key = NULL;	// string used as key of node in hash table

  // get info for new node
  void *new = node;

  // Check if node with same key already is in heap if we would add this node.
  // Don't add node to heap if one does. 
  if (heap->ht && ((i<=max) || (heap->compare(node, heap->node_list[1]) > 0))) {
    node_key = heap->get_key(new);
    if (hash_lookup_str(node_key, heap->ht)) return new;
  }
DATASET *read_seq_file(
  char *file_name,		/* name of file to open */
  char *alpha,			/* alphabet used in sequences */
  BOOLEAN use_comp,		/* use complementary strands, too */
  double seqfrac 		/* fraction of input sequences to use */
)
{
  int i, j;
  FILE *data_file;		/* file with samples to read */
  FILE *prior_file=NULL;	/* file with positional priors to read */
  char *sample_name;		/* name of sample read */
  char *sample_de;		/* descriptor text for sample */
  char *sequence;		/* sequence read */
  long length;			/* length of sequence */
  BOOLEAN error=FALSE;		/* none yet */
  SAMPLE *sample;		/* sample created */
  DATASET *dataset;		/* dataset created */
  int n_samples=0;		/* number of samples read */
  double *seq_weights=NULL;	/* sequence weights */
  int n_wgts=0;			/* number of sequence weights given */

  /* create a hash table of sequence names */
  if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE);

  /* create a dataset */
  dataset = (DATASET *) mymalloc(sizeof(DATASET));
  dataset->alength = strlen(alpha);
  dataset->alphabet = alpha;
  dataset->psp_w = 0;			// indicates no PSP was read
  dataset->log_psp_w = 0;		// so log_psp will get initialized

  /* open data file */
  if (file_name == NULL) {
    fprintf(stderr, "You must specify a data file or `stdin'.\n");
    exit(1);
  } else if (strcmp(file_name, "stdin")) {
    data_file = fopen(file_name, "r"); 
    if (data_file == NULL) {
      fprintf(stderr, "Cannot open file `%s'.\n", file_name);
      exit(1);
    }
  } else {
    data_file = stdin;
  }

  /* initialize maximum length of sequences */
  dataset->max_slength = 0;
  dataset->min_slength = 10000000;

  dataset->n_samples = 0;	/* no samples yet */
  dataset->samples = NULL;	/* no samples */

  while (read_sequence(data_file, &sample_name, &sample_de, &sequence, 
    &length)) {

    /* skip sequence if an error occurred */
    if (length < 0) continue;

    /* parse weights if given; make (more than enough) room in array */
    if (strcmp(sample_name, "WEIGHTS")==0) {
      double wgt; 
      char *wgt_str = sample_de;
      Resize(seq_weights, n_wgts+(int)strlen(wgt_str), double);
      while (sscanf(wgt_str, "%lf", &wgt) == 1) {
        if (wgt <= 0 || wgt > 1) {
	  fprintf(stderr, 
            "Weights must be larger than zero and no greater than 1.\n");
	  exit(1);
        }
        seq_weights[n_wgts++] = wgt;			/* save weight */
        wgt_str += strspn(wgt_str, "      ");		/* skip white */
        wgt_str += strcspn(wgt_str, "     ");		/* skip token */
      }
      myfree(sample_name);
      myfree(sample_de);
      myfree(sequence);
      continue;
    }

    /* ignore duplicate (same sample name) sequences */ 
    if (hash_lookup_str(sample_name, ht_seq_names) != NULL) {
      fprintf(stderr, "Skipping sequence '%s'.\n", sample_name);
      myfree(sample_name);
      myfree(sample_de);
      myfree(sequence);
      continue;
    }
    hash_insert_str(sample_name, ht_seq_names);  /* put name in hash table */

    n_samples++;

    /* see if sequence will be used in random sample; store it if yes */
    if (drand48() >= 1 - seqfrac) {

      HASH_TABLE_ENTRY *hash_entry; // needed to add pointer to sample

      /* create the sample */
      sample = create_sample(alpha, length, sample_name, sequence, sample_de, use_comp);
      if (sample == NULL) {error = TRUE; continue;}

      /* record maximum length of actual sequences */
      dataset->max_slength = MAX(sample->length, dataset->max_slength);
      dataset->min_slength = MIN(sample->length, dataset->min_slength);

      /* put the sample in the array of samples */
      if ((dataset->n_samples % RCHUNK) == 0) {
        Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *);
      }
      HASH_TABLE_ENTRY *hash_entry; // needed to add pointer to sample

      /* create the sample */
      sample = create_sample(alpha, length, sample_name, sequence, sample_de, use_comp);
      if (sample == NULL) {error = TRUE; continue;}

      /* record maximum length of actual sequences */
      dataset->max_slength = MAX(sample->length, dataset->max_slength);
      dataset->min_slength = MIN(sample->length, dataset->min_slength);

      /* put the sample in the array of samples */
      if ((dataset->n_samples % RCHUNK) == 0) {
        Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *);
      }
      dataset->samples[dataset->n_samples++] = sample;
      hash_entry = hash_lookup_str (sample_name, ht_seq_names);
      if (!hash_entry) {
	fprintf(stderr, "hash error: added sample ID then failed to find it in read_seq_file ()\n");
	error = TRUE;
	break;
      }
      hash_set_entry_value(sample, hash_entry);
    }
    /* cleanup sequence (create_sample copies it) */
    myfree(sequence);
    
  } /* sequences */
  if (length < 0) error = TRUE;			/* read_sequence error */
  
  /* resize the array of samples */
  if (dataset->n_samples) Resize(dataset->samples, dataset->n_samples, SAMPLE*);