Beispiel #1
0
/**************************************************************************
 * Callback invoked when matching an opening pattern tag for a CISML file 
 * of a secondary motif database. It checks that the motif should be scored,
 * clears out the list of sequence matches and stores the current motif.
 **************************************************************************/
void motif_secondary(void *ctx, char *accession, char *name, char *db, char *lsId, double *pvalue, double *score) {
  SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx;
  SECONDARY_KEY_T key;
  RBNODE_T *node;
  PSSM_T *pssm;
  int i, seq_count;
  key.db_id = loader->db_id;
  key.motif_id = accession;
  node = rbtree_lookup(loader->secondary_motifs, &key, FALSE, NULL);
  if (node != NULL) {
    loader->secondary_motif = (SECONDARY_MOTIF_T*)rbtree_value(node);
    if (!(loader->secondary_motif->loaded)) {
      seq_count = rbtree_size(loader->sequences);
      for (i = 0; i < seq_count; ++i) loader->secondary_matches[i] = 0;
      if (loader->score_threshold_or_multiplier < 0 && loader->score_threshold_or_multiplier >= -1) {
        pssm = build_motif_pssm(loader->secondary_motif->motif, loader->background, loader->background, NULL, 0, PSSM_RANGE, 0, FALSE);
        loader->calculated_score_threshold = pssm_best_match_score(pssm) * (-loader->score_threshold_or_multiplier);
        free_pssm(pssm);
      }
    } else {
      die("Already seen CISML data for this motif!");
    }
  } else {
    loader->secondary_motif = NULL;
  }
}
Beispiel #2
0
/*****************************************************************************
 * Reads frequency attributes into the pre-allocated freqs array.
 ****************************************************************************/
static void parse_freq_attrs(PS_T *ps, const char* tag, const xmlChar **attrs) {
  int i, ncore, seen, *idx;
  char *end_ptr;
  double value, sum;
  RBNODE_T *node;
  bool seen_bad;
  ncore = rbtree_size(ps->alph_ids);
  // initilize the freqs array
  if (ps->freqs == NULL) ps->freqs = mm_malloc(sizeof(double) * ncore);
  // reset freqs array;
  for (i = 0; i < ncore; i++) ps->freqs[i] = -1;
  seen = 0;
  seen_bad = false;
  sum = 0.0;
  // iterate over attributes
  for (i = 0; attrs[i] != NULL; i += 2) {
    idx = (int*)rbtree_get(ps->alph_ids, attrs[i]);
    if (idx != NULL) {
      assert(*idx < ncore);
      if (ps->freqs[*idx] != -1) {
        dreme_attr_parse_error(ps, PARSE_ATTR_DUPLICATE, tag, (const char*)attrs[i], NULL);
        continue;
      }
      seen++;
      errno = 0; // reset because we're about to check it
      value = strtod((const char*)attrs[i+1], &end_ptr);
      // allow out of range values, mainly because freqs can be very close to zero
      if (end_ptr == (const char*)attrs[i+1] || (errno && errno != ERANGE) || value < 0 || value > 1) {
        dreme_attr_parse_error(ps, PARSE_ATTR_BAD_VALUE, tag, (const char*)attrs[i], (const char*)attrs[i+1]);
        ps->freqs[*idx] = 0; // mark frequence as seen, even though it's bad
        seen_bad = true;
        continue;
      }
      ps->freqs[*idx] = value;
      sum += value;
    }
  }
  // check we got everthing
  if (seen < ncore) {
    // identify what we're missing
    for (node = rbtree_first(ps->alph_ids); node != NULL; node = rbtree_next(node)) {
      idx = (int*)rbtree_value(node);
      if (ps->freqs[*idx] == -1) {
        dreme_attr_parse_error(ps, PARSE_ATTR_MISSING, tag, (char*)rbtree_key(node), NULL);
      }
    }
  } else if (!seen_bad) {
    // check the frequencies sum to 1
    double delta = sum - 1;
    delta = (delta < 0 ? -delta : delta);
    if (delta > (0.001 * ncore)) {
      // dreme writes background probabilities to 3 decimal places so assuming 
      // the error on each is at maximum 0.001 then the total error for the 
      // sum must be less than or equal to 0.004
      error(ps, "Probabilities of %s do not sum to 1, got %g .\n", tag, sum);
    }
  }
}
Beispiel #3
0
/**************************************************************************
 * Puts counts into the spacing bins.
 **************************************************************************/
void bin_matches(int margin, int bin_size, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, int *matches) {
  int primary_len, secondary_len, secondary, secondary_pos, primary_rc, secondary_rc, quad, distance, max_distance;
  RBNODE_T *node;
  SECONDARY_MOTIF_T *smotif;
  SEQUENCE_T *sequence;
  SPACING_T *spacing;

  primary_len = get_motif_trimmed_length(primary_motif);

  smotif = secondary_motif;
  secondary_len = get_motif_trimmed_length(smotif->motif);

  // Note that distance counts from zero
  max_distance = margin - secondary_len;

  // for each sequence
  for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) {
    sequence = (SEQUENCE_T*)rbtree_value(node);
    secondary = matches[sequence->index];
    // check for a match
    if (!secondary) continue;
    // convert the encoded form into easier to use form
    primary_rc = sequence->primary_match < 0;
    secondary_rc = secondary < 0;
    secondary_pos = (secondary_rc ? -secondary : secondary);
    // calculate the distance (counts from zero) and side
    if (secondary_pos <= margin) {
      distance = margin - secondary_pos - secondary_len + 1;
      if (primary_rc) {//rotate reference direction
        quad = RIGHT;
      } else {
        quad = LEFT;
      }
    } else {
      distance = secondary_pos - margin - primary_len - 1;
      if (primary_rc) {//rotate reference direction
        quad = LEFT;
      } else {
        quad = RIGHT;
      }
    }
    // check that we're within the acceptable range
    if (distance < 0 || distance > max_distance) {
      die("Secondary motif match not within margin as it should be due to prior checks!");
    }
    // calculate the strand
    if (secondary_rc == primary_rc) {
      quad |= SAME;
    } else {
      quad |= OPPO;
    }
    // add a count to the frequencies
    spacing = smotif->spacings+(quad);
    spacing->bins[(int)(distance / bin_size)] += 1;
    smotif->total_spacings += 1;
  }
}
Beispiel #4
0
/***********************************************************************
 * Convert a tree of motifs into an array of motifs with a count.
 * This is intended to allow backwards compatibility with the older
 * version.
 ***********************************************************************/
void motif_tree_to_array(RBTREE_T *motif_tree, MOTIF_T **motif_array, int *num) {
  int count, i;
  MOTIF_T *motifs;
  RBNODE_T *node;

  count = rbtree_size(motif_tree);
  motifs = mm_malloc(sizeof(MOTIF_T) * count);
  for (i = 0, node = rbtree_first(motif_tree); node != NULL; i++, node = rbtree_next(node)) {
    copy_motif((MOTIF_T*)rbtree_value(node), motifs+i);
  }
  *motif_array = motifs;
  *num = count;
}
/*****************************************************************************
 * MEME > training_set > /alphabet
 * Read in the number of symbols in the alphabet and if it is nucleotide or 
 * amino-acid (RNA is apparently classed as nucleotide).
 ****************************************************************************/
void mxml_end_alphabet(void *ctx) {
  PARMSG_T *message;
  CTX_T *data;
  RBNODE_T *node;
  char *id, symbol;
  bool *exists;
  int i;

  data = (CTX_T*)ctx;
  if (data->alph == NULL) { // Custom alphabet
    alph_reader_done(data->alph_rdr);
    // report any errors that the alphabet reader found
    while (alph_reader_has_message(data->alph_rdr)) {
      message = alph_reader_next_message(data->alph_rdr);
      if (message->severity == SEVERITY_ERROR) {
        local_error(data, "Alphabet error: %s.\n", message->message);
      } else {
        local_warning(data, "Alphabet warning: %s.\n", message->message);
      }
      parmsg_destroy(message);
    }
    // try to get an alphabet
    data->alph = alph_reader_alphabet(data->alph_rdr);
    alph_reader_destroy(data->alph_rdr);
    data->alph_rdr = NULL;
  } else { // legacy alphabet
    exists = mm_malloc(sizeof(bool) * alph_size_core(data->alph));
    // set list to false
    for (i = 0; i < alph_size_core(data->alph); i++) exists[i] = false;
    // check that id's were defined for all the core alphabet symbols
    for (node = rbtree_first(data->letter_lookup); node != NULL; node = rbtree_next(node)) {
      id = (char*)rbtree_key(node);
      symbol = ((char*)rbtree_value(node))[0];
      if (exists[alph_indexc(data->alph, symbol)]) {
        // duplicate!
        local_error(data, "The letter identifier %s is not the first to refer to symbol %c.\n", id, symbol);
      }
      exists[alph_indexc(data->alph, symbol)] = true;
    }
    // now check for missing identifiers
    for (i = 0; i < alph_size_core(data->alph); i++) {
      if (!exists[i]) {
        // missing id for symbol
        local_error(data, "The symbol %c does not have an assigned identifier.\n", alph_char(data->alph, i));
      }
    }
    free(exists);
  }
}
Beispiel #6
0
/**************************************************************************
 * Callback invoked when matching an opening scanned_sequence tag in the 
 * CISML file for the primary motif. Checks if the sequence is one we are 
 * scoring and if so records it as the current sequence as well as clearing
 * the hits list.
 **************************************************************************/
void sequence_primary(void *ctx, char *accession, char *name, char *db, char *lsId, double *score, double *pvalue, long *length) {
  PRIMARY_LOADER_T *loader = (PRIMARY_LOADER_T*)ctx;
  if (!(loader->in_motif)) {
    loader->current_sequence = NULL;
  } else {
    RBNODE_T *node = rbtree_lookup(loader->sequences, name, FALSE, NULL);
    if (node) {
      loader->current_sequence = rbtree_value(node);
      if (loader->current_sequence->primary_match) die("Already seen this sequence! We can't process this information "
          "because the scoring information from the previous sighting has already been discarded.\n");
      loader->current_score = 0; // reset the current score
      loader->hit_count = 0; //reset the hit count
    } else {
      loader->current_sequence = NULL;
    }
  }
}
Beispiel #7
0
/**************************************************************************
 * Calculate the total number of pvalue calculations that will be done
 * by the program. This number is used to correct the pvalues for multiple
 * tests using a bonferoni correction.
 **************************************************************************/
int calculate_test_count(int margin, int bin, int test_max, RBTREE_T *secondary_motifs) {
  int total_tests, quad_opt_count, quad_bin_count;
  SECONDARY_MOTIF_T *smotif;
  RBNODE_T *node;

  total_tests = 0;
  for (node = rbtree_first(secondary_motifs); node != NULL; node = rbtree_next(node)) {
    smotif = (SECONDARY_MOTIF_T*)rbtree_value(node);
    //the number of possible values for spacings in one quadrant
    quad_opt_count = margin - get_motif_trimmed_length(smotif->motif) + 1;
    //the number of bins in one quadrant (excluding a possible leftover bin)
    quad_bin_count = (int)(quad_opt_count / bin) + (quad_opt_count % bin ? 1 : 0);
    //add the number of tested bins
    total_tests += (test_max < quad_bin_count ? test_max : quad_bin_count) * 4;
  }
  return total_tests;
}
Beispiel #8
0
/**************************************************************************
 * compute the list of ids for the most significant spacing
 **************************************************************************/
void compute_idset(int margin, int bin_size, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, int *matches) {
  int primary_len, secondary_len, secondary, secondary_pos, primary_rc, secondary_rc, quad, distance;
  RBNODE_T *node;
  SEQUENCE_T *sequence;

  if (secondary_motif->sig_count == 0) return;

  primary_len = get_motif_trimmed_length(primary_motif);
  secondary_len = get_motif_trimmed_length(secondary_motif->motif);

  // for each sequence
  for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) {
    sequence = (SEQUENCE_T*)rbtree_value(node);
    secondary = matches[sequence->index];
    // check for a match
    if (!secondary) continue;
    // convert the encoded form into easier to use form
    primary_rc = sequence->primary_match < 0;
    secondary_rc = secondary < 0;
    secondary_pos = (secondary_rc ? -secondary : secondary);
    // calculate the distance and side
    // note that distance can be zero meaning the primary is next to the secondary
    if (secondary_pos <= margin) {
      distance = margin - secondary_pos - secondary_len + 1;
      quad = LEFT;
    } else {
      distance = secondary_pos - margin - primary_len;
      quad = RIGHT;
    }
    // calculate the strand
    if (secondary_rc == primary_rc) {
      quad |= SAME;
    } else {
      quad |= OPPO;
    }
    // add the sequence id to the set if the bin matches    
    if (quad == secondary_motif->sigs->quad && (distance / bin_size) == secondary_motif->sigs->bin) {
      secondary_motif->seq_count += 1;
      secondary_motif->seqs = (int*)mm_realloc(secondary_motif->seqs, sizeof(int) * secondary_motif->seq_count);
      secondary_motif->seqs[secondary_motif->seq_count-1] = sequence->index;
    }
  }
}
Beispiel #9
0
/**************************************************************************
 * Callback invoked when matching an opening scanned_sequence tag for a
 * CISML file of a secondary motif database. It calcualtes and caches the
 * left and right bounds of the primary motif and stores the current 
 * sequence.
 **************************************************************************/
void sequence_secondary(void *ctx, char *accession, char *name, char *db, char *lsId, double *score, double *pvalue, long *length) {
  SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx;
  RBNODE_T *node;
  int pmatch;
  if (loader->secondary_motif == NULL) return;
  node = rbtree_lookup(loader->sequences, accession, FALSE, NULL);
  if (node != NULL) {
    loader->current_sequence = (SEQUENCE_T*)rbtree_value(node);
    pmatch = loader->current_sequence->primary_match;
    loader->primary_lpos = (pmatch < 0 ? -pmatch : pmatch);
    loader->primary_rpos = loader->primary_lpos + get_motif_length(loader->primary_motif) - 1;
    if (loader->secondary_matches[loader->current_sequence->index] != 0) {
      die("Already seen this sequence!");
    }
    loader->secondary_score = 0;
    loader->hit_count = 0;
  } else {
    loader->current_sequence = NULL;
  }
}
/**************************************************************************
 * Dump sequence matches sorted by the name of the sequence.
 *
 * Outputs Columns:
 *   1) Trimmed lowercase sequence with uppercase matches.
 *   2) Position of the secondary match within the whole sequence.
 *   3) Sequence fragment that the primary matched.
 *   4) Strand of the primary match (+|-)
 *   5) Sequence fragment that the secondary matched.
 *   6) Strand of the secondary match (+|-)
 *   7) Is the primary match on the same strand as the secondary (s|o)
 *   8) Is the secondary match downstream or upstream (d|u)
 *   9) The gap between the primary and secondary matches
 *  10) The name of the sequence
 *  11) The p-value of the bin containing the match (adjusted for # of bins)
 *  ---if the FASTA input file sequence names are in Genome Browser format:
 *  12-14) Position of primary match in BED coordinates
 *  15) Position of primary match in Genome Browser coordinates
 *  16-18) Position of secondary match in BED coordinates
 *  19) Position of secondary match in Genome Browser coordinates
 *
 * If you wish to sort based on the gap column:
 * Sort individual output:
 *  sort -n -k 9,9 -o seqs_primary_secondary.txt seqs_primary_secondary.txt
 * Or sort all outputs:
 *  for f in seqs_*.txt; do sort -n -k 9,9 -o $f $f; done
 * Or to get just locations of primary motif in BED coordinates
 * where the secondary is on the opposite strand, upstream with a gap of 118bp:
 *   awk '$7=="o" && $8=="u" && $9==118 {print $12"\t"$13"\t"$14;}' seqs_primary_secondary.txt 
 *
 **************************************************************************/
static void dump_sequence_matches(FILE *out, int margin, int bin, 
    double sigthresh, BOOLEAN_T sig_only, RBTREE_T *sequences,
    MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif,
    ARRAY_T **matches) {
  RBNODE_T *node;
  SEQUENCE_T *sequence;
  int idx, seqlen, i, j, start, end, secondary, secondary_pos, primary_len, secondary_len, distance;
  BOOLEAN_T primary_rc, secondary_rc, downstream; 
  char *buffer, *seq, *primary_match, *secondary_match;
  ARRAY_T *secondary_array;
  ALPH_T *alph;
  // get the alphabet
  alph = get_motif_alph(primary_motif);
  // allocate a buffer for copying the trimmed sequence into and modify it
  seqlen = margin * 2 + get_motif_trimmed_length(primary_motif);
  buffer = (char*)mm_malloc(sizeof(char) * (seqlen + 1));
  // get the lengths of the motifs
  primary_len = get_motif_trimmed_length(primary_motif);
  secondary_len = get_motif_trimmed_length(secondary_motif->motif); 
  // allocate some strings for storing the matches
  primary_match = (char*)mm_malloc(sizeof(char) * (primary_len + 1));
  secondary_match = (char*)mm_malloc(sizeof(char) * (secondary_len + 1));
  // add null byte at the end of the match strings
  primary_match[primary_len] = '\0';
  secondary_match[secondary_len] = '\0';

  // iterate over all the sequences
  for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) {
    sequence = (SEQUENCE_T*)rbtree_value(node);
    primary_rc = get_array_item(0, sequence->primary_matches) < 0;

    //secondary = matches[sequence->index];
    secondary_array = matches[sequence->index];
    if (! secondary_array) continue;
    int n_secondary_matches = get_array_length(secondary_array);
    for (idx=0; idx<n_secondary_matches; idx++) {
      secondary = get_array_item(idx, secondary_array);
      secondary_rc = secondary < 0;
      secondary_pos = abs(secondary);

      // calculate the distance
      if (secondary_pos <= margin) {
        distance = margin - secondary_pos - secondary_len + 1;
        downstream = primary_rc;
      } else {
        distance = secondary_pos - margin - primary_len - 1;
        downstream = !primary_rc;
      }

      // copy the trimmed sequence
      seq = sequence->data;
      for (i = 0; i < seqlen; ++i) {
        buffer[i] = (alph_is_case_insensitive(alph) ? tolower(seq[i]) : seq[i]);
      }
      buffer[seqlen] = '\0';

      // uppercase primary
      start = margin;
      end = margin + primary_len;
      for (i = start, j = 0; i < end; ++i, ++j) {
        buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]);
        primary_match[j] = buffer[i];
      }

      // uppercase secondary
      // note orign was one, subtract 1 to make origin zero as required for arrays
      start = secondary_pos -1;
      end = start + secondary_len;
      for (i = start, j = 0; i < end; ++i, ++j) {
        buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]);
        secondary_match[j] = buffer[i];
      }

      // get the p-value of the seconndary match
      SPACING_T *spacings;
      if (secondary_rc == primary_rc) {
        spacings = downstream ? secondary_motif->spacings+(SAME+RIGHT) : secondary_motif->spacings+(SAME+LEFT); 
      } else {
        spacings = downstream ? secondary_motif->spacings+(OPPO+RIGHT) : secondary_motif->spacings+(OPPO+LEFT); 
      }
      double p_value = spacings->pvalue[distance/bin];

      // skip match if not significant and only reporting significant matches
      if (sig_only && (p_value > sigthresh)) continue;

      // output line to file
      fprintf(out, "%s    %3d    %s    %s    %s    %s    %s    %s    %3d    %s    %.1e", 
          buffer, 
          secondary_pos, 
          primary_match, 
          (primary_rc ? "-" : "+"), 
          secondary_match, 
          (secondary_rc ? "-" : "+"), 
          (secondary_rc == primary_rc ? "s" : "o"),
          (downstream ? "d" : "u"), 
          distance, 
          sequence->name,
          p_value
      );

      // Parse the sequence name to see if we can get genomic coordinates
      // and print additional columns with primary and secondary matches
      // in both BED and Genome Browser coordinates.
      char *chr_name;
      size_t chr_name_len;
      int start_pos, end_pos;
      if (parse_genomic_coordinates_helper(
          sequence->name,
          &chr_name,
          &chr_name_len,
          &start_pos,
          &end_pos))
      {
        // Get the start and end of the primary match in 
        // 0-relative, half-open genomic coordinates.
        int p_start = start_pos + fabs(get_array_item(0, sequence->primary_matches)) - 1;
        int p_end = p_start + primary_len;
        // Get the start and end of the secondary match in 
        // 0-relative, half-open genomic coordinates.
        int s_start, s_end;
        if ( (!primary_rc && downstream) || (primary_rc && !downstream) ) {
          s_start = p_end + distance;
          s_end = s_start + secondary_len;
        } else {
          s_end = p_start - distance;
          s_start = s_end - secondary_len;
        }
        fprintf(out, "    %s    %d    %d    %s:%d-%d", 
          chr_name, p_start, p_end, chr_name, p_start+1, p_end);
        fprintf(out, "    %s    %d    %d    %s:%d-%d\n", 
          chr_name, s_start, s_end, chr_name, s_start+1, s_end);
      } else {
        fprintf(out, "\n");
      }

    } // secondary match
  } // primary match

  free(buffer);
  free(primary_match);
  free(secondary_match);
}
/*
 * Load background file frequencies into the array.
 */
ARRAY_T* get_file_frequencies(ALPH_T *alph, char *bg_filename, ARRAY_T *freqs) {
  regmatch_t matches[4];
  STR_T *line;
  char chunk[BG_CHUNK_SIZE+1], letter[2], *key;
  int size, terminate, offset, i;
  FILE *fp;
  regex_t bgfreq;
  double freq;
  RBTREE_T *letters;
  RBNODE_T *node;
  
  regcomp_or_die("bg freq", &bgfreq, BGFREQ_RE, REG_EXTENDED);
  letters = rbtree_create(rbtree_strcasecmp, rbtree_strcpy, free, rbtree_dblcpy, free);
  line = str_create(100);
  if (!(fp = fopen(bg_filename, "r"))) {
    die("Unable to open background file \"%s\" for reading.\n", bg_filename);
  }
  
  terminate = feof(fp);
  while (!terminate) {
    size = fread(chunk, sizeof(char), BG_CHUNK_SIZE, fp);
    chunk[size] = '\0';
    terminate = feof(fp);
    offset = 0;
    while (offset < size) {
      // skip mac newline
      if (str_len(line) == 0 && chunk[offset] == '\r') {
        offset++;
        continue;
      }
      // find next new line
      for (i = offset; i < size; ++i) {
        if (chunk[i] == '\n') break;
      }
      // append portion up to the new line or end of chunk
      str_append(line, chunk+offset, i - offset);
      // read more if we didn't find a new line
      if (i == size && !terminate) break;
      // move the offset past the new line
      offset = i + 1;
      // handle windows new line
      if (str_char(line, -1) == '\r') str_truncate(line, -1);
      // remove everything to the right of a comment character
      for (i = 0; i < str_len(line); ++i) {
        if (str_char(line, i) == '#') {
          str_truncate(line, i);
          break;
        }
      }
      // check the line for a single letter followed by a number
      if (regexec_or_die("bg freq", &bgfreq, str_internal(line), 4, matches, 0)) {
        // parse the letter and frequency value
        regex_strncpy(matches+1, str_internal(line), letter, 2);
        freq = regex_dbl(matches+2, str_internal(line));
        // check the frequency is acceptable
        if (freq < 0 || freq > 1) {
          die("The background file lists the illegal probability %g for "
            "the letter %s.\n", freq, letter);
        } else if (freq == 0) {
          die("The background file lists a probability of zero for the "
            "letter %s\n", letter);
        }
        if (freq >= 0 && freq <= 1) rbtree_put(letters, letter, &freq);
      }
      str_clear(line);
    }
  }
  // finished with the file so clean up file parsing stuff
  fclose(fp);
  str_destroy(line, FALSE);
  regfree(&bgfreq);
  // guess the alphabet
  if (*alph == INVALID_ALPH) {
    switch (rbtree_size(letters)) {
      case PROTEIN_ASIZE:
        *alph = PROTEIN_ALPH;
        break;
      case DNA_ASIZE:
        *alph = DNA_ALPH;
        break;
      default:
        die("Number of single character entries in background does not match "
            "an alphabet.\n");
    }
  }
  // make the background
  if (freqs == NULL) freqs = allocate_array(alph_size(*alph, ALL_SIZE));
  assert(get_array_length(freqs) >= alph_size(*alph, ALL_SIZE));
  init_array(-1, freqs);
  for (node = rbtree_first(letters); node != NULL; node = rbtree_next(node)) {
    key = (char*)rbtree_key(node);
    i = alph_index(*alph, key[0]);
    freq = *((double*)rbtree_value(node));
    if (i == -1) {
      die("Background contains letter %s which is not in the %s alphabet.\n", 
          key, alph_name(*alph));
    }
    if (get_array_item(i, freqs) != -1) {
      die("Background contains letter %s which has the same meaning as an "
          "already listed letter.\n", key);
    }
    set_array_item(i, freq, freqs);
  }
  // check that all items were set
  for (i = 0; i < alph_size(*alph, ALPH_SIZE); i++) {
    if (get_array_item(i, freqs) == -1) {
      die("Background is missing letter %c.\n", alph_char(*alph, i));
    }
  }
  // disabled for backwards compatability (AMA test was failing)
  //normalize_subarray(0, ALPH_ASIZE[*alph], 0.0, freqs);
  // calculate the values of the ambiguous letters from the concrete ones
  calc_ambigs(*alph, FALSE, freqs);
  // cleanup
  rbtree_destroy(letters);
  // return result
  return freqs;
}
Beispiel #12
0
/*************************************************************************
 * Build a linear HMM.
 *************************************************************************/
void build_linear_hmm
  (ARRAY_T*  background,
   ORDER_T*  order_spacing,
   int       spacer_states, 
   RBTREE_T* motifs, // motifs with key as in order_spacing
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T    alph;
  int       model_length; // Total number of states in the model.
  int       i_state;      // Index of the current state.
  int       i_order;      // Index within the order and spacing.
  int       i_position;   // Index within the current motif or spacer.
  int       motif_i;      // motif key in order spacing
  MOTIF_T  *motif;        // motif
  RBNODE_T *node;

  alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs)));

  // Calculate the total length of the model.
  model_length = 2; // start and end state
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);
    model_length += get_motif_length(motif);
  }
  model_length += (get_order_occurs(order_spacing) + 1) * spacer_states;


  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, model_length);
  check_sq_matrix((*the_hmm)->trans, model_length);

  // Record that this is a linear model.
  (*the_hmm)->type = LINEAR_HMM;

  // Record the number of motifs in the model. 
  // It doesn't want the distinct count
  (*the_hmm)->num_motifs = get_order_occurs(order_spacing);

  // Record the number of states in the model.
  (*the_hmm)->num_states = model_length;
  (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Begin the model with a non-emitting state.
  i_state = 0;
  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  build_linear_state(
      alph,
      START_STATE,
      i_state,
      get_spacer_length(order_spacing, 0),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to start state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;

  // Build the first spacer.
  for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
    check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
    build_linear_state(
        alph,
        SPACER_STATE,
        i_state, 
        get_spacer_length(order_spacing, 0),
        background, 
        SPACER_NUMSITES,
        NON_MOTIF_INDEX,
        i_position, // position within spacer
        NULL, // no motif
        &((*the_hmm)->states[i_state]));
  }

  // Build each motif and subsequent spacer.
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    STATE_T state;
    int spacer_len;
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);

    // Build the motif.
    for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order);
      } else if (i_position == (get_motif_length(motif) - 1)) {
        state = END_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order+1);
      } else {
        state = MID_MOTIF_STATE;
        spacer_len = 0;
      }
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          state, 
          i_state, 
          spacer_len, // Expected spacer length.
          get_matrix_row(i_position, get_motif_freqs(motif)),
          get_motif_nsites(motif),
          i_order,
          i_position, // position within motif (middle)
          motif,
          &((*the_hmm)->states[i_state]));
    }

    // Build the following spacer.
    for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          SPACER_STATE, 
          i_state, 
          get_spacer_length(order_spacing, i_order+1),
          background,
          SPACER_NUMSITES,
          NON_MOTIF_INDEX, 
          i_position, // position within spacer
          NULL, // no motif
          &((*the_hmm)->states[i_state]));
    }
  }

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Finish up the model with a non-emitting end state.
  build_linear_state(
      alph, 
      END_STATE, 
      i_state, 
      get_spacer_length(order_spacing, i_order),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to end state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;
  assert(i_state == model_length);

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}