Exemple #1
0
/**************************************************************************
 * Puts counts into the spacing bins.
 **************************************************************************/
void bin_matches(int margin, int bin_size, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, int *matches) {
  int primary_len, secondary_len, secondary, secondary_pos, primary_rc, secondary_rc, quad, distance, max_distance;
  RBNODE_T *node;
  SECONDARY_MOTIF_T *smotif;
  SEQUENCE_T *sequence;
  SPACING_T *spacing;

  primary_len = get_motif_trimmed_length(primary_motif);

  smotif = secondary_motif;
  secondary_len = get_motif_trimmed_length(smotif->motif);

  // Note that distance counts from zero
  max_distance = margin - secondary_len;

  // for each sequence
  for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) {
    sequence = (SEQUENCE_T*)rbtree_value(node);
    secondary = matches[sequence->index];
    // check for a match
    if (!secondary) continue;
    // convert the encoded form into easier to use form
    primary_rc = sequence->primary_match < 0;
    secondary_rc = secondary < 0;
    secondary_pos = (secondary_rc ? -secondary : secondary);
    // calculate the distance (counts from zero) and side
    if (secondary_pos <= margin) {
      distance = margin - secondary_pos - secondary_len + 1;
      if (primary_rc) {//rotate reference direction
        quad = RIGHT;
      } else {
        quad = LEFT;
      }
    } else {
      distance = secondary_pos - margin - primary_len - 1;
      if (primary_rc) {//rotate reference direction
        quad = LEFT;
      } else {
        quad = RIGHT;
      }
    }
    // check that we're within the acceptable range
    if (distance < 0 || distance > max_distance) {
      die("Secondary motif match not within margin as it should be due to prior checks!");
    }
    // calculate the strand
    if (secondary_rc == primary_rc) {
      quad |= SAME;
    } else {
      quad |= OPPO;
    }
    // add a count to the frequencies
    spacing = smotif->spacings+(quad);
    spacing->bins[(int)(distance / bin_size)] += 1;
    smotif->total_spacings += 1;
  }
}
Exemple #2
0
/**************************************************************************
 * compute the pvalues for the frequencies of each spacing
 **************************************************************************/
void compute_spacing_pvalues(int margin, int bin_size, int n_secondary_motifs, int test_max, 
  double threshold, double motif_evalue_cutoff, SECONDARY_MOTIF_T *smotif) {
  int quad_opt_count, quad_bin_count, quad_leftover, total_opt_count, i, j;
  double general_prob, leftover_prob;
  //the number of possible values for spacings in one quadrant
  quad_opt_count = margin - get_motif_trimmed_length(smotif->motif) + 1;
  //the number of bins in one quadrant (excluding a possible leftover bin)
  quad_bin_count = (int)(quad_opt_count / bin_size);
  //the number of spacings that don't fit in the full bins (the number that would go into the leftover bin)
  quad_leftover = quad_opt_count % bin_size;
  //the total number of possible values for spacings
  total_opt_count = quad_opt_count * 4;
  //prior probability of a bin that has bin_size possible spacings that could go into it
  general_prob = (double)bin_size / total_opt_count;
  //prior probability of the final bin that has less than bin_size possible spacings that could go into it
  leftover_prob = (double)quad_leftover / total_opt_count;
  //calculate the number of independent tests
  int independent_tests = 4 * (min(quad_bin_count,test_max) + (quad_leftover == 0 ? 0 : 1));
  //calculate the significance of each bin
  for (i = 0; i < 4; ++i) {
    for (j = 0; j < quad_bin_count; ++j) {
      compute_spacing_pvalue(independent_tests, threshold, i, j, test_max, general_prob, smotif);
    }
    if (quad_leftover) { //bin only exists if quad_leftover is non-zero
      compute_spacing_pvalue(independent_tests, threshold, i, j, test_max, leftover_prob, smotif);
    }
  }
  //sort the significant finds
  qsort(smotif->sigs, smotif->sig_count, sizeof(SIGSPACE_T), compare_sigs);
  smotif->passes_evalue_cutoff = ((smotif->min_pvalue * n_secondary_motifs) <= motif_evalue_cutoff);
}
Exemple #3
0
/**************************************************************************
 * compute the list of ids for the most significant spacing
 **************************************************************************/
void compute_idset(int margin, int bin_size, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, int *matches) {
  int primary_len, secondary_len, secondary, secondary_pos, primary_rc, secondary_rc, quad, distance;
  RBNODE_T *node;
  SEQUENCE_T *sequence;

  if (secondary_motif->sig_count == 0) return;

  primary_len = get_motif_trimmed_length(primary_motif);
  secondary_len = get_motif_trimmed_length(secondary_motif->motif);

  // for each sequence
  for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) {
    sequence = (SEQUENCE_T*)rbtree_value(node);
    secondary = matches[sequence->index];
    // check for a match
    if (!secondary) continue;
    // convert the encoded form into easier to use form
    primary_rc = sequence->primary_match < 0;
    secondary_rc = secondary < 0;
    secondary_pos = (secondary_rc ? -secondary : secondary);
    // calculate the distance and side
    // note that distance can be zero meaning the primary is next to the secondary
    if (secondary_pos <= margin) {
      distance = margin - secondary_pos - secondary_len + 1;
      quad = LEFT;
    } else {
      distance = secondary_pos - margin - primary_len;
      quad = RIGHT;
    }
    // calculate the strand
    if (secondary_rc == primary_rc) {
      quad |= SAME;
    } else {
      quad |= OPPO;
    }
    // add the sequence id to the set if the bin matches    
    if (quad == secondary_motif->sigs->quad && (distance / bin_size) == secondary_motif->sigs->bin) {
      secondary_motif->seq_count += 1;
      secondary_motif->seqs = (int*)mm_realloc(secondary_motif->seqs, sizeof(int) * secondary_motif->seq_count);
      secondary_motif->seqs[secondary_motif->seq_count-1] = sequence->index;
    }
  }
}
Exemple #4
0
/**************************************************************************
 * Calculate the total number of pvalue calculations that will be done
 * by the program. This number is used to correct the pvalues for multiple
 * tests using a bonferoni correction.
 **************************************************************************/
int calculate_test_count(int margin, int bin, int test_max, RBTREE_T *secondary_motifs) {
  int total_tests, quad_opt_count, quad_bin_count;
  SECONDARY_MOTIF_T *smotif;
  RBNODE_T *node;

  total_tests = 0;
  for (node = rbtree_first(secondary_motifs); node != NULL; node = rbtree_next(node)) {
    smotif = (SECONDARY_MOTIF_T*)rbtree_value(node);
    //the number of possible values for spacings in one quadrant
    quad_opt_count = margin - get_motif_trimmed_length(smotif->motif) + 1;
    //the number of bins in one quadrant (excluding a possible leftover bin)
    quad_bin_count = (int)(quad_opt_count / bin) + (quad_opt_count % bin ? 1 : 0);
    //add the number of tested bins
    total_tests += (test_max < quad_bin_count ? test_max : quad_bin_count) * 4;
  }
  return total_tests;
}
Exemple #5
0
/**************************************************************************
 * Create a secondary motif. As the number of sequences is unknown at this
 * point the sequence_matches array is left unallocated. All pvalues are
 * initilized to 1.
 **************************************************************************/
SECONDARY_MOTIF_T* create_secondary_motif(int margin, int bin, 
    MOTIF_DB_T *db, MOTIF_T *motif) {
  int bin_count, i;
  SECONDARY_MOTIF_T *smotif;
  smotif = mm_malloc(sizeof(SECONDARY_MOTIF_T));
  smotif->db = db;
  smotif->motif = motif;
  //set loaded to false
  smotif->loaded = FALSE;
  //calculate the number of bins needed for this motif
  bin_count = (int)((margin - get_motif_trimmed_length(motif) + 1) / bin) + 1;
  //allocate spacings
  for (i = 0; i < 4; ++i) init_spacings((smotif->spacings)+i, bin_count);
  smotif->total_spacings = 0;
  smotif->max_in_one_bin = 0;
  //these will be allocated after we've filled the spacings tables
  //and calculated the most significant spacings
  smotif->sigs = NULL;
  smotif->sig_count = 0;
  smotif->min_pvalue = 1;
  smotif->seqs = NULL;
  smotif->seq_count = 0;
  return smotif;
}
/**************************************************************************
 * Dump sequence matches sorted by the name of the sequence.
 *
 * Outputs Columns:
 *   1) Trimmed lowercase sequence with uppercase matches.
 *   2) Position of the secondary match within the whole sequence.
 *   3) Sequence fragment that the primary matched.
 *   4) Strand of the primary match (+|-)
 *   5) Sequence fragment that the secondary matched.
 *   6) Strand of the secondary match (+|-)
 *   7) Is the primary match on the same strand as the secondary (s|o)
 *   8) Is the secondary match downstream or upstream (d|u)
 *   9) The gap between the primary and secondary matches
 *  10) The name of the sequence
 *  11) The p-value of the bin containing the match (adjusted for # of bins)
 *  ---if the FASTA input file sequence names are in Genome Browser format:
 *  12-14) Position of primary match in BED coordinates
 *  15) Position of primary match in Genome Browser coordinates
 *  16-18) Position of secondary match in BED coordinates
 *  19) Position of secondary match in Genome Browser coordinates
 *
 * If you wish to sort based on the gap column:
 * Sort individual output:
 *  sort -n -k 9,9 -o seqs_primary_secondary.txt seqs_primary_secondary.txt
 * Or sort all outputs:
 *  for f in seqs_*.txt; do sort -n -k 9,9 -o $f $f; done
 * Or to get just locations of primary motif in BED coordinates
 * where the secondary is on the opposite strand, upstream with a gap of 118bp:
 *   awk '$7=="o" && $8=="u" && $9==118 {print $12"\t"$13"\t"$14;}' seqs_primary_secondary.txt 
 *
 **************************************************************************/
static void dump_sequence_matches(FILE *out, int margin, int bin, 
    double sigthresh, BOOLEAN_T sig_only, RBTREE_T *sequences,
    MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif,
    ARRAY_T **matches) {
  RBNODE_T *node;
  SEQUENCE_T *sequence;
  int idx, seqlen, i, j, start, end, secondary, secondary_pos, primary_len, secondary_len, distance;
  BOOLEAN_T primary_rc, secondary_rc, downstream; 
  char *buffer, *seq, *primary_match, *secondary_match;
  ARRAY_T *secondary_array;
  ALPH_T *alph;
  // get the alphabet
  alph = get_motif_alph(primary_motif);
  // allocate a buffer for copying the trimmed sequence into and modify it
  seqlen = margin * 2 + get_motif_trimmed_length(primary_motif);
  buffer = (char*)mm_malloc(sizeof(char) * (seqlen + 1));
  // get the lengths of the motifs
  primary_len = get_motif_trimmed_length(primary_motif);
  secondary_len = get_motif_trimmed_length(secondary_motif->motif); 
  // allocate some strings for storing the matches
  primary_match = (char*)mm_malloc(sizeof(char) * (primary_len + 1));
  secondary_match = (char*)mm_malloc(sizeof(char) * (secondary_len + 1));
  // add null byte at the end of the match strings
  primary_match[primary_len] = '\0';
  secondary_match[secondary_len] = '\0';

  // iterate over all the sequences
  for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) {
    sequence = (SEQUENCE_T*)rbtree_value(node);
    primary_rc = get_array_item(0, sequence->primary_matches) < 0;

    //secondary = matches[sequence->index];
    secondary_array = matches[sequence->index];
    if (! secondary_array) continue;
    int n_secondary_matches = get_array_length(secondary_array);
    for (idx=0; idx<n_secondary_matches; idx++) {
      secondary = get_array_item(idx, secondary_array);
      secondary_rc = secondary < 0;
      secondary_pos = abs(secondary);

      // calculate the distance
      if (secondary_pos <= margin) {
        distance = margin - secondary_pos - secondary_len + 1;
        downstream = primary_rc;
      } else {
        distance = secondary_pos - margin - primary_len - 1;
        downstream = !primary_rc;
      }

      // copy the trimmed sequence
      seq = sequence->data;
      for (i = 0; i < seqlen; ++i) {
        buffer[i] = (alph_is_case_insensitive(alph) ? tolower(seq[i]) : seq[i]);
      }
      buffer[seqlen] = '\0';

      // uppercase primary
      start = margin;
      end = margin + primary_len;
      for (i = start, j = 0; i < end; ++i, ++j) {
        buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]);
        primary_match[j] = buffer[i];
      }

      // uppercase secondary
      // note orign was one, subtract 1 to make origin zero as required for arrays
      start = secondary_pos -1;
      end = start + secondary_len;
      for (i = start, j = 0; i < end; ++i, ++j) {
        buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]);
        secondary_match[j] = buffer[i];
      }

      // get the p-value of the seconndary match
      SPACING_T *spacings;
      if (secondary_rc == primary_rc) {
        spacings = downstream ? secondary_motif->spacings+(SAME+RIGHT) : secondary_motif->spacings+(SAME+LEFT); 
      } else {
        spacings = downstream ? secondary_motif->spacings+(OPPO+RIGHT) : secondary_motif->spacings+(OPPO+LEFT); 
      }
      double p_value = spacings->pvalue[distance/bin];

      // skip match if not significant and only reporting significant matches
      if (sig_only && (p_value > sigthresh)) continue;

      // output line to file
      fprintf(out, "%s    %3d    %s    %s    %s    %s    %s    %s    %3d    %s    %.1e", 
          buffer, 
          secondary_pos, 
          primary_match, 
          (primary_rc ? "-" : "+"), 
          secondary_match, 
          (secondary_rc ? "-" : "+"), 
          (secondary_rc == primary_rc ? "s" : "o"),
          (downstream ? "d" : "u"), 
          distance, 
          sequence->name,
          p_value
      );

      // Parse the sequence name to see if we can get genomic coordinates
      // and print additional columns with primary and secondary matches
      // in both BED and Genome Browser coordinates.
      char *chr_name;
      size_t chr_name_len;
      int start_pos, end_pos;
      if (parse_genomic_coordinates_helper(
          sequence->name,
          &chr_name,
          &chr_name_len,
          &start_pos,
          &end_pos))
      {
        // Get the start and end of the primary match in 
        // 0-relative, half-open genomic coordinates.
        int p_start = start_pos + fabs(get_array_item(0, sequence->primary_matches)) - 1;
        int p_end = p_start + primary_len;
        // Get the start and end of the secondary match in 
        // 0-relative, half-open genomic coordinates.
        int s_start, s_end;
        if ( (!primary_rc && downstream) || (primary_rc && !downstream) ) {
          s_start = p_end + distance;
          s_end = s_start + secondary_len;
        } else {
          s_end = p_start - distance;
          s_start = s_end - secondary_len;
        }
        fprintf(out, "    %s    %d    %d    %s:%d-%d", 
          chr_name, p_start, p_end, chr_name, p_start+1, p_end);
        fprintf(out, "    %s    %d    %d    %s:%d-%d\n", 
          chr_name, s_start, s_end, chr_name, s_start+1, s_end);
      } else {
        fprintf(out, "\n");
      }

    } // secondary match
  } // primary match

  free(buffer);
  free(primary_match);
  free(secondary_match);
}