/************************************************************************** * Puts counts into the spacing bins. **************************************************************************/ void bin_matches(int margin, int bin_size, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, int *matches) { int primary_len, secondary_len, secondary, secondary_pos, primary_rc, secondary_rc, quad, distance, max_distance; RBNODE_T *node; SECONDARY_MOTIF_T *smotif; SEQUENCE_T *sequence; SPACING_T *spacing; primary_len = get_motif_trimmed_length(primary_motif); smotif = secondary_motif; secondary_len = get_motif_trimmed_length(smotif->motif); // Note that distance counts from zero max_distance = margin - secondary_len; // for each sequence for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) { sequence = (SEQUENCE_T*)rbtree_value(node); secondary = matches[sequence->index]; // check for a match if (!secondary) continue; // convert the encoded form into easier to use form primary_rc = sequence->primary_match < 0; secondary_rc = secondary < 0; secondary_pos = (secondary_rc ? -secondary : secondary); // calculate the distance (counts from zero) and side if (secondary_pos <= margin) { distance = margin - secondary_pos - secondary_len + 1; if (primary_rc) {//rotate reference direction quad = RIGHT; } else { quad = LEFT; } } else { distance = secondary_pos - margin - primary_len - 1; if (primary_rc) {//rotate reference direction quad = LEFT; } else { quad = RIGHT; } } // check that we're within the acceptable range if (distance < 0 || distance > max_distance) { die("Secondary motif match not within margin as it should be due to prior checks!"); } // calculate the strand if (secondary_rc == primary_rc) { quad |= SAME; } else { quad |= OPPO; } // add a count to the frequencies spacing = smotif->spacings+(quad); spacing->bins[(int)(distance / bin_size)] += 1; smotif->total_spacings += 1; } }
/************************************************************************** * compute the pvalues for the frequencies of each spacing **************************************************************************/ void compute_spacing_pvalues(int margin, int bin_size, int n_secondary_motifs, int test_max, double threshold, double motif_evalue_cutoff, SECONDARY_MOTIF_T *smotif) { int quad_opt_count, quad_bin_count, quad_leftover, total_opt_count, i, j; double general_prob, leftover_prob; //the number of possible values for spacings in one quadrant quad_opt_count = margin - get_motif_trimmed_length(smotif->motif) + 1; //the number of bins in one quadrant (excluding a possible leftover bin) quad_bin_count = (int)(quad_opt_count / bin_size); //the number of spacings that don't fit in the full bins (the number that would go into the leftover bin) quad_leftover = quad_opt_count % bin_size; //the total number of possible values for spacings total_opt_count = quad_opt_count * 4; //prior probability of a bin that has bin_size possible spacings that could go into it general_prob = (double)bin_size / total_opt_count; //prior probability of the final bin that has less than bin_size possible spacings that could go into it leftover_prob = (double)quad_leftover / total_opt_count; //calculate the number of independent tests int independent_tests = 4 * (min(quad_bin_count,test_max) + (quad_leftover == 0 ? 0 : 1)); //calculate the significance of each bin for (i = 0; i < 4; ++i) { for (j = 0; j < quad_bin_count; ++j) { compute_spacing_pvalue(independent_tests, threshold, i, j, test_max, general_prob, smotif); } if (quad_leftover) { //bin only exists if quad_leftover is non-zero compute_spacing_pvalue(independent_tests, threshold, i, j, test_max, leftover_prob, smotif); } } //sort the significant finds qsort(smotif->sigs, smotif->sig_count, sizeof(SIGSPACE_T), compare_sigs); smotif->passes_evalue_cutoff = ((smotif->min_pvalue * n_secondary_motifs) <= motif_evalue_cutoff); }
/************************************************************************** * compute the list of ids for the most significant spacing **************************************************************************/ void compute_idset(int margin, int bin_size, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, int *matches) { int primary_len, secondary_len, secondary, secondary_pos, primary_rc, secondary_rc, quad, distance; RBNODE_T *node; SEQUENCE_T *sequence; if (secondary_motif->sig_count == 0) return; primary_len = get_motif_trimmed_length(primary_motif); secondary_len = get_motif_trimmed_length(secondary_motif->motif); // for each sequence for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) { sequence = (SEQUENCE_T*)rbtree_value(node); secondary = matches[sequence->index]; // check for a match if (!secondary) continue; // convert the encoded form into easier to use form primary_rc = sequence->primary_match < 0; secondary_rc = secondary < 0; secondary_pos = (secondary_rc ? -secondary : secondary); // calculate the distance and side // note that distance can be zero meaning the primary is next to the secondary if (secondary_pos <= margin) { distance = margin - secondary_pos - secondary_len + 1; quad = LEFT; } else { distance = secondary_pos - margin - primary_len; quad = RIGHT; } // calculate the strand if (secondary_rc == primary_rc) { quad |= SAME; } else { quad |= OPPO; } // add the sequence id to the set if the bin matches if (quad == secondary_motif->sigs->quad && (distance / bin_size) == secondary_motif->sigs->bin) { secondary_motif->seq_count += 1; secondary_motif->seqs = (int*)mm_realloc(secondary_motif->seqs, sizeof(int) * secondary_motif->seq_count); secondary_motif->seqs[secondary_motif->seq_count-1] = sequence->index; } } }
/************************************************************************** * Calculate the total number of pvalue calculations that will be done * by the program. This number is used to correct the pvalues for multiple * tests using a bonferoni correction. **************************************************************************/ int calculate_test_count(int margin, int bin, int test_max, RBTREE_T *secondary_motifs) { int total_tests, quad_opt_count, quad_bin_count; SECONDARY_MOTIF_T *smotif; RBNODE_T *node; total_tests = 0; for (node = rbtree_first(secondary_motifs); node != NULL; node = rbtree_next(node)) { smotif = (SECONDARY_MOTIF_T*)rbtree_value(node); //the number of possible values for spacings in one quadrant quad_opt_count = margin - get_motif_trimmed_length(smotif->motif) + 1; //the number of bins in one quadrant (excluding a possible leftover bin) quad_bin_count = (int)(quad_opt_count / bin) + (quad_opt_count % bin ? 1 : 0); //add the number of tested bins total_tests += (test_max < quad_bin_count ? test_max : quad_bin_count) * 4; } return total_tests; }
/************************************************************************** * Create a secondary motif. As the number of sequences is unknown at this * point the sequence_matches array is left unallocated. All pvalues are * initilized to 1. **************************************************************************/ SECONDARY_MOTIF_T* create_secondary_motif(int margin, int bin, MOTIF_DB_T *db, MOTIF_T *motif) { int bin_count, i; SECONDARY_MOTIF_T *smotif; smotif = mm_malloc(sizeof(SECONDARY_MOTIF_T)); smotif->db = db; smotif->motif = motif; //set loaded to false smotif->loaded = FALSE; //calculate the number of bins needed for this motif bin_count = (int)((margin - get_motif_trimmed_length(motif) + 1) / bin) + 1; //allocate spacings for (i = 0; i < 4; ++i) init_spacings((smotif->spacings)+i, bin_count); smotif->total_spacings = 0; smotif->max_in_one_bin = 0; //these will be allocated after we've filled the spacings tables //and calculated the most significant spacings smotif->sigs = NULL; smotif->sig_count = 0; smotif->min_pvalue = 1; smotif->seqs = NULL; smotif->seq_count = 0; return smotif; }
/************************************************************************** * Dump sequence matches sorted by the name of the sequence. * * Outputs Columns: * 1) Trimmed lowercase sequence with uppercase matches. * 2) Position of the secondary match within the whole sequence. * 3) Sequence fragment that the primary matched. * 4) Strand of the primary match (+|-) * 5) Sequence fragment that the secondary matched. * 6) Strand of the secondary match (+|-) * 7) Is the primary match on the same strand as the secondary (s|o) * 8) Is the secondary match downstream or upstream (d|u) * 9) The gap between the primary and secondary matches * 10) The name of the sequence * 11) The p-value of the bin containing the match (adjusted for # of bins) * ---if the FASTA input file sequence names are in Genome Browser format: * 12-14) Position of primary match in BED coordinates * 15) Position of primary match in Genome Browser coordinates * 16-18) Position of secondary match in BED coordinates * 19) Position of secondary match in Genome Browser coordinates * * If you wish to sort based on the gap column: * Sort individual output: * sort -n -k 9,9 -o seqs_primary_secondary.txt seqs_primary_secondary.txt * Or sort all outputs: * for f in seqs_*.txt; do sort -n -k 9,9 -o $f $f; done * Or to get just locations of primary motif in BED coordinates * where the secondary is on the opposite strand, upstream with a gap of 118bp: * awk '$7=="o" && $8=="u" && $9==118 {print $12"\t"$13"\t"$14;}' seqs_primary_secondary.txt * **************************************************************************/ static void dump_sequence_matches(FILE *out, int margin, int bin, double sigthresh, BOOLEAN_T sig_only, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, ARRAY_T **matches) { RBNODE_T *node; SEQUENCE_T *sequence; int idx, seqlen, i, j, start, end, secondary, secondary_pos, primary_len, secondary_len, distance; BOOLEAN_T primary_rc, secondary_rc, downstream; char *buffer, *seq, *primary_match, *secondary_match; ARRAY_T *secondary_array; ALPH_T *alph; // get the alphabet alph = get_motif_alph(primary_motif); // allocate a buffer for copying the trimmed sequence into and modify it seqlen = margin * 2 + get_motif_trimmed_length(primary_motif); buffer = (char*)mm_malloc(sizeof(char) * (seqlen + 1)); // get the lengths of the motifs primary_len = get_motif_trimmed_length(primary_motif); secondary_len = get_motif_trimmed_length(secondary_motif->motif); // allocate some strings for storing the matches primary_match = (char*)mm_malloc(sizeof(char) * (primary_len + 1)); secondary_match = (char*)mm_malloc(sizeof(char) * (secondary_len + 1)); // add null byte at the end of the match strings primary_match[primary_len] = '\0'; secondary_match[secondary_len] = '\0'; // iterate over all the sequences for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) { sequence = (SEQUENCE_T*)rbtree_value(node); primary_rc = get_array_item(0, sequence->primary_matches) < 0; //secondary = matches[sequence->index]; secondary_array = matches[sequence->index]; if (! secondary_array) continue; int n_secondary_matches = get_array_length(secondary_array); for (idx=0; idx<n_secondary_matches; idx++) { secondary = get_array_item(idx, secondary_array); secondary_rc = secondary < 0; secondary_pos = abs(secondary); // calculate the distance if (secondary_pos <= margin) { distance = margin - secondary_pos - secondary_len + 1; downstream = primary_rc; } else { distance = secondary_pos - margin - primary_len - 1; downstream = !primary_rc; } // copy the trimmed sequence seq = sequence->data; for (i = 0; i < seqlen; ++i) { buffer[i] = (alph_is_case_insensitive(alph) ? tolower(seq[i]) : seq[i]); } buffer[seqlen] = '\0'; // uppercase primary start = margin; end = margin + primary_len; for (i = start, j = 0; i < end; ++i, ++j) { buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]); primary_match[j] = buffer[i]; } // uppercase secondary // note orign was one, subtract 1 to make origin zero as required for arrays start = secondary_pos -1; end = start + secondary_len; for (i = start, j = 0; i < end; ++i, ++j) { buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]); secondary_match[j] = buffer[i]; } // get the p-value of the seconndary match SPACING_T *spacings; if (secondary_rc == primary_rc) { spacings = downstream ? secondary_motif->spacings+(SAME+RIGHT) : secondary_motif->spacings+(SAME+LEFT); } else { spacings = downstream ? secondary_motif->spacings+(OPPO+RIGHT) : secondary_motif->spacings+(OPPO+LEFT); } double p_value = spacings->pvalue[distance/bin]; // skip match if not significant and only reporting significant matches if (sig_only && (p_value > sigthresh)) continue; // output line to file fprintf(out, "%s %3d %s %s %s %s %s %s %3d %s %.1e", buffer, secondary_pos, primary_match, (primary_rc ? "-" : "+"), secondary_match, (secondary_rc ? "-" : "+"), (secondary_rc == primary_rc ? "s" : "o"), (downstream ? "d" : "u"), distance, sequence->name, p_value ); // Parse the sequence name to see if we can get genomic coordinates // and print additional columns with primary and secondary matches // in both BED and Genome Browser coordinates. char *chr_name; size_t chr_name_len; int start_pos, end_pos; if (parse_genomic_coordinates_helper( sequence->name, &chr_name, &chr_name_len, &start_pos, &end_pos)) { // Get the start and end of the primary match in // 0-relative, half-open genomic coordinates. int p_start = start_pos + fabs(get_array_item(0, sequence->primary_matches)) - 1; int p_end = p_start + primary_len; // Get the start and end of the secondary match in // 0-relative, half-open genomic coordinates. int s_start, s_end; if ( (!primary_rc && downstream) || (primary_rc && !downstream) ) { s_start = p_end + distance; s_end = s_start + secondary_len; } else { s_end = p_start - distance; s_start = s_end - secondary_len; } fprintf(out, " %s %d %d %s:%d-%d", chr_name, p_start, p_end, chr_name, p_start+1, p_end); fprintf(out, " %s %d %d %s:%d-%d\n", chr_name, s_start, s_end, chr_name, s_start+1, s_end); } else { fprintf(out, "\n"); } } // secondary match } // primary match free(buffer); free(primary_match); free(secondary_match); }