/******************************************************** * Read priors from the priors file. * * If *prior_block is NULL, a prior block will be allocated, * and the first block of priors will be read into it. * * If the seq. position is less than the current prior * position we leave the prior block in the current position * and set the prior to NaN. * * If the seq. position is within the extent of the current * prior block we set the prior to the value from the block. * * If the seq. position is past the extent of the current * prior block we read blocks until reach the seq. position * or we reach the end of the sequence. * ********************************************************/ void get_prior_from_reader( DATA_BLOCK_READER_T *prior_reader, const char *seq_name, size_t seq_position, DATA_BLOCK_T **prior_block, // Out variable double *prior // Out variable ) { double default_prior = get_default_prior_from_reader(prior_reader); *prior = default_prior; if (*prior_block == NULL) { // Allocate prior block if not we've not already done so *prior_block = new_prior_block(); // Fill in first data for block BOOLEAN_T result = prior_reader->get_next_block(prior_reader, *prior_block); if (result == FALSE) { die("Failed to read first prior from sequence %s.", seq_name); } } // Get prior for sequence postion size_t block_position = get_start_pos_for_data_block(*prior_block); size_t block_extent = get_num_read_into_data_block(*prior_block); if (block_position > seq_position) { // Sequence position is before current prior position return; } else if (block_position <= seq_position && seq_position <= (block_position + block_extent - 1)) { // Sequence position is contained in current prior block *prior = get_prior_from_data_block(*prior_block); } else { // Sequence position is after current prior position. // Try reading the next prior block. BOOLEAN_T priors_remaining = FALSE; while ((priors_remaining = prior_reader->get_next_block(prior_reader, *prior_block)) != FALSE) { block_position = get_start_pos_for_data_block(*prior_block); block_extent = get_num_read_into_data_block(*prior_block); if (block_position > seq_position) { // Sequence position is before current prior position return; } else if (block_position <= seq_position && seq_position <= (block_position + block_extent - 1)) { // Sequence position is contained in current prior block *prior = get_prior_from_data_block(*prior_block); break; } } if (priors_remaining == FALSE && verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Warning: reached end of priors for sequence %s.\n", seq_name); } } }
/******************************************************** * Read an array of priors from the priors file. * * If no prior for in the sequence and coordinate given * is found in the prior file, the priors will be set to 0.5 * * If the seq. position is within the extent of the current * prior block we set the prior to the value from the block. * * If the seq. position is past the extent of the current * prior block we read blocks until reach the seq. position * or we reach the end of the sequence. * * Sequences must occur in the same order as the FASTA file * Positions in sequence must be in increasing order ********************************************************/ void get_prior_array_from_reader( DATA_BLOCK_READER_T *prior_reader, const char *seq_name, size_t seq_start, size_t num_priors, size_t buffer_offset, double *priors ) { size_t seq_end = seq_start + num_priors; char *prior_seq_name = NULL; prior_reader->get_seq_name(prior_reader, &prior_seq_name); assert(strcmp(seq_name, prior_seq_name) == 0); // Fill the array with the default prior // starting at the buffer offset double default_prior = get_default_prior_from_reader(prior_reader); size_t i; for (i = buffer_offset; i < num_priors; ++i) { priors[i] = default_prior; } BOOLEAN_T result; size_t prior_start; size_t prior_length; size_t prior_end; DATA_BLOCK_T *prior_block = new_prior_block(); // Read and copy prior blocks until we've filled in the array while (TRUE) { result = prior_reader->get_next_block(prior_reader, prior_block); if (result == FALSE) { // Reached the end of priors for this sequence break; } prior_start = get_start_pos_for_data_block(prior_block) - 1; prior_length = get_num_read_into_data_block(prior_block); prior_end = prior_start + prior_length - 1; if (prior_end < seq_start) { // Skip prior blocks before region of interest continue; } // Copy the priors into the array size_t start_intersect = MAX(prior_start, seq_start); size_t end_intersect = MIN(prior_end, seq_end); BOOLEAN_T overlap = (end_intersect >= start_intersect); if (overlap == TRUE) { // FIXEME CEG // size_t num_to_copy = end_intersect - start_intersect + 1; size_t num_to_copy = end_intersect - start_intersect; size_t intersect_offset = start_intersect - seq_start; for (i = 0; i < num_to_copy; ++i) { priors[intersect_offset + i] = get_prior_from_data_block(prior_block); } } if (prior_end > seq_end) { // We're done filling the array, but have some priors left over // Rewind the reader to before the last block read prior_reader->unget_block(prior_reader); break; } } free_data_block(prior_block); return; }
BOOLEAN_T get_next_data_block_from_prior_reader_from_psp( DATA_BLOCK_READER_T *reader, DATA_BLOCK_T *data_block ) { BOOLEAN_T result = FALSE; const int buffer_size = 100; char buffer[buffer_size]; int num_read = 0; PSP_DATA_BLOCK_READER_T *psp_reader = (PSP_DATA_BLOCK_READER_T *) get_data_block_reader_data(reader); double *output_prior = get_prior_from_data_block(data_block); *output_prior = NaN(); int c = 0; // Skip over leading white space while((c = fgetc(psp_reader->psp_file)) != EOF) { if (isspace(c)) { if (c == '\n') { psp_reader->at_start_of_line = TRUE; } else { psp_reader->at_start_of_line = FALSE; } continue; } else { break; } } if (c == '>' && psp_reader->at_start_of_line == TRUE) { // We found the start of a new sequence while trying // to find a prior. c = ungetc(c, psp_reader->psp_file); if (ferror(psp_reader->psp_file)) { die( "Error reading file:%s.\nError message: %s\n", psp_reader->filename, strerror(ferror(psp_reader->psp_file)) ); } } else { // We are at start of a prior. // Read prior string until next space or EOF. int buffer_index = 0; while(c != EOF && !isspace(c)) { buffer[buffer_index] = c; ++buffer_index; if (buffer_index >= (buffer_size - 1)) { // No prior string should be this long buffer[buffer_size - 1] = 0; die("File %s contains invalid prior value: %s\n", psp_reader->filename, buffer); } c = fgetc(psp_reader->psp_file); } if (c == '\n') { psp_reader->at_start_of_line = TRUE; } else { psp_reader->at_start_of_line = FALSE; } buffer[buffer_index] = '\0'; // If the buffer is not empty, it should contain a string // representing the prior. Convert it to a double. if (buffer_index != 0) { char *end_ptr = NULL; double prior = strtod(buffer, &end_ptr); if (end_ptr == buffer || *end_ptr != '\0' || prior < 0.0L || prior > 1.0L ) { die("File %s contains invalid prior value: %s\n", psp_reader->filename, buffer); } *output_prior = prior; num_read = 1; ++psp_reader->current_position; result = TRUE; } } if (c == EOF && ferror(psp_reader->psp_file)) { die( "Error while reading file:%s.\nError message: %s\n", psp_reader->filename, strerror(ferror(psp_reader->psp_file)) ); } set_start_pos_for_data_block(data_block, psp_reader->current_position); set_num_read_into_data_block(data_block, num_read); return result; }
/******************************************************************** * This program reads a MEME PSP file and computes the binned * distribution of priors. The distribution is writen to stdout. ********************************************************************/ int main(int argc, char *argv[]) { char *usage = "compute-prior-dist <num-bins> <psp-file>"; if (argc != 3) { fprintf(stderr, "Usage: %s\n", usage); return -1; } int num_bins = atoi(argv[1]); if (num_bins <= 0) { fprintf(stderr, "Usage: %s\n", usage); return -1; } const char *filename = argv[2]; // Read each prior, find max and min of distribution. DATA_BLOCK_READER_T *psp_reader = NULL; psp_reader = new_prior_reader_from_psp(FALSE /* Don't try to parse genomic coord.*/, filename); DATA_BLOCK_T *psp_block = new_prior_block(); int prior_array_size = 100; ARRAY_T *raw_priors = allocate_array(prior_array_size); int num_priors = 0; while (psp_reader->go_to_next_sequence(psp_reader) != FALSE) { while (psp_reader->get_next_block(psp_reader, psp_block) != FALSE) { double prior = get_prior_from_data_block(psp_block); if (prior == 0.0) { // Skip priors that are exactly 0.0 continue; } if (num_priors == INT_MAX) { die("Number of priors exceeded maximum allowed value of %d", INT_MAX); } set_array_item(num_priors, prior, raw_priors); ++num_priors; if (num_priors >= prior_array_size) { resize_array(raw_priors, 2 * prior_array_size); prior_array_size = 2 * prior_array_size; } } } free_data_block(psp_block); free_data_block_reader(psp_reader); ARRAY_T *priors = extract_subarray(raw_priors, 0, num_priors); free_array(raw_priors); double median_prior = compute_median(priors); double min_prior = get_array_item(0, priors); double max_prior = get_array_item(num_priors - 1, priors); // Print min, max, and median printf("#min %6.5f\n", min_prior); printf("#max %6.5f\n", max_prior); printf("#median %6.5f\n", median_prior); // Special case if priors are exactly uniform. if (min_prior == max_prior) { printf("%6.5f\n", 1.0); return 0; } // Create the array of bins, intialized to 0. double *prior_dist = mm_calloc(num_bins, sizeof(double)); double scale = (num_bins - 1) / (max_prior - min_prior); double offset = min_prior; int dist_index = 0; int i; for (i = 0; i < num_priors; ++i) { double prior = get_array_item(i, priors); dist_index = raw_to_scaled(prior, 1, scale, offset); ++prior_dist[dist_index]; } for (dist_index = 0; dist_index < num_bins; ++dist_index) { // Print normalized bin counts prior_dist[dist_index] /= num_priors; printf("%6.5f\n", prior_dist[dist_index]); } return 0; }