/*********************************************************************** * Convert transition counts to transition probabilities, and compute * average spacer lengths. * * Each matrix is indexed 0 ... n+1, where n is the number of motifs. * The entry at [i,j] corresponds to the transition from motif i to * motif j. Hence, after normalization, each row in the transition * matrix should sum to 1. ***********************************************************************/ static void normalize_spacer_counts( double trans_pseudo, double spacer_pseudo, // Pseudocount for self-loop. BOOLEAN_T keep_unused, MATRIX_T* transp_freq, MATRIX_T* spacer_ave ) { int i_row; int i_col; int num_rows; double total_spacer; double num_transitions; double ave_spacer; /* Divide the spacer lengths by the number of occurrences. */ num_rows = get_num_rows(transp_freq); for (i_row = 0; i_row < num_rows; i_row++) { for (i_col = 0; i_col < num_rows; i_col++) { total_spacer = get_matrix_cell(i_row, i_col, spacer_ave) + spacer_pseudo; num_transitions = get_matrix_cell(i_row, i_col, transp_freq); if (spacer_pseudo > 0) num_transitions++; if (num_transitions != 0.0) { ave_spacer = total_spacer / num_transitions; set_matrix_cell(i_row, i_col, ave_spacer, spacer_ave); } } } // Add pseudocounts. for (i_row = 0; i_row < num_rows; i_row++) { for (i_col = 0; i_col < num_rows; i_col++) { // Force some transitions to zero. if (// No transitions to the start state. (i_col == 0) || // No transitions from the end state. (i_row == num_rows - 1) || // No transition from start to end. ((i_row == 0) && (i_col == num_rows - 1))) { set_matrix_cell(i_row, i_col, 0.0, transp_freq); } else { // Only increment the used transitions. if ((keep_unused) || (get_matrix_cell(i_row, i_col, transp_freq) > 0.0)) { incr_matrix_cell(i_row, i_col, trans_pseudo, transp_freq); } } } } // Normalize rows. for (i_row = 0; i_row < num_rows - 1; i_row++) { if (array_total(get_matrix_row(i_row, transp_freq)) > 0.0) { normalize(SLOP, get_matrix_row(i_row, transp_freq)); } } }
extern MATRIX_T* gen_pam_matrix( ALPH_T alph, /* alphabet */ int dist, /* PAM distance */ BOOLEAN_T logodds /* true: generate log-odds matrix false: generate target frequency matrix */ ) { assert(alph == DNA_ALPH || alph == PROTEIN_ALPH); int i, j; MATRIX_T *matrix, *mul; BOOLEAN_T dna = (alph == DNA_ALPH); double *pfreq = dna ? pam_dna_freq : pam_prot_freq; // standard frequencies int alen = alph_size(alph, ALPH_SIZE); // length of standard alphabet double factor = dist < 170 ? 2/log(2) : 3/log(2); // same as in "pam" Version 1.0.6 /* create the array for the joint probability matrix */ matrix = allocate_matrix(alen, alen); mul = allocate_matrix(alen, alen); /* initialize the matrix: PAM 1: due to roundoff, take the average of the two estimates of the joint frequency of i and j as the joint, then compute the conditionals for the matrix */ for (i=0; i<alen; i++) { for (j=0; j<=i; j++) { double vij = dna ? trans[i][j] : dayhoff[i][j]; double vji = dna ? trans[j][i] : dayhoff[j][i]; double joint = ((vij * pfreq[j]) + (vji * pfreq[i]))/20000;/* use average to fix rndoff */ set_matrix_cell(i, j, joint/pfreq[j], matrix); if (i!=j) set_matrix_cell(j, i, joint/pfreq[i], matrix); } } /* take PAM matrix to desired power to scale it */ copy_matrix(matrix, mul); for (i=dist; i>1; i--) { MATRIX_T *product = matrix_multiply(matrix, mul); SWAP(MATRIX_T*, product, matrix) free_matrix(product); } free_matrix(mul); /* convert to joint or logodds matrix: target: J_ij = Pr(i,j) = Mij pr(j) logodds: L_ij = log (Pr(i,j)/(Pr(i)Pr(j)) = log (Mij Pr(j)/Pr(i)Pr(j)) = log(Mij/pr(i)) */ for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { double vij = get_matrix_cell(i, j, matrix); vij = logodds ? nint(factor * log((vij+EPSILON)/pfreq[i])) : vij * pfreq[j]; set_matrix_cell(i, j, vij, matrix); } } return matrix; } /* gen_pam_matrix */
MATRIX_T *reorder_matrix( const char *alpha1, /* current alphabet */ const char *alpha2, /* new alphabet; must be subset */ MATRIX_T *in_matrix /* matrix to reorder */ ) { int i, j; int alen1 = strlen(alpha1); int alen2 = strlen(alpha2); MATRIX_T *out_matrix; if (alen2 > alen1) die("The new alphabet %s must be a subset of the old alphabet %s.\n", alpha2, alpha1); out_matrix = allocate_matrix(alen2, alen2); for (i=0; i<alen2; i++) { int ii = strchr(alpha1, alpha2[i]) - alpha1; for (j=0; j<alen2; j++) { int jj; char *ptr = strchr(alpha1, alpha2[j]); if (!ptr) die("The new alphabet %s must be a subset of the old alphabet %s\n", alpha2, alpha1); jj = ptr - alpha1; set_matrix_cell(i, j, get_matrix_cell(ii, jj, in_matrix), out_matrix); } } return(out_matrix); } /* reorder_matrix */
/************************************************************************** * hash_pssm_matrix_pos Recursively create a single position of a hashed PSSM. * **************************************************************************/ static void hash_pssm_matrix_pos( MATRIX_T *pssm, // pssm to hash MATRIX_T *hashed_pssm, // hashed pssm int pos, // position in pssm int hashed_pos, // position in hashed pssm int n, // number of columns to hash together double score, // cumulative score; call with 0 int index // cumulative index; call with 0 ) { int i; int alen = get_num_cols(pssm); // alphabet length int w = get_num_rows(pssm); // pssm width if (n==0) { // done, set hashed_pssm entry set_matrix_cell(hashed_pos, index, score, hashed_pssm); } else { // combine next column of pssm for (i=0; i<=alen; i++) { // letters + blank // not past right edge of motif and not blank? double s = (pos<w && i!=alen) ? get_matrix_cell(pos, i, pssm) : 0; hash_pssm_matrix_pos(pssm, hashed_pssm, pos+1, // position in old pssm hashed_pos, // position working on n-1, // positions remaining to hash score+s, // score so far index*(alen+1)+i); // hashed alphabet index so far } // leter } } // hash_pssm_matrix_pos
/*********************************************************************** * Apply a pseudocount to the motif pspm. ***********************************************************************/ void apply_pseudocount_to_motif (MOTIF_T* motif, ARRAY_T *background, double pseudocount) { int pos, letter, len, asize, sites; double prob, count, total; ARRAY_T *temp; // no point in doing work when it makes no difference if (pseudocount == 0) return; assert(pseudocount > 0); // motif dimensions asize = alph_size(motif->alph, ALPH_SIZE); len = motif->length; // create a uniform background if none is given temp = NULL; if (background == NULL) { temp = allocate_array(asize); get_uniform_frequencies(motif->alph, temp); background = temp; } // calculate the counts sites = (motif->num_sites > 0 ? motif->num_sites : DEFAULT_SITE_COUNT); total = sites + pseudocount; for (pos = 0; pos < len; ++pos) { for (letter = 0; letter < asize; ++letter) { prob = get_matrix_cell(pos, letter, motif->freqs); count = (prob * sites) + (pseudocount * get_array_item(letter, background)); prob = count / total; set_matrix_cell(pos, letter, prob, motif->freqs); } } if (temp) free_array(temp); }
/************************************************************************** * scale_pssm * * Scale and round the scores in a PSSM so that the score of a word * is in the range [0..w*range]. * * Returns the scaled PSSM. * **************************************************************************/ void scale_pssm( PSSM_T *pssm, // The PSSM. (IN/OUT) PRIOR_DIST_T *prior_dist, // Distribution of priors (IN) double alpha, // Fraction of all TFBS that are the TFBS of interest int range // The desired range. (IN) ) { int i, j; MATRIX_T* matrix = pssm->matrix; int r = pssm->w; int c = pssm->alphsize; double small = BIG; double large = -BIG; double scale, offset; // Get the largest and smallest scores in the PSSM. for (i=0; i<r; i++) { for (j=0; j<c; j++) { double x = get_matrix_cell(i, j, matrix); small = MIN(small, x); large = MAX(large, x); } } // Get the smallest and largest prior log-odds from the prior distribution // and use them to adjust small and large. if (prior_dist != NULL) { double min_lo_prior = get_min_lo_prior(prior_dist, alpha); double max_lo_prior = get_max_lo_prior(prior_dist, alpha); small = MIN(small, min_lo_prior); large = MAX(large, max_lo_prior); } // Find offset and scale factors so that PSSM scores for words is in the // range: [0..w*range] // To make LO=0 map back to 0, need offset*scale to be an integer. // So we make offset and scale integers. (TLB 31 May 2013) if (large == small) { small = large - 1; } // In case all motif entries are the same. offset = small = floor(small); // Make offset an integer. scale = floor(range/(large-small)); // Ensure scaled scores are <= range. // Scale and round the PSSM entries. for (i=0; i<r; i++) { for (j=0; j<c; j++) { double x = raw_to_scaled(get_matrix_cell(i, j, matrix), 1, scale, offset); set_matrix_cell(i, j, x, matrix); } } // return scale and offset of scores pssm->scale = scale; pssm->offset = offset; pssm->range = range; } // scale_pssm
/************************************************************************* * Using information stored in the states of an HMM, fill in the HMM's * transition matrix. *************************************************************************/ static void build_transition_matrix (MHMM_T *the_hmm) { int i_state; /* Indices into the matrix. */ int j_state; MHMM_STATE_T * this_state; /* Pointer to the current state. */ int num_out; /* No. of trans out of the current state. */ int i_out; /* Index of outgoing transition. */ check_sq_matrix(the_hmm->trans, the_hmm->num_states); /* First make sure the matrix is zeroed. */ for (i_state = 0; i_state < the_hmm->num_states; i_state++) { for (j_state = 0; j_state < the_hmm->num_states; j_state++) { set_matrix_cell(i_state, j_state, 0.0, the_hmm->trans); } } /* Look at each state in the model. */ for (i_state = 0; i_state < the_hmm->num_states; i_state++) { this_state = &(the_hmm->states[i_state]); /* Find out how many transitions out of this state there are. */ num_out = this_state->ntrans_out; for (i_out = 0; i_out < num_out; i_out++) { /* Get the index of the state being transitioned to. */ j_state = this_state->itrans_out[i_out]; assert(j_state != 0); /* Fill in the matrix with the appropriate value. */ set_matrix_cell(i_state, j_state, get_array_item(i_out, this_state->trans_out), the_hmm->trans); } } assert(verify_trans_matrix(FALSE, the_hmm->num_states, the_hmm->trans)); }
/*********************************************************************** * Takes a matrix of meme scores and converts them into letter * probabilities. * * The probablility can be got by: * p = (2 ^ (s / 100)) * bg * ***********************************************************************/ MATRIX_T* convert_scores_into_freqs (ALPH_T alph, MATRIX_T *scores, ARRAY_T *bg, int site_count, double pseudo_count) { int asize, length; double freq, score, total_count, counts, bg_freq; MATRIX_T *freqs; int row, col; assert(alph != INVALID_ALPH); assert(scores != NULL); assert(bg != NULL); length = get_num_rows(scores); asize = alph_size(alph, ALPH_SIZE); freqs = allocate_matrix(length, asize); total_count = site_count + pseudo_count; for (col = 0; col < asize; ++col) { bg_freq = get_array_item(col, bg); for (row = 0; row < length; ++row) { score = get_matrix_cell(row, col, scores); // convert to a probability freq = pow(2.0, score / 100.0) * bg_freq; // remove the pseudo count freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count; if (freq < 0) freq = 0; else if (freq > 1) freq = 1; set_matrix_cell(row, col, freq, freqs); } } for (row = 0; row < length; ++row) { normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs)); } return freqs; }
/*********************************************************************** * Converts a TRANSFAC motif to a MEME motif. * Caller is responsible for freeing the returned MOTIF_T. ***********************************************************************/ MOTIF_T *convert_transfac_motif_to_meme_motif( char *id, int pseudocount, ARRAY_T *bg, TRANSFAC_MOTIF_T *motif ) { MATRIX_T *counts = get_transfac_counts(motif); if (counts == NULL) { die( "Unable to convert TRANSFAC motif %s to MEME motif: " "missing counts matrix.", id ); }; // Convert the motif counts to frequencies. int num_bases = get_num_cols(counts); int motif_width = get_num_rows(counts); int motif_position = 0; MATRIX_T *freqs = allocate_matrix(motif_width, num_bases); for (motif_position = 0; motif_position < motif_width; ++motif_position) { int i_base = 0; int num_seqs = 0; // motif columns may have different counts for (i_base = 0; i_base < num_bases; i_base++) { num_seqs += get_matrix_cell(motif_position, i_base, counts); } for (i_base = 0; i_base < num_bases; i_base++) { double freq = (get_matrix_cell(motif_position, i_base, counts) + (pseudocount * get_array_item(i_base, bg))) / (num_seqs + pseudocount); set_matrix_cell(motif_position, i_base, freq, freqs); } } MOTIF_T *meme_motif = allocate_motif(id, DNA_ALPH, NULL, freqs); calc_motif_ambigs(meme_motif); return meme_motif; }
MATRIX_T *convert_score_to_target( MATRIX_T *score, /* score matrix */ ARRAY_T *prob /* letter frequencies */ ) { int i, j; KARLIN_INPUT_T *karlin_input; double lambda, K, H; MATRIX_T *target; /* target freq. matrix */ int alen = get_num_rows(score); /* alphabet length */ /* make input for karlin() */ karlin_input = make_karlin_input(score, prob); /* get lambda */ karlin(karlin_input->low, karlin_input->high, karlin_input->prob->items, &lambda, &K, &H); /*printf("lambda %f K %f H %f\n", lambda, K, H);*/ /* calculate target frequencies */ target = allocate_matrix(alen, alen); for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { double pi = get_array_item(i, prob); double pj = get_array_item(j, prob); double sij = get_matrix_cell(i, j, score); double f = pi * pj * exp(lambda * sij); set_matrix_cell(i, j, f, target); } } // Free local dynamic memory. free_array(karlin_input->prob); myfree(karlin_input); return(target); } /* convert_score_to_target */
/*********************************************************************** * Takes a matrix of letter probabilities and converts them into meme * score. * * Assuming the probability is nonzero the score is just: * s = log2(p / bg) * 100 * ***********************************************************************/ MATRIX_T* convert_freqs_into_scores (ALPH_T alph, MATRIX_T *freqs, ARRAY_T *bg, int site_count, double pseudo_count) { int asize, length; double freq, score, total_count, counts, bg_freq; MATRIX_T *scores; int row, col; assert(alph != INVALID_ALPH); assert(freqs != NULL); assert(bg != NULL); length = get_num_rows(freqs); asize = alph_size(alph, ALPH_SIZE); scores = allocate_matrix(length, asize); total_count = site_count + pseudo_count; for (col = 0; col < asize; ++col) { bg_freq = get_array_item(col, bg); for (row = 0; row < length; ++row) { freq = get_matrix_cell(row, col, freqs); // apply a pseudo count freq = ((pseudo_count * bg_freq) + (freq * site_count)) / total_count; // if the background is correct this shouldn't happen if (freq <= 0) freq = 0.0000005; // convert to a score score = (log(freq / bg_freq) / log(2)) * 100; set_matrix_cell(row, col, score, scores); } } return scores; }
/***************************************************************************** * MEME > motifs > motif > probabilities > alphabet_matrix > alphabet_array > /value * Lookup a letter and check it exists and does not have a probability. * Set the letter's score to the passed value. ****************************************************************************/ void mxml_probability_value(void *ctx, char *letter_id, double probability) { CTX_T *data; MATRIX_T *freqs; char *symbol; int index; data = (CTX_T*)ctx; freqs = data->mscope.motif->freqs; // lookup letter ID symbol = (char*)rbtree_get(data->letter_lookup, letter_id); if (symbol == NULL) { local_error(data, "Probability is not allowed for unknown letter identifier \"%s\".\n", letter_id); return; } index = alph_indexc(data->alph, symbol[0]); if (index < 0) { local_error(data, "Probability is not allowed for non-core letter %c.\n", symbol[0]); return; } if (get_matrix_cell(data->current_pos, index, freqs) != -1) { local_error(data, "Probability for letter %c in position %d has already been set.\n", symbol[0], data->current_pos + 1); return; } set_matrix_cell(data->current_pos, index, probability, freqs); }
void read_regexp_file( char* filename, // Name of MEME file IN int* num_motifs, // Number of motifs retrieved OUT MOTIF_T* motifs // The retrieved motifs - NOT ALLOCATED! ) { FILE* motif_file; // MEME file containing the motifs. char motif_name[MAX_MOTIF_ID_LENGTH+1]; char motif_regexp[MAX_MOTIF_WIDTH]; ARRAY_T* these_freqs; MOTIF_T* m; int i; //Set things to the defaults. *num_motifs = 0; // Open the given MEME file. if (open_file(filename, "r", TRUE, "motif", "motifs", &motif_file) == 0) exit(1); //Set alphabet - ONLY supports dna. set_alphabet(verbosity, "ACGT"); while (fscanf(motif_file, "%s\t%s", motif_name, motif_regexp) == 2) { /* * Now we: * 1. Fill in new motif (preallocated) * 2. Assign name * 3. Convert regexp into frequency table. */ m = &(motifs[*num_motifs]); set_motif_id(motif_name, m); m->length = strlen(motif_regexp); /* Store the alphabet size in the motif. */ m->alph_size = get_alph_size(ALPH_SIZE); m->ambigs = get_alph_size(AMBIG_SIZE); /* Allocate memory for the matrix. */ m->freqs = allocate_matrix(m->length, get_alph_size(ALL_SIZE)); //Set motif frequencies here. for (i=0;i<strlen(motif_regexp);i++) { switch(toupper(motif_regexp[i])) { case 'A': set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); break; case 'C': set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'G': set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); break; case 'T': set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'U': set_matrix_cell(i,alphabet_index('U',get_alphabet(TRUE)),1,m->freqs); break; case 'R': //purines set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); break; case 'Y': //pyramidines set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'K': //keto set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'M': //amino set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'S': //strong set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'W': //weak set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'B': set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'D': set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'H': set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'V': set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); break; case 'N': set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; } } /* Compute values for ambiguous characters. */ for (i = 0; i < m->length; i++) { these_freqs = get_matrix_row(i, m->freqs); fill_in_ambiguous_chars(FALSE, these_freqs); } /* Compute and store the motif complexity. */ m->complexity = compute_motif_complexity(m); //Move our pointer along to do the next motif. (*num_motifs)++; } }
/*********************************************************************** * Read TRANSFAC motifs from a TRANSFAC file. * Returns an arraylist of pointers to TRANSFAC_MOTIF_T ***********************************************************************/ ARRAYLST_T *read_motifs_from_transfac_file ( const char* transfac_filename // Name of TRANSFAC file or '-' for stdin IN ) { // Create dynamic storage for motifs ARRAYLST_T *motif_list = arraylst_create(); // Open the TRANFAC file for reading. FILE *transfac_file = NULL; if (open_file( transfac_filename, "r", TRUE, // Allow '-' for stdin "transfac file", "", &transfac_file ) == FALSE) { exit(1); } // Read and parse the TRANFAC file. int num_bases = 4; char *line = NULL; while ((line = getline2(transfac_file)) != NULL) { // Split the line into an initial tag and everything else. char *this_accession = split(line, ' '); char *tag = line; // Have we reached a new matrix? if (strcmp(tag, "AC") == 0) { trim(this_accession); char *this_id = NULL; char *this_name = NULL; char *this_descr = NULL; char *this_species = NULL; char this_consensus[MAX_CONSENSUS_LENGTH]; STRING_LIST_T *species_list = new_string_list(); // Old versions of TRANSFAC use pee-zero; new use pee-oh. while (strcmp(tag, "PO") != 0 && strcmp(tag, "P0") != 0) { line = getline2(transfac_file); if (line == NULL) { die ("Can't find PO line for TRANSFAC matrix %s.\n", this_accession); } char *data = split(line, ' '); if (data != NULL) { trim(data); } tag = line; // Store the id line. if (strcmp(tag, "ID") == 0) { this_id = strdup(data); } // Store the species line. else if (strcmp(tag, "BF") == 0) { add_string(data, species_list); } // Store the name line. else if (strcmp(tag, "NA") == 0) { this_name = strdup(data); } // Store the description line. else if (strcmp(tag, "DE") == 0) { this_descr = strdup(data); } } // Check how many positions in the motif // Mark current position in file fpos_t file_position; errno = 0; int status = fgetpos(transfac_file, &file_position); if (status) { die("Error reading file %s: %s", transfac_filename, strerror(errno)); } int num_motif_positions = 0; while (TRUE) { // Read till we reach the end of the counts or the end of the motif line = getline2(transfac_file); if (line == NULL) { break; } char *data = split(line, ' '); if (data != NULL) { trim(data); } tag = line; // Read till we reach the end of the counts or the end of the motif if ((strcmp(tag, "XX\n") == 0) || (strcmp(tag, "//\n") == 0)) { break; } ++num_motif_positions; } // Rewind file errno = 0; status = fsetpos(transfac_file, &file_position); if (status) { die("Error reading file %s: %s", transfac_filename, strerror(errno)); } // Read the motif counts. int num_seqs = 0; this_consensus[0] = 0; MATRIX_T *motif_counts = allocate_matrix(num_motif_positions, 4); int position = 0; while (TRUE) { line = getline2(transfac_file); if (line == NULL) { break; } char *data = split(line, ' '); if (data != NULL) { trim(data); } tag = line; // Look for the end of the motif. if ((strcmp(tag, "XX\n") == 0) || (strcmp(tag, "//\n") == 0)) { break; } position = atoi(tag); if (position > num_motif_positions) { die( "Error reading motif counts at position %d of motif %s in file %s", position, this_accession, transfac_filename ); } // Store the contents of this row. int count[4]; char consensus; sscanf( data, "%d %d %d %d %c", &(count[0]), &(count[1]), &(count[2]), &(count[3]), &consensus ); int i_base; for (i_base = 0; i_base < num_bases; i_base++) { set_matrix_cell(position - 1, i_base, count[i_base], motif_counts); } this_consensus[position - 1] = consensus; } this_consensus[position] = 0; TRANSFAC_MOTIF_T *motif = new_transfac_motif( this_accession, this_id, this_name, this_descr, this_consensus, species_list, motif_counts ); arraylst_add(motif, motif_list); } } fclose(transfac_file); return motif_list; }