/**************************************************************************** * Extract a small alignment out of the middle of a larger alignment. ****************************************************************************/ ALIGNMENT_T* extract_subalignment (int start, int width, ALIGNMENT_T* alignment) { int num_sequences = get_num_aligned_sequences(alignment); SEQ_T** sequences = get_alignment_sequences(alignment); SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); // Extract the specified columns into a new list of sequences. int i_seq = 0; char* subsequence = mm_malloc((width + 1) * sizeof(char)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_seq = sequences[i_seq]; char* raw_seq = get_raw_sequence(this_seq); strncpy(subsequence, raw_seq + start, width); subsequence[width] = '\0'; subsequences[i_seq] = allocate_seq(get_seq_name(this_seq), get_seq_description(this_seq), get_seq_offset(this_seq), subsequence); } // Extract the consensus string in the specified columns. char* consensus = get_consensus_string(alignment); char* subconsensus = mm_malloc(sizeof(char) * (width + 1)); strncpy(subconsensus, consensus + start, width); subconsensus[width] = '\0'; // Allocate and return the new alignment. ALIGNMENT_T* subalignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, subsequences, subconsensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { free_seq(subsequences[i_seq]); } myfree(subsequences); myfree(subsequence); return(subalignment); }
void ramen_scan_sequences() { FILE* seq_file = NULL; MOTIF_T* motif = NULL; MOTIF_T* rev_motif = NULL; SEQ_T* sequence = NULL; SCANNED_SEQUENCE_T* scanned_seq = NULL; PATTERN_T* pattern; int i; int j; SEQ_T** seq_list; int num_seqs; int seq_len; //For the bdb_bg mode: ARRAY_T* seq_bg_freqs; double atcontent; double roundatcontent; double avg_seq_length = 0; //Open the file. if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) { fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename); ramen_terminate(1); } //Start reading in the sequences read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list); seq_ids = new_string_list(); seq_fscores = allocate_array(num_seqs); //Allocate the required space for results results = malloc(sizeof(double*) * motifs.num); for (i=0;i<motifs.num;i++) { results[i] = malloc(sizeof(double)*num_seqs); } for (j=0;j<num_seqs;j++) { fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs); //copy the pointer into our current object for clarity sequence = seq_list[j]; //Read the fluorescence data from the description field. add_string(get_seq_name(sequence),seq_ids); seq_len = get_seq_length(sequence); set_array_item(j,atof(get_seq_description(sequence)),seq_fscores); //Scan with each motif. for (i=0;i<motifs.num;i++) { int motifindex = i*2; results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), motif_at(motifs.motifs, motifindex+1), NULL, NULL, //No need to pass PSSM. AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs); if (TRUE == args.linreg_normalise) { int k; double maxscore = 1; motif = motif_at(motifs.motifs,motifindex); for (k=0;k<get_motif_length(motif);k++) { double maxprob = 0; if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)); maxscore *= maxprob; } results[i][j] /= maxscore; } } } }
/**************************************************************************** * Remove from the alignment all columns that contain gaps for the * specified species. ****************************************************************************/ ALIGNMENT_T* remove_alignment_gaps (char* species, ALIGNMENT_T* alignment) { // Locate this species in the alignment. int species_index = get_index_in_string_list(species, get_species_names(alignment)); if (species_index == -1) { die("Can't find %s in alignment.\n", species); } SEQ_T* this_seq = get_alignment_sequence(species_index, alignment); // Get the dimensions of the original matrix. int num_sequences = get_num_aligned_sequences(alignment); int alignment_length = get_alignment_length(alignment); // Allocate memory for raw sequences that will constitute the new alignment. char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences); int i_seq = 0; for (i_seq = 0; i_seq < num_sequences; i_seq++) { raw_sequences[i_seq] = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); } char* consensus = get_consensus_string(alignment); char* new_consensus = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); // Iterate over all columns. int i_column; int i_raw = 0; for (i_column = 0; i_column < alignment_length; i_column++) { // Is there a gap? char this_char = get_seq_char(i_column, this_seq); if ((this_char != '-') && (this_char != '.')) { // If no gap, then copy over this column. for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); char this_char = get_seq_char(i_column, this_sequence); raw_sequences[i_seq][i_raw] = this_char; } new_consensus[i_raw] = consensus[i_column]; i_raw++; } } // Create new sequence objects. SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence), get_seq_description(this_sequence), get_seq_offset(this_sequence), raw_sequences[i_seq]); } // Allocate and return the new alignment. ALIGNMENT_T* new_alignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, new_sequences, new_consensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { myfree(raw_sequences[i_seq]); free_seq(new_sequences[i_seq]); } myfree(raw_sequences); myfree(new_sequences); myfree(new_consensus); return(new_alignment); }
/**************************************************************************** * Allocate one alignment object. Name and description may be NULL. * * Returns a pointer to the newly created alignment. ****************************************************************************/ ALIGNMENT_T* allocate_alignment( char* name, char* description, int num_sequences, SEQ_T** sequences, char* consensus_string ) { assert(num_sequences > 0); assert(sequences != NULL); assert(consensus_string != NULL); // Allocate the alignment object. ALIGNMENT_T* new_alignment = (ALIGNMENT_T*)mm_malloc(sizeof(ALIGNMENT_T)); if (new_alignment == NULL) { die("Error allocating alignment\n"); } // Store the name, truncating if necessary. if (name != NULL) { strncpy(new_alignment->name, name, MAX_ALIGNMENT_NAME); new_alignment->name[MAX_ALIGNMENT_NAME] = '\0'; if (strlen(new_alignment->name) != strlen(name)) { fprintf(stderr, "Warning: truncating alignment program name %s to %s.\n", name, new_alignment->name); } } else { new_alignment->name[0] = '\0'; } // Store the description, truncating if necessary. if (description != NULL) { strncpy(new_alignment->desc, description, MAX_ALIGNMENT_COMMENT); new_alignment->desc[MAX_ALIGNMENT_COMMENT] = '\0'; } else { new_alignment->desc[0] = '\0'; } // Store the sequences. new_alignment->sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*)); if (new_alignment->sequences == NULL) { die("Error allocating sequences\n"); } new_alignment->num_sequences = num_sequences; int seq_length = strlen(get_raw_sequence(sequences[0])); int i; for (i = 0; i < num_sequences; i++) { myassert(TRUE, strlen(get_raw_sequence(sequences[i])) == seq_length, "Sequence #1 (%s) is length=%d, but sequence #%d (%s) is length=%d.\n<%s>\n", get_seq_name(sequences[0]), seq_length, i, get_seq_name(sequences[i]), strlen(get_raw_sequence(sequences[i])), get_raw_sequence(sequences[i])); new_alignment->sequences[i] = allocate_seq(get_seq_name(sequences[i]), get_seq_description(sequences[i]), get_seq_offset(sequences[i]), get_raw_sequence(sequences[i]) ); } // Fill in the remaining fields. new_alignment->length = seq_length; copy_string(&(new_alignment->consensus_string), consensus_string); return(new_alignment); }