/************************************************************************** * Copy a sequence object. Memory must be freed by caller. **************************************************************************/ SEQ_T* copy_sequence (SEQ_T* source_sequence) { // Allocate the sequence object. SEQ_T* new_sequence = allocate_seq(source_sequence->name, source_sequence->desc, source_sequence->offset, source_sequence->sequence); // Copy additional fields. new_sequence->weight = source_sequence->weight; new_sequence->is_complete = source_sequence->is_complete; if (source_sequence->intseq != NULL) { new_sequence->intseq = (int*)mm_malloc(sizeof(int) * source_sequence->length); int i; for (i = 0; i < source_sequence->length; i++) { new_sequence->intseq[i] = source_sequence->intseq[i]; } } if (source_sequence->gc != NULL) { new_sequence->gc = (int*)mm_malloc(sizeof(int) * source_sequence->length); int i; for (i = 0; i < source_sequence->length; i++) { new_sequence->gc[i] = source_sequence->gc[i]; } } return(new_sequence); }
/********************************************************************** shuffle_sequence() shuffle a given sequences based on their content **********************************************************************/ void shuffle_sequence( SEQ_T* seq, /* original sequence IN */ unsigned int seed, /* seed IN */ SEQ_T** target /* target sequence OUT */ ){ my_srand(seed); assert(*target==NULL); // reset target if not null if (*target != NULL){ free_seq(*target); } *target = allocate_seq(get_seq_name(seq),"shuffled",get_seq_offset(seq),get_raw_sequence(seq)); char *raw = get_raw_sequence(*target); /* copy original in temp string */ char* tmp = (char*)mm_calloc(get_seq_length(seq)+1,sizeof(char)); strcpy(tmp,get_raw_sequence(seq)); tmp[get_seq_length(seq)]='\0'; int i,j; char *ss; char *dd; for(j=0,i=get_seq_length(seq);i>0;i--){ // Pick a random number in the range: int pick = rand() % i; raw[j++] = tmp[pick]; // "shift" routine here eliminates the "picked" base from the _src string: // dd starts at the picked position: ss is one beyond that: for( dd = tmp+pick , ss = dd + 1 ; *dd ; *dd++=*ss++ ); } myfree(tmp); }
/****************************************************************************** * This function allocates and initializes a SEQ_T object from a FASTA and * a prior reader. The prior reader is optional and may be null. * * Returns a pointer to a new SEQ_T object. *****************************************************************************/ SEQ_T *get_next_seq_from_readers( DATA_BLOCK_READER_T *fasta_reader, DATA_BLOCK_READER_T *prior_reader, size_t max_size ) { // Move to the next sequence in the fasta file. BOOLEAN_T got_seq = fasta_reader->go_to_next_sequence(fasta_reader); if (got_seq == FALSE) { // Reached EOF return NULL; } char *fasta_seq_name = NULL; fasta_reader->get_seq_name(fasta_reader, &fasta_seq_name); size_t seq_offset = get_current_pos_from_seq_reader_from_fasta(fasta_reader); SEQ_T *sequence = allocate_seq( fasta_seq_name, NULL, // description seq_offset, NULL // raw sequence ); // Read the first raw sequence segment into the sequence. read_one_fasta_segment_from_reader( fasta_reader, max_size, 0, // No buffer offset on first segment sequence ); // Move to the next sequence in the priors file. if (prior_reader) { BOOLEAN_T got_priors_seq = prior_reader->go_to_next_sequence(prior_reader); if (got_priors_seq == FALSE) { die("Unable to read sequence from priors file."); } // Check that the sequence name from the FASTA reader matches // the sequence name from the prior reader. char *prior_seq_name = NULL; prior_reader->get_seq_name(prior_reader, &prior_seq_name); if (strcmp(fasta_seq_name, prior_seq_name) != 0) { die( "Sequence named %s from prior reader did not " "match sequence name %s from fasta reader\n", prior_seq_name, fasta_seq_name ); } // Read the first segment of priors data into the sequence. read_one_priors_segment_from_reader( prior_reader, max_size, 0, // No buffer offset on first segment sequence ); } myfree(fasta_seq_name); return sequence; }
/**************************************************************************** * Extract a small alignment out of the middle of a larger alignment. ****************************************************************************/ ALIGNMENT_T* extract_subalignment (int start, int width, ALIGNMENT_T* alignment) { int num_sequences = get_num_aligned_sequences(alignment); SEQ_T** sequences = get_alignment_sequences(alignment); SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); // Extract the specified columns into a new list of sequences. int i_seq = 0; char* subsequence = mm_malloc((width + 1) * sizeof(char)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_seq = sequences[i_seq]; char* raw_seq = get_raw_sequence(this_seq); strncpy(subsequence, raw_seq + start, width); subsequence[width] = '\0'; subsequences[i_seq] = allocate_seq(get_seq_name(this_seq), get_seq_description(this_seq), get_seq_offset(this_seq), subsequence); } // Extract the consensus string in the specified columns. char* consensus = get_consensus_string(alignment); char* subconsensus = mm_malloc(sizeof(char) * (width + 1)); strncpy(subconsensus, consensus + start, width); subconsensus[width] = '\0'; // Allocate and return the new alignment. ALIGNMENT_T* subalignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, subsequences, subconsensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { free_seq(subsequences[i_seq]); } myfree(subsequences); myfree(subsequence); return(subalignment); }
SEQ_T* get_consensus_sequence(double threshold, ALIGNMENT_T* alignment) { char* seq_string = NULL; unsigned char c = 0; unsigned char most_freq_char = 0; #define NUM_CHARS 127 char char_counts[NUM_CHARS]; int i = 0; int j = 0; double max_char_freq = 0.0; SEQ_T* consensus; assert(alignment != NULL); seq_string = mm_malloc(alignment->length * sizeof(char) + 1); if (seq_string == NULL) { die("Error allocating consensus sequence string\n"); } // For each column in the alignment for (i = 0; i < alignment->length; i++) { most_freq_char = 0; memset(char_counts, 0, NUM_CHARS * sizeof(char)); // Count character occurances for (j = 0; j < alignment->num_sequences; j++) { c = get_seq_char(i, alignment->sequences[j]); char_counts[c]++; } // Find the index of the character that occurs the most frequently for (c = 0; c < NUM_CHARS; c++) { most_freq_char = char_counts[most_freq_char] >= char_counts[c] ? most_freq_char : c; } // If the most frequent character exceeds the threshold // it will be the consensus character. max_char_freq = (double) char_counts[most_freq_char] / (double) alignment->num_sequences; if (max_char_freq >= threshold) { seq_string[i] = most_freq_char; } else { // Otherwise the consensus is the gap character seq_string[i] = '-'; } } seq_string[i] = '\0'; consensus = allocate_seq("Consensus", "", 0, seq_string); if (seq_string != NULL) myfree(seq_string); return(consensus); }
consensus_data * generate_consensus( char ** input_seq, unsigned int n_seq, unsigned min_cov, unsigned K, double min_idt) { unsigned int j; unsigned int seq_count; unsigned int aligned_seq_count; kmer_lookup * lk_ptr; seq_array sa_ptr; seq_addr_array sda_ptr; kmer_match * kmer_match_ptr; aln_range * arange; alignment * aln; align_tags_t ** tags_list; //char * consensus; consensus_data * consensus; double max_diff; max_diff = 1.0 - min_idt; seq_count = n_seq; //printf("XX n_seq %d\n", n_seq); //for (j=0; j < seq_count; j++) { // printf("seq_len: %u %u\n", j, strlen(input_seq[j])); //}; fflush(stdout); tags_list = calloc( seq_count, sizeof(align_tags_t *) ); lk_ptr = allocate_kmer_lookup( 1 << (K * 2) ); sa_ptr = allocate_seq( (seq_coor_t) strlen( input_seq[0]) ); sda_ptr = allocate_seq_addr( (seq_coor_t) strlen( input_seq[0]) ); add_sequence( 0, K, input_seq[0], strlen(input_seq[0]), sda_ptr, sa_ptr, lk_ptr); //mask_k_mer(1 << (K * 2), lk_ptr, 16); aligned_seq_count = 0; for (j=1; j < seq_count; j++) { //printf("seq_len: %ld %u\n", j, strlen(input_seq[j])); kmer_match_ptr = find_kmer_pos_for_seq(input_seq[j], strlen(input_seq[j]), K, sda_ptr, lk_ptr); #define INDEL_ALLOWENCE_0 6 arange = find_best_aln_range(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5); // narrow band to avoid aligning through big indels //printf("1:%ld %ld %ld %ld\n", arange_->s1, arange_->e1, arange_->s2, arange_->e2); //arange = find_best_aln_range2(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5); // narrow band to avoid aligning through big indels //printf("2:%ld %ld %ld %ld\n\n", arange->s1, arange->e1, arange->s2, arange->e2); #define INDEL_ALLOWENCE_1 0.10 if (arange->e1 - arange->s1 < 100 || arange->e2 - arange->s2 < 100 || abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) > (int) (0.5 * INDEL_ALLOWENCE_1 * (arange->e1 - arange->s1 + arange->e2 - arange->s2))) { free_kmer_match( kmer_match_ptr); free_aln_range(arange); continue; } //printf("%ld %s\n", strlen(input_seq[j]), input_seq[j]); //printf("%ld %s\n\n", strlen(input_seq[0]), input_seq[0]); #define INDEL_ALLOWENCE_2 150 aln = align(input_seq[j]+arange->s1, arange->e1 - arange->s1 , input_seq[0]+arange->s2, arange->e2 - arange->s2 , INDEL_ALLOWENCE_2, 1); if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) { tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str, aln->aln_str_size, arange, j, 0); aligned_seq_count ++; } /*** for (k = 0; k < tags_list[j]->len; k++) { printf("%ld %d %c\n", tags_list[j]->align_tags[k].t_pos, tags_list[j]->align_tags[k].delta, tags_list[j]->align_tags[k].q_base); } ***/ free_aln_range(arange); free_alignment(aln); free_kmer_match( kmer_match_ptr); } if (aligned_seq_count > 0) { consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov ); } else { // allocate an empty consensus sequence consensus = calloc( 1, sizeof(consensus_data) ); consensus->sequence = calloc( 1, sizeof(char) ); consensus->eqv = calloc( 1, sizeof(unsigned int) ); } //free(consensus); free_seq_addr_array(sda_ptr); free_seq_array(sa_ptr); free_kmer_lookup(lk_ptr); for (j=0; j < aligned_seq_count; j++) { free_align_tags(tags_list[j]); } free(tags_list); return consensus; }
SEQ_T *print_random_seqs ( FILE *out, // Stream to print on. int seed, // Random number seed. int nseqs, // Number of sequences to print. int min, // Minimum sequence length. int max, // Maximum sequence length. char **letters, // Array of letter strings. int r, // Number of letter strings. int c, // Length of letter strings. int order, // Order of Markov model. double *cum // Cumulative distribution(s) defining model. ) { int i, j; int n; // Length of sequence. char *buffer = NULL; // Buffer for sequences. char *id = NULL; // Sequence name. char first_letter = letters[0][0]; // First letter in alphabet. // Create the buffer for the string. mm_resize(buffer, max*(c-1)+1, char); /* set up random number generator */ if (seed != 0) srand48(seed); /* print random sequences */ for (i=0; i<nseqs; i++) { /* sequence */ // Decide length of sequence to print. n = (int) (min + drand48() * (max - min + 1)); // Print FASTA ID line. if (out != NULL) { fprintf(out, ">SEQ_%-d %d\n", i+1, n); } else { mm_resize(id, 50, char); sprintf(id, ">SEQ_%-d %d\n", i+1, n); } /* Generate letters by 1) random x ~ [0,1) 2) binary search of cum for cum[i-1]<x<=cum[i] 3) letter/codon is letters[i-1] */ for (j=0; j<n; j++) { // generate letters/codons double x = drand48(); // random number int lo = 0; int hi = r; int offset = 0; // Offset into cum array. if (order >= 1) { // Markov model. int start_ptr; // Find the offset into the cumulative prob array by looking // for the offset of the preceeding "order" characters. buffer[j] = first_letter; // Now contains index into array. buffer[j+1] = '\0'; start_ptr = j > order ? j-order : 0; // Start of index string. offset = s2i(buffer + start_ptr); //fprintf(stderr, "b: %s\n offset: %d x: %f\n", buffer+start_ptr, offset, x); } while (hi-lo > 1) { // binary search int mid = (lo+hi)/2; // midpoint if (x > cum[mid+offset]) { lo = mid; } else { hi = mid; } } //fprintf(stderr, "%11.8f %s r %d\n", x, letters[x<cum[lo+offset] ? lo : lo+1], r); //fprintf(stderr, "%s", letters[x<cum[lo+offset] ? lo : lo+1]); buffer[j] = letters[x<cum[lo+offset] ? lo : lo+1][0]; } /* generate letters/codons */ buffer[j] = '\0'; // Print the sequence. if (out != NULL) { for (j=0; j<n; j+=50) { fprintf(out, "%-50.50s\n", buffer+j); } } else { SEQ_T *seq = allocate_seq(id, "", 0, buffer); set_complete(TRUE, seq); myfree(buffer); return(seq); } } /* sequence */ myfree(buffer); return(NULL); } // print_random_seqs
/**************************************************************************** * Remove from the alignment all columns that contain gaps for the * specified species. ****************************************************************************/ ALIGNMENT_T* remove_alignment_gaps (char* species, ALIGNMENT_T* alignment) { // Locate this species in the alignment. int species_index = get_index_in_string_list(species, get_species_names(alignment)); if (species_index == -1) { die("Can't find %s in alignment.\n", species); } SEQ_T* this_seq = get_alignment_sequence(species_index, alignment); // Get the dimensions of the original matrix. int num_sequences = get_num_aligned_sequences(alignment); int alignment_length = get_alignment_length(alignment); // Allocate memory for raw sequences that will constitute the new alignment. char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences); int i_seq = 0; for (i_seq = 0; i_seq < num_sequences; i_seq++) { raw_sequences[i_seq] = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); } char* consensus = get_consensus_string(alignment); char* new_consensus = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); // Iterate over all columns. int i_column; int i_raw = 0; for (i_column = 0; i_column < alignment_length; i_column++) { // Is there a gap? char this_char = get_seq_char(i_column, this_seq); if ((this_char != '-') && (this_char != '.')) { // If no gap, then copy over this column. for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); char this_char = get_seq_char(i_column, this_sequence); raw_sequences[i_seq][i_raw] = this_char; } new_consensus[i_raw] = consensus[i_column]; i_raw++; } } // Create new sequence objects. SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence), get_seq_description(this_sequence), get_seq_offset(this_sequence), raw_sequences[i_seq]); } // Allocate and return the new alignment. ALIGNMENT_T* new_alignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, new_sequences, new_consensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { myfree(raw_sequences[i_seq]); free_seq(new_sequences[i_seq]); } myfree(raw_sequences); myfree(new_sequences); myfree(new_consensus); return(new_alignment); }
/**************************************************************************** * Allocate one alignment object. Name and description may be NULL. * * Returns a pointer to the newly created alignment. ****************************************************************************/ ALIGNMENT_T* allocate_alignment( char* name, char* description, int num_sequences, SEQ_T** sequences, char* consensus_string ) { assert(num_sequences > 0); assert(sequences != NULL); assert(consensus_string != NULL); // Allocate the alignment object. ALIGNMENT_T* new_alignment = (ALIGNMENT_T*)mm_malloc(sizeof(ALIGNMENT_T)); if (new_alignment == NULL) { die("Error allocating alignment\n"); } // Store the name, truncating if necessary. if (name != NULL) { strncpy(new_alignment->name, name, MAX_ALIGNMENT_NAME); new_alignment->name[MAX_ALIGNMENT_NAME] = '\0'; if (strlen(new_alignment->name) != strlen(name)) { fprintf(stderr, "Warning: truncating alignment program name %s to %s.\n", name, new_alignment->name); } } else { new_alignment->name[0] = '\0'; } // Store the description, truncating if necessary. if (description != NULL) { strncpy(new_alignment->desc, description, MAX_ALIGNMENT_COMMENT); new_alignment->desc[MAX_ALIGNMENT_COMMENT] = '\0'; } else { new_alignment->desc[0] = '\0'; } // Store the sequences. new_alignment->sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*)); if (new_alignment->sequences == NULL) { die("Error allocating sequences\n"); } new_alignment->num_sequences = num_sequences; int seq_length = strlen(get_raw_sequence(sequences[0])); int i; for (i = 0; i < num_sequences; i++) { myassert(TRUE, strlen(get_raw_sequence(sequences[i])) == seq_length, "Sequence #1 (%s) is length=%d, but sequence #%d (%s) is length=%d.\n<%s>\n", get_seq_name(sequences[0]), seq_length, i, get_seq_name(sequences[i]), strlen(get_raw_sequence(sequences[i])), get_raw_sequence(sequences[i])); new_alignment->sequences[i] = allocate_seq(get_seq_name(sequences[i]), get_seq_description(sequences[i]), get_seq_offset(sequences[i]), get_raw_sequence(sequences[i]) ); } // Fill in the remaining fields. new_alignment->length = seq_length; copy_string(&(new_alignment->consensus_string), consensus_string); return(new_alignment); }