int main (int argc, char * argv[]) { // Parse the command line. if (argc != 4) { fprintf(stderr, "Usage: reduce-alignment <start> <width> <alignment>\n"); exit(1); } int start_position = atoi(argv[1]); int width = atoi(argv[2]); char* alignment_filename = argv[3]; // Read the alignment. ALIGNMENT_T* big_alignment = read_alignment_from_file( alignment_filename, FALSE, FALSE, NULL // pointer to ref_seq_index, not used. ); fprintf(stderr, "Read alignment of %d sequences and %d columns.\n", get_num_aligned_sequences(big_alignment), get_alignment_length(big_alignment)); if (start_position + width > get_alignment_length(big_alignment)) { fprintf(stderr, "Invalid coordinates: %d + %d > %d.\n", start_position, width, get_alignment_length(big_alignment)); exit(1); } // Extract the smaller alignment. ALIGNMENT_T* small_alignment = extract_subalignment(start_position, width, big_alignment); fprintf(stderr, "Created alignment of %d sequences and %d columns.\n", get_num_aligned_sequences(small_alignment), get_alignment_length(small_alignment)); // Print the alignment. print_clustalw(stdout, FALSE, small_alignment); // Free locally allocated memory. free_alignment(big_alignment); free_alignment(small_alignment); return(0); }
double run_alignment_times( int type, p_query query, size_t hit_count, int bit_width, int internal_iterations ) { p_alignment_list (*align_func)( p_query, size_t, int, int ); if( type == SW ) { align_func = &sw_align; } else { align_func = &nw_align; } struct timeval start; struct timeval finish; gettimeofday( &start, NULL ); for( int i = 0; i < internal_iterations; ++i ) { free_alignment( align_func( query, hit_count, bit_width, COMPUTE_SCORE ) ); } gettimeofday( &finish, NULL ); double elapsed = (finish.tv_sec - start.tv_sec); elapsed += (finish.tv_usec - start.tv_usec) / 1000000.0; return elapsed; }
/**************************************************************************** * Read an alignment from a file. Sort the sequences by sequence name if * requested. Remove all gap sequences if requested. ****************************************************************************/ ALIGNMENT_T* read_alignment_from_file (char *filename, BOOLEAN_T sort, BOOLEAN_T remove_allgap_seqs, int* ref_seq_index ) { int i; // Read the sequences. ALIGNMENT_T* alignment = read_alignment_from_clustalw_file(filename); if (sort) { // Create a temporary array to hold sorted sequence pointers. int num_sequences = get_num_aligned_sequences(alignment); SEQ_T** sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*)); // Sort the sequences by name. STRING_LIST_T* alignment_species = get_species_names(alignment); // Store the name of the reference sequence. char *ref_name = get_nth_string(*ref_seq_index, alignment_species); sort_string_list(alignment_species); // keep species alphabetical for (i=0; i<num_sequences; i++) { char *name = get_nth_string(i, alignment_species); sequences[i] = get_alignment_sequence_by_name(name, alignment); } myfree(alignment->sequences); alignment->sequences = sequences; // Find the new index of the reference sequence. *ref_seq_index = get_index_in_string_list(ref_name, alignment_species); } if (remove_allgap_seqs) { ALIGNMENT_T* new_alignment = remove_allgap_sequences(alignment); if (new_alignment != alignment) { free_alignment(alignment); alignment = new_alignment; } } return(alignment); } // read_alignment_from_file
consensus_data * generate_utg_consensus( char ** input_seq, seq_coor_t *offset, unsigned int n_seq, unsigned min_cov, unsigned K, double min_idt) { unsigned int j; unsigned int seq_count; unsigned int aligned_seq_count; aln_range * arange; alignment * aln; align_tags_t ** tags_list; //char * consensus; consensus_data * consensus; double max_diff; seq_coor_t utg_len; seq_coor_t r_len; max_diff = 1.0 - min_idt; seq_count = n_seq; /*** for (j=0; j < seq_count; j++) { printf("seq_len: %u %u\n", j, strlen(input_seq[j])); }; fflush(stdout); ***/ tags_list = calloc( seq_count+1, sizeof(align_tags_t *) ); utg_len = strlen(input_seq[0]); aligned_seq_count = 0; arange = calloc( 1, sizeof(aln_range) ); arange->s1 = 0; arange->e1 = strlen(input_seq[0]); arange->s2 = 0; arange->e2 = strlen(input_seq[0]); tags_list[aligned_seq_count] = get_align_tags( input_seq[0], input_seq[0], strlen(input_seq[0]), arange, 0, 0); aligned_seq_count += 1; for (j=1; j < seq_count; j++) { arange->s1 = 0; arange->e1 = strlen(input_seq[j])-1; arange->s2 = 0; arange->e2 = strlen(input_seq[j])-1; r_len = strlen(input_seq[j]); //printf("seq_len: %u %u\n", j, r_len); if ( offset[j] < 0) { if ((r_len + offset[j]) < 128) { continue; } if ( r_len + offset[j] < utg_len ) { //printf("1: %ld %u %u\n", offset[j], r_len, utg_len); aln = align(input_seq[j] - offset[j], r_len + offset[j] , input_seq[0], r_len + offset[j] , 500, 1); } else { //printf("2: %ld %u %u\n", offset[j], r_len, utg_len); aln = align(input_seq[j] - offset[j], utg_len , input_seq[0], utg_len , 500, 1); } offset[j] = 0; } else { if ( offset[j] > utg_len - 128) { continue; } if ( offset[j] + r_len > utg_len ) { //printf("3: %ld %u %u\n", offset[j], r_len, utg_len); aln = align(input_seq[j], utg_len - offset[j] , input_seq[0]+offset[j], utg_len - offset[j], 500, 1); } else { //printf("4: %ld %u %u\n", offset[j], r_len, utg_len); aln = align(input_seq[j], r_len , input_seq[0]+offset[j], r_len , 500, 1); } } if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) { tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str, aln->aln_str_size, arange, j, offset[j]); aligned_seq_count ++; } free_alignment(aln); } free_aln_range(arange); if (aligned_seq_count > 0) { consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, utg_len, 0 ); } else { // allocate an empty consensus sequence consensus = calloc( 1, sizeof(consensus_data) ); consensus->sequence = calloc( 1, sizeof(char) ); consensus->eqv = calloc( 1, sizeof(unsigned int) ); } //free(consensus); for (j=0; j < aligned_seq_count; j++) { free_align_tags(tags_list[j]); } free(tags_list); return consensus; }
consensus_data * generate_consensus( char ** input_seq, unsigned int n_seq, unsigned min_cov, unsigned K, double min_idt) { unsigned int j; unsigned int seq_count; unsigned int aligned_seq_count; kmer_lookup * lk_ptr; seq_array sa_ptr; seq_addr_array sda_ptr; kmer_match * kmer_match_ptr; aln_range * arange; alignment * aln; align_tags_t ** tags_list; //char * consensus; consensus_data * consensus; double max_diff; max_diff = 1.0 - min_idt; seq_count = n_seq; //printf("XX n_seq %d\n", n_seq); //for (j=0; j < seq_count; j++) { // printf("seq_len: %u %u\n", j, strlen(input_seq[j])); //}; fflush(stdout); tags_list = calloc( seq_count, sizeof(align_tags_t *) ); lk_ptr = allocate_kmer_lookup( 1 << (K * 2) ); sa_ptr = allocate_seq( (seq_coor_t) strlen( input_seq[0]) ); sda_ptr = allocate_seq_addr( (seq_coor_t) strlen( input_seq[0]) ); add_sequence( 0, K, input_seq[0], strlen(input_seq[0]), sda_ptr, sa_ptr, lk_ptr); //mask_k_mer(1 << (K * 2), lk_ptr, 16); aligned_seq_count = 0; for (j=1; j < seq_count; j++) { //printf("seq_len: %ld %u\n", j, strlen(input_seq[j])); kmer_match_ptr = find_kmer_pos_for_seq(input_seq[j], strlen(input_seq[j]), K, sda_ptr, lk_ptr); #define INDEL_ALLOWENCE_0 6 arange = find_best_aln_range(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5); // narrow band to avoid aligning through big indels //printf("1:%ld %ld %ld %ld\n", arange_->s1, arange_->e1, arange_->s2, arange_->e2); //arange = find_best_aln_range2(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5); // narrow band to avoid aligning through big indels //printf("2:%ld %ld %ld %ld\n\n", arange->s1, arange->e1, arange->s2, arange->e2); #define INDEL_ALLOWENCE_1 0.10 if (arange->e1 - arange->s1 < 100 || arange->e2 - arange->s2 < 100 || abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) > (int) (0.5 * INDEL_ALLOWENCE_1 * (arange->e1 - arange->s1 + arange->e2 - arange->s2))) { free_kmer_match( kmer_match_ptr); free_aln_range(arange); continue; } //printf("%ld %s\n", strlen(input_seq[j]), input_seq[j]); //printf("%ld %s\n\n", strlen(input_seq[0]), input_seq[0]); #define INDEL_ALLOWENCE_2 150 aln = align(input_seq[j]+arange->s1, arange->e1 - arange->s1 , input_seq[0]+arange->s2, arange->e2 - arange->s2 , INDEL_ALLOWENCE_2, 1); if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) { tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str, aln->aln_str_size, arange, j, 0); aligned_seq_count ++; } /*** for (k = 0; k < tags_list[j]->len; k++) { printf("%ld %d %c\n", tags_list[j]->align_tags[k].t_pos, tags_list[j]->align_tags[k].delta, tags_list[j]->align_tags[k].q_base); } ***/ free_aln_range(arange); free_alignment(aln); free_kmer_match( kmer_match_ptr); } if (aligned_seq_count > 0) { consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov ); } else { // allocate an empty consensus sequence consensus = calloc( 1, sizeof(consensus_data) ); consensus->sequence = calloc( 1, sizeof(char) ); consensus->eqv = calloc( 1, sizeof(unsigned int) ); } //free(consensus); free_seq_addr_array(sda_ptr); free_seq_array(sa_ptr); free_kmer_lookup(lk_ptr); for (j=0; j < aligned_seq_count; j++) { free_align_tags(tags_list[j]); } free(tags_list); return consensus; }
/**************************************************************************** * Return a list containing the empirical column frequency distributions * for all alignments in the input. * * Each file in the list of filenames is read and the species list is * determined. The counts of each occurring column are tallied. * All files with the same species lists get their counts combined. * * The returned list contains one distribution per species list that * occurs in some alignment. ****************************************************************************/ OBJECT_LIST_T* get_alignment_column_freqs_list (ALPH_T alph, STRING_LIST_T* filenames, BOOLEAN_T remove_allgap_seqs) { int file_index; int num_filenames = get_num_strings(filenames); ARRAY_T* alignment_column_freqs = NULL; OBJECT_LIST_T* alignment_column_freqs_list = new_object_list(equal_string_lists, (void*)copy_string_list, free_string_list, free_array); // Consider each alignment in turn. for(file_index = 0; file_index < num_filenames; file_index++) { char* filename = get_nth_string(file_index, filenames); if (verbosity >= NORMAL_VERBOSE && !(file_index % 1)) { fprintf( stderr, "Computing column freqs: alignment file number %d of %d total files.\n", file_index+1, num_filenames ); } // Read the alignment int ref_seq_index = 0; ALIGNMENT_T* alignment = read_alignment_from_file(filename, TRUE, remove_allgap_seqs, &ref_seq_index); STRING_LIST_T* alignment_species = get_species_names(alignment); // Try to retrieve the counts so far for this list of species. alignment_column_freqs = (ARRAY_T*)retrieve_object( alignment_species, alignment_column_freqs_list ); // Found counts for current species list? if (alignment_column_freqs) { // Add counts from current alignment. (void) build_alignment_column_counts(alph, alignment, alignment_column_freqs); // Note: objects in lists are references, so no need to re-store // after modification. } // Didn't find counts for this species list, so create new array of counts. else { alignment_column_freqs = build_alignment_column_counts(alph, alignment, NULL); store_object( (void*)alignment_column_freqs, (void*)alignment_species, 0.0, // Score alignment_column_freqs_list ); } // free space used by alignment free_alignment(alignment); } // each filename fprintf(stderr, "\n"); // Convert counts to frequencies by retrieving each array of counts // and dividing by the total counts for that list of species. while ( (alignment_column_freqs = retrieve_next_object(alignment_column_freqs_list) ) != NULL ) { int i; int num_freqs = get_array_length(alignment_column_freqs); double total_counts; // Get total counts. for (i=total_counts=0; i<num_freqs; i++) { total_counts += get_array_item(i, alignment_column_freqs); } // Get frequencies. for (i=0; i<num_freqs; i++) { double f = get_array_item(i, alignment_column_freqs); set_array_item(i, f/total_counts, alignment_column_freqs); #ifdef DEBUG int asize = alph_size(alph, ALPH_SIZE); int num_leaves = NINT(log(num_freqs)/log(asize)); char* alignment_col = mm_malloc((num_leaves + 1) * sizeof(char)); unhash_alignment_col( alph, i, //col_index alignment_col, num_leaves ); printf("%s %g %g\n", alignment_col, f, f/total_counts); myfree(alignment_col); #endif } // get frequencies } // while more species lists return(alignment_column_freqs_list); } // get_alignment_column_freqs_list