int main
  (int    argc,
   char * argv[])
{

  // Parse the command line.
  if (argc != 4) {
    fprintf(stderr, "Usage: reduce-alignment <start> <width> <alignment>\n");
    exit(1);
  }
  int start_position = atoi(argv[1]);
  int width = atoi(argv[2]);
  char* alignment_filename = argv[3];

  // Read the alignment.
  ALIGNMENT_T* big_alignment = read_alignment_from_file(
    alignment_filename,
		FALSE, 
    FALSE, 
    NULL // pointer to ref_seq_index, not used.
  );
  fprintf(stderr, "Read alignment of %d sequences and %d columns.\n",
	  get_num_aligned_sequences(big_alignment),
	  get_alignment_length(big_alignment));

  if (start_position + width > get_alignment_length(big_alignment)) {
    fprintf(stderr, "Invalid coordinates: %d + %d > %d.\n",
	    start_position, width, get_alignment_length(big_alignment));
    exit(1);
  }

  // Extract the smaller alignment.
  ALIGNMENT_T* small_alignment = extract_subalignment(start_position,
						      width,
						      big_alignment);
  fprintf(stderr, "Created alignment of %d sequences and %d columns.\n",
	  get_num_aligned_sequences(small_alignment),
	  get_alignment_length(small_alignment));

  // Print the alignment.
  print_clustalw(stdout, FALSE, small_alignment);

  // Free locally allocated memory.
  free_alignment(big_alignment);
  free_alignment(small_alignment);

  return(0);
}
예제 #2
0
double run_alignment_times( int type, p_query query, size_t hit_count, int bit_width, int internal_iterations ) {
    p_alignment_list (*align_func)( p_query, size_t, int, int );
    if( type == SW ) {
        align_func = &sw_align;
    }
    else {
        align_func = &nw_align;
    }

    struct timeval start;
    struct timeval finish;

    gettimeofday( &start, NULL );

    for( int i = 0; i < internal_iterations; ++i ) {
        free_alignment( align_func( query, hit_count, bit_width, COMPUTE_SCORE ) );
    }

    gettimeofday( &finish, NULL );

    double elapsed = (finish.tv_sec - start.tv_sec);
    elapsed += (finish.tv_usec - start.tv_usec) / 1000000.0;

    return elapsed;
}
예제 #3
0
/****************************************************************************
 * Read an alignment from a file.  Sort the sequences by sequence name if
 * requested.  Remove all gap sequences if requested.
 ****************************************************************************/
ALIGNMENT_T* read_alignment_from_file
  (char *filename, 
   BOOLEAN_T sort,
   BOOLEAN_T remove_allgap_seqs,
   int* ref_seq_index
  )
{
  int i;

  // Read the sequences.
  ALIGNMENT_T* alignment = read_alignment_from_clustalw_file(filename);

  if (sort) {
    // Create a temporary array to hold sorted sequence pointers.
    int num_sequences = get_num_aligned_sequences(alignment);
    SEQ_T** sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*));

    // Sort the sequences by name.
    STRING_LIST_T* alignment_species = get_species_names(alignment);
    // Store the name of the reference sequence.
    char *ref_name = get_nth_string(*ref_seq_index, alignment_species);
    sort_string_list(alignment_species); 	// keep species alphabetical
    for (i=0; i<num_sequences; i++) { 
      char *name = get_nth_string(i, alignment_species);
      sequences[i] = get_alignment_sequence_by_name(name, alignment);
    }
    myfree(alignment->sequences);
    alignment->sequences = sequences;

    // Find the new index of the reference sequence.
    *ref_seq_index = get_index_in_string_list(ref_name, alignment_species);
  }

  if (remove_allgap_seqs) {
    ALIGNMENT_T* new_alignment = remove_allgap_sequences(alignment);
    if (new_alignment != alignment) {
      free_alignment(alignment);
      alignment = new_alignment;
    }
  }

  return(alignment);
} // read_alignment_from_file
예제 #4
0
파일: falcon.c 프로젝트: abretaud/FALCON
consensus_data * generate_utg_consensus( char ** input_seq,
                           seq_coor_t *offset,
                           unsigned int n_seq,
                           unsigned min_cov,
                           unsigned K,
                           double min_idt) {

    unsigned int j;
    unsigned int seq_count;
    unsigned int aligned_seq_count;
    aln_range * arange;
    alignment * aln;
    align_tags_t ** tags_list;
    //char * consensus;
    consensus_data * consensus;
    double max_diff;
    seq_coor_t utg_len;
    seq_coor_t r_len;
    max_diff = 1.0 - min_idt;


    seq_count = n_seq;
    /***
    for (j=0; j < seq_count; j++) {
        printf("seq_len: %u %u\n", j, strlen(input_seq[j]));
    };
    fflush(stdout);
    ***/
    tags_list = calloc( seq_count+1, sizeof(align_tags_t *) );
    utg_len =  strlen(input_seq[0]);
    aligned_seq_count = 0;
    arange = calloc( 1, sizeof(aln_range) );

    arange->s1 = 0;
    arange->e1 = strlen(input_seq[0]);
    arange->s2 = 0;
    arange->e2 = strlen(input_seq[0]);
    tags_list[aligned_seq_count] = get_align_tags( input_seq[0], input_seq[0],
                                                   strlen(input_seq[0]), arange, 0, 0);
    aligned_seq_count += 1;
    for (j=1; j < seq_count; j++) {
        arange->s1 = 0;
        arange->e1 = strlen(input_seq[j])-1;
        arange->s2 = 0;
        arange->e2 = strlen(input_seq[j])-1;

        r_len = strlen(input_seq[j]);
        //printf("seq_len: %u %u\n", j, r_len);
        if ( offset[j] < 0) {
            if ((r_len + offset[j]) < 128) {
                continue;
            }
            if ( r_len + offset[j] < utg_len ) {

                //printf("1: %ld %u %u\n", offset[j], r_len, utg_len);
                aln = align(input_seq[j] - offset[j], r_len + offset[j] ,
                            input_seq[0], r_len + offset[j] ,
                            500, 1);
            } else {
                //printf("2: %ld %u %u\n", offset[j], r_len, utg_len);
                aln = align(input_seq[j] - offset[j], utg_len ,
                            input_seq[0], utg_len ,
                            500, 1);
            }
            offset[j] = 0;

        } else {
            if ( offset[j] > utg_len - 128) {
                continue;
            }
            if ( offset[j] + r_len > utg_len ) {
                //printf("3: %ld %u %u\n", offset[j], r_len, utg_len);
                aln = align(input_seq[j], utg_len - offset[j] ,
                            input_seq[0]+offset[j], utg_len - offset[j],
                            500, 1);
            } else {
                //printf("4: %ld %u %u\n", offset[j], r_len, utg_len);
                aln = align(input_seq[j], r_len ,
                            input_seq[0]+offset[j], r_len ,
                            500, 1);
            }
        }
        if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) {
            tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str,
                                                           aln->aln_str_size, arange, j,
                                                           offset[j]);
            aligned_seq_count ++;
        }
        free_alignment(aln);
    }
    free_aln_range(arange);
    if (aligned_seq_count > 0) {
        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, utg_len, 0 );
    } else {
        // allocate an empty consensus sequence
        consensus = calloc( 1, sizeof(consensus_data) );
        consensus->sequence = calloc( 1, sizeof(char) );
        consensus->eqv = calloc( 1, sizeof(unsigned int) );
    }
    //free(consensus);
    for (j=0; j < aligned_seq_count; j++) {
        free_align_tags(tags_list[j]);
    }
    free(tags_list);
    return consensus;
}
예제 #5
0
파일: falcon.c 프로젝트: abretaud/FALCON
consensus_data * generate_consensus( char ** input_seq,
                           unsigned int n_seq,
                           unsigned min_cov,
                           unsigned K,
                           double min_idt) {
    unsigned int j;
    unsigned int seq_count;
    unsigned int aligned_seq_count;
    kmer_lookup * lk_ptr;
    seq_array sa_ptr;
    seq_addr_array sda_ptr;
    kmer_match * kmer_match_ptr;
    aln_range * arange;
    alignment * aln;
    align_tags_t ** tags_list;
    //char * consensus;
    consensus_data * consensus;
    double max_diff;
    max_diff = 1.0 - min_idt;

    seq_count = n_seq;
    //printf("XX n_seq %d\n", n_seq);
    //for (j=0; j < seq_count; j++) {
    //    printf("seq_len: %u %u\n", j, strlen(input_seq[j]));
    //};
    fflush(stdout);

    tags_list = calloc( seq_count, sizeof(align_tags_t *) );
    lk_ptr = allocate_kmer_lookup( 1 << (K * 2) );
    sa_ptr = allocate_seq( (seq_coor_t) strlen( input_seq[0]) );
    sda_ptr = allocate_seq_addr( (seq_coor_t) strlen( input_seq[0]) );
    add_sequence( 0, K, input_seq[0], strlen(input_seq[0]), sda_ptr, sa_ptr, lk_ptr);
    //mask_k_mer(1 << (K * 2), lk_ptr, 16);

    aligned_seq_count = 0;
    for (j=1; j < seq_count; j++) {

        //printf("seq_len: %ld %u\n", j, strlen(input_seq[j]));

        kmer_match_ptr = find_kmer_pos_for_seq(input_seq[j], strlen(input_seq[j]), K, sda_ptr, lk_ptr);
#define INDEL_ALLOWENCE_0 6

        arange = find_best_aln_range(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5);  // narrow band to avoid aligning through big indels

        //printf("1:%ld %ld %ld %ld\n", arange_->s1, arange_->e1, arange_->s2, arange_->e2);

        //arange = find_best_aln_range2(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5);  // narrow band to avoid aligning through big indels

        //printf("2:%ld %ld %ld %ld\n\n", arange->s1, arange->e1, arange->s2, arange->e2);

#define INDEL_ALLOWENCE_1 0.10
        if (arange->e1 - arange->s1 < 100 || arange->e2 - arange->s2 < 100 ||
            abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) >
                   (int) (0.5 * INDEL_ALLOWENCE_1 * (arange->e1 - arange->s1 + arange->e2 - arange->s2))) {
            free_kmer_match( kmer_match_ptr);
            free_aln_range(arange);
            continue;
        }
        //printf("%ld %s\n", strlen(input_seq[j]), input_seq[j]);
        //printf("%ld %s\n\n", strlen(input_seq[0]), input_seq[0]);


#define INDEL_ALLOWENCE_2 150

        aln = align(input_seq[j]+arange->s1, arange->e1 - arange->s1 ,
                    input_seq[0]+arange->s2, arange->e2 - arange->s2 ,
                    INDEL_ALLOWENCE_2, 1);
        if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) {
            tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str,
                                                           aln->t_aln_str,
                                                           aln->aln_str_size,
                                                           arange, j,
                                                           0);
            aligned_seq_count ++;
        }
        /***
        for (k = 0; k < tags_list[j]->len; k++) {
            printf("%ld %d %c\n", tags_list[j]->align_tags[k].t_pos,
                                   tags_list[j]->align_tags[k].delta,
                                   tags_list[j]->align_tags[k].q_base);
        }
        ***/
        free_aln_range(arange);
        free_alignment(aln);
        free_kmer_match( kmer_match_ptr);
    }

    if (aligned_seq_count > 0) {
        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov );
    } else {
        // allocate an empty consensus sequence
        consensus = calloc( 1, sizeof(consensus_data) );
        consensus->sequence = calloc( 1, sizeof(char) );
        consensus->eqv = calloc( 1, sizeof(unsigned int) );
    }
    //free(consensus);
    free_seq_addr_array(sda_ptr);
    free_seq_array(sa_ptr);
    free_kmer_lookup(lk_ptr);
    for (j=0; j < aligned_seq_count; j++) {
        free_align_tags(tags_list[j]);
    }
    free(tags_list);
    return consensus;
}
예제 #6
0
/****************************************************************************
 *  Return a list containing the empirical column frequency distributions
 *  for all alignments in the input.
 *
 *  Each file in the list of filenames is read and the species list is
 *  determined.  The counts of each occurring column are tallied.  
 *  All files with the same species lists get their counts combined.
 *
 *  The returned list contains one distribution per species list that 
 *  occurs in some alignment.
 ****************************************************************************/
OBJECT_LIST_T* get_alignment_column_freqs_list
  (ALPH_T alph, 
   STRING_LIST_T* filenames,
  BOOLEAN_T remove_allgap_seqs) 
{
  int file_index;
  int num_filenames = get_num_strings(filenames);
  ARRAY_T* alignment_column_freqs = NULL;
  OBJECT_LIST_T* alignment_column_freqs_list
    = new_object_list(equal_string_lists,
		      (void*)copy_string_list,
		      free_string_list,
		      free_array);

  // Consider each alignment in turn.
  for(file_index = 0; file_index < num_filenames; file_index++) { 
    char* filename = get_nth_string(file_index, filenames);

    if (verbosity >= NORMAL_VERBOSE && !(file_index % 1)) {
      fprintf(
	stderr, 
	"Computing column freqs: alignment file number %d of %d total files.\n",
	file_index+1, num_filenames
      );
    }

    // Read the alignment
    int ref_seq_index = 0;
    ALIGNMENT_T* alignment = 
      read_alignment_from_file(filename, TRUE, remove_allgap_seqs, &ref_seq_index);
    STRING_LIST_T* alignment_species = get_species_names(alignment);

    // Try to retrieve the counts so far for this list of species.
    alignment_column_freqs = 
      (ARRAY_T*)retrieve_object(
	alignment_species, 
	alignment_column_freqs_list
      );
    
    // Found counts for current species list?
    if (alignment_column_freqs) {
      // Add counts from current alignment.
      (void) build_alignment_column_counts(alph, alignment, alignment_column_freqs);
      // Note: objects in lists are references, so no need to re-store
      // after modification.
    } 
    // Didn't find counts for this species list, so create new array of counts.
    else {
      alignment_column_freqs = build_alignment_column_counts(alph, alignment, NULL);
      store_object(
	(void*)alignment_column_freqs,
	(void*)alignment_species,
	0.0, // Score
	alignment_column_freqs_list
      );
    }
    // free space used by alignment
    free_alignment(alignment);
  } // each filename
  fprintf(stderr, "\n");

  // Convert counts to frequencies by retrieving each array of counts
  // and dividing by the total counts for that list of species.
  while ( 
    (alignment_column_freqs = retrieve_next_object(alignment_column_freqs_list)
    ) != NULL )
  {
    int i;
    int num_freqs = get_array_length(alignment_column_freqs);
    double total_counts;

    // Get total counts.
    for (i=total_counts=0; i<num_freqs; i++) {
      total_counts += get_array_item(i, alignment_column_freqs);
    }

    // Get frequencies.
    for (i=0; i<num_freqs; i++) {
      double f = get_array_item(i, alignment_column_freqs);
      set_array_item(i, f/total_counts, alignment_column_freqs);

#ifdef DEBUG
      int asize = alph_size(alph, ALPH_SIZE);
      int num_leaves = NINT(log(num_freqs)/log(asize));
      char* alignment_col = mm_malloc((num_leaves + 1) * sizeof(char));
      unhash_alignment_col(
        alph,
        i, 				//col_index
	alignment_col,
	num_leaves
      );
      printf("%s %g %g\n", alignment_col, f, f/total_counts);
      myfree(alignment_col);
#endif
    } // get frequencies
  } // while more species lists

  return(alignment_column_freqs_list);
} // get_alignment_column_freqs_list