Exemplo n.º 1
0
void needleman_wunsch_align2(const char *a, const char *b,
                             size_t len_a, size_t len_b,
                             const scoring_t *scoring,
                             nw_aligner_t *nw, alignment_t *result)
{
  aligner_align(nw, a, b, len_a, len_b, scoring, 0);

  // work backwards re-tracing optimal alignment, then shift sequences into place

  // note: longest_alignment = strlen(seq_a) + strlen(seq_b)
  size_t longest_alignment = nw->score_width-1 + nw->score_height-1;
  alignment_ensure_capacity(result, longest_alignment);

  // Position of next alignment character in buffer (working backwards)
  size_t next_char = longest_alignment-1;

  size_t arr_size = nw->score_width * nw->score_height;

  // Get max score (and therefore current matrix)
  enum Matrix curr_matrix = MATCH;
  score_t curr_score = nw->match_scores[arr_size-1];

  if(nw->gap_b_scores[arr_size-1] >= curr_score)
  {
    curr_matrix = GAP_B;
    curr_score = nw->gap_b_scores[arr_size-1];
  }

  if(nw->gap_a_scores[arr_size-1] >= curr_score)
  {
    curr_matrix = GAP_A;
    curr_score = nw->gap_a_scores[arr_size-1];
  }

  #ifdef DEBUG
    alignment_print_matrices(nw);
  #endif

  result->score = curr_score;
  char *alignment_a = result->result_a, *alignment_b = result->result_b;

  // coords in score matrices
  size_t score_x = nw->score_width-1, score_y = nw->score_height-1;
  size_t arr_index = arr_size - 1;

  for(; score_x > 0 && score_y > 0; next_char--)
  {
    #ifdef DEBUG
    printf("matrix: %s (%lu,%lu) score: %i\n",
           MATRIX_NAME(curr_matrix), score_x-1, score_y-1, curr_score);
    #endif

    switch(curr_matrix)
    {
      case MATCH:
        alignment_a[next_char] = nw->seq_a[score_x-1];
        alignment_b[next_char] = nw->seq_b[score_y-1];
        break;

      case GAP_A:
        alignment_a[next_char] = '-';
        alignment_b[next_char] = nw->seq_b[score_y-1];
        break;

      case GAP_B:
        alignment_a[next_char] = nw->seq_a[score_x-1];
        alignment_b[next_char] = '-';
        break;

      default:
        fprintf(stderr, "Program error: invalid matrix number\n");
        fprintf(stderr, "Please submit a bug report to: [email protected]\n");
        exit(EXIT_FAILURE);
    }

    if(score_x > 0 && score_y > 0)
    {
      alignment_reverse_move(&curr_matrix, &curr_score,
                             &score_x, &score_y, &arr_index, nw);
    }
  }

  // Gap in A
  while(score_y > 0)
  {
    alignment_a[next_char] = '-';
    alignment_b[next_char] = nw->seq_b[score_y-1];
    next_char--;
    score_y--;
  }

  // Gap in B
  while(score_x > 0)
  {
    alignment_a[next_char] = nw->seq_a[score_x-1];
    alignment_b[next_char] = '-';
    next_char--;
    score_x--;
  }

  // Shift alignment strings back into 0th position in char arrays
  int first_char = next_char+1;
  int alignment_len = longest_alignment - first_char;

  // Use memmove
  memmove(alignment_a, alignment_a+first_char, alignment_len);
  memmove(alignment_b, alignment_b+first_char, alignment_len);

  alignment_a[alignment_len] = '\0';
  alignment_b[alignment_len] = '\0';

  result->length = alignment_len;
}
Exemplo n.º 2
0
// Backtrack through scoring matrices
void alignment_reverse_move(enum Matrix *curr_matrix, score_t *curr_score,
                            size_t *score_x, size_t *score_y,
                            size_t *arr_index, const aligner_t *aligner)
{
  size_t seq_x = (*score_x)-1, seq_y = (*score_y)-1;
  size_t len_i = aligner->score_width-1, len_j = aligner->score_height-1;

  bool is_match;
  int match_penalty;
  const scoring_t *scoring = aligner->scoring;

  scoring_lookup(scoring, aligner->seq_a[seq_x], aligner->seq_b[seq_y],
                 &match_penalty, &is_match);

  int gap_a_open_penalty, gap_b_open_penalty;
  int gap_a_extend_penalty, gap_b_extend_penalty;

  gap_a_open_penalty = gap_b_open_penalty = scoring->gap_extend + scoring->gap_open;
  gap_a_extend_penalty = gap_b_extend_penalty = scoring->gap_extend;

  // Free gaps at the ends
  if(scoring->no_end_gap_penalty) {
    if(*score_x == len_i) gap_a_open_penalty = gap_a_extend_penalty = 0;
    if(*score_y == len_j) gap_b_open_penalty = gap_b_extend_penalty = 0;
  }
  if(scoring->no_start_gap_penalty) {
    if(*score_x == 0) gap_a_open_penalty = gap_a_extend_penalty = 0;
    if(*score_y == 0) gap_b_open_penalty = gap_b_extend_penalty = 0;
  }

  long prev_match_penalty, prev_gap_a_penalty, prev_gap_b_penalty;

  switch(*curr_matrix)
  {
    case MATCH:
      prev_match_penalty = match_penalty;
      prev_gap_a_penalty = match_penalty;
      prev_gap_b_penalty = match_penalty;
      (*score_x)--;
      (*score_y)--;
      (*arr_index) -= aligner->score_width + 1;
      break;

    case GAP_A:
      prev_match_penalty = gap_a_open_penalty;
      prev_gap_a_penalty = gap_a_extend_penalty;
      prev_gap_b_penalty = gap_a_open_penalty;
      (*score_y)--;
      (*arr_index) -= aligner->score_width;
      break;

    case GAP_B:
      prev_match_penalty = gap_b_open_penalty;
      prev_gap_a_penalty = gap_b_open_penalty;
      prev_gap_b_penalty = gap_b_extend_penalty;
      (*score_x)--;
      (*arr_index)--;
      break;

    default:
      fprintf(stderr, "Program error: invalid matrix in get_reverse_move()\n");
      fprintf(stderr, "Please submit a bug report to: [email protected]\n");
      exit(EXIT_FAILURE);
  }

  // *arr_index = ARR_2D_INDEX(aligner->score_width, *score_x, *score_y);

  if((!scoring->no_gaps_in_a || *score_x == 0 || *score_x == len_i) &&
     (long)aligner->gap_a_scores[*arr_index] + prev_gap_a_penalty == *curr_score)
  {
    *curr_matrix = GAP_A;
    *curr_score = aligner->gap_a_scores[*arr_index];
  }
  else if((!scoring->no_gaps_in_b || *score_y == 0 || *score_y == len_j) &&
          (long)aligner->gap_b_scores[*arr_index] + prev_gap_b_penalty == *curr_score)
  {
    *curr_matrix = GAP_B;
    *curr_score = aligner->gap_b_scores[*arr_index];
  }
  else if((long)aligner->match_scores[*arr_index] + prev_match_penalty == *curr_score)
  {
    *curr_matrix = MATCH;
    *curr_score = aligner->match_scores[*arr_index];
  }
  else
  {
    alignment_print_matrices(aligner);

    fprintf(stderr, "[%s:%zu,%zu]: %i [ismatch: %i] '%c' '%c'\n",
            MATRIX_NAME(*curr_matrix), *score_x, *score_y, *curr_score,
            is_match, aligner->seq_a[seq_x], aligner->seq_b[seq_y]);
    fprintf(stderr, " Penalties match: %li gap_open: %li gap_extend: %li\n",
            prev_match_penalty, prev_gap_a_penalty, prev_gap_b_penalty);
    fprintf(stderr, " Expected MATCH: %i GAP_A: %i GAP_B: %i\n",
            aligner->match_scores[*arr_index],
            aligner->gap_a_scores[*arr_index],
            aligner->gap_b_scores[*arr_index]);

    fprintf(stderr,
"Program error: traceback fail (get_reverse_move)\n"
"This may be due to an integer overflow if your sequences are long or scores\n"
"are large. If this is the case using smaller scores or shorter sequences may\n"
"work around this problem.  \n"
"  If you think this is a bug, please report it to: [email protected]\n");
    exit(EXIT_FAILURE);
  }
}
Exemplo n.º 3
0
// Align two sequences against each other to find local alignments between them
void align(const char *seq_a, const char *seq_b,
           const char *seq_a_name, const char *seq_b_name)
{
  if((seq_a_name != NULL || seq_b_name != NULL) && wait_on_keystroke)
  {
    fprintf(stderr, "Error: Interactive input takes seq only "
                    "(no FASTA/FASTQ) '%s:%s'\n", seq_a_name, seq_b_name);
    fflush(stderr);
    exit(EXIT_FAILURE);
  }

  // Check both arguments have length > 0
  if(seq_a[0] == '\0' || seq_b[0] == '\0')
  {
    fprintf(stderr, "Error: Sequences must have length > 0\n");
    fflush(stderr);

    if(cmd->print_fasta && seq_a_name != NULL && seq_b_name != NULL)
    {
      fprintf(stderr, "%s\n%s\n", seq_a_name, seq_b_name);
    }

    fflush(stderr);

    return;
  }

  smith_waterman_align(seq_a, seq_b, &scoring, sw);

  aligner_t *aligner = smith_waterman_get_aligner(sw);
  size_t len_a = aligner->score_width-1, len_b = aligner->score_height-1;

  printf("== Alignment %zu lengths (%lu, %lu):\n", alignment_index, len_a, len_b);

  if(cmd->print_matrices)
  {
    alignment_print_matrices(aligner);
  }

  // seqA
  if(cmd->print_fasta && seq_a_name != NULL)
  {
    fputs(seq_a_name, stdout);
    putc('\n', stdout);
  }

  if(cmd->print_seq)
  {
    fputs(seq_a, stdout);
    putc('\n', stdout);
  }

  // seqB
  if(cmd->print_fasta && seq_b_name != NULL)
  {
    fputs(seq_b_name, stdout);
    putc('\n', stdout);
  }

  if(cmd->print_seq)
  {
    fputs(seq_b, stdout);
    putc('\n', stdout);
  }

  putc('\n', stdout);

  if(!cmd->min_score_set)
  {
    // If min_score hasn't been set, set a limit based on the lengths of seqs
    // or zero if we're running interactively
    cmd->min_score = wait_on_keystroke ? 0
                       : scoring.match * MAX2(0.2 * MIN2(len_a, len_b), 2);

    #ifdef SEQ_ALIGN_VERBOSE
    printf("min_score: %i\n", cmd->min_score);
    #endif
  }

  fflush(stdout);

  size_t hit_index = 0;

  // For print context
  size_t context_left = 0, context_right = 0;
  size_t left_spaces_a = 0, left_spaces_b = 0;
  size_t right_spaces_a = 0, right_spaces_b = 0;


  while(get_next_hit() &&
        smith_waterman_fetch(sw, result) && result->score >= cmd->min_score &&
        (!cmd->max_hits_per_alignment_set ||
         hit_index < cmd->max_hits_per_alignment))
  {
    printf("hit %zu.%zu score: %i\n", alignment_index, hit_index++, result->score);

    if(cmd->print_context)
    {
      // Calculate number of characters of context to print either side
      context_left = MAX2(result->pos_a, result->pos_b);
      context_left = MIN2(context_left, cmd->print_context);

      size_t rem_a = len_a - (result->pos_a + result->len_a);
      size_t rem_b = len_b - (result->pos_b + result->len_b);

      context_right = MAX2(rem_a, rem_b);
      context_right = MIN2(context_right, cmd->print_context);

      left_spaces_a = (context_left > result->pos_a)
                      ? context_left - result->pos_a : 0;

      left_spaces_b = (context_left > result->pos_b)
                      ? context_left - result->pos_b : 0;

      right_spaces_a = (context_right > rem_a) ? context_right - rem_a : 0;
      right_spaces_b = (context_right > rem_b) ? context_right - rem_b : 0;
    }

    #ifdef SEQ_ALIGN_VERBOSE
    printf("context left = %lu; right = %lu spacing: [%lu,%lu] [%lu,%lu]\n",
           context_left, context_right,
           left_spaces_a, right_spaces_a,
           left_spaces_b, right_spaces_b);
    #endif

    // seq a
    print_alignment_part(result->result_a, result->result_b,
                         result->pos_a, result->len_a,
                         seq_a,
                         left_spaces_a, right_spaces_a,
                         context_left-left_spaces_a,
                         context_right-right_spaces_a);

    if(cmd->print_pretty)
    {
      fputs("  ", stdout);

      size_t max_left_spaces = MAX2(left_spaces_a, left_spaces_b);
      size_t max_right_spaces = MAX2(right_spaces_a, right_spaces_b);
      size_t spacer;

      // Print spaces for lefthand spacing
      for(spacer = 0; spacer < max_left_spaces; spacer++)
      {
        putc(' ', stdout);
      }

      // Print dots for lefthand context sequence
      for(spacer = 0; spacer < context_left-max_left_spaces; spacer++)
      {
        putc('.', stdout);
      }

      alignment_print_spacer(result->result_a, result->result_b, &scoring);

      // Print dots for righthand context sequence
      for(spacer = 0; spacer < context_right-max_right_spaces; spacer++)
      {
        putc('.', stdout);
      }

      // Print spaces for righthand spacing
      for(spacer = 0; spacer < max_right_spaces; spacer++)
      {
        putc(' ', stdout);
      }

      putc('\n', stdout);
    }

    // seq b
    print_alignment_part(result->result_b, result->result_a,
                         result->pos_b, result->len_b,
                         seq_b,
                         left_spaces_b, right_spaces_b,
                         context_left-left_spaces_b,
                         context_right-right_spaces_b);

    printf("\n");

    // Flush output here
    fflush(stdout);
  }

  fputs("==\n", stdout);
  fflush(stdout);

  // Increment sequence alignment counter
  alignment_index++;
}