Exemple #1
0
static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape,
                            int state, int is_diff, const gap_opt_t *opt)
{
    int score;
    gap_entry_t *p;
    gap_stack1_t *q;
    score = aln_score(n_mm, n_gapo, n_gape, opt);
    q = stack->stacks + score;
    if (q->n_entries == q->m_entries) {
        q->m_entries = q->m_entries? q->m_entries<<1 : 4;
        q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
    }
    p = q->stack + q->n_entries;
    p->info = (u_int32_t)score<<21 | i;
    p->k = k;
    p->l = l;
    p->n_mm = n_mm;
    p->n_gapo = n_gapo;
    p->n_gape = n_gape;
    p->state = state;
    p->last_diff_pos = is_diff? i : 0;
    ++(q->n_entries);
    ++(stack->n_entries);
    if (stack->best > score) stack->best = score;
}
Exemple #2
0
/* Calculate one sequence's contribution to the alignment score */
double marginal_score(glam2_scorer *s, glam2_aln *aln,
		      int seq, const fasta *f) {
  double score = aln->score;
  unalign(aln, seq, f);
  score -= aln_score(s, aln);
  realign(aln, seq, f);
  return score;
}
Exemple #3
0
void update_aln(glam2_aln *aln, data *d, const double temperature) {
  assert(aln->seq_num > 0);
  if (rand_dbl(d->a.column_sample_rate + 1) < 1) {
    const int seq_pick = rand_int(aln->seq_num);
    if (d->a.profile)
      fprintf(d->out, "site sample: seq=%d\n", seq_pick);
    site_sample(aln, seq_pick, d, temperature);
  } else
    column_sample(aln, d, temperature);
  aln->score = aln_score(&d->scorer, aln);
}
Exemple #4
0
/* get a random starting alignment */
void start_aln(glam2_aln *aln, data *d) {
  int i;
#if 0
  aln->width = d->a.min_width;  /* ?? initial number of columns */
  aln->width = sqrt(d->a.max_width * d->a.min_width);  /* geometric mean */
#endif
  aln->width = d->a.init_width;
  aln_zero(aln);
  SHUFFLE(d->seq_order, aln->seq_num);
  for (i = 0; i < aln->seq_num; ++i)
    site_sample(aln, d->seq_order[i], d, 1);
  aln->score = aln_score(&d->scorer, aln);
}
Exemple #5
0
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt)
{
	int i;
	gap_stack_t *stack;
	stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t));
	stack->n_stacks = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt);
	stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t));
	for (i = 0; i != stack->n_stacks; ++i) {
		gap_stack1_t *p = stack->stacks + i;
		p->m_entries = 4;
		p->stack = (gap_entry_t*)calloc(p->m_entries, sizeof(gap_entry_t));
	}
	return stack;
}
Exemple #6
0
static tmap_map1_aux_stack_t*
tmap_map1_aux_stack_reset(tmap_map1_aux_stack_t *stack,
                          int32_t max_mm, int32_t max_gapo, int32_t max_gape, 
                          const tmap_map_opt_t *opt)
{
  int32_t i;
  //int32_t i, j;
  int32_t n_bins_needed = 0;
  // move to the beginning of the memory pool
  stack->entry_pool_i = 0;
  stack->best_score = INT32_MAX;

  if(TMAP_MAP1_AUX_STACK_TOO_BIG < stack->entry_pool_length) {
      tmap_map1_aux_stack_destroy_helper(stack, 0);
      tmap_map1_aux_stack_init_helper(stack);
  }

  // clear the bins 
  for(i=0;i<stack->n_bins;i++) {
      /*
      for(j=0;j<stack->bins[i].n_entries;j++) {
          stack->bins[i].entries[j] = NULL;
      }
      */
      stack->bins[i].n_entries = 0;

  }
  // resize the bins if necessary
  n_bins_needed = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt);
  if(stack->n_bins < n_bins_needed) {
      // realloc
      tmap_roundup32(n_bins_needed);
      stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); 
      // initialize
      for(i=stack->n_bins;i<n_bins_needed;i++) {
          stack->bins[i].n_entries = stack->bins[i].m_entries = 0;
          stack->bins[i].entries = NULL;
      }
      stack->n_bins = n_bins_needed;
  }
  stack->n_entries = 0;
  
  return stack;
}
Exemple #7
0
tmap_map_sams_t *
tmap_map1_aux_core(tmap_seq_t *seq, tmap_index_t *index,
                   tmap_bwt_match_hash_t *hash,
                   tmap_bwt_match_width_t *width, tmap_bwt_match_width_t *seed_width, tmap_map_opt_t *opt,
                   tmap_map1_aux_stack_t *stack, int32_t seed2_len)
{
  int32_t max_mm = opt->max_mm, max_gapo = opt->max_gapo, max_gape = opt->max_gape, seed_max_diff = opt->seed_max_diff;
  int32_t best_score, next_best_score;
  int32_t best_cnt = 0;
  int32_t i, j, num_n = 0;
  int32_t max_edit_score;
  tmap_bwt_match_occ_t match_sa_start;
  tmap_string_t *bases=NULL;
  tmap_map_sams_t *sams = NULL;
  int32_t max_diff, best_diff;
  tmap_bwt_int_t k, l;
  tmap_refseq_t *refseq = index->refseq;
  tmap_bwt_t *bwt = index->bwt;
  tmap_sa_t *sa = index->sa;
  tmap_map1_aux_occ_t *occs = NULL;


  max_edit_score = opt->pen_mm;
  //if(max_edit_score < opt->pen_gapo + opt->pen_gape) max_edit_score = opt->pen_gapo + opt->pen_gape;
  //if(max_edit_score < opt->pen_gape) max_edit_score = opt->pen_gape;

  bases = tmap_seq_get_bases(seq);
  /*
  fputc('\n', stderr);
  for(i=0;i<bases->l;i++) {
      fputc("ACGTN"[(int)bases->s[i]], stderr);
  }
  fputc('\n', stderr);
  */
  
  // the maximum # of differences
  if(bases->l <= TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH) {
      best_diff = max_diff = opt->max_diff_table[bases->l];
  }
  else {
      best_diff = max_diff = opt->max_diff_table[TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH];
  }
  
  // bound differenes by the maximum # of differences
  if(max_diff < max_mm) max_mm = max_diff;
  if(max_diff < max_gapo) max_gapo = max_diff;
  //if(max_diff < max_gape) max_gape = max_diff;
  
  best_score = next_best_score = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt);

  // check whether there are too many N
  for(j=bases->l-seed2_len,num_n=0;j<bases->l;j++) {
      if(3 < bases->s[j]) {
          num_n++;
      }
  }
  if(max_mm < num_n || max_diff < num_n) {
      return tmap_map_sams_init(NULL);
  }

  // initialize
  sams = tmap_map_sams_init(NULL);
  occs = NULL;

  match_sa_start.offset = 0;
  match_sa_start.hi = 0;
  match_sa_start.k = 0;
  match_sa_start.l = bwt->seq_len;

  stack = tmap_map1_aux_stack_reset(stack, max_mm, max_gapo, max_gape, opt); // reset stack
  tmap_map1_aux_stack_push(stack, bases->l, &match_sa_start, 0, 0, 0, STATE_M, 0, NULL, opt);

  while(0 < tmap_map1_aux_stack_size(stack) && tmap_map1_aux_stack_size(stack) < opt->max_entries) {
      tmap_map1_aux_stack_entry_t *e = NULL;
      int32_t len=-1; 
      int32_t n_seed_mm=0, offset, width_cur_i;
      const uint8_t *str=NULL;
      int32_t sam_found, m;
      tmap_bwt_match_width_t *width_cur = NULL;
      const tmap_bwt_match_width_t *seed_width_cur = NULL;
      tmap_bwt_match_occ_t match_sa_cur, match_sa_next[4];
      
      // get the best entry
      e = tmap_map1_aux_stack_pop(stack); 

      // bound with best score
      if(best_score + max_edit_score < e->score) {
          break; // no need to continue
      }

      // some more information
      match_sa_cur = e->match_sa; 

      // check if we have too many edits
      m = max_diff - (e->n_mm + e->n_gapo + e->n_gape);
      if(m < 0) {
          continue; // too many edits
      }

      // get the rest of the information
      offset = e->offset; // zero-based
      str = (uint8_t*)bases->s;
      len = bases->l;
      width_cur = width;
      width_cur_i = seed2_len - (len - offset);

      if(NULL != seed_width) {
          seed_width_cur = seed_width;
          n_seed_mm = seed_max_diff - (e->n_mm + e->n_gapo + e->n_gape); // consider only mismatches in the seed
      }
      else {
          seed_width_cur = NULL;
      }
      if(0 < width_cur_i && m < width_cur[width_cur_i-1].bid) { // too many edits
          continue;
      }

      // check whether a sam is found
      sam_found = 0;
      if(len - seed2_len == offset) {
          sam_found = 1;
      }
      else if(max_mm == e->n_mm // no mismatches from any state
              && ((e->state == STATE_M && max_gapo == e->n_gapo) // in STATE_M but no more gap opens
                  || (e->state != STATE_M && max_gape == e->n_gape))) { // in STATE_I/STATE_D but no more extensions
          if(0 < tmap_bwt_match_hash_exact_alt_reverse(bwt, offset, str, &match_sa_cur, hash)) { // the alignment must match exactly to sam
              sam_found = 2;
          }
          else {
              continue; // no sam, skip
          }
      }

      if(0 < sam_found) { // alignment found
          // check for duplicates
          if(0 < sams->n) {
              for(i=0;i<sams->n;i++) {
                  // check contained
                  if(match_sa_cur.k <= occs[i].k
                     && occs[i].k <= match_sa_cur.l) { // MK <= SK <= ML
                      if(occs[i].l <= match_sa_cur.l) { // MK <= SK <= SL <= ML
                          // Want (SK - MK) + (ML - SL)
                          k = occs[i].k - match_sa_cur.k; // (SK - MK)
                          k += match_sa_cur.l - occs[i].l; // (ML - SL)
                          occs[i].l = match_sa_cur.l; // Make SL = ML
                      }
                      else { // MK <= SK <= ML <= SL
                          k = occs[i].k - match_sa_cur.k; // (SK - MK)
                      }
                      occs[i].k = match_sa_cur.k; // Make SK = MK
                      break;
                  }
                  else if(match_sa_cur.k <= occs[i].l
                          && occs[i].l <= match_sa_cur.l) { // MK <= SL <= ML
                      if(match_sa_cur.k <= occs[i].k) { // MK <= SK <= SL <= ML
                          // Want (SK - MK) + (ML - SL)
                          k = occs[i].k - match_sa_cur.k; // (SK - MK)
                          k += match_sa_cur.l - occs[i].l; // (ML - SL)
                          occs[i].k = match_sa_cur.k; // Make SK = MK
                      }
                      else { // SK <= MK <= SL <= ML
                          k = match_sa_cur.l - occs[i].l; // (ML - SL)
                      }
                      occs[i].l = match_sa_cur.l; // Make SL = ML
                      break;
                  }
              }
              if(i < sams->n) {
                  // shadow
                  if(0 < k) {
                      //tmap_map1_aux_stack_shadow(k, bwt->seq_len, e->last_diff_offset, width_cur);
                      width_cur_i = seed2_len - (len - e->last_diff_offset);
                      tmap_map1_aux_stack_shadow(k, seed2_len, width_cur_i, width_cur);
                  }
                  sam_found = 0;
                  continue;
              }
          }

          int32_t score = aln_score(e->n_mm, e->n_gapo, e->n_gape, opt);
          int32_t do_add = 1;
          if(sams->n == 0) {
              best_score = score;
              best_cnt = 0;
              best_diff = e->n_mm + e->n_gapo + e->n_gape;
          }
          if(score == best_score) {
              best_cnt += match_sa_cur.l - match_sa_cur.k + 1;
          }
          else {
              if(best_diff + 1 <= max_diff) {
                  max_diff = best_diff + 1;
              }
              if(score < next_best_score) {
                  next_best_score = score;
              }
              else if(next_best_score < score) {
                  // no need to examine further
                  break;
              }
          }
          if(do_add) { // append
              uint32_t op, op_len, cigar_i;
              tmap_map_sam_t *sam = NULL;
              tmap_map1_aux_stack_entry_t *cur = NULL;
  
              tmap_map_sams_realloc(sams, sams->n+1);
              occs = tmap_realloc(occs, sizeof(tmap_map1_aux_occ_t) * sams->n, "occs");

              sam = &sams->sams[sams->n-1];

              sam->algo_id = TMAP_MAP_ALGO_MAP1;
              sam->algo_stage = 0;
              sam->score = e->score;

              // aux data
              tmap_map_sam_malloc_aux(sam);
              k = occs[sams->n-1].k = match_sa_cur.k;
              l = occs[sams->n-1].l= match_sa_cur.l;
              sam->aux.map1_aux->n_mm = e->n_mm;
              sam->aux.map1_aux->n_gapo = e->n_gapo;
              sam->aux.map1_aux->n_gape = e->n_gape;

              // aux data: reference length
              cur = e;
              i = e->i;
              sam->aux.map1_aux->aln_ref = 0;
              cigar_i = 0;
              if(2 == sam_found) { // we used 'tmap_bwt_match_exact_alt_reverse' 
                  op = STATE_M;
                  op_len = offset;
              }
              else {
                  op = -1;
                  op_len = 0;
              }
              while(0 <= i) {
                  cur = stack->entry_pool[i];
                  if(len == cur->offset) break;
                  if(op != cur->state) {
                      if(STATE_M == op || STATE_D == op) {
                          sam->aux.map1_aux->aln_ref += op_len;
                      }
                      op = cur->state;
                      op_len = 1;
                  }
                  else {
                      op_len++;
                  }
                  //fprintf(stderr, "cur->state=%c op_len=%d cur->prev_i=%d k=%u l=%u\n", "MIDS"[cur->state], op_len, cur->prev_i, cur->match_sa.k, cur->match_sa.l);
                  i = cur->prev_i;
              }
              if(STATE_M == op || STATE_D == op) {
                  sam->aux.map1_aux->aln_ref += op_len;
              }

              /*
              fprintf(stderr, "shadow 2 k=%u l=%u len=%d offset=%d last_diff_offset=%d\n",
                      k, l, len, offset, e->last_diff_offset);
              fprintf(stderr, "e->n_mm=%d e->n_gapo=%d e->n_gape=%d\n",
                      e->n_mm, e->n_gapo, e->n_gape);
              */
              //tmap_map1_aux_stack_shadow(l - k + 1, bwt->seq_len, e->last_diff_offset, width_cur);
              width_cur_i = seed2_len - (len - e->last_diff_offset);
              tmap_map1_aux_stack_shadow(l - k + 1, seed2_len, width_cur_i, width_cur);
              if(opt->max_best_cals < best_cnt) {
                  // ignore if too many "best" have been found
                  occs[sams->n-1].l -= (best_cnt - opt->max_best_cals); // only save the maximum
                  break;
              }
          }
      }
      else {
          int32_t allow_diff = 1, allow_mm = (e->n_mm < max_mm) ? 1 : 0;

          // decrement the offset
          offset--;

          // use a bound for mismatches
          if(0 < offset) {
              int32_t seed_width_cur_i = offset - (len - opt->seed_length);
              width_cur_i = seed2_len - (len - offset);
              if(0 < width_cur_i) {
                  if(m-1 < width_cur[width_cur_i-1].bid) { 
                      allow_diff = 0;
                  }
                  else if(width_cur[width_cur_i-1].bid == m-1
                          && width_cur[width_cur_i].bid == m-1
                          && width_cur[width_cur_i-1].w == width_cur[width_cur_i].w) {
                      allow_mm = 0;
                  }
              }
              if(0 < seed_width_cur_i) {
                  if(NULL != seed_width_cur && 0 < seed_width_cur_i) {
                      if(n_seed_mm-1 < seed_width_cur[seed_width_cur_i-1].bid) {
                          allow_diff = 0;
                      }
                      else if(seed_width_cur[seed_width_cur_i-1].bid == n_seed_mm-1
                              && seed_width_cur[seed_width_cur_i].bid == n_seed_mm-1
                              && seed_width_cur[seed_width_cur_i-1].w == seed_width_cur[seed_width_cur_i].w) {
                          allow_mm = 0;
                      }
                  }
              }
          }

          // retrieve the next SA interval
          tmap_bwt_match_hash_2occ4(bwt, &e->match_sa, match_sa_next, hash); 

          // insertions/deletions
          if(allow_diff 
             && opt->indel_ends_bound + e->n_gapo + e->n_gape <= offset
             && opt->indel_ends_bound + e->n_gapo + e->n_gape <= len - offset) { // check to add gaps
              if(STATE_M == e->state) { // gap open
                  if(e->n_gapo < max_gapo) { // gap open is allowed
                      // insertion
                      tmap_map1_aux_stack_push(stack, offset, &match_sa_cur, e->n_mm, e->n_gapo + 1, e->n_gape, STATE_I, 1, e, opt);

                      // deletion
                      for(j = 0; j != 4; ++j) {
                          if(match_sa_next[j].k <= match_sa_next[j].l) {
                              //   remember that a gap deletion does not consume a
                              //   read base, so use 'offset+1'
                              tmap_map1_aux_stack_push(stack, offset+1, &match_sa_next[j], e->n_mm, e->n_gapo + 1, e->n_gape, STATE_D, 1, e, opt);
                          }
                      }
                  }
              }
              else if(STATE_I == e->state) { // extension of an insertion
                  if(e->n_gape < max_gape) { // gap extension is allowed
                      tmap_map1_aux_stack_push(stack, offset, &match_sa_cur, e->n_mm, e->n_gapo, e->n_gape + 1, STATE_I, 1, e, opt);
                  }
              }
              else if(STATE_D == e->state) { // extension of a deletion
                  if(e->n_gape < max_gape) {
                      if(e->n_gape + e->n_gapo < max_diff || e->match_sa.l - e->match_sa.k + 1 < opt->max_cals_del) { // gap extension is allowed
                          for(j = 0; j != 4; ++j) {
                              if(match_sa_next[j].k <= match_sa_next[j].l) {
                                  //   remember that a gap deletion does not consume a
                                  //   read base, so use 'offset+1'
                                  tmap_map1_aux_stack_push(stack, offset+1, &match_sa_next[j], e->n_mm, e->n_gapo, e->n_gape + 1, STATE_D, 1, e, opt);
                              }
                          }
                      }
                  }
              }
          }

          // mismatches
          if(1 == allow_mm && 1 == allow_diff) { // mismatches allowed
              for(j=0;j<4;j++) {
                  int32_t c = (str[offset] + j) & 3;
                  int32_t is_mm = (0 < j || 3 < str[offset]);
                  if(match_sa_next[c].k <= match_sa_next[c].l) {
                      tmap_map1_aux_stack_push(stack, offset, &match_sa_next[c], e->n_mm + is_mm, e->n_gapo, e->n_gape, STATE_M, is_mm, e, opt);
                  }
              }
          } 
          else if(str[offset] < 4) { // try exact match only
              int32_t c = str[offset] & 3;
              if(match_sa_next[c].k <= match_sa_next[c].l) {
                  tmap_map1_aux_stack_push(stack, offset, &match_sa_next[c], e->n_mm, e->n_gapo, e->n_gape, STATE_M, 0, e, opt);
              }
          }
      }
  }

  return tmap_map1_sam_to_real(sams, occs, bases, seed2_len, refseq, bwt, sa, hash, opt);
}
Exemple #8
0
static inline void
tmap_map1_aux_stack_push(tmap_map1_aux_stack_t *stack, 
                         int32_t offset,
                         tmap_bwt_match_occ_t *match_sa_prev,
                         int32_t n_mm, int32_t n_gapo, int32_t n_gape,
                         int32_t state, int32_t is_diff, 
                         tmap_map1_aux_stack_entry_t *prev_entry,
                         const tmap_map_opt_t *opt)
{
  int32_t i;
  int32_t n_bins_needed = 0;
  tmap_map1_aux_stack_entry_t *entry = NULL;
  tmap_map1_aux_bin_t *bin = NULL;

  // check to see if we need more memory
  if(stack->entry_pool_length <= stack->entry_pool_i) { 
      int32_t i = stack->entry_pool_length;
      stack->entry_pool_length <<= 2;
      stack->entry_pool = tmap_realloc(stack->entry_pool, 
                                       sizeof(tmap_map1_aux_stack_entry_t*)*stack->entry_pool_length, 
                                       "stack->entry_pool");
      while(i<stack->entry_pool_length) {
          stack->entry_pool[i] = tmap_malloc(sizeof(tmap_map1_aux_stack_entry_t), "stack->entry_pool[i]");
          i++;
      }
  }

  entry = stack->entry_pool[stack->entry_pool_i];
  entry->score = aln_score(n_mm, n_gapo, n_gape, opt);
  entry->n_mm = n_mm;
  entry->n_gapo = n_gapo;
  entry->n_gape = n_gape;
  entry->state = state;
  entry->match_sa = (*match_sa_prev); 
  entry->i = stack->entry_pool_i;
  entry->offset = offset;
  if(NULL == prev_entry) {
      entry->last_diff_offset = offset;
      entry->prev_i = -1;
  }
  else {
      entry->last_diff_offset = (1 == is_diff) ? (offset) : prev_entry->last_diff_offset; 
      entry->prev_i = prev_entry->i;
  }

  if(stack->n_bins <= entry->score) {
      //tmap_bug();
      // resize the bins if necessary
      n_bins_needed = entry->score + 1;
      // realloc
      tmap_roundup32(n_bins_needed);
      stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); 
      // initialize
      for(i=stack->n_bins;i<n_bins_needed;i++) {
          stack->bins[i].n_entries = stack->bins[i].m_entries = 0;
          stack->bins[i].entries = NULL;
      }
      stack->n_bins = n_bins_needed;
  }
  if(stack->n_bins <= entry->score) {
      tmap_bug();
  }
  bin = &stack->bins[entry->score];
  
  // - remove duplicates
  // - most likely formed by tandem repeats or indels
  // - too computationally expensive, and not necessary
  /*
  for(i=0;i<bin->n_entries;i++) {
      if(bin->entries[i]->match_sa.k == entry->match_sa.k 
         && bin->entries[i]->match_sa.l == entry->match_sa.l 
         && bin->entries[i]->offset == entry->offset
         && bin->entries[i]->state == entry->state) {
          return;
      }
  }
  */
  
  // update best score
  if(stack->best_score > entry->score) stack->best_score = entry->score;

  if(bin->m_entries <= bin->n_entries) {
      bin->m_entries++;
      tmap_roundup32(bin->m_entries);
      bin->entries = tmap_realloc(bin->entries, sizeof(tmap_map1_aux_bin_t) * bin->m_entries, "bin->entries");
  }
  bin->entries[bin->n_entries] = entry;
  bin->n_entries++;

  stack->entry_pool_i++;
  stack->n_entries++;
}
Exemple #9
0
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt)
{
    return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt));
}
Exemple #10
0
sint malign(sint istart,char *phylip_name) /* full progressive alignment*/
{
   static 	sint *aligned;
   static 	sint *group;
   static 	sint ix;

   sint 	*maxid, max, sum;
   sint		*tree_weight;
   sint 		i,j,set,iseq=0;
   sint 		status,entries;
   lint		score = 0;


   info("Start of Multiple Alignment");

/* get the phylogenetic tree from *.ph */

   if (nseqs >= 2) 
     {
       status = read_tree(phylip_name, (sint)0, nseqs);
       if (status == 0) return((sint)0);
     }

/* calculate sequence weights according to branch lengths of the tree -
   weights in global variable seq_weight normalised to sum to 100 */

   calc_seq_weights((sint)0, nseqs, seq_weight);

/* recalculate tmat matrix as percent similarity matrix */

   status = calc_similarities(nseqs);
   if (status == 0) return((sint)0);

/* for each sequence, find the most closely related sequence */

   maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
   for (i=1;i<=nseqs;i++)
     {
         maxid[i] = -1;
         for (j=1;j<=nseqs;j++) 
           if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
     }

/* group the sequences according to their relative divergence */

   if (istart == 0)
     {
        sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) );
        for(i=0;i<=nseqs;i++)
           sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );

        create_sets((sint)0,nseqs);
        info("There are %d groups",(pint)nsets);

/* clear the memory used for the phylogenetic tree */

        if (nseqs >= 2)
             clear_tree(NULL);

/* start the multiple alignments.........  */

        info("Aligning...");

/* first pass, align closely related sequences first.... */

        ix = 0;
        aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
        for (i=0;i<=nseqs;i++) aligned[i] = 0;

        for(set=1;set<=nsets;++set)
         {
          entries=0;
          for (i=1;i<=nseqs;i++)
            {
               if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff))
                 {
                    entries++;
                    if  (aligned[i] == 0)
                       {
                          if (output_order==INPUT)
                            {
                              ++ix;
                              output_index[i] = i;
                            }
                          else output_index[++ix] = i;
                          aligned[i] = 1;
                       }
                 }
            }

          if(entries > 0) score = prfalign(sets[set], aligned);
          else score=0.0;


/* negative score means fatal error... exit now!  */

          if (score < 0) 
             {
                return(-1);
             }
          if ((entries > 0) && (score > 0))
             info("Group %d: Sequences:%4d      Score:%d",
             (pint)set,(pint)entries,(pint)score);
          else
             info("Group %d:                     Delayed",
             (pint)set);
        }

        for (i=0;i<=nseqs;i++)
          sets[i]=ckfree((void *)sets[i]);
        sets=ckfree(sets);
     }
   else
     {
/* clear the memory used for the phylogenetic tree */

        if (nseqs >= 2)
             clear_tree(NULL);

        aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
        ix = 0;
        for (i=1;i<=istart+1;i++)
         {
           aligned[i] = 1;
           ++ix;
           output_index[i] = i;
         }
        for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;
     }

/* second pass - align remaining, more divergent sequences..... */

/* if not all sequences were aligned, for each unaligned sequence,
   find it's closest pair amongst the aligned sequences.  */

    group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
    tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
    for (i=0;i<nseqs;i++)
   		tree_weight[i] = seq_weight[i];

/* if we haven't aligned any sequences, in the first pass - align the
two most closely related sequences now */
   if(ix==0)
     {
        max = -1;
	iseq = 0;
        for (i=1;i<=nseqs;i++)
          {
             for (j=i+1;j<=nseqs;j++)
               {
                  if (max < tmat[i][j])
		  {
                     max = tmat[i][j];
                     iseq = i;
                  }
              }
          }
        aligned[iseq]=1;
        if (output_order == INPUT)
          {
            ++ix;
            output_index[iseq] = iseq;
          }
         else
            output_index[++ix] = iseq;
     }

    while (ix < nseqs)
      {
             for (i=1;i<=nseqs;i++) {
                if (aligned[i] == 0)
                  {
                     maxid[i] = -1;
                     for (j=1;j<=nseqs;j++) 
                        if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
                            maxid[i] = tmat[i][j];
                  }
              }
/* find the most closely related sequence to those already aligned */

            max = -1;
	    iseq = 0;
            for (i=1;i<=nseqs;i++)
              {
                if ((aligned[i] == 0) && (maxid[i] > max))
                  {
                     max = maxid[i];
                     iseq = i;
                  }
              }


/* align this sequence to the existing alignment */
/* weight sequences with percent identity with profile*/
/* OR...., multiply sequence weights from tree by percent identity with new sequence */
   if(no_weights==FALSE) {
   for (j=0;j<nseqs;j++)
       if (aligned[j+1] != 0)
              seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
/*
  Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
*/

         sum = 0;
         for (j=0;j<nseqs;j++)
           if (aligned[j+1] != 0)
            sum += seq_weight[j];
         if (sum == 0)
          {
           for (j=0;j<nseqs;j++)
                seq_weight[j] = 1;
                sum = j;
          }
         for (j=0;j<nseqs;j++)
           if (aligned[j+1] != 0)
             {
               seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
               if (seq_weight[j] < 1) seq_weight[j] = 1;
             }
	}

         entries = 0;
         for (j=1;j<=nseqs;j++)
           if (aligned[j] != 0)
              {
                 group[j] = 1;
                 entries++;
              }
           else if (iseq==j)
              {
                 group[j] = 2;
                 entries++;
              }
         aligned[iseq] = 1;

         score = prfalign(group, aligned);
         info("Sequence:%d     Score:%d",(pint)iseq,(pint)score);
         if (output_order == INPUT)
          {
            ++ix;
            output_index[iseq] = iseq;
          }
         else
            output_index[++ix] = iseq;
      }

   group=ckfree((void *)group);
   aligned=ckfree((void *)aligned);
   maxid=ckfree((void *)maxid);
   tree_weight=ckfree((void *)tree_weight);

   aln_score();

/* make the rest (output stuff) into routine clustal_out in file amenu.c */

   return(nseqs);

}
Exemple #11
0
sint seqalign(sint istart,char *phylip_name) /* sequence alignment to existing profile */
{
   static 	sint *aligned, *tree_weight;
   static 	sint *group;
   static 	sint ix;

   sint 	*maxid, max;
   sint 		i,j,status,iseq=0;
   sint 		sum,entries;
   lint		score = 0;


   info("Start of Multiple Alignment");

/* get the phylogenetic tree from *.ph */

   if (nseqs >= 2) 
     {
       status = read_tree(phylip_name, (sint)0, nseqs);
       if (status == 0) return(0);
     }

/* calculate sequence weights according to branch lengths of the tree -
   weights in global variable seq_weight normalised to sum to 100 */

   calc_seq_weights((sint)0, nseqs, seq_weight);
   
   tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
   for (i=0;i<nseqs;i++)
   		tree_weight[i] = seq_weight[i];

/* recalculate tmat matrix as percent similarity matrix */

   status = calc_similarities(nseqs);
   if (status == 0) return((sint)0);

/* for each sequence, find the most closely related sequence */

   maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
   for (i=1;i<=nseqs;i++)
     {
         maxid[i] = -1;
         for (j=1;j<=nseqs;j++) 
           if (maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
     }

/* clear the memory used for the phylogenetic tree */

        if (nseqs >= 2)
             clear_tree(NULL);

        aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
        ix = 0;
        for (i=1;i<=istart+1;i++)
         {
           aligned[i] = 1;
           ++ix;
           output_index[i] = i;
         }
        for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;

/* for each unaligned sequence, find it's closest pair amongst the
   aligned sequences.  */

    group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));

    while (ix < nseqs)
      {
        if (ix > 0) 
          {
             for (i=1;i<=nseqs;i++) {
                if (aligned[i] == 0)
                  {
                     maxid[i] = -1;
                     for (j=1;j<=nseqs;j++) 
                        if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
                            maxid[i] = tmat[i][j];
                  }
              }
          }

/* find the most closely related sequence to those already aligned */

         max = -1;
         for (i=1;i<=nseqs;i++)
           {
             if ((aligned[i] == 0) && (maxid[i] > max))
               {
                  max = maxid[i];
                  iseq = i;
               }
           }

/* align this sequence to the existing alignment */

         entries = 0;
         for (j=1;j<=nseqs;j++)
           if (aligned[j] != 0)
              {
                 group[j] = 1;
                 entries++;
              }
           else if (iseq==j)
              {
                 group[j] = 2;
                 entries++;
              }
         aligned[iseq] = 1;


/* EITHER....., set sequence weights equal to percent identity with new sequence */
/*
           for (j=0;j<nseqs;j++)
              seq_weight[j] = tmat[j+1][iseq];
*/
/* OR...., multiply sequence weights from tree by percent identity with new sequence */
           for (j=0;j<nseqs;j++)
              seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
if (debug>1)         
  for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf (stdout,"sequence %d: %d\n", j+1,tree_weight[j]);
/*
  Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
*/

         sum = 0;
         for (j=0;j<nseqs;j++)
           if (group[j+1] == 1) sum += seq_weight[j];
         if (sum == 0)
          {
           for (j=0;j<nseqs;j++)
                seq_weight[j] = 1;
                sum = j;
          }
         for (j=0;j<nseqs;j++)
             {
               seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
               if (seq_weight[j] < 1) seq_weight[j] = 1;
             }

if (debug > 1) {
  fprintf(stdout,"new weights\n");
  for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]);
}

         score = prfalign(group, aligned);
         info("Sequence:%d     Score:%d",(pint)iseq,(pint)score);
         if (output_order == INPUT)
          {
            ++ix;
            output_index[iseq] = iseq;
          }
         else
            output_index[++ix] = iseq;
      }

   group=ckfree((void *)group);
   aligned=ckfree((void *)aligned);
   maxid=ckfree((void *)maxid);

   aln_score();

/* make the rest (output stuff) into routine clustal_out in file amenu.c */

   return(nseqs);

}