static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int state, int is_diff, const gap_opt_t *opt) { int score; gap_entry_t *p; gap_stack1_t *q; score = aln_score(n_mm, n_gapo, n_gape, opt); q = stack->stacks + score; if (q->n_entries == q->m_entries) { q->m_entries = q->m_entries? q->m_entries<<1 : 4; q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; p->last_diff_pos = is_diff? i : 0; ++(q->n_entries); ++(stack->n_entries); if (stack->best > score) stack->best = score; }
/* Calculate one sequence's contribution to the alignment score */ double marginal_score(glam2_scorer *s, glam2_aln *aln, int seq, const fasta *f) { double score = aln->score; unalign(aln, seq, f); score -= aln_score(s, aln); realign(aln, seq, f); return score; }
void update_aln(glam2_aln *aln, data *d, const double temperature) { assert(aln->seq_num > 0); if (rand_dbl(d->a.column_sample_rate + 1) < 1) { const int seq_pick = rand_int(aln->seq_num); if (d->a.profile) fprintf(d->out, "site sample: seq=%d\n", seq_pick); site_sample(aln, seq_pick, d, temperature); } else column_sample(aln, d, temperature); aln->score = aln_score(&d->scorer, aln); }
/* get a random starting alignment */ void start_aln(glam2_aln *aln, data *d) { int i; #if 0 aln->width = d->a.min_width; /* ?? initial number of columns */ aln->width = sqrt(d->a.max_width * d->a.min_width); /* geometric mean */ #endif aln->width = d->a.init_width; aln_zero(aln); SHUFFLE(d->seq_order, aln->seq_num); for (i = 0; i < aln->seq_num; ++i) site_sample(aln, d->seq_order[i], d, 1); aln->score = aln_score(&d->scorer, aln); }
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) { int i; gap_stack_t *stack; stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); stack->n_stacks = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); for (i = 0; i != stack->n_stacks; ++i) { gap_stack1_t *p = stack->stacks + i; p->m_entries = 4; p->stack = (gap_entry_t*)calloc(p->m_entries, sizeof(gap_entry_t)); } return stack; }
static tmap_map1_aux_stack_t* tmap_map1_aux_stack_reset(tmap_map1_aux_stack_t *stack, int32_t max_mm, int32_t max_gapo, int32_t max_gape, const tmap_map_opt_t *opt) { int32_t i; //int32_t i, j; int32_t n_bins_needed = 0; // move to the beginning of the memory pool stack->entry_pool_i = 0; stack->best_score = INT32_MAX; if(TMAP_MAP1_AUX_STACK_TOO_BIG < stack->entry_pool_length) { tmap_map1_aux_stack_destroy_helper(stack, 0); tmap_map1_aux_stack_init_helper(stack); } // clear the bins for(i=0;i<stack->n_bins;i++) { /* for(j=0;j<stack->bins[i].n_entries;j++) { stack->bins[i].entries[j] = NULL; } */ stack->bins[i].n_entries = 0; } // resize the bins if necessary n_bins_needed = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); if(stack->n_bins < n_bins_needed) { // realloc tmap_roundup32(n_bins_needed); stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); // initialize for(i=stack->n_bins;i<n_bins_needed;i++) { stack->bins[i].n_entries = stack->bins[i].m_entries = 0; stack->bins[i].entries = NULL; } stack->n_bins = n_bins_needed; } stack->n_entries = 0; return stack; }
tmap_map_sams_t * tmap_map1_aux_core(tmap_seq_t *seq, tmap_index_t *index, tmap_bwt_match_hash_t *hash, tmap_bwt_match_width_t *width, tmap_bwt_match_width_t *seed_width, tmap_map_opt_t *opt, tmap_map1_aux_stack_t *stack, int32_t seed2_len) { int32_t max_mm = opt->max_mm, max_gapo = opt->max_gapo, max_gape = opt->max_gape, seed_max_diff = opt->seed_max_diff; int32_t best_score, next_best_score; int32_t best_cnt = 0; int32_t i, j, num_n = 0; int32_t max_edit_score; tmap_bwt_match_occ_t match_sa_start; tmap_string_t *bases=NULL; tmap_map_sams_t *sams = NULL; int32_t max_diff, best_diff; tmap_bwt_int_t k, l; tmap_refseq_t *refseq = index->refseq; tmap_bwt_t *bwt = index->bwt; tmap_sa_t *sa = index->sa; tmap_map1_aux_occ_t *occs = NULL; max_edit_score = opt->pen_mm; //if(max_edit_score < opt->pen_gapo + opt->pen_gape) max_edit_score = opt->pen_gapo + opt->pen_gape; //if(max_edit_score < opt->pen_gape) max_edit_score = opt->pen_gape; bases = tmap_seq_get_bases(seq); /* fputc('\n', stderr); for(i=0;i<bases->l;i++) { fputc("ACGTN"[(int)bases->s[i]], stderr); } fputc('\n', stderr); */ // the maximum # of differences if(bases->l <= TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH) { best_diff = max_diff = opt->max_diff_table[bases->l]; } else { best_diff = max_diff = opt->max_diff_table[TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH]; } // bound differenes by the maximum # of differences if(max_diff < max_mm) max_mm = max_diff; if(max_diff < max_gapo) max_gapo = max_diff; //if(max_diff < max_gape) max_gape = max_diff; best_score = next_best_score = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); // check whether there are too many N for(j=bases->l-seed2_len,num_n=0;j<bases->l;j++) { if(3 < bases->s[j]) { num_n++; } } if(max_mm < num_n || max_diff < num_n) { return tmap_map_sams_init(NULL); } // initialize sams = tmap_map_sams_init(NULL); occs = NULL; match_sa_start.offset = 0; match_sa_start.hi = 0; match_sa_start.k = 0; match_sa_start.l = bwt->seq_len; stack = tmap_map1_aux_stack_reset(stack, max_mm, max_gapo, max_gape, opt); // reset stack tmap_map1_aux_stack_push(stack, bases->l, &match_sa_start, 0, 0, 0, STATE_M, 0, NULL, opt); while(0 < tmap_map1_aux_stack_size(stack) && tmap_map1_aux_stack_size(stack) < opt->max_entries) { tmap_map1_aux_stack_entry_t *e = NULL; int32_t len=-1; int32_t n_seed_mm=0, offset, width_cur_i; const uint8_t *str=NULL; int32_t sam_found, m; tmap_bwt_match_width_t *width_cur = NULL; const tmap_bwt_match_width_t *seed_width_cur = NULL; tmap_bwt_match_occ_t match_sa_cur, match_sa_next[4]; // get the best entry e = tmap_map1_aux_stack_pop(stack); // bound with best score if(best_score + max_edit_score < e->score) { break; // no need to continue } // some more information match_sa_cur = e->match_sa; // check if we have too many edits m = max_diff - (e->n_mm + e->n_gapo + e->n_gape); if(m < 0) { continue; // too many edits } // get the rest of the information offset = e->offset; // zero-based str = (uint8_t*)bases->s; len = bases->l; width_cur = width; width_cur_i = seed2_len - (len - offset); if(NULL != seed_width) { seed_width_cur = seed_width; n_seed_mm = seed_max_diff - (e->n_mm + e->n_gapo + e->n_gape); // consider only mismatches in the seed } else { seed_width_cur = NULL; } if(0 < width_cur_i && m < width_cur[width_cur_i-1].bid) { // too many edits continue; } // check whether a sam is found sam_found = 0; if(len - seed2_len == offset) { sam_found = 1; } else if(max_mm == e->n_mm // no mismatches from any state && ((e->state == STATE_M && max_gapo == e->n_gapo) // in STATE_M but no more gap opens || (e->state != STATE_M && max_gape == e->n_gape))) { // in STATE_I/STATE_D but no more extensions if(0 < tmap_bwt_match_hash_exact_alt_reverse(bwt, offset, str, &match_sa_cur, hash)) { // the alignment must match exactly to sam sam_found = 2; } else { continue; // no sam, skip } } if(0 < sam_found) { // alignment found // check for duplicates if(0 < sams->n) { for(i=0;i<sams->n;i++) { // check contained if(match_sa_cur.k <= occs[i].k && occs[i].k <= match_sa_cur.l) { // MK <= SK <= ML if(occs[i].l <= match_sa_cur.l) { // MK <= SK <= SL <= ML // Want (SK - MK) + (ML - SL) k = occs[i].k - match_sa_cur.k; // (SK - MK) k += match_sa_cur.l - occs[i].l; // (ML - SL) occs[i].l = match_sa_cur.l; // Make SL = ML } else { // MK <= SK <= ML <= SL k = occs[i].k - match_sa_cur.k; // (SK - MK) } occs[i].k = match_sa_cur.k; // Make SK = MK break; } else if(match_sa_cur.k <= occs[i].l && occs[i].l <= match_sa_cur.l) { // MK <= SL <= ML if(match_sa_cur.k <= occs[i].k) { // MK <= SK <= SL <= ML // Want (SK - MK) + (ML - SL) k = occs[i].k - match_sa_cur.k; // (SK - MK) k += match_sa_cur.l - occs[i].l; // (ML - SL) occs[i].k = match_sa_cur.k; // Make SK = MK } else { // SK <= MK <= SL <= ML k = match_sa_cur.l - occs[i].l; // (ML - SL) } occs[i].l = match_sa_cur.l; // Make SL = ML break; } } if(i < sams->n) { // shadow if(0 < k) { //tmap_map1_aux_stack_shadow(k, bwt->seq_len, e->last_diff_offset, width_cur); width_cur_i = seed2_len - (len - e->last_diff_offset); tmap_map1_aux_stack_shadow(k, seed2_len, width_cur_i, width_cur); } sam_found = 0; continue; } } int32_t score = aln_score(e->n_mm, e->n_gapo, e->n_gape, opt); int32_t do_add = 1; if(sams->n == 0) { best_score = score; best_cnt = 0; best_diff = e->n_mm + e->n_gapo + e->n_gape; } if(score == best_score) { best_cnt += match_sa_cur.l - match_sa_cur.k + 1; } else { if(best_diff + 1 <= max_diff) { max_diff = best_diff + 1; } if(score < next_best_score) { next_best_score = score; } else if(next_best_score < score) { // no need to examine further break; } } if(do_add) { // append uint32_t op, op_len, cigar_i; tmap_map_sam_t *sam = NULL; tmap_map1_aux_stack_entry_t *cur = NULL; tmap_map_sams_realloc(sams, sams->n+1); occs = tmap_realloc(occs, sizeof(tmap_map1_aux_occ_t) * sams->n, "occs"); sam = &sams->sams[sams->n-1]; sam->algo_id = TMAP_MAP_ALGO_MAP1; sam->algo_stage = 0; sam->score = e->score; // aux data tmap_map_sam_malloc_aux(sam); k = occs[sams->n-1].k = match_sa_cur.k; l = occs[sams->n-1].l= match_sa_cur.l; sam->aux.map1_aux->n_mm = e->n_mm; sam->aux.map1_aux->n_gapo = e->n_gapo; sam->aux.map1_aux->n_gape = e->n_gape; // aux data: reference length cur = e; i = e->i; sam->aux.map1_aux->aln_ref = 0; cigar_i = 0; if(2 == sam_found) { // we used 'tmap_bwt_match_exact_alt_reverse' op = STATE_M; op_len = offset; } else { op = -1; op_len = 0; } while(0 <= i) { cur = stack->entry_pool[i]; if(len == cur->offset) break; if(op != cur->state) { if(STATE_M == op || STATE_D == op) { sam->aux.map1_aux->aln_ref += op_len; } op = cur->state; op_len = 1; } else { op_len++; } //fprintf(stderr, "cur->state=%c op_len=%d cur->prev_i=%d k=%u l=%u\n", "MIDS"[cur->state], op_len, cur->prev_i, cur->match_sa.k, cur->match_sa.l); i = cur->prev_i; } if(STATE_M == op || STATE_D == op) { sam->aux.map1_aux->aln_ref += op_len; } /* fprintf(stderr, "shadow 2 k=%u l=%u len=%d offset=%d last_diff_offset=%d\n", k, l, len, offset, e->last_diff_offset); fprintf(stderr, "e->n_mm=%d e->n_gapo=%d e->n_gape=%d\n", e->n_mm, e->n_gapo, e->n_gape); */ //tmap_map1_aux_stack_shadow(l - k + 1, bwt->seq_len, e->last_diff_offset, width_cur); width_cur_i = seed2_len - (len - e->last_diff_offset); tmap_map1_aux_stack_shadow(l - k + 1, seed2_len, width_cur_i, width_cur); if(opt->max_best_cals < best_cnt) { // ignore if too many "best" have been found occs[sams->n-1].l -= (best_cnt - opt->max_best_cals); // only save the maximum break; } } } else { int32_t allow_diff = 1, allow_mm = (e->n_mm < max_mm) ? 1 : 0; // decrement the offset offset--; // use a bound for mismatches if(0 < offset) { int32_t seed_width_cur_i = offset - (len - opt->seed_length); width_cur_i = seed2_len - (len - offset); if(0 < width_cur_i) { if(m-1 < width_cur[width_cur_i-1].bid) { allow_diff = 0; } else if(width_cur[width_cur_i-1].bid == m-1 && width_cur[width_cur_i].bid == m-1 && width_cur[width_cur_i-1].w == width_cur[width_cur_i].w) { allow_mm = 0; } } if(0 < seed_width_cur_i) { if(NULL != seed_width_cur && 0 < seed_width_cur_i) { if(n_seed_mm-1 < seed_width_cur[seed_width_cur_i-1].bid) { allow_diff = 0; } else if(seed_width_cur[seed_width_cur_i-1].bid == n_seed_mm-1 && seed_width_cur[seed_width_cur_i].bid == n_seed_mm-1 && seed_width_cur[seed_width_cur_i-1].w == seed_width_cur[seed_width_cur_i].w) { allow_mm = 0; } } } } // retrieve the next SA interval tmap_bwt_match_hash_2occ4(bwt, &e->match_sa, match_sa_next, hash); // insertions/deletions if(allow_diff && opt->indel_ends_bound + e->n_gapo + e->n_gape <= offset && opt->indel_ends_bound + e->n_gapo + e->n_gape <= len - offset) { // check to add gaps if(STATE_M == e->state) { // gap open if(e->n_gapo < max_gapo) { // gap open is allowed // insertion tmap_map1_aux_stack_push(stack, offset, &match_sa_cur, e->n_mm, e->n_gapo + 1, e->n_gape, STATE_I, 1, e, opt); // deletion for(j = 0; j != 4; ++j) { if(match_sa_next[j].k <= match_sa_next[j].l) { // remember that a gap deletion does not consume a // read base, so use 'offset+1' tmap_map1_aux_stack_push(stack, offset+1, &match_sa_next[j], e->n_mm, e->n_gapo + 1, e->n_gape, STATE_D, 1, e, opt); } } } } else if(STATE_I == e->state) { // extension of an insertion if(e->n_gape < max_gape) { // gap extension is allowed tmap_map1_aux_stack_push(stack, offset, &match_sa_cur, e->n_mm, e->n_gapo, e->n_gape + 1, STATE_I, 1, e, opt); } } else if(STATE_D == e->state) { // extension of a deletion if(e->n_gape < max_gape) { if(e->n_gape + e->n_gapo < max_diff || e->match_sa.l - e->match_sa.k + 1 < opt->max_cals_del) { // gap extension is allowed for(j = 0; j != 4; ++j) { if(match_sa_next[j].k <= match_sa_next[j].l) { // remember that a gap deletion does not consume a // read base, so use 'offset+1' tmap_map1_aux_stack_push(stack, offset+1, &match_sa_next[j], e->n_mm, e->n_gapo, e->n_gape + 1, STATE_D, 1, e, opt); } } } } } } // mismatches if(1 == allow_mm && 1 == allow_diff) { // mismatches allowed for(j=0;j<4;j++) { int32_t c = (str[offset] + j) & 3; int32_t is_mm = (0 < j || 3 < str[offset]); if(match_sa_next[c].k <= match_sa_next[c].l) { tmap_map1_aux_stack_push(stack, offset, &match_sa_next[c], e->n_mm + is_mm, e->n_gapo, e->n_gape, STATE_M, is_mm, e, opt); } } } else if(str[offset] < 4) { // try exact match only int32_t c = str[offset] & 3; if(match_sa_next[c].k <= match_sa_next[c].l) { tmap_map1_aux_stack_push(stack, offset, &match_sa_next[c], e->n_mm, e->n_gapo, e->n_gape, STATE_M, 0, e, opt); } } } } return tmap_map1_sam_to_real(sams, occs, bases, seed2_len, refseq, bwt, sa, hash, opt); }
static inline void tmap_map1_aux_stack_push(tmap_map1_aux_stack_t *stack, int32_t offset, tmap_bwt_match_occ_t *match_sa_prev, int32_t n_mm, int32_t n_gapo, int32_t n_gape, int32_t state, int32_t is_diff, tmap_map1_aux_stack_entry_t *prev_entry, const tmap_map_opt_t *opt) { int32_t i; int32_t n_bins_needed = 0; tmap_map1_aux_stack_entry_t *entry = NULL; tmap_map1_aux_bin_t *bin = NULL; // check to see if we need more memory if(stack->entry_pool_length <= stack->entry_pool_i) { int32_t i = stack->entry_pool_length; stack->entry_pool_length <<= 2; stack->entry_pool = tmap_realloc(stack->entry_pool, sizeof(tmap_map1_aux_stack_entry_t*)*stack->entry_pool_length, "stack->entry_pool"); while(i<stack->entry_pool_length) { stack->entry_pool[i] = tmap_malloc(sizeof(tmap_map1_aux_stack_entry_t), "stack->entry_pool[i]"); i++; } } entry = stack->entry_pool[stack->entry_pool_i]; entry->score = aln_score(n_mm, n_gapo, n_gape, opt); entry->n_mm = n_mm; entry->n_gapo = n_gapo; entry->n_gape = n_gape; entry->state = state; entry->match_sa = (*match_sa_prev); entry->i = stack->entry_pool_i; entry->offset = offset; if(NULL == prev_entry) { entry->last_diff_offset = offset; entry->prev_i = -1; } else { entry->last_diff_offset = (1 == is_diff) ? (offset) : prev_entry->last_diff_offset; entry->prev_i = prev_entry->i; } if(stack->n_bins <= entry->score) { //tmap_bug(); // resize the bins if necessary n_bins_needed = entry->score + 1; // realloc tmap_roundup32(n_bins_needed); stack->bins = tmap_realloc(stack->bins, sizeof(tmap_map1_aux_bin_t) * n_bins_needed, "stack->bins"); // initialize for(i=stack->n_bins;i<n_bins_needed;i++) { stack->bins[i].n_entries = stack->bins[i].m_entries = 0; stack->bins[i].entries = NULL; } stack->n_bins = n_bins_needed; } if(stack->n_bins <= entry->score) { tmap_bug(); } bin = &stack->bins[entry->score]; // - remove duplicates // - most likely formed by tandem repeats or indels // - too computationally expensive, and not necessary /* for(i=0;i<bin->n_entries;i++) { if(bin->entries[i]->match_sa.k == entry->match_sa.k && bin->entries[i]->match_sa.l == entry->match_sa.l && bin->entries[i]->offset == entry->offset && bin->entries[i]->state == entry->state) { return; } } */ // update best score if(stack->best_score > entry->score) stack->best_score = entry->score; if(bin->m_entries <= bin->n_entries) { bin->m_entries++; tmap_roundup32(bin->m_entries); bin->entries = tmap_realloc(bin->entries, sizeof(tmap_map1_aux_bin_t) * bin->m_entries, "bin->entries"); } bin->entries[bin->n_entries] = entry; bin->n_entries++; stack->entry_pool_i++; stack->n_entries++; }
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) { return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); }
sint malign(sint istart,char *phylip_name) /* full progressive alignment*/ { static sint *aligned; static sint *group; static sint ix; sint *maxid, max, sum; sint *tree_weight; sint i,j,set,iseq=0; sint status,entries; lint score = 0; info("Start of Multiple Alignment"); /* get the phylogenetic tree from *.ph */ if (nseqs >= 2) { status = read_tree(phylip_name, (sint)0, nseqs); if (status == 0) return((sint)0); } /* calculate sequence weights according to branch lengths of the tree - weights in global variable seq_weight normalised to sum to 100 */ calc_seq_weights((sint)0, nseqs, seq_weight); /* recalculate tmat matrix as percent similarity matrix */ status = calc_similarities(nseqs); if (status == 0) return((sint)0); /* for each sequence, find the most closely related sequence */ maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); for (i=1;i<=nseqs;i++) { maxid[i] = -1; for (j=1;j<=nseqs;j++) if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j]; } /* group the sequences according to their relative divergence */ if (istart == 0) { sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) ); for(i=0;i<=nseqs;i++) sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); create_sets((sint)0,nseqs); info("There are %d groups",(pint)nsets); /* clear the memory used for the phylogenetic tree */ if (nseqs >= 2) clear_tree(NULL); /* start the multiple alignments......... */ info("Aligning..."); /* first pass, align closely related sequences first.... */ ix = 0; aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); for (i=0;i<=nseqs;i++) aligned[i] = 0; for(set=1;set<=nsets;++set) { entries=0; for (i=1;i<=nseqs;i++) { if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff)) { entries++; if (aligned[i] == 0) { if (output_order==INPUT) { ++ix; output_index[i] = i; } else output_index[++ix] = i; aligned[i] = 1; } } } if(entries > 0) score = prfalign(sets[set], aligned); else score=0.0; /* negative score means fatal error... exit now! */ if (score < 0) { return(-1); } if ((entries > 0) && (score > 0)) info("Group %d: Sequences:%4d Score:%d", (pint)set,(pint)entries,(pint)score); else info("Group %d: Delayed", (pint)set); } for (i=0;i<=nseqs;i++) sets[i]=ckfree((void *)sets[i]); sets=ckfree(sets); } else { /* clear the memory used for the phylogenetic tree */ if (nseqs >= 2) clear_tree(NULL); aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); ix = 0; for (i=1;i<=istart+1;i++) { aligned[i] = 1; ++ix; output_index[i] = i; } for (i=istart+2;i<=nseqs;i++) aligned[i] = 0; } /* second pass - align remaining, more divergent sequences..... */ /* if not all sequences were aligned, for each unaligned sequence, find it's closest pair amongst the aligned sequences. */ group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); for (i=0;i<nseqs;i++) tree_weight[i] = seq_weight[i]; /* if we haven't aligned any sequences, in the first pass - align the two most closely related sequences now */ if(ix==0) { max = -1; iseq = 0; for (i=1;i<=nseqs;i++) { for (j=i+1;j<=nseqs;j++) { if (max < tmat[i][j]) { max = tmat[i][j]; iseq = i; } } } aligned[iseq]=1; if (output_order == INPUT) { ++ix; output_index[iseq] = iseq; } else output_index[++ix] = iseq; } while (ix < nseqs) { for (i=1;i<=nseqs;i++) { if (aligned[i] == 0) { maxid[i] = -1; for (j=1;j<=nseqs;j++) if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0)) maxid[i] = tmat[i][j]; } } /* find the most closely related sequence to those already aligned */ max = -1; iseq = 0; for (i=1;i<=nseqs;i++) { if ((aligned[i] == 0) && (maxid[i] > max)) { max = maxid[i]; iseq = i; } } /* align this sequence to the existing alignment */ /* weight sequences with percent identity with profile*/ /* OR...., multiply sequence weights from tree by percent identity with new sequence */ if(no_weights==FALSE) { for (j=0;j<nseqs;j++) if (aligned[j+1] != 0) seq_weight[j] = tree_weight[j] * tmat[j+1][iseq]; /* Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR */ sum = 0; for (j=0;j<nseqs;j++) if (aligned[j+1] != 0) sum += seq_weight[j]; if (sum == 0) { for (j=0;j<nseqs;j++) seq_weight[j] = 1; sum = j; } for (j=0;j<nseqs;j++) if (aligned[j+1] != 0) { seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum; if (seq_weight[j] < 1) seq_weight[j] = 1; } } entries = 0; for (j=1;j<=nseqs;j++) if (aligned[j] != 0) { group[j] = 1; entries++; } else if (iseq==j) { group[j] = 2; entries++; } aligned[iseq] = 1; score = prfalign(group, aligned); info("Sequence:%d Score:%d",(pint)iseq,(pint)score); if (output_order == INPUT) { ++ix; output_index[iseq] = iseq; } else output_index[++ix] = iseq; } group=ckfree((void *)group); aligned=ckfree((void *)aligned); maxid=ckfree((void *)maxid); tree_weight=ckfree((void *)tree_weight); aln_score(); /* make the rest (output stuff) into routine clustal_out in file amenu.c */ return(nseqs); }
sint seqalign(sint istart,char *phylip_name) /* sequence alignment to existing profile */ { static sint *aligned, *tree_weight; static sint *group; static sint ix; sint *maxid, max; sint i,j,status,iseq=0; sint sum,entries; lint score = 0; info("Start of Multiple Alignment"); /* get the phylogenetic tree from *.ph */ if (nseqs >= 2) { status = read_tree(phylip_name, (sint)0, nseqs); if (status == 0) return(0); } /* calculate sequence weights according to branch lengths of the tree - weights in global variable seq_weight normalised to sum to 100 */ calc_seq_weights((sint)0, nseqs, seq_weight); tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); for (i=0;i<nseqs;i++) tree_weight[i] = seq_weight[i]; /* recalculate tmat matrix as percent similarity matrix */ status = calc_similarities(nseqs); if (status == 0) return((sint)0); /* for each sequence, find the most closely related sequence */ maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); for (i=1;i<=nseqs;i++) { maxid[i] = -1; for (j=1;j<=nseqs;j++) if (maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j]; } /* clear the memory used for the phylogenetic tree */ if (nseqs >= 2) clear_tree(NULL); aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); ix = 0; for (i=1;i<=istart+1;i++) { aligned[i] = 1; ++ix; output_index[i] = i; } for (i=istart+2;i<=nseqs;i++) aligned[i] = 0; /* for each unaligned sequence, find it's closest pair amongst the aligned sequences. */ group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); while (ix < nseqs) { if (ix > 0) { for (i=1;i<=nseqs;i++) { if (aligned[i] == 0) { maxid[i] = -1; for (j=1;j<=nseqs;j++) if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0)) maxid[i] = tmat[i][j]; } } } /* find the most closely related sequence to those already aligned */ max = -1; for (i=1;i<=nseqs;i++) { if ((aligned[i] == 0) && (maxid[i] > max)) { max = maxid[i]; iseq = i; } } /* align this sequence to the existing alignment */ entries = 0; for (j=1;j<=nseqs;j++) if (aligned[j] != 0) { group[j] = 1; entries++; } else if (iseq==j) { group[j] = 2; entries++; } aligned[iseq] = 1; /* EITHER....., set sequence weights equal to percent identity with new sequence */ /* for (j=0;j<nseqs;j++) seq_weight[j] = tmat[j+1][iseq]; */ /* OR...., multiply sequence weights from tree by percent identity with new sequence */ for (j=0;j<nseqs;j++) seq_weight[j] = tree_weight[j] * tmat[j+1][iseq]; if (debug>1) for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf (stdout,"sequence %d: %d\n", j+1,tree_weight[j]); /* Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR */ sum = 0; for (j=0;j<nseqs;j++) if (group[j+1] == 1) sum += seq_weight[j]; if (sum == 0) { for (j=0;j<nseqs;j++) seq_weight[j] = 1; sum = j; } for (j=0;j<nseqs;j++) { seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum; if (seq_weight[j] < 1) seq_weight[j] = 1; } if (debug > 1) { fprintf(stdout,"new weights\n"); for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]); } score = prfalign(group, aligned); info("Sequence:%d Score:%d",(pint)iseq,(pint)score); if (output_order == INPUT) { ++ix; output_index[iseq] = iseq; } else output_index[++ix] = iseq; } group=ckfree((void *)group); aligned=ckfree((void *)aligned); maxid=ckfree((void *)maxid); aln_score(); /* make the rest (output stuff) into routine clustal_out in file amenu.c */ return(nseqs); }