Alignment* shrink_aln ( Alignment *A, int nseq, int *list) { Alignment *B=NULL; int a,seq; B=copy_aln (A, B); for ( a=0; a< nseq; a++) { seq=list[a]; sprintf ( A->seq_comment[a], "%s",B->seq_comment[seq]); sprintf ( A->aln_comment[a], "%s",B->aln_comment[seq]); sprintf ( A->seq_al [a], "%s",B->seq_al [seq]); A->order[a][0]=B->order[seq][0]; A->order[a][1]=B->order[seq][1]; A->order[a][2]=B->order[seq][2]; A->order[a][3]=B->order[seq][3]; A->order[a][4]=B->order[seq][4]; A->score_seq[a]=B->score_seq[seq]; A->len[a]=B->len[seq]; } A->nseq=nseq; A->len_aln=strlen (A->seq_al[0]); free_aln (B); return A; }
int fasta_cdna_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL) { /*TREATMENT OF THE TERMINAL GAP PENALTIES*/ /*TG_MODE=0---> gop and gep*/ /*TG_MODE=1---> --- gep*/ /*TG_MODE=2---> --- ---*/ int maximise; int l0, l1; /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/ int **tot_diag; int *diag; int ktup; static int n_groups; static char **group_list; int score; Alignment *B; /********Prepare Penalties******/ maximise=CL->maximise; ktup=CL->ktup; /********************************/ if ( !group_list) { group_list=make_group_aa (&n_groups, CL->matrix_for_aa_group); } B=dna_aln2_3frame_cdna_aln(A, ns, l_s); B->nseq=6; l0=strlen ( B->seq_al[0]); l1=strlen ( B->seq_al[3]); tot_diag=evaluate_diagonals_cdna( B, ns, l_s, CL, maximise,n_groups,group_list, ktup); diag=extract_N_diag (l0, l1, tot_diag,20,1); score=make_fasta_cdna_pair_wise ( A,B, ns, l_s, CL, diag); free_aln(B); free_int (tot_diag, -1); vfree (diag); return score; }
Alignment *extract_domain_with_coordinates ( Alignment *RESULT,int start, int len, Constraint_list *CL) { int a; char *buf; Alignment *SEQ_DOMAIN=NULL; /*ADJUST THE DIRECTION OF THE DOMAIN: len<0:left and len>0:right*/ if (len>0); else if (len<0) { len=len*-1; start=start-len+1; } /*CHECK THAT THE BOUNDARY CONDITIONS*/ if (start<0 || (!CL->packed_seq_lu && (start+len)>strlen((CL->S)->seq[0])) ||(CL->packed_seq_lu && (start+len)>strlen((CL->S)->seq[(CL->S)->nseq-1])) )return NULL; else { for ( a=start; a< start+len; a++) { if ((CL->moca)->forbiden_residues && (CL->moca)->forbiden_residues[0][a+1]==UNDEFINED) { fprintf ( stderr, "*"); return NULL; } } } /*EXTRACT THE DOMAIN*/ SEQ_DOMAIN=add_seq2aln (CL,SEQ_DOMAIN, CL->S); buf=extract_char (SEQ_DOMAIN->seq_al[0], start, len); for (a=0; a<len; a++)if ( buf[a]=='X') { free_aln(SEQ_DOMAIN); return NULL; } sprintf ( SEQ_DOMAIN->seq_al[0], "%s", buf); SEQ_DOMAIN->order[0][1]=start; SEQ_DOMAIN=add_seq2aln (CL,SEQ_DOMAIN, CL->S); return SEQ_DOMAIN; }
int ktup_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL) { static char **gl; static int ng; char *seq1; char *seq2; int min_len=10; if ( !gl) gl=make_group_aa (&ng, "vasiliky"); if ( ns[0]>1)seq1=sub_aln2cons_seq_mat (A, ns[0], l_s[0],"blosum62mt"); else { seq1=(char*)vcalloc ( strlen (A->seq_al[l_s[0][0]])+1, sizeof (char)); sprintf ( seq1, "%s",A->seq_al[l_s[0][0]]); } if ( ns[1]>1)seq2=sub_aln2cons_seq_mat (A, ns[1], l_s[1],"blosum62mt"); else { seq2=(char*)vcalloc ( strlen (A->seq_al[l_s[1][0]])+1, sizeof (char)); sprintf ( seq2, "%s",A->seq_al[l_s[1][0]]); } if ( strlen (seq1)<min_len || strlen (seq2)<min_len) { Alignment *B; ungap(seq1); ungap(seq2); B=align_two_sequences ( seq1, seq2, "blosum62mt",-10, -1, "myers_miller_pair_wise"); A->score=A->score_aln=aln2sim(B, "idmat"); free_aln (B); return A->score; } else { string_convert (seq1, ng, gl); string_convert (seq2, ng, gl); A->score=A->score_aln=ktup_comparison (seq1,seq2, CL->ktup); } vfree (seq1); vfree (seq2); return A->score; }
void Alifold:: fold(const ALN& aln, const std::vector<Fasta>& fa, const std::string& str, BP& bp) const { const uint L=aln.front().second.size(); std::string p(str); std::replace(p.begin(), p.end(), '.', 'x'); std::replace(p.begin(), p.end(), '?', '.'); int bk = Vienna::fold_constrained; Vienna::fold_constrained = 1; char** seqs = alloc_aln(aln, fa); // scaling parameters to avoid overflow std::string res(p); #ifdef HAVE_VIENNA20 double min_en = Vienna::alifold((const char**)seqs, &res[0]); #else double min_en = Vienna::alifold(seqs, &res[0]); #endif double kT = (Vienna::temperature+273.15)*1.98717/1000.; /* in Kcal */ Vienna::pf_scale = exp(-(1.07*min_en)/kT/L); Vienna::free_alifold_arrays(); pair_info* pi; #ifdef HAVE_VIENNA20 Vienna::alipf_fold((const char**)seqs, &p[0], &pi); #else Vienna::alipf_fold(seqs, &p[0], &pi); #endif bp.resize(L); for (uint k=0; pi[k].i!=0; ++k) if (pi[k].p>th_) bp[pi[k].i-1].push_back(std::make_pair(pi[k].j-1, pi[k].p)); free(pi); Vienna::free_alipf_arrays(); free_aln(seqs); Vienna::fold_constrained = bk; }
float Alifold:: energy_of_struct(const ALN& aln, const std::vector<Fasta>& fa, const std::string& str, float& cv) const { const uint L=aln.front().second.size(); const uint N=aln.size(); std::string res(L+1, ' '); char** seqs = alloc_aln(aln, fa); #ifdef HAVE_VIENNA20 float min_en = Vienna::energy_of_alistruct((const char **)seqs, str.c_str(), N, &cv); #else std::vector<float> cv_temp(2); float min_en = Vienna::energy_of_alistruct(seqs, str.c_str(), N, &cv_temp[0]); cv = cv_temp[1]; #endif free_aln(seqs); return min_en; }
int make_fasta_cdna_pair_wise (Alignment *B,Alignment *A,int*in_ns, int **l_s,Constraint_list *CL, int *diag) { int a,c,p,k; Dp_Result *DPR; static Dp_Model *M; int l0, l1; int len_i, len_j; int f0=0, f1=0; int deltaf0, deltaf1, delta; int nr1, nr2; int ala, alb, aa0, aa1; int type; char **al; int **tl_s; int *tns; /*DEBUG*/ int debug_cdna_fasta=0; Alignment *DA; int score; int state,prev_state; int t, e; int a1, a2; l0=strlen ( B->seq_al[l_s[0][0]]); l1=strlen ( B->seq_al[l_s[1][0]]); al=declare_char (2, l0+l1+1); B=realloc_aln2 (B,B->nseq,l0+l1+1); free_int (B->cdna_cache, -1); B->cdna_cache=declare_int(1, l0+l1+1); if ( !M)M=initialize_dna_dp_model (CL); M->diag=diag; tl_s=declare_int (2, 2);tns=vcalloc(2, sizeof(int));tl_s[0][0]=0;tl_s[1][0]=3;tns[0]=tns[1]=1; DPR=make_fast_dp_pair_wise (A,tns, tl_s,CL,M); vfree(tns);free_int(tl_s, -1); /*new_trace_back*/ a=p=0; aa0=aa1=ala=alb=0; while ( (k=DPR->traceback[a++])!=M->START); while ( (k=DPR->traceback[a++])!=M->END) { f0=M->model_properties[k][M->F0]; f1=M->model_properties[k][M->F1]; len_i=M->model_properties[k][M->LEN_I]; len_j=M->model_properties[k][M->LEN_J]; type=M->model_properties[k][M->TYPE]; if (type==M->CODING0) { deltaf0=(aa0*3+f0)-ala; deltaf1=(aa1*3+f1)-alb; delta=MAX(deltaf0, deltaf1); for (nr1=0, nr2=0,c=0; c<delta; c++, nr1++, nr2++,p++) { if (nr1<deltaf0 && ala<l0)al[0][p]=B->seq_al[l_s[0][0]][ala++]; else al[0][p]='-'; if (nr2<deltaf1 && alb<l1)al[1][p]=B->seq_al[l_s[1][0]][alb++]; else al[1][p]='-'; B->cdna_cache[0][p]=M->NON_CODING; if ( is_gap(al[1][p]) && is_gap(al[0][p]))p--; else if ( debug_cdna_fasta)fprintf (stderr, "\nUM: %c %c", al[0][p], al[1][p]); } for ( c=0; c< 3; c++, p++) { if ( c==0)B->cdna_cache[0][p]=M->CODING0; else if ( c==1)B->cdna_cache[0][p]=M->CODING1; else if ( c==2)B->cdna_cache[0][p]=M->CODING2; if (ala<l0)al[0][p]=B->seq_al[l_s[0][0]][ala++]; else al[0][p]='-'; if (alb<l1)al[1][p]=B->seq_al[l_s[1][0]][alb++]; else al[1][p]='-'; if ( is_gap(al[1][p]) && is_gap(al[0][p]))p--; else if ( debug_cdna_fasta)fprintf (stderr, "\n%d: %c %c",k, al[0][p], al[1][p]); } } aa0+=len_i; aa1+=len_j; } deltaf0=(aa0*3+f0)-ala; deltaf1=(aa1*3+f1)-alb; delta=MAX(deltaf0, deltaf1); for (nr1=0, nr2=0,c=0; c<delta; c++, nr1++, nr2++,p++) { if (nr1<deltaf0 && ala<l0)al[0][p]=B->seq_al[l_s[0][0]][ala++]; else al[0][p]='-'; if (nr2<deltaf1 && alb<l1)al[1][p]=B->seq_al[l_s[1][0]][alb++]; else al[1][p]='-'; B->cdna_cache[0][p]=M->NON_CODING; if ( is_gap(al[1][p]) && is_gap(al[0][p]))p--; else if ( debug_cdna_fasta)fprintf (stderr, "\nUM: %c %c", al[0][p], al[1][p]); } /*End New traceback*/ al[0][p]='\0'; al[1][p]='\0'; sprintf( B->seq_al[l_s[0][0]], "%s", al[0]); sprintf( B->seq_al[l_s[1][0]], "%s", al[1]); B->len_aln=strlen (al[0]); B->nseq=2; if ( debug_cdna_fasta) { fprintf ( stderr, "\nA-A=%d, %d", CL->M['a'-'A']['a'-'A'], CL->M['a'-'A']['a'-'A'] *SCORE_K); for ( a=1; a<diag[0]; a++) { fprintf ( stderr, "\nchosen diag: %d", diag[a]); } fprintf ( stderr, "\n GOP=%d GEP=%d TG_MODE=%d", M->gop, M->gep, M->TG_MODE); fprintf ( stderr, "\nF_GOP=%d F_GEP=%d F_TG_MODE=%d", M->gop, M->gep, M->F_TG_MODE); DA=copy_aln (B, NULL); DA=realloc_aln2 (DA,6,(DA->len_aln+1)); for ( a=0; a<B->len_aln; a++) { fprintf ( stderr, "\n%d", DA->cdna_cache[0][a]); if (DA->cdna_cache[0][a]>=M->CODING0)DA->seq_al[DA->nseq][a]=DA->cdna_cache[0][a]-M->nstate+'0'; else DA->seq_al[DA->nseq][a]=DA->cdna_cache[0][a]-M->nstate+'0'; if (DA->cdna_cache[0][a]==M->CODING0) { DA->seq_al[DA->nseq+1][a]=translate_dna_codon (DA->seq_al[0]+a,'*'); DA->seq_al[DA->nseq+2][a]=translate_dna_codon (DA->seq_al[1]+a,'*'); } else { DA->seq_al[DA->nseq+1][a]='-'; DA->seq_al[DA->nseq+2][a]='-'; } } DA->nseq+=3; print_aln (DA); free_aln(DA); score=0; for (prev_state=M->START,a=0; a< DA->len_aln;) { state=DA->cdna_cache[0][a]; t=M->model[prev_state][state]; if ( DA->cdna_cache[0][a]==M->CODING0) { a1=translate_dna_codon (A->seq_al[0]+a,'x'); a2=translate_dna_codon (A->seq_al[1]+a,'x'); if ( a1!='x' && a2!='x') { e=CL->M[a1-'A'][a2-'A']*SCORE_K; } } else if ( DA->cdna_cache[0][a]>M->CODING0); else { e=M->model_properties[B->cdna_cache[0][a]][M->EMISSION]; } if ( e==UNDEFINED || t==UNDEFINED) fprintf ( stderr, "\nPROBLEM %d\n", a); fprintf ( stderr, "\n[%c..%c: %d(e)+%d(t)=%d]", A->seq_al[0][a], A->seq_al[1][a], e,t,e+t); score+=e+t; prev_state=state; if (B->cdna_cache[0][a]==M->NON_CODING)a++; else a+=3; } } for ( a=0; a<B->len_aln; a++) { if ( B->cdna_cache[0][a]<M->CODING0)B->cdna_cache[0][a]=0; else B->cdna_cache[0][a]=1; } free_char ( al, -1); return DPR->score; }
int cfasta_cdna_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL) { /*TREATMENT OF THE TERMINAL GAP PENALTIES*/ /*TG_MODE=0---> gop and gep*/ /*TG_MODE=1---> --- gep*/ /*TG_MODE=2---> --- ---*/ int maximise; /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/ int **tot_diag; int *diag; int ktup; static int n_groups; static char **group_list; int score, new_score; int n_chosen_diag=0; int step; int max_n_chosen_diag; int l0, l1; Alignment *B; /********Prepare Penalties******/ maximise=CL->maximise; ktup=CL->ktup; /********************************/ if ( !group_list) { group_list=make_group_aa (&n_groups, CL->matrix_for_aa_group); } B=dna_aln2_3frame_cdna_aln(A, ns, l_s); l0=strlen(B->seq_al[0]); l1=strlen(B->seq_al[3]); tot_diag=evaluate_diagonals_cdna ( B, ns, l_s, CL, maximise,n_groups,group_list, ktup); max_n_chosen_diag=100; n_chosen_diag=step=10 ; n_chosen_diag=MIN(n_chosen_diag, max_n_chosen_diag); diag=extract_N_diag (l0,l1, tot_diag, n_chosen_diag,2); score =make_fasta_cdna_pair_wise ( A,B, ns, l_s, CL, diag); new_score=0; vfree ( diag); while (new_score!=score && n_chosen_diag< max_n_chosen_diag ) { score=new_score; ungap_sub_aln ( A, ns[0], l_s[0]); ungap_sub_aln ( A, ns[1], l_s[1]); n_chosen_diag+=step; n_chosen_diag=MIN(n_chosen_diag, max_n_chosen_diag); diag =extract_N_diag (l0,l1, tot_diag, n_chosen_diag,3); new_score=make_fasta_cdna_pair_wise ( A, B,ns, l_s, CL, diag); vfree ( diag); } score=new_score; free_int (tot_diag, -1); free_aln(B); return score; }
struct alignment* detect_and_read_sequences(struct alignment* aln,struct parameters* param) { int feature = 0; char **input = 0; unsigned short int* input_type = 0; unsigned short int* input_numseq = 0; int num_input = 0; int i = 0; int j = 0; int c = 0; int a,b; int free_read = 1; unsigned int numseq = get_kalign_context()->numseq; while(free_read == 1 || param->infile[i]){ num_input++; i++; free_read = 0; } numseq = 0; input = malloc(sizeof(char*) * num_input); input_type = malloc(sizeof(unsigned short int) * num_input); input_numseq = malloc(sizeof(unsigned short int) * num_input); for (i = 0; i < num_input;i++){ input[i] = 0; input_type[i] = 0; input_numseq[i] = 0; } free_read = 0; if(param->quiet){ c = 1; }else{ c = 0; } for (i = c; i < num_input;i++){ if(!param->infile[i]){ k_printf("reading from STDIN: "); }else{ k_printf("reading from %s: ",param->infile[i]); } input[i] = get_input_into_string(input[i],param->infile[i]); if(input[i]){ free_read++; if (byg_start("<macsim>",input[i]) != -1){ input_numseq[i] = count_sequences_macsim(input[i]); feature = 1; input_type[i] = 1; }else if (byg_start("<uniprot",input[i]) != -1){ input_numseq[i] = count_sequences_uniprot(input[i]); input_type[i] = 2; }else if(byg_start("This SWISS-PROT",input[i]) != -1){ input_numseq[i] = count_sequences_swissprot(input[i]); input_type[i] = 3; }else if (byg_start("This Swiss-Prot",input[i]) != -1){ input_numseq[i] = count_sequences_swissprot(input[i]); input_type[i] = 3; }else if (byg_start("CLUSTAL W",input[i]) != -1){ input_numseq[i] = count_sequences_clustalw(input[i]); input_type[i] = 4; }else if (byg_start("PileUp",input[i]) != -1){ input_numseq[i] = count_sequences_clustalw(input[i]); input_type[i] = 4; }else if (byg_start("MSF:",input[i]) != -1){ input_numseq[i] = count_sequences_clustalw(input[i]); input_type[i] = 4; }else if (byg_start("STOCKHOLM",input[i]) != -1){ input_numseq[i] = count_sequences_stockholm(input[i]); input_type[i] = 5; }else{ input_numseq[i] = count_sequences_fasta(input[i]); input_type[i] = 0; } k_printf("found %d sequences\n",input_numseq[i]); if(input_numseq[i] < 1){ free(input[i]); input[i] = 0; }else{ numseq += input_numseq[i]; } }else{ k_printf("found no sequences.\n"); if(!param->outfile && i){ param->outfile = param->infile[i]; k_printf("-> output file, in "); //try to set format.... if(!param->format){ if (byg_start("msf",param->outfile) != -1){ param->format = "msf"; }else if (byg_start("clustal",param->outfile) != -1){ param->format = "clustal"; }else if (byg_start("aln",param->outfile) != -1){ param->format = "clustal"; }else if (byg_start("macsim",param->outfile) != -1){ param->format = "macsim"; }else{ param->format = "fasta"; } if(param->reformat){ k_printf("unaligned fasta format\n"); }else if(param->format){ k_printf("%s format\n",param->format); }else{ k_printf("fasta format\n"); } } } k_printf("\n"); } } if(numseq < 2){ k_printf("%s\n", usage); if(!numseq){ k_printf("\nWARNING: No sequences found.\n\n"); }else{ k_printf("\nWARNING: Only one sequence found.\n\n"); } for (i = 0; i < num_input;i++){ free(input[i]); } free(input_numseq); free(input_type); free(input); free_param(param); exit(0); } if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){ if( free_read < 2){ k_printf("\nWARNING: You are trying to perform a profile - profile alignment but ony one input file was detected.\n\n"); param->alignment_type = "default"; } } if (param->feature_type && !feature){ for (i = 0; i < num_input;i++){ free(input[i]); } free(input_numseq); free(input_type); free(input); free_param(param); throwKalignException(k_printf("\nWARNING: You are trying to perform a feature alignment but the input format(s) do not contain feature information.\n")); } get_kalign_context()->numprofiles = (numseq << 1) - 1; aln = aln_alloc(aln); //numseq = 0; if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){ j = 0; for (i = 0; i < num_input;i++){ if(input[i]){ switch(input_type[i]){ case 0: aln = read_alignment(aln,input[i]); break; case 1: aln = read_alignment_macsim_xml(aln,input[i]); break; case 2: aln = read_alignment_uniprot_xml(aln,input[i]); break; case 3: aln = read_alignment_from_swissprot(aln, input[i]); break; case 4: aln = read_alignment_clustal(aln,input[i]); break; case 5: aln = read_alignment_stockholm(aln,input[i]); break; default: aln = read_alignment(aln,input[i]); break; } input[i] = 0; //create partial profile.... aln->nsip[numseq+j] = input_numseq[i]; aln->sip[numseq+j] = malloc(sizeof(int)*aln->nsip[numseq+j]); //k_printf("%d %d\n",numseq+j,aln->sl[numseq+j]); j++; } } num_input = j; c = 0; for (i = 0;i < num_input;i++){ // for ( j = 0; j < aln->nsip[numseq+i];j++){ aln->sip[numseq+i][j] = c; c++; // k_printf("%d ",aln->sip[numseq+i][j]); } aln->sl[numseq+i] = aln->sl[aln->sip[numseq+i][0]]; // k_printf("PROFILE:%d contains: %d long:%d\n",i+numseq,aln->nsip[numseq+i],aln->sl[numseq+i]); // k_printf("\n"); } //sanity check -are all input for (i = 0;i < num_input;i++){ for ( j = 0; j < aln->nsip[numseq+i]-1;j++){ a = aln->sip[numseq+i][j]; a = aln->sl[a]; for (c = j+1; j < aln->nsip[numseq+i];j++){ b = aln->sip[numseq+i][c]; b = aln->sl[b]; if(a != b){ for (i = 0; i < num_input;i++){ free(input[i]); } free(input_numseq); free(input_type); free(input); free_aln(aln); free_param(param); throwKalignException(k_printf("Unaligned sequences in input %s.\n",param->infile[i])); } } } } //exit(0); /*for (i = 0; i < numseq;i++){ k_printf("len%d:%d\n",i,aln->sl[i]); for ( j =0 ; j < aln->sl[i];j++){ //if(aln->s[i][j]> 23 || aln->s[i][j] < 0){ // aln->s[i][j] = -1; //} k_printf("%d ",aln->s[i][j]); } // k_printf("\n"); } exit(0);*/ }else{ for (i = 0; i < num_input;i++){ if(input[i]){ switch(input_type[i]){ case 0: aln = read_sequences(aln,input[i]); break; case 1: aln = read_sequences_macsim_xml(aln,input[i]); break; case 2: aln = read_sequences_uniprot_xml(aln,input[i]); break; case 3: aln = read_sequences_from_swissprot(aln, input[i]); break; case 4: aln = read_sequences_clustal(aln,input[i]); break; case 5: aln = read_sequences_stockholm(aln,input[i]); break; default: aln = read_sequences(aln,input[i]); break; } /*if (byg_start("<macsim>",input[i]) != -1){ aln = read_sequences_macsim_xml(aln,input[i]); }else if (byg_start("<uniprot",input[i]) != -1){ aln = read_sequences_uniprot_xml(aln,input[i]); }else if(byg_start("This SWISS-PROT entry is copyright.",input[i]) != -1){ aln = read_sequences_from_swissprot(aln, input[i]); }else if (byg_start("This Swiss-Prot entry is copyright.",input[i]) != -1){ aln = read_sequences_from_swissprot(aln, input[i]); }else if (byg_start("CLUSTAL W",input[i]) != -1){ aln = read_sequences_clustal(aln,input[i]); }else if (byg_start("PileUp",input[i]) != -1){ aln = read_sequences_clustal(aln,input[i]); }else if (byg_start("MSF:",input[i]) != -1){ aln = read_sequences_clustal(aln,input[i]); }else if (byg_start("STOCKHOLM",input[i]) != -1){ aln = read_sequences_stockholm(aln,input[i]); }else{ aln = read_sequences(aln,input[i]); }*/ input[i] = 0; } } } if(numseq < 2){ free_param(param); throwKalignException(k_printf("\nNo sequences could be read.\n")); } if(!param->format && param->outfile){ if (byg_start("msf",param->outfile) != -1){ param->format = "msf"; }else if (byg_start("clustal",param->outfile) != -1){ param->format = "clustal"; }else if (byg_start("aln",param->outfile) != -1){ param->format = "clustal"; }else if (byg_start("macsim",param->outfile) != -1){ param->format = "macsim"; } k_printf("Output file: %s, in %s format.\n",param->outfile,param->format); } free(input); free(input_type); free(input_numseq); return aln; }
int measure_domain_length ( Constraint_list *CL,Alignment *IN, int start, int min_len, int max_len, int step) { Alignment *C=NULL; int score, best_score,best_len,a, b, l; int *score_matrix, *len_matrix; int n_val, best_val; score_matrix=(int*)vcalloc ( max_len, sizeof (int)); len_matrix=(int*)vcalloc ( max_len, sizeof (int)); l=strlen ( (CL->S)->seq[0]); min_len=MAX(0, min_len); min_len=MIN(l-start, min_len); if ( !IN)C=extract_domain_with_coordinates (C,start,min_len, CL); else { C=copy_aln (IN, C); C->len_aln=min_len; for ( a=0; a< C->nseq; a++)C->seq_al[a][min_len]='\0'; C=add_seq2aln (CL,C, CL->S); } best_score= score=((CL->moca)->evaluate_domain)(C, CL); min_len=MAX(0, min_len); for ( best_len=best_val=n_val=0,b=min_len; b<max_len && (start+b)<l; b+=step, n_val++) { if ( !IN)C=extract_domain_with_coordinates (C,start, b, CL); else { C=copy_aln (IN, C); C->len_aln=min_len; for ( a=0; a< C->nseq; a++)C->seq_al[a][b]='\0'; C=add_seq2aln (CL,C, CL->S); } if ( C->len_aln>0 )score=((CL->moca)->evaluate_domain)(C, CL); else score=-1; if ( score< -3000)break; fprintf ( stderr, "\n\t%d %d=>%d (%d, %d)[%d]",start, b, score, C->nseq, C->len_aln, step); score_matrix[n_val]=score; len_matrix [n_val]=b; if ( score>best_score) { best_score=score; best_len=b; best_val=n_val; } } free_aln(C); for ( a=best_val; a<n_val; a++) { if (score_matrix[a]>best_score/2)best_len=len_matrix[a]; else break; } vfree ( score_matrix); vfree ( len_matrix); return best_len; }
Alignment * interactive_domain_extraction ( Constraint_list *CL) { int LEN=0; int START=1; int SCALE=2; int GOPP=3; int iteration=0; char *choice; int a,b, c; int index; char *s; char last_start[100]; char out_format[100]; Alignment *RESULT=NULL; Alignment *PREVIOUS=NULL; Alignment *C=NULL; Alignment *EA=NULL; int **parameters; choice=(char*)vcalloc ( 100, sizeof (char)); parameters=declare_int (10000, 4); parameters[0][START]=(CL->moca)->moca_start; parameters[0][LEN]= (CL->moca)->moca_len; parameters[0][SCALE]=(CL->moca)->moca_scale; parameters[0][GOPP]=CL->gop; iteration=0; sprintf ( last_start, "%d", (CL->moca)->moca_start); sprintf ( out_format, "mocca_aln"); print_moca_interactive_choices (); while ( !strm4 (choice, "Q","X", "q", "x" )) { c=choice[0]; if (c=='b' || c=='B') { iteration-=atoi(choice+1)+1; if (iteration<0)iteration=1; } else { iteration++; parameters[iteration][START]=parameters[iteration-1][START]; parameters[iteration][LEN]=parameters[iteration-1][LEN]; parameters[iteration][SCALE]=parameters[iteration-1][SCALE]; parameters[iteration][GOPP]=parameters[iteration-1][GOPP]; if ( c=='>')parameters[iteration][LEN]=atoi(choice+1); else if ( c=='|') { sprintf ( last_start, "%s", choice); parameters[iteration][START]=0; s=strrchr(choice, ':'); if (s==NULL) { parameters[iteration][START]=atoi(choice+1); } else { s[0]='\0'; if((index=name_is_in_list (choice+1,(CL->S)->name,(CL->S)->nseq,100))==-1) { fprintf ( stderr, "\n\tERROR: %s NOT in Sequence Set",choice+1); continue; } for ( a=0; a< index; a++) { parameters[iteration][START]+=(CL->S)->len[a]+1; } parameters[iteration][START]+=atoi(s+1)-1; } } else if ( c=='C'||c=='c')parameters[iteration][SCALE]=atoi(choice+1); else if ( c=='G'||c=='g') { parameters[iteration][GOPP]=atoi(choice+1); CL->gop=parameters[iteration][GOPP]; } else if ( c=='F'||c=='f') { sprintf ( out_format, "%s", choice+1); } else if ( c=='S'||c=='s') { if (choice[1]=='\0')sprintf ( choice, "default.domain_aln.%d", iteration); output_format_aln (out_format,RESULT,EA=fast_coffee_evaluate_output(RESULT, CL),choice+1); fprintf (stderr, "\tOutput file [%15s] in [%10s] format\n",choice+1,out_format); free_aln (EA); } else if (c=='\0') { if ( parameters[iteration][SCALE]>0) { fprintf ( stderr, "\nWARNING: THRESHOLD RESET to 0"); parameters[iteration][SCALE]=0; } (CL->moca)->moca_scale=parameters[iteration][SCALE]; CL->gop=parameters[iteration][GOPP]; C=extract_domain_with_coordinates (C,parameters[iteration][START],parameters[iteration][LEN],CL); if ( C==NULL) { fprintf ( stderr, "\nERROR: ILLEGAL COORDINATES! SEQUENCE BOUNDARY CROSSED\n"); for ( b=1,a=0; a< (CL->S)->nseq-1; a++) { fprintf ( stderr, "\n\t%15s=> Abs:[%d %d] Rel:[0 %d]", (CL->S)->name[a],b, b+(CL->S)->len[a]-1,(CL->S)->len[a]); b+=(CL->S)->len[a]; } fprintf ( stderr, "\n"); } else if (parameters[iteration][START]==0 && parameters[iteration][LEN]==0) { fprintf ( stderr, "\n\tEnter the following parameters:\n\n\t\tSTART value: |x [Return]\n\t\tLENgth value: >y [Return]\n\t\ttype [Return]\n\n"); fprintf ( stderr, "\n\n\tSTART is measured on the total length of the concatenated sequences\n\tx and y are positive integers\n\n"); } else if ( C->nseq==0) { fprintf ( stderr, "\nNO MATCH FOUND: LOWER THE SCALE (C)\n"); } else { RESULT=copy_aln ( C, RESULT); unpack_seq_aln (RESULT, CL); RESULT->output_res_num=1; output_format_aln (out_format,RESULT,EA=fast_coffee_evaluate_output(RESULT, CL),"stdout"); free_aln(EA); PREVIOUS=copy_aln ( RESULT, PREVIOUS); free_aln (C); print_moca_interactive_choices (); } } fprintf ( stderr, "\t[ITERATION %3d][START=%s][LEN=%3d][GOPP=%3d][SCALE=%4d]\t",iteration,last_start,parameters[iteration][LEN],parameters[iteration][GOPP],parameters[iteration][SCALE]); a=0; fprintf ( stderr, "Your Choice: "); while ( (c=fgetc(stdin))!='\n')choice[a++]=c; choice[a]=0; } } if (!RESULT)myexit(EXIT_SUCCESS); if ( RESULT)RESULT->output_res_num=0; return RESULT; }
Alignment * extract_domain ( Constraint_list *CL) { /* function documentation: start Alignment * extract_domain ( Constraint_list *CL) given a CL, this function extracts the next best scoring local multiple alignment It returns a CL where the aligned residues have been indicated in (CL->moca)->forbiden_residues; the local alignment is extracted with the dp function indicated by CL->dp_mode: (gotoh_sw_pair_wise) Evaluation: CL->get_dp_cost=slow_get_dp_cost; CL->evaluate_residue_pair=sw_residue_pair_extended_list; Continuation: (CL->moca)->evaluate_domain=evaluate_moca_domain; Cache of CL: (CL->moca)->cache_cl_with_domain=cache_cl_with_moca_domain; Domain post processing: (CL->moca)->make_nol_aln=make_moca_nol_aln; function documentation: end */ int min_start, max_start, start,min_len, max_len, len, score; int step; Alignment *C=NULL; Alignment *RESULT=NULL; Alignment *EA=NULL; /*CASE 1: Non Automatic Domain Extraction*/ if ((CL->moca)->moca_interactive) { return interactive_domain_extraction (CL); } else if ((CL->moca)->moca_len) { while ((C=extract_domain_with_coordinates (C,(CL->moca)->moca_start,(CL->moca)->moca_len,CL))->nseq==0)(CL->moca)->moca_scale=(CL->moca)->moca_scale*0.9; RESULT=copy_aln ( C, RESULT); unpack_seq_aln (RESULT, CL); output_format_aln ("mocca_aln",RESULT,EA=fast_coffee_evaluate_output(RESULT, CL),"stdout"); free_aln(EA); return RESULT; } else if ( !(CL->moca)->moca_len) { analyse_sequence (CL); myexit (EXIT_FAILURE); } /*CASE 2: Automatic Domain Extraction: Find Coordinates*/ start=500; step=10; min_start=0; max_start=strlen ((CL->S)->seq[0]); min_len=20; max_len=strlen ((CL->S)->seq[0]); C=extract_domain_with_coordinates (C,13,30,CL); C->output_res_num=1; print_aln (C); (CL->moca)->moca_scale=-180; C=add_seq2aln (CL,C, CL->S); print_aln (C); (CL->moca)->moca_scale=-160; C=add_seq2aln (CL,C, CL->S); print_aln (C); myexit (EXIT_FAILURE); while ( step>0) { C=approximate_domain (min_start,max_start,step,min_len,max_len, step,&start, &len, &score, CL); min_start=start-step; max_start=start+step; min_len=len-step; max_len=len+step; step=step/2; } C=extract_domain_with_coordinates (C,start-10, len+20,CL); C->output_res_num=1; print_aln (C); myexit (EXIT_FAILURE); return C; }
Alignment* copy_aln ( Alignment *A, Alignment *B) { int a, b; int nnseq; int nlen; /* c[100]=10;*/ if ( A==NULL){free_aln(B); return NULL;} nnseq=MAX(A->nseq, A->max_n_seq); nlen=A->len_aln+1; if (B) B=realloc_alignment2 (B, nnseq, nlen); else B=declare_aln2 (nnseq, nlen); B->S=A->S; /*SIZES*/ B->max_len=A->max_len; B->min_len=A->min_len; B->declared_len=nlen; B->max_n_seq=nnseq; B->nseq=A->nseq; B->len_aln=A->len_aln; /*sequence Information*/ if ( A->generic_comment) { vfree(B->generic_comment); B->generic_comment=vcalloc (strlen(A->generic_comment)+1, sizeof (char)); sprintf ( B->generic_comment, "%s", A->generic_comment); } if ( (A->S)==NULL){vfree (B->len); B->len=vcalloc ( A->max_n_seq, sizeof (int));} ga_memcpy_int ( A->len, B->len, B->nseq); B->seq_comment=copy_char ( A->seq_comment, B->seq_comment, -1,-1); B->aln_comment=copy_char ( A->aln_comment, B->aln_comment, -1,-1); B->name=copy_char ( A->name, B->name, -1,-1); B->file=copy_char ( A->file, B->file, -1,-1); B->tree_order=copy_char ( A->tree_order, B->tree_order, -1,-1); B->expanded_order=A->expanded_order; free_char ( B->seq_al, -1); B->seq_al=declare_char(B->max_n_seq, B->declared_len); // HERE ("A: MAX_NSEQ=%d %d %d %d",B->nseq, B->max_n_seq, B->declared_len, B->len_aln); // HERE ("B: MAX_NSEQ=%d %d %d %d",A->nseq, A->max_n_seq, A->declared_len, A->len_aln); for ( a=0; a< nnseq; a++) { if (A->seq_al[a]) { for ( b=0; b< A->len_aln; b++) B->seq_al[a][b]=A->seq_al[a][b]; } } B->order=copy_int ( A->order, B->order, -1, -1); B->S=A->S; if (A->seq_cache) { B->seq_cache=copy_int ( A->seq_cache, B->seq_cache,-1,-1); } if (A->cdna_cache) { B->cdna_cache=copy_int ( A->cdna_cache, B->cdna_cache,-1,-1); } B->P=copy_profile (A->P); B->Dp_result=A->Dp_result; /*Score*/ if ( (A->S)==NULL){vfree (B->score_seq); B->score_seq=vcalloc ( A->max_n_seq, sizeof (int));} ga_memcpy_int( A->score_seq,B->score_seq,B->nseq); B->score_res=A->score_res; B->score_aln=A->score_aln; B->score=A->score; B->ibit=A->ibit; B->cpu=A->cpu; B->finished=A->finished; /*Output Options*/ B->output_res_num=A->output_res_num; B->residue_case=A->residue_case; B->expand=A->expand; B->CL=A->CL; B->random_tag=A->random_tag; /*Make the function Recursive */ if ( A->A) { B->A=copy_aln (A->A, NULL); } else B->A=NULL; return B; }
void KalignAdapter::alignUnsafe(const MultipleSequenceAlignment& ma, MultipleSequenceAlignment& res, TaskStateInfo& ti) { ti.progress = 0; int* tree = 0; quint32 a, b, c; struct alignment* aln = 0; struct parameters* param = 0; struct aln_tree_node* tree2 = 0; param = (parameters*)malloc(sizeof(struct parameters)); param = interface(param,0,0); kalign_context *ctx = get_kalign_context(); unsigned int &numseq = ctx->numseq; unsigned int &numprofiles = ctx->numprofiles; if (ma->getNumRows() < 2){ if (!numseq){ k_printf("No sequences found.\n\n"); } else { k_printf("Only one sequence found.\n\n"); } free_param(param); throw KalignException("Can't align less then 2 sequences"); } if(ctx->gpo != -1) { param->gpo = ctx->gpo; } if(ctx->gpe != -1) { param->gpe = ctx->gpe; } if(ctx->tgpe != -1) { param->tgpe = ctx->tgpe; } if(ctx->secret != -1) { param->secret = ctx->secret; } /************************************************************************/ /* Convert MA to aln */ /************************************************************************/ k_printf("Prepare data"); numseq = ma->getNumRows(); numprofiles = (numseq << 1) - 1; aln = aln_alloc(aln); for(quint32 i = 0 ; i < numseq; i++) { const MultipleSequenceAlignmentRow row= ma->getMsaRow(i); aln->sl[i] = row->getUngappedLength(); aln->lsn[i] = row->getName().length(); } for (quint32 i = 0; i < numseq;i++) { try { aln->s[i] = (int*) malloc(sizeof(int)*(aln->sl[i]+1)); checkAllocatedMemory(aln->s[i]); aln->seq[i] = (char*)malloc(sizeof(char)*(aln->sl[i]+1)); checkAllocatedMemory(aln->seq[i]); aln->sn[i] = (char*)malloc(sizeof(char)*(aln->lsn[i]+1)); checkAllocatedMemory(aln->sn[i]); } catch (...) { cleanupMemory(NULL, numseq, NULL, aln, param); throw; } } int aacode[26] = {0,1,2,3,4,5,6,7,8,-1,9,10,11,12,23,13,14,15,16,17,17,18,19,20,21,22}; for(quint32 i = 0; i < numseq; i++) { const MultipleSequenceAlignmentRow row= ma->getMsaRow(i); qstrncpy(aln->sn[i], row->getName().toLatin1(), row->getName().length() + 1); //+1 to include '\0' QString gapless = QString(row->getCore()).remove('-'); qstrncpy(aln->seq[i], gapless.toLatin1(), gapless.length() + 1); //+1 to include '\0' for (quint32 j = 0; j < aln->sl[i]; j++) { if (isalpha((int)aln->seq[i][j])){ aln->s[i][j] = aacode[toupper(aln->seq[i][j])-65]; } else { aln->s[i][j] = -1; } } aln->s[i][aln->sl[i]] = 0; aln->seq[i][aln->sl[i]] = 0; aln->sn[i][aln->lsn[i]] = 0; } /*for(int i=0;i<numseq;i++) { for(int j=0;j<aln->sl[i];j++) printf("%d ", aln->s[i][j]); }*/ //aln_dump(aln); //aln = detect_and_read_sequences(aln,param); if(param->ntree > (int)numseq){ param->ntree = (int)numseq; } //DETECT DNA if(param->dna == -1){ for (quint32 i = 0; i < numseq;i++){ param->dna = byg_detect(aln->s[i],aln->sl[i]); if(param->dna){ break; } } } //param->dna = 0; //k_printf("DNA:%d\n",param->dna); //exit(0); if(param->dna == 1){ //brief sanity check... for (quint32 i = 0; i < numseq;i++){ if(aln->sl[i] < 6){ //k_printf("Dna/Rna alignments are only supported for sequences longer than 6."); free(param); free_aln(aln); throw KalignException("Dna/Rna alignments are only supported for sequences longer than 6."); } } aln = make_dna(aln); } //int j; //fast distance calculation; float** submatrix = 0; submatrix = read_matrix(submatrix,param); // sets gap penalties as well..... //if(byg_start(param->alignment_type,"profPROFprofilePROFILE") != -1){ // profile_alignment_main(aln,param,submatrix); //} float** dm = 0; if(param->ntree > 1){ //if(byg_start(param->distance,"pairclustalPAIRCLUSTAL") != -1){ // if(byg_start(param->tree,"njNJ") != -1){ // dm = protein_pairwise_alignment_distance(aln,dm,param,submatrix,1); // }else{ // dm = protein_pairwise_alignment_distance(aln,dm,param,submatrix,0); // } //}else if(byg_start("wu",param->alignment_type) != -1){ // dm = protein_wu_distance2(aln,dm,param); // // param->feature_type = "wumanber"; if(param->dna == 1){ // if(byg_start(param->tree,"njNJ") != -1){ // dm = dna_distance(aln,dm,param,1); // }else{ dm = dna_distance(aln,dm,param,0); // } }else{ //if(byg_start(param->tree,"njNJ") != -1){ // dm = protein_wu_distance(aln,dm,param,1); //}else{ try { dm = protein_wu_distance(aln,dm,param,0); } catch (const KalignException &) { cleanupMemory(submatrix, numseq, dm, aln, param); throw; } //} } if(check_task_canceled(ctx)) { cleanupMemory(submatrix, numseq, dm, aln, param); throwCancellingException(); } /*int j; for (int i = 0; i< numseq;i++){ for (j = 0; j< numseq;j++){ k_printf("%f ",dm[i][j]); } k_printf("\n"); }*/ //if(byg_start(param->tree,"njNJ") != -1){ // tree2 = real_nj(dm,param->ntree); //}else{ tree2 = real_upgma(dm,param->ntree); //} //if(param->print_tree){ // print_tree(tree2,aln,param->print_tree); //} } tree = (int*) malloc(sizeof(int)*(numseq*3+1)); for (quint32 i = 1; i < (numseq*3)+1;i++){ tree[i] = 0; } tree[0] = 1; if(param->ntree < 2){ tree[0] = 0; tree[1] = 1; c = numseq; tree[2] = c; a = 2; for (quint32 i = 3; i < (numseq-1)*3;i+=3){ tree[i] = c; tree[i+1] = a; c++; tree[i+2] = c; a++; } }else if(param->ntree > 2){ ntreeify(tree2,param->ntree); }else{ tree = readtree(tree2,tree); for (quint32 i = 0; i < (numseq*3);i++){ tree[i] = tree[i+1]; } free(tree2->links); free(tree2->internal_lables); free(tree2); } //get matrices... //struct feature_matrix* fm = 0; //struct ntree_data* ntree_data = 0; int** map = 0; //if(param->ntree > 2){ // ntree_data = (struct ntree_data*)malloc(sizeof(struct ntree_data)); // ntree_data->realtree = tree2; // ntree_data->aln = aln; // ntree_data->profile = 0; // ntree_data->map = 0; // ntree_data->ntree = param->ntree; // ntree_data->submatrix = submatrix; // ntree_data->tree = tree; // ntree_data = ntree_alignment(ntree_data); // map = ntree_data->map; // tree = ntree_data->tree; // for (int i = 0; i < (numseq*3);i++){ // tree[i] = tree[i+1]; // } // free(ntree_data); //}else if (param->feature_type){ // fm = get_feature_matrix(fm,aln,param); // if(!fm){ // for (int i = 32;i--;){ // free(submatrix[i]); // } // free(submatrix); // free_param(param); // free(map); // free(tree); // throw KalignException("getting feature matrix error"); // } // map = feature_hirschberg_alignment(aln,tree,submatrix,map,fm); // //exit(0); // //map = feature_alignment(aln,tree,submatrix, map,fm); //}else if (byg_start("pairwise",param->alignment_type) != -1){ // if(param->dna == 1){ // map = dna_alignment_against_a(aln,tree,submatrix, map,param->gap_inc); // }else{ // map = hirschberg_alignment_against_a(aln,tree,submatrix, map,param->smooth_window,param->gap_inc); // } // //map = default_alignment(aln,tree,submatrix, map); //}else if (byg_start("fast",param->alignment_type) != -1){ // map = default_alignment(aln,tree,submatrix, map); if(param->dna == 1){ map = dna_alignment(aln,tree,submatrix, map,param->gap_inc); // /*}else if (byg_start("test",param->alignment_type) != -1){ // map = test_alignment(aln,tree,submatrix, map,param->internal_gap_weight,param->smooth_window,param->gap_inc); // }else if (param->aa){ // map = aa_alignment(aln,tree,submatrix, map,param->aa); // }else if (param->alter_gaps){ // map = alter_gaps_alignment(aln,tree,submatrix,map,param->alter_gaps,param->alter_range,param->alter_weight); // }else if (byg_start("altergaps",param->alignment_type) != -1){ // map = alter_gaps_alignment(aln,tree,submatrix,map,param->alter_gaps,param->alter_range,param->alter_weight); // }else if(byg_start("simple",param->alignment_type) != -1){ // map = simple_hirschberg_alignment(aln,tree,submatrix, map);*/ //}else if(byg_start("advanced",param->alignment_type) != -1){ // map = advanced_hirschberg_alignment(aln,tree,submatrix, map,param->smooth_window,param->gap_inc,param->internal_gap_weight); }else{ map = hirschberg_alignment(aln,tree,submatrix, map,param->smooth_window,param->gap_inc); } if (map == NULL) { throw KalignException("Failed to build alignment."); } if(check_task_canceled(ctx)) { free_param(param); free_aln(aln); free(map); free(tree); throwCancellingException(); } //clear up sequence array to be reused as gap array.... int *p = 0; for (quint32 i = 0; i < numseq;i++){ p = aln->s[i]; for (a = 0; a < aln->sl[i];a++){ p[a] = 0; } } //clear up for (quint32 i = 0; i < (numseq-1)*3;i +=3){ a = tree[i]; b = tree[i+1]; aln = make_seq(aln,a,b,map[tree[i+2]]); } //for (int i = 0; i < numseq;i++){ // k_printf("%s %d\n",aln->sn[i],aln->nsip[i]); //} for (quint32 i = 0; i < numseq;i++){ aln->nsip[i] = 0; } aln = sort_sequences(aln,tree,param->sort); //for (int i = 0; i < numseq;i++){ // k_printf("%d %d %d\n",i,aln->nsip[i],aln->sip[i][0]); //} /************************************************************************/ /* Convert aln to MA */ /************************************************************************/ res->setAlphabet(ma->getAlphabet()); for (quint32 i = 0; i < numseq;i++){ int f = aln->nsip[i]; QString seq; for(quint32 j=0;j<aln->sl[f];j++) { seq += QString(aln->s[f][j],'-') + aln->seq[f][j]; } seq += QString(aln->s[f][aln->sl[f]],'-'); res->addRow(QString(aln->sn[f]), seq.toLatin1()); } //output(aln,param); /* if(!param->format){ fasta_output(aln,param->outfile); }else{ if (byg_start("msf",param->format) != -1){ msf_output(aln,param->outfile); }else if (byg_start("clustal",param->format) != -1){ clustal_output(aln,param->outfile); }else if (byg_start("macsim",param->format) != -1){ macsim_output(aln,param->outfile,param->infile[0]); } } */ free_param(param); free_aln(aln); free(map); free(tree); //KalignContext* ctx = getKalignContext(); }