int main(int argc, char* argv[]) { double before, after; int M = atoi(argv[1]); int N = atoi(argv[2]); int P = atoi(argv[3]); int **A = Allocate2DArray< int >(M, P); int **B = Allocate2DArray< int >(P, N); int **C = Allocate2DArray< int >(M, N); int **C4 = Allocate2DArray< int >(M, N); int i, j; for (i = 0; i < M; i++) { for (j = 0; j < P; j++) { A[i][j] = ((int)(rand()%100) /10); } } for (i = 0; i < P; i++) { for (j = 0; j < N; j++) { B[i][j] = ((int)(rand()%100) / 10.0); } } printf("Execute Standard matmult\n\n"); before = omp_get_wtime(); seqMatMult(M, N, P, A, B, C); after = omp_get_wtime(); printf("Standard matrix function done in %10f secs\n\n\n",(after - before)); //printf("The number of cores is %d\n",omp_get_num_procs()); omp_set_num_threads(8); GRAIN = (long)(M*N*2); before = omp_get_wtime(); matmultS(M, N, P, A, B, C4); after = omp_get_wtime(); printf("Strassen matrix function done in %10f secs\n\n\n",(after - before)); if (CheckResults(M, N, C, C4)) printf("Error in matmultS\n\n"); else printf("OKAY\n\n"); Free2DArray(A); Free2DArray(B); Free2DArray(C); Free2DArray(C4); return 0; }
/* Function: PAMPrior() * * Purpose: Produces an ad hoc "Dirichlet mixture" prior for * match emissions, using a PAM matrix. * * Side effect notice: PAMPrior() replaces the match * emission section of an existing Dirichlet prior, * which is /expected/ to be a simple one-component * kind of prior. The insert emissions /must/ be a * one-component prior (because of details in how * PriorifyEmissionVector() is done). However, * the transitions /could/ be a mixture Dirichlet prior * without causing problems. In other words, the * -p and -P options of hmmb can coexist, but there * may be conflicts. PAMPrior() checks for these, * so there's no serious problem, except that the * error message from PAMPrior() might be confusing to * a user. */ void PAMPrior(char *pamfile, struct p7prior_s *pri, float wt) { FILE *fp; char *blastpamfile; /* BLAST looks in aa/ subdirectory of BLASTMAT */ int **pam; float scale; int xi, xj; int idx1, idx2; if (Alphabet_type != hmmAMINO) Die("PAM prior is only valid for protein sequences"); if (pri->strategy != PRI_DCHLET) Die("PAM prior may only be applied over an existing Dirichlet prior"); if (pri->inum != 1) Die("PAM prior requires that the insert emissions be a single Dirichlet"); if (MAXDCHLET < 20) Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior"); blastpamfile = FileConcat("aa", pamfile); if ((fp = fopen(pamfile, "r")) == NULL && (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL && (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL) Die("Failed to open PAM scoring matrix file %s", pamfile); if (! ParsePAMFile(fp, &pam, &scale)) Die("Failed to parse PAM scoring matrix file %s", pamfile); fclose(fp); free(blastpamfile); pri->strategy = PRI_PAM; pri->mnum = 20; /* Convert PAM entries back to conditional prob's P(xj | xi), * which we'll use as "pseudocounts" weighted by wt. */ for (xi = 0; xi < Alphabet_size; xi++) for (xj = 0; xj < Alphabet_size; xj++) { idx1 = Alphabet[xi] - 'A'; idx2 = Alphabet[xj] - 'A'; pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale); } /* Normalize so that rows add up to wt. * i.e. Sum(xj) mat[xi][xj] = wt for every row xi */ for (xi = 0; xi < Alphabet_size; xi++) { pri->mq[xi] = 1. / Alphabet_size; FNorm(pri->m[xi], Alphabet_size); FScale(pri->m[xi], Alphabet_size, wt); } Free2DArray((void **)pam,27); }
int Cluster(float **dmx, int N, struct phylo_s *tree) { float **mx; int *coord; int i; int Np; int row, col; float min; for (col = 0; col < N; Np--) { for (row = 0; row < Np; row++) for (col = row+1; col < Np; col++) if (mx[row][col] < min) i = row; tree[Np-2].left = coord[i]; } Free2DArray((void **) mx, N); }
void rk8(real* y, void (*deriv_func)(real, real*, int, real*, void*), real t, const int n, const real h, void* data) { int i,j,k; int nrk=11; real *ym = (real* )Alloc1DArray( sizeof(real), n ); real **kn = (real**)Alloc2DArray( sizeof(real), nrk*2, n ); memcpy( ym, y, sizeof(real)*n ); for(i=0;i<nrk;i++) { deriv_func( t+h*c[i], ym, n, kn[i], data ); for(j=0;j<n;j++) { ym[j] = y[j]; for(k=0;k<i+1;k++) { ym[j] += a[i][k]*h*kn[k][j]; } } } memcpy( y, ym, sizeof(real)*n ); Free1DArray( (void*)ym ); Free2DArray( (void**)kn, nrk); }
main (int argc, char ** argv ) { char *seqfile; /* name of sequence file */ SQINFO sqinfo; /* extra info about sequence */ SQFILE *dbfp; /* open sequence file */ int fmt,ofmt=106; /* format of seqfile */ /* 106 is PHYLIP format in SQUID */ char *seq; /* sequence */ int type; /* kAmino, kDNA, kRNA, or kOtherSeq */ sequence * seqs, * cds_seqs; sequence tmp_seqs[2], tmp_cds_seqs[2]; char *optname; char *optarg, *t; int optind; int be_quiet; int seqct = 0,cdsct = 0; int min_aln_len = 0; int do_oneline = 0; char * output_filename = 0, *submat_file = 0; int showaln = 1; int showheader=1; FILE *ofd, *fd; alignment *cds_aln; alignment * opt_alignment = NULL; /* place for pairwise alignment */ int len,i,j, k, jk,ik,aln_count, rc; pairwise_distances pwMLdist, pwNGdist; int firsttime = 1; struct timeval tp; pwMLdist.N = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.S = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.SEdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.SEdN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.t = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.kappa= make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); /* pwMLdist.N = pwMLdist.dN = pwMLdist.S = 0; pwMLdist.dS = pwMLdist.dNdS = pwMLdist.SEdS = 0; pwMLdist.SEdN = pwMLdist.t = pwMLdist.kappa= 0; pwNGdist.dN = pwNGdist.dS = pwNGdist.dNdS = 0; */ Alntype = default_aln_type; /* Command line Parse */ fmt = SQFILE_UNKNOWN; /* default: autodetect format */ be_quiet = FALSE; type = kOtherSeq; /* for our purposes this is only pairwise alignments, but * would rather do it correctly in case we move to MSA case */ while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "--matrix") == 0) submat_file = optarg; else if (strcmp(optname, "--quiet") == 0) be_quiet = TRUE; else if (strcmp(optname, "--gapopen") == 0) { Gapopen = atoi(optarg); if( Gapopen < 0 ) Gapopen *= -1; } else if (strcmp(optname, "--gapext") == 0) { Gapext = atoi(optarg); if( Gapext < 0 ) Gapext *= -1; } else if (strcmp(optname, "--informat") == 0) { fmt = String2SeqfileFormat(optarg); if (fmt == SQFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); } else if (strcmp(optname, "--outformat") == 0) { ofmt = String2SeqfileFormat(optarg); if (ofmt == SQFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); } else if( strcmp(optname, "--global") == 0 ) { Alntype = global; } else if (strcmp(optname, "-h") == 0) { puts(usage); puts(experts); exit(EXIT_SUCCESS); } else if ( strcmp(optname, "-v") == 0 ) { Verbose = 1; } else if ( strcmp(optname, "--gapchar") == 0 ) { GapChar = optarg[0]; } else if( strcmp(optname, "--output") == 0 ) { output_filename = optarg; } else if( strcmp(optname, "--showtable" ) == 0 ) { showaln = 0; } else if( strcmp(optname, "--noheader" ) == 0 ) { showheader = 0; } } if (argc - optind < 1) Die("%s\n", usage); if( ! submat_file ) { if( (t = getenv("SUBOPTDIR")) != 0 || (t = getenv("SUBOPT_DIR")) != 0 ) { submat_file = calloc(strlen(t) + 24, sizeof(char)); sprintf(submat_file, "%s/%s",t,Default_submat); } else { submat_file = calloc(strlen((void *)Default_submat) + 24, sizeof(char)); sprintf(submat_file, "../%s",Default_submat); } } /* open matrix */ fd = fopen(submat_file, "r"); if( ! ParsePAMFile(fd,&ScoringMatrix, &MatrixScale) ) { fprintf(stderr, "Cannot parse or open matrix file %s\n",submat_file); free(submat_file); exit(EXIT_SUCCESS); } if( output_filename && strlen(output_filename) != 1 && output_filename[0] != '-') { ofd = fopen(output_filename,"w"); if( ! ofd ) { fprintf(stderr, "could not open file %s",output_filename); goto end; } } else ofd = stdout; while( optind < argc ) { seqfile = argv[optind++]; /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (fmt == SQFILE_UNKNOWN && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) fmt = SQFILE_FASTA; if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) { FreeSequence(NULL, &sqinfo); seqct++; } cds_seqs = (sequence *)calloc(seqct, sizeof(sequence)); seqs = (sequence *)calloc(seqct, sizeof(sequence)); SeqfileRewind(dbfp); seqct=0; while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) { sqinfo.type = Seqtype(seq); if( sqinfo.type == kDNA || sqinfo.type == kRNA ) { seqs[seqct].seqstr = Translate(seq,stdcode1); /* Let's remove the last codon if it is a stop codon */ len = strlen(seqs[seqct].seqstr); if( Verbose ) fprintf(stderr,"seqct is %d length is %d\n",seqct, len); if( seqs[seqct].seqstr[len-1] == '*' ) { seqs[seqct].seqstr[len-1] = '\0'; seq[strlen(seq) - 3] = '\0'; } cds_seqs[cdsct].seqstr = seq; seqs[seqct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char)); cds_seqs[cdsct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char)); strcpy(seqs[seqct].seqname,sqinfo.name ); strcpy(cds_seqs[cdsct].seqname,sqinfo.name); cds_seqs[cdsct].length = sqinfo.len; cds_seqs[cdsct].alphabet = ( sqinfo.type == kDNA ) ? dna : rna; seqs[seqct].length = strlen(seqs[seqct].seqstr); seqs[seqct].alphabet = protein; cdsct++; seqct++; } else { fprintf(stderr,"Expect CDS sequences (DNA or RNA) not Protein\n"); goto end; } FreeSequence(NULL, &sqinfo); if( Verbose && seqct > 3 ) break; } if( seqct < 2 ) { fprintf(stderr,"Must have provided a valid file with at least 2 sequences in it"); goto end; } for( i=0; i < seqct; i++ ) { for(k=i+1; k < seqct; k++ ) { if( (opt_alignment = (alignment *)calloc(1,sizeof(alignment *))) == NULL) { fprintf(stderr,"Could not allocate memory\n"); goto end; } opt_alignment->msa = NULL; rc = optimal_align(&seqs[i],&seqs[k],opt_alignment); if( rc != 1 ) { fprintf(stderr,"Could not make an optimal alignment\n"); goto end; } else { tmp_cds_seqs[0] = cds_seqs[i]; tmp_cds_seqs[1] = cds_seqs[k]; rc = mrtrans(opt_alignment, tmp_cds_seqs, &cds_aln,0); if( rc != 0 ) { fprintf(stderr, "Could not map the coding sequence to the protein alignemnt for aln %d: %d\n",i,rc); goto end; } if( showaln ) { if( ofmt >= 100 ) { MSAFileWrite(ofd,cds_aln->msa, ofmt,do_oneline); } else { for(j=0; j < cds_aln->msa->nseq; j++ ) { WriteSeq(ofd, ofmt, cds_aln->msa->aseq[j], &(cds_aln->sqinfo[j]) ); } } } else { if( showheader && firsttime ) { fprintf(ofd,"SEQ1\tSEQ2\tSCORE\tdN\tdS\tOMEGA\tN\tS\tkappa\tt\tLENGTH\n"); firsttime = 0; } if( do_kaks_yn00(cds_aln->msa, &pwMLdist,&pwNGdist) < 0 ) { fprintf(stderr, "warning: problem with align for %s %s\n", cds_aln->msa->sqname[0], cds_aln->msa->sqname[1]); continue; } for(ik = 0; ik < NUM_PW_SEQS; ik++ ) { for( jk = ik+1; jk < NUM_PW_SEQS; jk++ ) { fprintf(ofd,"%s\t%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%d\n", cds_aln->sqinfo[ik].name, cds_aln->sqinfo[jk].name, opt_alignment->score, pwMLdist.dN[ik][jk],pwMLdist.dS[ik][jk], pwMLdist.dNdS[ik][jk], pwMLdist.N[ik][jk], pwMLdist.S[ik][jk], pwMLdist.kappa[ik][jk], pwMLdist.t[ik][jk], opt_alignment->msa->alen); } } } } cleanup_alignment(cds_aln); cleanup_alignment(opt_alignment); } } } if( ofd && ofd != stdout ) fclose(ofd); end: free(submat_file); Free2DArray((void **)ScoringMatrix,27); for(i =0; i< seqct; i++ ) { free(seqs[i].seqstr); free(seqs[i].seqname); seqs[i].seqstr = seqs[i].seqname = 0; } for(i = 0; i < cdsct; i++) { free(cds_seqs[i].seqstr); free(cds_seqs[i].seqname); cds_seqs[i].seqstr = cds_seqs[i].seqname = 0; } cleanup_matrix((void **)pwMLdist.N,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dN,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.S,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.SEdS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.SEdN,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.t,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dNdS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.kappa,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dN,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dS,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dNdS,NUM_PW_SEQS); free(pwNGdist.dNdS); free(pwNGdist.dN); free(pwNGdist.dS); free(pwMLdist.dNdS); free(pwMLdist.dN); free(pwMLdist.dS); free(pwMLdist.N); free(pwMLdist.S); free(pwMLdist.SEdS); free(pwMLdist.SEdN); free(pwMLdist.t); free(pwMLdist.kappa); return 0; }
/* Function: ViterbiAlignAlignment() * * Purpose: Align a multiple sequence alignment to an HMM without * altering the multiple alignment. * * Args: shmm - HMM in integer log-odds score form * aseq - alignment, [0..nseq-1][0..alen-1] * alen - length of aligned sequences * nseq - number of aligned sequences * ret_tr - RETURN: array of tracebacks. rpos field is * relative to aseq, not raw seq, similar to * Maxmodelmaker(); use DealignTrace() if you * want relative to raw sequence. * ret_sc - RETURN: sum of log odds scores. * * Return: (void) * ret_tr is alloced here. Individuals must be free'd by FreeTrace(), * then tr itself free'd by free(). */ void ViterbiAlignAlignment(struct shmm_s *shmm, char **aseq, int alen, int nseq, struct trace_s ***ret_tr, float *ret_sc) { struct fvit_s **mx; /* the viterbi calculation grid */ int score; /* tmp variable for scores */ int i; /* counter for sequence position: 0,1..L */ int k; /* counter for model position: 0,1..M */ int idx; /* index for sequences */ struct fvit_s *thisrow; /* ptr to current row of mx */ struct fvit_s *nextrow; /* ptr to next row of mx */ int **matocc; /* [0..alen+1][0..nseq-1], 1 for MATCH*/ struct trace_s **tr; /* array of tracebacks to return */ int *tpos; /* index for position in indiv traces */ int lastsub; /* last state type in master trace */ /* A crucial extra component of this alignment algorithm: * at each matrix cell, we have to remember: for the best * path into the INSERT subcell, what state is each sequence in? * This is non-trivial because some gaps are assigned to * no states. When we calculate the score from an insert column, * where there are gaps we have to look up the previous state. * * Fortunately, we don't need to keep a full matrix of these, * or we'd be in serious memory problems. Use a rolling pointer * trick, keep two active rows "current" and "next". */ char **cur_state; /* [0..M+1][0..nseq-1]; MATCH, INSERT, or DELETE */ char **nxt_state; /* same, except keeps states for next row */ char **swap; /* used for swapping cur, nxt */ /******************************************** * Initial setup and allocations ********************************************/ /* allocate the calculation matrix, which is 0..alen+1 rows by 0..M+1 cols */ mx = (struct fvit_s **) MallocOrDie (sizeof(struct fvit_s *) * (alen+2)); matocc = (int **) MallocOrDie (sizeof(int *) * (alen+2)); cur_state = (char **) MallocOrDie (sizeof(char *) * (shmm->M+2)); nxt_state = (char **) MallocOrDie (sizeof(char *) * (shmm->M+2)); for (i = 0; i <= alen+1; i++) { mx[i] = (struct fvit_s *) MallocOrDie (sizeof(struct fvit_s) * (shmm->M+2)); matocc[i]= (int *) MallocOrDie (sizeof(int) * nseq); } for (k = 0; k <= shmm->M+1; k++) { cur_state[k] = (char *) MallocOrDie (sizeof(char) * nseq); nxt_state[k] = (char *) MallocOrDie (sizeof(char) * nseq); } /******************************************** * Initialization ********************************************/ /* initialize the first cell 0,0 */ mx[0][0].score_m = 0; mx[0][0].score_d = -99999999; mx[0][0].score_i = -99999999; for (k = 0; k <= shmm->M+1; k++) for (idx = 0; idx < nseq; idx++) nxt_state[k][idx] = MATCH; /* initialize the top row */ for (k = 1; k <= shmm->M+1; k++) { mx[0][k].score_m = -99999999; mx[0][k].score_i = -99999999; } /* Precalculate matocc (match occupancy). * 1 if symbol in column for this seq, 0 if not. * 1..alen, from 0..alen-1 alignments */ for (idx = 0; idx < nseq; idx++) { matocc[0][idx] = matocc[alen+1][idx] = 1; /* dummies for BEGIN, END */ for (i = 1; i <= alen; i++) matocc[i][idx] = isgap(aseq[idx][i-1]) ? 0 : 1; } /******************************************** * Recursion: fill in the mx matrix ********************************************/ /* Alignment is 0..alen-1, we index it here as 1..alen because of Viterbi matrix. */ for (i = 0; i <= alen; i++) { /* get ptrs into current and next row. */ thisrow = mx[i]; nextrow = mx[i+1]; /* initialize in the next row */ nextrow[0].score_m = -99999999; nextrow[0].score_d = -99999999; swap = cur_state; cur_state = nxt_state; nxt_state = swap; for (k = 0; k <= shmm->M; k++) { /* begin inner loop... this is where all the time is spent. */ /* add in emission scores to the current cell. */ if (i > 0) for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) { thisrow[k].score_m += shmm->m_emit[aseq[idx][i-1] - 'A'][k]; thisrow[k].score_i += shmm->i_emit[aseq[idx][i-1] - 'A'][k]; } /* initialize with transitions out of delete state */ /* to delete */ thisrow[k+1].score_d = thisrow[k].score_d + shmm->t[9*k + Tdd] * nseq; thisrow[k+1].tback_d = DELETE; /* to insert */ nextrow[k].score_i = thisrow[k].score_d; nextrow[k].tback_i = DELETE; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) { nextrow[k].score_i += shmm->t[9*k + Tdi]; nxt_state[k][idx] = INSERT; } else nxt_state[k][idx] = DELETE; /* to match */ nextrow[k+1].score_m = thisrow[k].score_d; nextrow[k+1].tback_m = DELETE; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) nextrow[k+1].score_m += shmm-> t[9*k + Tdm]; else nextrow[k+1].score_m += shmm-> t[9*k + Tdd]; /* deal with transitions out of insert state */ /* to delete state. */ score = thisrow[k].score_i; for (idx = 0; idx < nseq; idx++) switch (cur_state[k][idx]) { case MATCH: score += shmm->t[9*k + Tmd]; break; case DELETE: score += shmm->t[9*k + Tdd]; break; case INSERT: score += shmm->t[9*k + Tid]; break; } if (score > thisrow[k+1].score_d) { thisrow[k+1].score_d = score; thisrow[k+1].tback_d = INSERT; } /* to insert state */ score = thisrow[k].score_i; for (idx = 0; idx < nseq; idx++) { if (matocc[i+1][idx]) switch (cur_state[k][idx]) { case MATCH: score += shmm->t[9*k + Tmi]; break; case DELETE: score += shmm->t[9*k + Tdi]; break; case INSERT: score += shmm->t[9*k + Tii]; break; } } if (score > nextrow[k].score_i) { nextrow[k].score_i = score; nextrow[k].tback_i = INSERT; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) nxt_state[k][idx] = INSERT; else nxt_state[k][idx] = cur_state[k][idx]; } /* to match state */ score = thisrow[k].score_i; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) switch (cur_state[k][idx]) { case MATCH: score += shmm->t[9*k + Tmm]; break; case DELETE: score += shmm->t[9*k + Tdm]; break; case INSERT: score += shmm->t[9*k + Tim]; break; } else switch (cur_state[k][idx]) { case MATCH: score += shmm->t[9*k + Tmd]; break; case DELETE: score += shmm->t[9*k + Tdd]; break; case INSERT: score += shmm->t[9*k + Tid]; break; } if (score > nextrow[k+1].score_m) { nextrow[k+1].score_m = score; nextrow[k+1].tback_m = INSERT; } /* Transitions out of match state. */ /* to delete */ score = thisrow[k].score_m; for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) score += shmm->t[9*k + Tmd]; else score += shmm->t[9*k + Tdd]; if (score > thisrow[k+1].score_d) { thisrow[k+1].score_d = score; thisrow[k+1].tback_d = MATCH; } /* to insert */ score = thisrow[k].score_m; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) { if (matocc[i][idx]) score += shmm->t[9*k + Tmi]; else score += shmm->t[9*k + Tdi]; } if (score > nextrow[k].score_i) { nextrow[k].score_i = score; nextrow[k].tback_i = MATCH; for (idx = 0; idx < nseq; idx++) if (matocc[i+1][idx]) nxt_state[k][idx] = INSERT; else if (matocc[i][idx]) nxt_state[k][idx] = MATCH; else nxt_state[k][idx] = DELETE; } /* to match */ score = thisrow[k].score_m; for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) { if (matocc[i+1][idx]) score += shmm->t[9*k + Tmm]; else score += shmm->t[9*k + Tmd]; } else { if (matocc[i+1][idx]) score += shmm->t[9*k + Tdm]; else score += shmm->t[9*k + Tdd]; } if (score > nextrow[k+1].score_m) { nextrow[k+1].score_m = score; nextrow[k+1].tback_m = MATCH; } } /* end loop over model positions k */ } /* end loop over alignment positions i */ /* PrintFragViterbiMatrix(mx, alen, shmm->M); */ /* Fill stage finished. * mx now contains final score in mx[alen+1][M+1]. * Trace back from there to get master alignment. */ tr = (struct trace_s **) MallocOrDie (sizeof(struct trace_s *) * nseq); tpos = (int *) MallocOrDie (sizeof(int) * nseq); for (idx = 0; idx < nseq; idx++) { AllocTrace(alen + shmm->M + 3, &(tr[idx])); tr[idx]->nodeidx[0] = shmm->M+1; tr[idx]->statetype[0] = MATCH; tr[idx]->rpos[0] = -1; tpos[idx] = 1; } i = alen+1; k = shmm->M+1; lastsub= MATCH; while (i != 0 || k != 0) { switch (lastsub) { case MATCH: lastsub = mx[i][k].tback_m; i--; k--; break; case DELETE: lastsub = mx[i][k].tback_d; k--; break; case INSERT: lastsub = mx[i][k].tback_i; i--; break; default: Die("trace failed!"); } switch (lastsub) { case MATCH: for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) { tr[idx]->nodeidx[tpos[idx]] = k; tr[idx]->statetype[tpos[idx]] = MATCH; tr[idx]->rpos[tpos[idx]] = i-1; tpos[idx]++; } else { tr[idx]->nodeidx[tpos[idx]] = k; tr[idx]->statetype[tpos[idx]] = DELETE; tr[idx]->rpos[tpos[idx]] = -1; tpos[idx]++; } break; case INSERT: for (idx = 0; idx < nseq; idx++) if (matocc[i][idx]) { tr[idx]->nodeidx[tpos[idx]] = k; tr[idx]->statetype[tpos[idx]] = INSERT; tr[idx]->rpos[tpos[idx]] = i-1; tpos[idx]++; } break; case DELETE: for (idx = 0; idx < nseq; idx++) { tr[idx]->nodeidx[tpos[idx]] = k; tr[idx]->statetype[tpos[idx]] = DELETE; tr[idx]->rpos[tpos[idx]] = -1; tpos[idx]++; } break; default: Die("trace failed!"); } /* end switch across new subcell in traceback */ } /* end traceback */ for (idx = 0; idx < nseq; idx++) ReverseTrace(tr[idx], tpos[idx]); *ret_tr = tr; *ret_sc = (float) mx[alen+1][shmm->M+1].score_m / INTSCALE; Free2DArray(matocc, alen+2); Free2DArray(cur_state, shmm->M+2); Free2DArray(nxt_state, shmm->M+2); Free2DArray(mx, alen+2); free(tpos); }
/* Function: StrDPShuffle() * Date: SRE, Fri Oct 29 09:15:17 1999 [St. Louis] * * Purpose: Returns a shuffled version of s2, in s1. * (s1 and s2 may be identical; i.e. a string * may be shuffled in place.) The shuffle is a * "doublet-preserving" (DP) shuffle. Both * mono- and di-symbol composition are preserved. * * Done by searching for a random Eulerian * walk on a directed multigraph. * Reference: S.F. Altschul and B.W. Erickson, Mol. Biol. * Evol. 2:526-538, 1985. Quoted bits in my comments * are from Altschul's outline of the algorithm. * * Args: s1 - RETURN: the string after it's been shuffled * (space for s1 allocated by caller) * s2 - the string to be shuffled * * Returns: 0 if string can't be shuffled (it's not all [a-zA-z] * alphabetic. * 1 on success. */ int StrDPShuffle(char *s1, char *s2) { int len; int pos; /* a position in s1 or s2 */ int x,y; /* indices of two characters */ char **E; /* edge lists: E[0] is the edge list from vertex A */ int *nE; /* lengths of edge lists */ int *iE; /* positions in edge lists */ int n; /* tmp: remaining length of an edge list to be shuffled */ char sf; /* last character in s2 */ char Z[26]; /* connectivity in last edge graph Z */ int keep_connecting; /* flag used in Z connectivity algorithm */ int is_eulerian; /* flag used for when we've got a good Z */ /* First, verify that the string is entirely alphabetic. */ len = strlen(s2); for (pos = 0; pos < len; pos++) if (! isalpha(s2[pos])) return 0; /* "(1) Construct the doublet graph G and edge ordering E * corresponding to S." * * Note that these also imply the graph G; and note, * for any list x with nE[x] = 0, vertex x is not part * of G. */ E = MallocOrDie(sizeof(char *) * 26); nE = MallocOrDie(sizeof(int) * 26); for (x = 0; x < 26; x++) { E[x] = MallocOrDie(sizeof(char) * (len-1)); nE[x] = 0; } x = toupper(s2[0]) - 'A'; for (pos = 1; pos < len; pos++) { y = toupper(s2[pos]) - 'A'; E[x][nE[x]] = y; nE[x]++; x = y; } /* Now we have to find a random Eulerian edge ordering. */ sf = toupper(s2[len-1]) - 'A'; is_eulerian = 0; while (! is_eulerian) { /* "(2) For each vertex s in G except s_f, randomly select * one edge from the s edge list of E(S) to be the * last edge of the s list in a new edge ordering." * * select random edges and move them to the end of each * edge list. */ for (x = 0; x < 26; x++) { if (nE[x] == 0 || x == sf) continue; pos = CHOOSE(nE[x]); y = E[x][pos]; E[x][pos] = E[x][nE[x]-1]; E[x][nE[x]-1] = y; } /* "(3) From this last set of edges, construct the last-edge * graph Z and determine whether or not all of its * vertices are connected to s_f." * * a probably stupid algorithm for looking at the * connectivity in Z: iteratively sweep through the * edges in Z, and build up an array (confusing called Z[x]) * whose elements are 1 if x is connected to sf, else 0. */ for (x = 0; x < 26; x++) Z[x] = 0; Z[(int) sf] = keep_connecting = 1; while (keep_connecting) { keep_connecting = 0; for (x = 0; x < 26; x++) { y = E[x][nE[x]-1]; /* xy is an edge in Z */ if (Z[x] == 0 && Z[y] == 1) /* x is connected to sf in Z */ { Z[x] = 1; keep_connecting = 1; } } } /* if any vertex in Z is tagged with a 0, it's * not connected to sf, and we won't have a Eulerian * walk. */ is_eulerian = 1; for (x = 0; x < 26; x++) { if (nE[x] == 0 || x == sf) continue; if (Z[x] == 0) { is_eulerian = 0; break; } } /* "(4) If any vertex is not connected in Z to s_f, the * new edge ordering will not be Eulerian, so return to * (2). If all vertices are connected in Z to s_f, * the new edge ordering will be Eulerian, so * continue to (5)." * * e.g. note infinite loop while is_eulerian is FALSE. */ } /* "(5) For each vertex s in G, randomly permute the remaining * edges of the s edge list of E(S) to generate the s * edge list of the new edge ordering E(S')." * * Essentially a StrShuffle() on the remaining nE[x]-1 elements * of each edge list; unfortunately our edge lists are arrays, * not strings, so we can't just call out to StrShuffle(). */ for (x = 0; x < 26; x++) for (n = nE[x] - 1; n > 1; n--) { pos = CHOOSE(n); y = E[x][pos]; E[x][pos] = E[x][n-1]; E[x][n-1] = y; } /* "(6) Construct sequence S', a random DP permutation of * S, from E(S') as follows. Start at the s_1 edge list. * At each s_i edge list, add s_i to S', delete the * first edge s_i,s_j of the edge list, and move to * the s_j edge list. Continue this process until * all edge lists are exhausted." */ iE = MallocOrDie(sizeof(int) * 26); for (x = 0; x < 26; x++) iE[x] = 0; pos = 0; x = toupper(s2[0]) - 'A'; while (1) { s1[pos++] = 'A' + x; /* add s_i to S' */ y = E[x][iE[x]]; iE[x]++; /* "delete" s_i,s_j from edge list */ x = y; /* move to s_j edge list. */ if (iE[x] == nE[x]) break; /* the edge list is exhausted. */ } s1[pos++] = 'A' + sf; s1[pos] = '\0'; /* Reality checks. */ if (x != sf) Die("hey, you didn't end on s_f."); if (pos != len) Die("hey, pos (%d) != len (%d).", pos, len); /* Free and return. */ Free2DArray((void **) E, 26); free(nE); free(iE); return 1; }
/* Function: MSAFree() * Date: SRE, Tue May 18 11:20:16 1999 [St. Louis] * * Purpose: Free a multiple sequence alignment structure. * * Args: msa - the alignment * * Returns: (void) */ void MSAFree(MSA *msa) { Free2DArray((void **) msa->aseq, msa->nseq); Free2DArray((void **) msa->sqname, msa->nseq); Free2DArray((void **) msa->sqacc, msa->nseq); Free2DArray((void **) msa->sqdesc, msa->nseq); Free2DArray((void **) msa->ss, msa->nseq); Free2DArray((void **) msa->sa, msa->nseq); if (msa->sqlen != NULL) free(msa->sqlen); if (msa->wgt != NULL) free(msa->wgt); if (msa->name != NULL) free(msa->name); if (msa->desc != NULL) free(msa->desc); if (msa->acc != NULL) free(msa->acc); if (msa->au != NULL) free(msa->au); if (msa->ss_cons != NULL) free(msa->ss_cons); if (msa->sa_cons != NULL) free(msa->sa_cons); if (msa->rf != NULL) free(msa->rf); if (msa->sslen != NULL) free(msa->sslen); if (msa->salen != NULL) free(msa->salen); Free2DArray((void **) msa->comment, msa->ncomment); Free2DArray((void **) msa->gf_tag, msa->ngf); Free2DArray((void **) msa->gf, msa->ngf); Free2DArray((void **) msa->gs_tag, msa->ngs); Free3DArray((void ***)msa->gs, msa->ngs, msa->nseq); Free2DArray((void **) msa->gc_tag, msa->ngc); Free2DArray((void **) msa->gc, msa->ngc); Free2DArray((void **) msa->gr_tag, msa->ngr); Free3DArray((void ***)msa->gr, msa->ngr, msa->nseq); GKIFree(msa->index); GKIFree(msa->gs_idx); GKIFree(msa->gc_idx); GKIFree(msa->gr_idx); free(msa); }
/* Function: WriteMSF() * Date: SRE, Mon May 31 11:25:18 1999 [St. Louis] * * Purpose: Write an alignment in MSF format to an open file. * * Args: fp - file that's open for writing. * msa - alignment to write. * * Note that msa->type, usually optional, must be * set for WriteMSF to work. If it isn't, a fatal * error is generated. * * Returns: (void) */ void WriteMSF(FILE *fp, MSA *msa) { time_t now; /* current time as a time_t */ char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */ char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */ char **gcg_sqname; /* sequence names with GCG-valid character sets */ int idx; /* counter for sequences */ char *s; /* pointer into sqname or seq */ int len; /* tmp variable for name lengths */ int namelen; /* maximum name length used */ int pos; /* position counter */ char buffer[51]; /* buffer for writing seq */ int i; /* another position counter */ /***************************************************************** * Make copies of sequence names and sequences. * GCG recommends that name characters should only contain * alphanumeric characters, -, or _ * Some GCG and GCG-compatible software is sensitive to this. * We silently convert all other characters to '_'. * * For sequences, GCG allows only ~ and . for gaps. * Otherwise, everthing is interpreted as a residue; * so squid's IUPAC-restricted chars are fine. ~ means * an external gap. . means an internal gap. *****************************************************************/ /* make copies that we can edit */ gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq); gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq); for (idx = 0; idx < msa->nseq; idx++) { gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen); gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1); } /* alter names as needed */ for (idx = 0; idx < msa->nseq; idx++) for (s = gcg_sqname[idx]; *s != '\0'; s++) if (! isalnum((int) *s) && *s != '-' && *s != '_') *s = '_'; /* alter gap chars in seq */ for (idx = 0; idx < msa->nseq; idx++) { for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++) *s = '~'; for (; *s != '\0'; s++) if (isgap(*s)) *s = '.'; for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--) gcg_aseq[idx][pos] = '~'; } /* calculate max namelen used */ namelen = 0; for (idx = 0; idx < msa->nseq; idx++) if ((len = strlen(msa->sqname[idx])) > namelen) namelen = len; /***************************************************** * Write the MSF header *****************************************************/ /* required file type line */ if (msa->type == kOtherSeq) msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq); if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n"); else if (msa->type == kOtherSeq) Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n"); else Die("Invalid sequence type %d in WriteMSF()\n", msa->type); /* free text comments */ if (msa->ncomment > 0) { for (idx = 0; idx < msa->ncomment; idx++) fprintf(fp, "%s\n", msa->comment[idx]); fprintf(fp, "\n"); } /* required checksum line */ now = time(NULL); if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0) Die("What time is it on earth? strftime() failed in WriteMSF().\n"); fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n", msa->name != NULL ? msa->name : "squid.msf", msa->alen, msa->type == kRNA ? 'N' : 'P', date, GCGMultchecksum(gcg_aseq, msa->nseq)); fprintf(fp, "\n"); /***************************************************** * Names/weights section *****************************************************/ for (idx = 0; idx < msa->nseq; idx++) { fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n", namelen, namelen, gcg_sqname[idx], msa->alen, GCGchecksum(gcg_aseq[idx], msa->alen), msa->wgt[idx]); } fprintf(fp, "\n"); fprintf(fp, "//\n"); /***************************************************** * Write the sequences *****************************************************/ for (pos = 0; pos < msa->alen; pos += 50) { fprintf(fp, "\n"); /* Blank line between sequence blocks */ /* Coordinate line */ len = (pos + 50) > msa->alen ? msa->alen - pos : 50; if (len > 10) fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "", pos+1, len + ((len-1)/10) - 12, "", pos + len); else fprintf(fp, "%*s %-6d\n", namelen, "", pos+1); for (idx = 0; idx < msa->nseq; idx++) { fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]); /* get next line's worth of 50 from seq */ strncpy(buffer, gcg_aseq[idx] + pos, 50); buffer[50] = '\0'; /* draw the sequence line */ for (i = 0; i < len; i++) { if (! (i % 10)) fputc(' ', fp); fputc(buffer[i], fp); } fputc('\n', fp); } } Free2DArray((void **) gcg_aseq, msa->nseq); Free2DArray((void **) gcg_sqname, msa->nseq); return; }
/* Function: Cluster() * * Purpose: Cluster analysis on a distance matrix. Constructs a * phylogenetic tree which contains the topology * and info for each node: branch lengths, how many * sequences are included under the node, and which * sequences are included under the node. * * Args: dmx - the NxN distance matrix ( >= 0.0, larger means more diverged) * N - size of mx (number of sequences) * mode - CLUSTER_MEAN, CLUSTER_MAX, or CLUSTER_MIN * ret_tree- RETURN: the tree * * Return: 1 on success, 0 on failure. * The caller is responsible for freeing the tree's memory, * by calling FreePhylo(tree, N). */ int Cluster(float **dmx, int N, enum clust_strategy mode, struct phylo_s **ret_tree) { struct phylo_s *tree; /* (0..N-2) phylogenetic tree */ float **mx; /* copy of difference matrix */ int *coord; /* (0..N-1), indices for matrix coords */ int i=0, j=0; /* coords of minimum difference */ int idx; /* counter over seqs */ int Np; /* N', a working copy of N */ int row, col; /* loop variables */ float min; /* best minimum score found */ float *trow; /* tmp pointer for swapping rows */ float tcol; /* tmp storage for swapping cols */ float *diff=NULL; /* (0..N-2) difference scores at nodes */ int swapfoo; /* for SWAP() macro */ /************************** * Initializations. **************************/ /* We destroy the matrix we work on, so make a copy of dmx. */ if ((mx = (float **) malloc (sizeof(float *) * N)) == NULL) Die("malloc failed"); for (i = 0; i < N; i++) { if ((mx[i] = (float *) malloc (sizeof(float) * N)) == NULL) Die("malloc failed"); for (j = 0; j < N; j++) mx[i][j] = dmx[i][j]; } /* coord array alloc, (0..N-1) */ if ((coord = (int *) malloc (N * sizeof(int))) == NULL || (diff = (float *) malloc ((N-1) * sizeof(float))) == NULL) Die("malloc failed"); /* init the coord array to 0..N-1 */ for (col = 0; col < N; col++) coord[col] = col; for (i = 0; i < N-1; i++) diff[i] = 0.0; /* tree array alloc, (0..N-2) */ if ((tree = AllocPhylo(N)) == NULL) Die("AllocPhylo() failed"); /********************************* * Process the difference matrix *********************************/ /* N-prime, for an NxN down to a 2x2 diffmx */ for (Np = N; Np >= 2; Np--) { /* find a minimum on the N'xN' matrix*/ min = 999999.; for (row = 0; row < Np; row++) for (col = row+1; col < Np; col++) if (mx[row][col] < min) { min = mx[row][col]; i = row; j = col; } /* We're clustering row i with col j. write necessary * data into a node on the tree */ /* topology info */ tree[Np-2].left = coord[i]; tree[Np-2].right = coord[j]; if (coord[i] >= N) tree[coord[i]-N].parent = N + Np - 2; if (coord[j] >= N) tree[coord[j]-N].parent = N + Np - 2; /* keep score info */ diff[Np-2] = tree[Np-2].diff = min; /* way-simple branch length estimation */ tree[Np-2].lblen = tree[Np-2].rblen = min; if (coord[i] >= N) tree[Np-2].lblen -= diff[coord[i]-N]; if (coord[j] >= N) tree[Np-2].rblen -= diff[coord[j]-N]; /* number seqs included at node */ if (coord[i] < N) { tree[Np-2].incnum ++; tree[Np-2].is_in[coord[i]] = 1; } else { tree[Np-2].incnum += tree[coord[i]-N].incnum; for (idx = 0; idx < N; idx++) tree[Np-2].is_in[idx] |= tree[coord[i]-N].is_in[idx]; } if (coord[j] < N) { tree[Np-2].incnum ++; tree[Np-2].is_in[coord[j]] = 1; } else { tree[Np-2].incnum += tree[coord[j]-N].incnum; for (idx = 0; idx < N; idx++) tree[Np-2].is_in[idx] |= tree[coord[j]-N].is_in[idx]; } /* Now build a new matrix, by merging row i with row j and * column i with column j; see Fitch and Margoliash */ /* Row and column swapping. */ /* watch out for swapping i, j away: */ if (i == Np-1 || j == Np-2) SWAP(i,j); if (i != Np-2) { /* swap row i, row N'-2 */ trow = mx[Np-2]; mx[Np-2] = mx[i]; mx[i] = trow; /* swap col i, col N'-2 */ for (row = 0; row < Np; row++) { tcol = mx[row][Np-2]; mx[row][Np-2] = mx[row][i]; mx[row][i] = tcol; } /* swap coord i, coord N'-2 */ SWAP(coord[i], coord[Np-2]); } if (j != Np-1) { /* swap row j, row N'-1 */ trow = mx[Np-1]; mx[Np-1] = mx[j]; mx[j] = trow; /* swap col j, col N'-1 */ for (row = 0; row < Np; row++) { tcol = mx[row][Np-1]; mx[row][Np-1] = mx[row][j]; mx[row][j] = tcol; } /* swap coord j, coord N'-1 */ SWAP(coord[j], coord[Np-1]); } /* average i and j together; they're now at Np-2 and Np-1 though */ i = Np-2; j = Np-1; /* merge by saving avg of cols of row i and row j */ for (col = 0; col < Np; col++) { switch (mode) { case CLUSTER_MEAN: mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break; case CLUSTER_MIN: mx[i][col] = MIN(mx[i][col], mx[j][col]); break; case CLUSTER_MAX: mx[i][col] = MAX(mx[i][col], mx[j][col]); break; default: mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break; } } /* copy those rows to columns */ for (col = 0; col < Np; col++) mx[col][i] = mx[i][col]; /* store the node index in coords */ coord[Np-2] = Np+N-2; } /************************** * Garbage collection and return **************************/ Free2DArray(mx, N); free(coord); free(diff); *ret_tree = tree; return 1; }