/* Function: PAMPrior() * * Purpose: Produces an ad hoc "Dirichlet mixture" prior for * match emissions, using a PAM matrix. * * Side effect notice: PAMPrior() replaces the match * emission section of an existing Dirichlet prior, * which is /expected/ to be a simple one-component * kind of prior. The insert emissions /must/ be a * one-component prior (because of details in how * PriorifyEmissionVector() is done). However, * the transitions /could/ be a mixture Dirichlet prior * without causing problems. In other words, the * -p and -P options of hmmb can coexist, but there * may be conflicts. PAMPrior() checks for these, * so there's no serious problem, except that the * error message from PAMPrior() might be confusing to * a user. */ void PAMPrior(char *pamfile, struct p7prior_s *pri, float wt) { FILE *fp; char *blastpamfile; /* BLAST looks in aa/ subdirectory of BLASTMAT */ int **pam; float scale; int xi, xj; int idx1, idx2; if (Alphabet_type != hmmAMINO) Die("PAM prior is only valid for protein sequences"); if (pri->strategy != PRI_DCHLET) Die("PAM prior may only be applied over an existing Dirichlet prior"); if (pri->inum != 1) Die("PAM prior requires that the insert emissions be a single Dirichlet"); if (MAXDCHLET < 20) Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior"); blastpamfile = FileConcat("aa", pamfile); if ((fp = fopen(pamfile, "r")) == NULL && (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL && (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL) Die("Failed to open PAM scoring matrix file %s", pamfile); if (! ParsePAMFile(fp, &pam, &scale)) Die("Failed to parse PAM scoring matrix file %s", pamfile); fclose(fp); free(blastpamfile); pri->strategy = PRI_PAM; pri->mnum = 20; /* Convert PAM entries back to conditional prob's P(xj | xi), * which we'll use as "pseudocounts" weighted by wt. */ for (xi = 0; xi < Alphabet_size; xi++) for (xj = 0; xj < Alphabet_size; xj++) { idx1 = Alphabet[xi] - 'A'; idx2 = Alphabet[xj] - 'A'; pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale); } /* Normalize so that rows add up to wt. * i.e. Sum(xj) mat[xi][xj] = wt for every row xi */ for (xi = 0; xi < Alphabet_size; xi++) { pri->mq[xi] = 1. / Alphabet_size; FNorm(pri->m[xi], Alphabet_size); FScale(pri->m[xi], Alphabet_size, wt); } Free2DArray((void **)pam,27); }
main (int argc, char ** argv ) { char *seqfile; /* name of sequence file */ SQINFO sqinfo; /* extra info about sequence */ SQFILE *dbfp; /* open sequence file */ int fmt,ofmt=106; /* format of seqfile */ /* 106 is PHYLIP format in SQUID */ char *seq; /* sequence */ int type; /* kAmino, kDNA, kRNA, or kOtherSeq */ sequence * seqs, * cds_seqs; sequence tmp_seqs[2], tmp_cds_seqs[2]; char *optname; char *optarg, *t; int optind; int be_quiet; int seqct = 0,cdsct = 0; int min_aln_len = 0; int do_oneline = 0; char * output_filename = 0, *submat_file = 0; int showaln = 1; int showheader=1; FILE *ofd, *fd; alignment *cds_aln; alignment * opt_alignment = NULL; /* place for pairwise alignment */ int len,i,j, k, jk,ik,aln_count, rc; pairwise_distances pwMLdist, pwNGdist; int firsttime = 1; struct timeval tp; pwMLdist.N = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.S = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.SEdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.SEdN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.t = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwMLdist.kappa= make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dN = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); pwNGdist.dNdS = make_double_matrix(NUM_PW_SEQS,NUM_PW_SEQS); /* pwMLdist.N = pwMLdist.dN = pwMLdist.S = 0; pwMLdist.dS = pwMLdist.dNdS = pwMLdist.SEdS = 0; pwMLdist.SEdN = pwMLdist.t = pwMLdist.kappa= 0; pwNGdist.dN = pwNGdist.dS = pwNGdist.dNdS = 0; */ Alntype = default_aln_type; /* Command line Parse */ fmt = SQFILE_UNKNOWN; /* default: autodetect format */ be_quiet = FALSE; type = kOtherSeq; /* for our purposes this is only pairwise alignments, but * would rather do it correctly in case we move to MSA case */ while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, &optind, &optname, &optarg)) { if (strcmp(optname, "--matrix") == 0) submat_file = optarg; else if (strcmp(optname, "--quiet") == 0) be_quiet = TRUE; else if (strcmp(optname, "--gapopen") == 0) { Gapopen = atoi(optarg); if( Gapopen < 0 ) Gapopen *= -1; } else if (strcmp(optname, "--gapext") == 0) { Gapext = atoi(optarg); if( Gapext < 0 ) Gapext *= -1; } else if (strcmp(optname, "--informat") == 0) { fmt = String2SeqfileFormat(optarg); if (fmt == SQFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); } else if (strcmp(optname, "--outformat") == 0) { ofmt = String2SeqfileFormat(optarg); if (ofmt == SQFILE_UNKNOWN) Die("unrecognized sequence file format \"%s\"", optarg); } else if( strcmp(optname, "--global") == 0 ) { Alntype = global; } else if (strcmp(optname, "-h") == 0) { puts(usage); puts(experts); exit(EXIT_SUCCESS); } else if ( strcmp(optname, "-v") == 0 ) { Verbose = 1; } else if ( strcmp(optname, "--gapchar") == 0 ) { GapChar = optarg[0]; } else if( strcmp(optname, "--output") == 0 ) { output_filename = optarg; } else if( strcmp(optname, "--showtable" ) == 0 ) { showaln = 0; } else if( strcmp(optname, "--noheader" ) == 0 ) { showheader = 0; } } if (argc - optind < 1) Die("%s\n", usage); if( ! submat_file ) { if( (t = getenv("SUBOPTDIR")) != 0 || (t = getenv("SUBOPT_DIR")) != 0 ) { submat_file = calloc(strlen(t) + 24, sizeof(char)); sprintf(submat_file, "%s/%s",t,Default_submat); } else { submat_file = calloc(strlen((void *)Default_submat) + 24, sizeof(char)); sprintf(submat_file, "../%s",Default_submat); } } /* open matrix */ fd = fopen(submat_file, "r"); if( ! ParsePAMFile(fd,&ScoringMatrix, &MatrixScale) ) { fprintf(stderr, "Cannot parse or open matrix file %s\n",submat_file); free(submat_file); exit(EXIT_SUCCESS); } if( output_filename && strlen(output_filename) != 1 && output_filename[0] != '-') { ofd = fopen(output_filename,"w"); if( ! ofd ) { fprintf(stderr, "could not open file %s",output_filename); goto end; } } else ofd = stdout; while( optind < argc ) { seqfile = argv[optind++]; /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (fmt == SQFILE_UNKNOWN && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) fmt = SQFILE_FASTA; if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) Die("Failed to open sequence file %s for reading", seqfile); while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) { FreeSequence(NULL, &sqinfo); seqct++; } cds_seqs = (sequence *)calloc(seqct, sizeof(sequence)); seqs = (sequence *)calloc(seqct, sizeof(sequence)); SeqfileRewind(dbfp); seqct=0; while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) { sqinfo.type = Seqtype(seq); if( sqinfo.type == kDNA || sqinfo.type == kRNA ) { seqs[seqct].seqstr = Translate(seq,stdcode1); /* Let's remove the last codon if it is a stop codon */ len = strlen(seqs[seqct].seqstr); if( Verbose ) fprintf(stderr,"seqct is %d length is %d\n",seqct, len); if( seqs[seqct].seqstr[len-1] == '*' ) { seqs[seqct].seqstr[len-1] = '\0'; seq[strlen(seq) - 3] = '\0'; } cds_seqs[cdsct].seqstr = seq; seqs[seqct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char)); cds_seqs[cdsct].seqname = calloc(strlen(sqinfo.name)+1,sizeof(char)); strcpy(seqs[seqct].seqname,sqinfo.name ); strcpy(cds_seqs[cdsct].seqname,sqinfo.name); cds_seqs[cdsct].length = sqinfo.len; cds_seqs[cdsct].alphabet = ( sqinfo.type == kDNA ) ? dna : rna; seqs[seqct].length = strlen(seqs[seqct].seqstr); seqs[seqct].alphabet = protein; cdsct++; seqct++; } else { fprintf(stderr,"Expect CDS sequences (DNA or RNA) not Protein\n"); goto end; } FreeSequence(NULL, &sqinfo); if( Verbose && seqct > 3 ) break; } if( seqct < 2 ) { fprintf(stderr,"Must have provided a valid file with at least 2 sequences in it"); goto end; } for( i=0; i < seqct; i++ ) { for(k=i+1; k < seqct; k++ ) { if( (opt_alignment = (alignment *)calloc(1,sizeof(alignment *))) == NULL) { fprintf(stderr,"Could not allocate memory\n"); goto end; } opt_alignment->msa = NULL; rc = optimal_align(&seqs[i],&seqs[k],opt_alignment); if( rc != 1 ) { fprintf(stderr,"Could not make an optimal alignment\n"); goto end; } else { tmp_cds_seqs[0] = cds_seqs[i]; tmp_cds_seqs[1] = cds_seqs[k]; rc = mrtrans(opt_alignment, tmp_cds_seqs, &cds_aln,0); if( rc != 0 ) { fprintf(stderr, "Could not map the coding sequence to the protein alignemnt for aln %d: %d\n",i,rc); goto end; } if( showaln ) { if( ofmt >= 100 ) { MSAFileWrite(ofd,cds_aln->msa, ofmt,do_oneline); } else { for(j=0; j < cds_aln->msa->nseq; j++ ) { WriteSeq(ofd, ofmt, cds_aln->msa->aseq[j], &(cds_aln->sqinfo[j]) ); } } } else { if( showheader && firsttime ) { fprintf(ofd,"SEQ1\tSEQ2\tSCORE\tdN\tdS\tOMEGA\tN\tS\tkappa\tt\tLENGTH\n"); firsttime = 0; } if( do_kaks_yn00(cds_aln->msa, &pwMLdist,&pwNGdist) < 0 ) { fprintf(stderr, "warning: problem with align for %s %s\n", cds_aln->msa->sqname[0], cds_aln->msa->sqname[1]); continue; } for(ik = 0; ik < NUM_PW_SEQS; ik++ ) { for( jk = ik+1; jk < NUM_PW_SEQS; jk++ ) { fprintf(ofd,"%s\t%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%d\n", cds_aln->sqinfo[ik].name, cds_aln->sqinfo[jk].name, opt_alignment->score, pwMLdist.dN[ik][jk],pwMLdist.dS[ik][jk], pwMLdist.dNdS[ik][jk], pwMLdist.N[ik][jk], pwMLdist.S[ik][jk], pwMLdist.kappa[ik][jk], pwMLdist.t[ik][jk], opt_alignment->msa->alen); } } } } cleanup_alignment(cds_aln); cleanup_alignment(opt_alignment); } } } if( ofd && ofd != stdout ) fclose(ofd); end: free(submat_file); Free2DArray((void **)ScoringMatrix,27); for(i =0; i< seqct; i++ ) { free(seqs[i].seqstr); free(seqs[i].seqname); seqs[i].seqstr = seqs[i].seqname = 0; } for(i = 0; i < cdsct; i++) { free(cds_seqs[i].seqstr); free(cds_seqs[i].seqname); cds_seqs[i].seqstr = cds_seqs[i].seqname = 0; } cleanup_matrix((void **)pwMLdist.N,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dN,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.S,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.SEdS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.SEdN,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.t,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.dNdS,NUM_PW_SEQS); cleanup_matrix((void **)pwMLdist.kappa,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dN,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dS,NUM_PW_SEQS); cleanup_matrix((void **)pwNGdist.dNdS,NUM_PW_SEQS); free(pwNGdist.dNdS); free(pwNGdist.dN); free(pwNGdist.dS); free(pwMLdist.dNdS); free(pwMLdist.dN); free(pwMLdist.dS); free(pwMLdist.N); free(pwMLdist.S); free(pwMLdist.SEdS); free(pwMLdist.SEdN); free(pwMLdist.t); free(pwMLdist.kappa); return 0; }