///////////////////////////////////////////////////////////////////////////////////// // Do precalculations for q and t to prepare comparison ///////////////////////////////////////////////////////////////////////////////////// void PrepareTemplate(HMM& q, HMM& t, int format) { if (format==0) // HHM format { // Add transition pseudocounts to template t.AddTransitionPseudocounts(); // Don't use CS-pseudocounts because of runtime!!! // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] t.PreparePseudocounts(); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] t.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc); t.CalculateAminoAcidBackground(); } else // HHMER format { // Don't add transition pseudocounts to template // t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0); // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] // t.PreparePseudocounts(); // DON'T ADD amino acid pseudocounts to temlate: pcm=0! t.p[i][a] = t.f[i][a] t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); t.CalculateAminoAcidBackground(); } if (par.forward>=1) t.Log2LinTransitionProbs(1.0); // Factor Null model into HMM t // ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p t.IncludeNullModelInHMM(q,t); // Can go BEFORE the loop if not dependent on template return; }
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. void ReadAndPrepare(char* infile, HMM& q, Alignment* qali=NULL) { char path[NAMELEN]; // Open query file and determine file type char line[LINELEN]=""; // input line FILE* inf=NULL; if (strcmp(infile,"stdin")) { inf = fopen(infile, "r"); if (!inf) OpenFileError(infile); Pathname(path,infile); } else { inf = stdin; if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name); *path='\0'; } fgetline(line,LINELEN-1,inf); // Is it an hhm file? if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) { if (v>=2) cout<<"Query file is in HHM format\n"; // Rewind to beginning of line and read query hhm file rewind(inf); q.Read(inf,path); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); // Add transition pseudocounts to query -> q.p[i][a] q.AddTransitionPseudocounts(); if (!*par.clusterfile) { //compute context-specific pseudocounts? // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // Add amino acid pseudocounts to query: q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);; } else { // Add context specific pseudocount to query q.AddContextSpecificPseudocounts(par.pcm); } q.CalculateAminoAcidBackground(); } // ... or is it an a2m/a3m alignment file else if (line[0]=='#' || line[0]=='>') { Alignment* pali; if (qali==NULL) pali=new(Alignment); else pali=qali; if (par.calibrate) { printf("\nError in %s: only HHM files can be calibrated.\n",program_name); printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile); exit(1); } if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n"; // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument) pali->Read(inf,infile,line); // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] // and store marked sequences in name[k] and seq[k] pali->Compress(infile); // Sort out the nseqdis most dissimilar sequences for display in the output alignments pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); if (par.Neff>=0.999) pali->FilterNeff(); // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] pali->FrequenciesAndTransitions(q); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM); // Add transition pseudocounts to query -> p[i][a] q.AddTransitionPseudocounts(); if (!*par.clusterfile) { //compute context-specific pseudocounts? // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc); } else { // Add context specific pseudocount to query q.AddContextSpecificPseudocounts(par.pcm); } q.CalculateAminoAcidBackground(); if (qali==NULL) delete(pali); } else if (!strncmp(line,"HMMER",5)) { /////////////////////////////////////////////////////////////////////////////////////// // Don't allow HMMER format as input due to the severe loss of sensitivity!!!! (only allowed in HHmake) if (strncmp(program_name,"hhmake",6)) { cerr<<endl<<"Error in "<<program_name<<": HMMER format not allowed as input due to the severe loss of sensitivity!\n"; exit(1); } // Is infile a HMMER3 file? if (!strncmp(line,"HMMER3",6)) { if (v>=2) cout<<"Query file is in HMMER3 format\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer3(inf,path); // Don't add transition pseudocounts to query!! // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); q.CalculateAminoAcidBackground(); } // ... or is infile an old HMMER file? else if (!strncmp(line,"HMMER",5)) { if (v>=2) cout<<"Query file is in HMMER format\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer(inf,path); // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); q.CalculateAminoAcidBackground(); } } else { cerr<<endl<<"Error in "<<program_name<<": unrecognized input file format in \'"<<infile<<"\'\n"; cerr<<"line = "<<line<<"\n"; exit(1); } fclose(inf); if (par.addss==1) CalculateSS(q); if (par.columnscore == 5 && !q.divided_by_local_bg_freqs) q.DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs); if (par.forward>=1) q.Log2LinTransitionProbs(1.0); return; }
///////////////////////////////////////////////////////////////////////////////////// //// MAIN PROGRAM ///////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char* argv_conf[MAXOPT]; // Input arguments from .hhdefaults file (first=1: argv_conf[0] is not used) int argc_conf; // Number of arguments in argv_conf strcpy(par.infile, ""); strcpy(par.outfile, ""); strcpy(par.alnfile, ""); //Default parameter settings par.nseqdis = MAXSEQ - 1; // maximum number of sequences to be written par.showcons = 0; par.cons = 1; par.Ndiff = 0; par.max_seqid = 100; par.coverage = 0; par.pc_hhm_context_engine.pca = 0.0; // no amino acid pseudocounts par.pc_hhm_nocontext_a = 0.0; // no amino acid pseudocounts par.gapb = 0.0; // no transition pseudocounts // Make command line input globally available par.argv = argv; par.argc = argc; RemovePathAndExtension(program_name, argv[0]); // Enable changing verbose mode before defaults file and command line are processed int v = 2; for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "-def")) par.readdefaultsfile = 1; else if (strcmp(argv[i], "-v") == 0) { v = atoi(argv[i + 1]); } } par.v = Log::from_int(v); Log::reporting_level() = par.v; par.SetDefaultPaths(); // Read .hhdefaults file? if (par.readdefaultsfile) { // Process default otpions from .hhconfig file ReadDefaultsFile(argc_conf, argv_conf); ProcessArguments(argc_conf, argv_conf); } // Process command line options (they override defaults from .hhdefaults file) ProcessArguments(argc, argv); Alignment* qali = new Alignment(MAXSEQ, par.maxres); HMM* q = new HMM(MAXSEQDIS, par.maxres); //Create a HMM with maximum of par.maxres match states // q is only available after maxres is known, so we had to move this here for (int i = 1; i <= argc - 1; i++) { if (!strcmp(argv[i], "-name") && (i < argc - 1)) { strmcpy(q->name, argv[++i], NAMELEN - 1); //copy longname to name... strmcpy(q->longname, argv[i], DESCLEN - 1); //copy full name to longname } } // Check command line input and default values if (!*par.infile) { help(); HH_LOG(ERROR) << "Input file is missing!" << std::endl; exit(4); } // Get basename RemoveExtension(q->file, par.infile); //Get basename of infile (w/o extension): // Outfile not given? Name it basename.hhm if (!*par.outfile && !*par.alnfile) { RemoveExtension(par.outfile, par.infile); strcat(par.outfile, ".seq"); } // Prepare CS pseudocounts lib if (!par.nocontxt && *par.clusterfile) { InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode); } // Set substitution matrix; adjust to query aa distribution if par.pcm==3 SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim); // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. char input_format = 0; ReadQueryFile(par, par.infile, input_format, par.wg, q, qali, pb, S, Sim); // Same code as in PrepareQueryHMM(par.infile,input_format,q,qali), except that we add SS prediction // Add Pseudocounts, if no HMMER input if (input_format == 0) { // Transform transition freqs to lin space if not already done q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, par.gapb, par.gapb); // Comput substitution matrix pseudocounts if (par.nocontxt) { // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q->PreparePseudocounts(R); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode, par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c); } else { // Add full context specific pseudocounts to query q->AddContextSpecificPseudocounts(pc_hhm_context_engine, pc_hhm_context_mode); } } else { q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c); } q->CalculateAminoAcidBackground(pb); if (par.columnscore == 5 && !q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs( par.half_window_size_local_aa_bg_freqs, pb); // Write consensus sequence to sequence file // Consensus sequence is calculated in hhalignment.C, Alignment::FrequenciesAndTransitions() if (*par.outfile) { FILE* outf = NULL; if (strcmp(par.outfile, "stdout")) { outf = fopen(par.outfile, "a"); if (!outf) OpenFileError(par.outfile, __FILE__, __LINE__, __func__); } else outf = stdout; // OLD //// ">name_consensus" -> ">name consensus" //strsubst(q->sname[q->nfirst],"_consensus"," consensus"); //fprintf(outf,">%s\n%s\n",q->sname[q->nfirst],q->seq[q->nfirst]+1); // NEW (long header needed for NR30cons database) fprintf(outf, ">%s\n%s\n", q->longname, q->seq[q->nfirst] + 1); fclose(outf); } // Print A3M/A2M/FASTA output alignment if (*par.alnfile) { HalfAlignment qa; int n = imin(q->n_display, par.nseqdis + (q->nss_dssp >= 0) + (q->nss_pred >= 0) + (q->nss_conf >= 0) + (q->ncons >= 0)); qa.Set(q->name, q->seq, q->sname, n, q->L, q->nss_dssp, q->nss_pred, q->nss_conf, q->nsa_dssp, q->ncons); if (par.outformat == 1) qa.BuildFASTA(); else if (par.outformat == 2) qa.BuildA2M(); else if (par.outformat == 3) qa.BuildA3M(); if (qali->readCommentLine) qa.Print(par.alnfile, par.append, qali->longname); // print alignment to outfile else qa.Print(par.alnfile, par.append); // print alignment to outfile } delete qali; delete q; DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode); }