// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. void ReadAndPrepare(char* infile, HMM& q, Alignment* qali=NULL) { char path[NAMELEN]; // Open query file and determine file type char line[LINELEN]=""; // input line FILE* inf=NULL; if (strcmp(infile,"stdin")) { inf = fopen(infile, "r"); if (!inf) OpenFileError(infile); Pathname(path,infile); } else { inf = stdin; if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name); *path='\0'; } fgetline(line,LINELEN-1,inf); // Is it an hhm file? if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) { if (v>=2) cout<<"Query file is in HHM format\n"; // Rewind to beginning of line and read query hhm file rewind(inf); q.Read(inf,path); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); // Add transition pseudocounts to query -> q.p[i][a] q.AddTransitionPseudocounts(); if (!*par.clusterfile) { //compute context-specific pseudocounts? // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // Add amino acid pseudocounts to query: q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);; } else { // Add context specific pseudocount to query q.AddContextSpecificPseudocounts(par.pcm); } q.CalculateAminoAcidBackground(); } // ... or is it an a2m/a3m alignment file else if (line[0]=='#' || line[0]=='>') { Alignment* pali; if (qali==NULL) pali=new(Alignment); else pali=qali; if (par.calibrate) { printf("\nError in %s: only HHM files can be calibrated.\n",program_name); printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile); exit(1); } if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n"; // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument) pali->Read(inf,infile,line); // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] // and store marked sequences in name[k] and seq[k] pali->Compress(infile); // Sort out the nseqdis most dissimilar sequences for display in the output alignments pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); if (par.Neff>=0.999) pali->FilterNeff(); // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] pali->FrequenciesAndTransitions(q); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM); // Add transition pseudocounts to query -> p[i][a] q.AddTransitionPseudocounts(); if (!*par.clusterfile) { //compute context-specific pseudocounts? // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc); } else { // Add context specific pseudocount to query q.AddContextSpecificPseudocounts(par.pcm); } q.CalculateAminoAcidBackground(); if (qali==NULL) delete(pali); } else if (!strncmp(line,"HMMER",5)) { /////////////////////////////////////////////////////////////////////////////////////// // Don't allow HMMER format as input due to the severe loss of sensitivity!!!! (only allowed in HHmake) if (strncmp(program_name,"hhmake",6)) { cerr<<endl<<"Error in "<<program_name<<": HMMER format not allowed as input due to the severe loss of sensitivity!\n"; exit(1); } // Is infile a HMMER3 file? if (!strncmp(line,"HMMER3",6)) { if (v>=2) cout<<"Query file is in HMMER3 format\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer3(inf,path); // Don't add transition pseudocounts to query!! // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); q.CalculateAminoAcidBackground(); } // ... or is infile an old HMMER file? else if (!strncmp(line,"HMMER",5)) { if (v>=2) cout<<"Query file is in HMMER format\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer(inf,path); // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); q.CalculateAminoAcidBackground(); } } else { cerr<<endl<<"Error in "<<program_name<<": unrecognized input file format in \'"<<infile<<"\'\n"; cerr<<"line = "<<line<<"\n"; exit(1); } fclose(inf); if (par.addss==1) CalculateSS(q); if (par.columnscore == 5 && !q.divided_by_local_bg_freqs) q.DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs); if (par.forward>=1) q.Log2LinTransitionProbs(1.0); return; }
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. void ReadInput(char* infile, HMM& q, Alignment* qali=NULL) { char path[NAMELEN]; // Open query file and determine file type char line[LINELEN]=""; // input line FILE* inf=NULL; if (strcmp(infile,"stdin")) { inf = fopen(infile, "r"); if (!inf) OpenFileError(infile); Pathname(path,infile); } else { inf = stdin; if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name); *path='\0'; } fgetline(line,LINELEN-1,inf); // Is infile a HMMER3 file? if (!strncmp(line,"HMMER3",6)) { if (v>=2) cout<<"Query file is in HMMER3 format\n"; cerr<<"WARNING: Use of HMMER3 format as input will result in severe loss of sensitivity!\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer3(inf,path); } // ... or is infile an old HMMER file? else if (!strncmp(line,"HMMER",5)) { if (v>=2) cout<<"Query file is in HMMER format\n"; cerr<<"WARNING: Use of HMMER format as input will result in severe loss of sensitivity!\n"; // Read 'query HMMER file rewind(inf); q.ReadHMMer(inf,path); } // ... or is it an hhm file? else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) { if (v>=2) cout<<"Query file is in HHM format\n"; // Rewind to beginning of line and read query hhm file rewind(inf); q.Read(inf,path); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); } // ... or is it an alignment file else { Alignment* pali; if (qali==NULL) pali=new(Alignment); else pali=qali; if (par.calibrate) { printf("\nError in %s: only HHM files can be calibrated.\n",program_name); printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile); exit(1); } if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n"; // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument) pali->Read(inf,infile,line); // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] // and store marked sequences in name[k] and seq[k] pali->Compress(infile); // Sort out the nseqdis most dissimilar sequences for display in the output alignments pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); if (par.Neff>=0.999) pali->FilterNeff(); // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] pali->FrequenciesAndTransitions(q); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM); if (qali==NULL) delete(pali); } fclose(inf); return; }