Пример #1
0
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
void ReadAndPrepare(char* infile, HMM& q, Alignment* qali=NULL)
{
    char path[NAMELEN];

    // Open query file and determine file type
    char line[LINELEN]=""; // input line
    FILE* inf=NULL;
    if (strcmp(infile,"stdin"))
    {
        inf = fopen(infile, "r");
        if (!inf) OpenFileError(infile);
        Pathname(path,infile);
    }
    else
    {
        inf = stdin;
        if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name);
        *path='\0';
    }

    fgetline(line,LINELEN-1,inf);

    // Is it an hhm file?
    if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2))
    {
        if (v>=2) cout<<"Query file is in HHM format\n";

        // Rewind to beginning of line and read query hhm file
        rewind(inf);
        q.Read(inf,path);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);

        // Add transition pseudocounts to query -> q.p[i][a]
        q.AddTransitionPseudocounts();

        if (!*par.clusterfile) { //compute context-specific pseudocounts?
	  // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
	  q.PreparePseudocounts();
	  // Add amino acid pseudocounts to query:  q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
	  q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);;
        } else {
	  // Add context specific pseudocount to query
	  q.AddContextSpecificPseudocounts(par.pcm);
        }
        
        q.CalculateAminoAcidBackground();
    }

    // ... or is it an a2m/a3m alignment file
    else if (line[0]=='#' || line[0]=='>')
    {
        Alignment* pali;
        if (qali==NULL) pali=new(Alignment); else pali=qali;
        if (par.calibrate) {
            printf("\nError in %s: only HHM files can be calibrated.\n",program_name);
            printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile);
            exit(1);
        }

        if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n";

        // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
        pali->Read(inf,infile,line);

        // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
        // and store marked sequences in name[k] and seq[k]
        pali->Compress(infile);

        // Sort out the nseqdis most dissimilar sequences for display in the output alignments
        pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);

        // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
        pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);

 	if (par.Neff>=0.999) 
	  pali->FilterNeff();

	// Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
        pali->FrequenciesAndTransitions(q);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM);

        // Add transition pseudocounts to query -> p[i][a]
        q.AddTransitionPseudocounts();

        if (!*par.clusterfile) { //compute context-specific pseudocounts?
	  // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
	  q.PreparePseudocounts();
	  // Add amino acid pseudocounts to query:  p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
	  q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);
        } else {
	  // Add context specific pseudocount to query
	  q.AddContextSpecificPseudocounts(par.pcm);
        }

        q.CalculateAminoAcidBackground();

        if (qali==NULL) delete(pali);
    
    } else if (!strncmp(line,"HMMER",5)) {

        ///////////////////////////////////////////////////////////////////////////////////////
        // Don't allow HMMER format as input due to the severe loss of sensitivity!!!! (only allowed in HHmake)
        if (strncmp(program_name,"hhmake",6)) {
	  cerr<<endl<<"Error in "<<program_name<<": HMMER format not allowed as input due to the severe loss of sensitivity!\n";
	  exit(1);
        }
      
        // Is infile a HMMER3 file?
	if (!strncmp(line,"HMMER3",6))
	  {
	    if (v>=2) cout<<"Query file is in HMMER3 format\n";
	    
	    // Read 'query HMMER file
	    rewind(inf);
	    q.ReadHMMer3(inf,path);
	    
	    // Don't add transition pseudocounts to query!!
	    // DON'T ADD amino acid pseudocounts to query: pcm=0!  q.p[i][a] = f[i][a]
	    q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
	    q.CalculateAminoAcidBackground();
	  }
	
	// ... or is infile an old HMMER file?
	else if (!strncmp(line,"HMMER",5))
	  {
	    if (v>=2) cout<<"Query file is in HMMER format\n";
	    
	    // Read 'query HMMER file
	    rewind(inf);
	    q.ReadHMMer(inf,path);
	    
	    // DON'T ADD amino acid pseudocounts to query: pcm=0!  q.p[i][a] = f[i][a]
	    q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
	    q.CalculateAminoAcidBackground();
	  }
	
    } else {
      cerr<<endl<<"Error in "<<program_name<<": unrecognized input file format in \'"<<infile<<"\'\n";
      cerr<<"line = "<<line<<"\n";
      exit(1);
    }
    fclose(inf);

    if (par.addss==1)
      CalculateSS(q);

    if (par.columnscore == 5 && !q.divided_by_local_bg_freqs) q.DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs);

    if (par.forward>=1) q.Log2LinTransitionProbs(1.0);
    return;
}
Пример #2
0
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
void ReadInput(char* infile, HMM& q, Alignment* qali=NULL)
{
    char path[NAMELEN];

    // Open query file and determine file type
    char line[LINELEN]=""; // input line
    FILE* inf=NULL;
    if (strcmp(infile,"stdin"))
    {
        inf = fopen(infile, "r");
        if (!inf) OpenFileError(infile);
        Pathname(path,infile);
    }
    else
    {
        inf = stdin;
        if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name);
        *path='\0';
    }

    fgetline(line,LINELEN-1,inf);

    // Is infile a HMMER3 file?
    if (!strncmp(line,"HMMER3",6))
    {
        if (v>=2) cout<<"Query file is in HMMER3 format\n";
	cerr<<"WARNING: Use of HMMER3 format as input will result in severe loss of sensitivity!\n";

        // Read 'query HMMER file
        rewind(inf);
        q.ReadHMMer3(inf,path);
    }

    // ... or is infile an old HMMER file?
    else if (!strncmp(line,"HMMER",5))
    {
        if (v>=2) cout<<"Query file is in HMMER format\n";
	cerr<<"WARNING: Use of HMMER format as input will result in severe loss of sensitivity!\n";

        // Read 'query HMMER file
        rewind(inf);
        q.ReadHMMer(inf,path);
    }

    // ... or is it an hhm file?
    else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2))
    {
        if (v>=2) cout<<"Query file is in HHM format\n";

        // Rewind to beginning of line and read query hhm file
        rewind(inf);
        q.Read(inf,path);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);

    }
    // ... or is it an alignment file
    else
    {
        Alignment* pali;
        if (qali==NULL) pali=new(Alignment); else pali=qali;
        if (par.calibrate) {
            printf("\nError in %s: only HHM files can be calibrated.\n",program_name);
            printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile);
            exit(1);
        }

        if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n";

        // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
        pali->Read(inf,infile,line);

        // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
        // and store marked sequences in name[k] and seq[k]
        pali->Compress(infile);

        // Sort out the nseqdis most dissimilar sequences for display in the output alignments
        pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);

        // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
        pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);

	if (par.Neff>=0.999) 
	  pali->FilterNeff();

        // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
        pali->FrequenciesAndTransitions(q);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM);

        if (qali==NULL) delete(pali);
    }
    fclose(inf);

    return;
}