Beispiel #1
0
/////////////////////////////////////////////////////////////////////////////////////
// Do precalculations for q and t to prepare comparison
/////////////////////////////////////////////////////////////////////////////////////
void PrepareTemplate(HMM& q, HMM& t, int format)
{
    if (format==0) // HHM format
    {
        // Add transition pseudocounts to template
        t.AddTransitionPseudocounts();

	// Don't use CS-pseudocounts because of runtime!!!
	// Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
	t.PreparePseudocounts();
	
	// Add amino acid pseudocounts to query:  p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
	t.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);

        t.CalculateAminoAcidBackground();
    }
    else // HHMER format
    {
        // Don't add transition pseudocounts to template
        // t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0);

        // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
        // t.PreparePseudocounts();

        // DON'T ADD amino acid pseudocounts to temlate: pcm=0!  t.p[i][a] = t.f[i][a]
        t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
        t.CalculateAminoAcidBackground();
    }

    if (par.forward>=1) t.Log2LinTransitionProbs(1.0);

    // Factor Null model into HMM t
    // ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p
    t.IncludeNullModelInHMM(q,t);  // Can go BEFORE the loop if not dependent on template

    return;
}
Beispiel #2
0
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
void ReadAndPrepare(char* infile, HMM& q, Alignment* qali=NULL)
{
    char path[NAMELEN];

    // Open query file and determine file type
    char line[LINELEN]=""; // input line
    FILE* inf=NULL;
    if (strcmp(infile,"stdin"))
    {
        inf = fopen(infile, "r");
        if (!inf) OpenFileError(infile);
        Pathname(path,infile);
    }
    else
    {
        inf = stdin;
        if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name);
        *path='\0';
    }

    fgetline(line,LINELEN-1,inf);

    // Is it an hhm file?
    if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2))
    {
        if (v>=2) cout<<"Query file is in HHM format\n";

        // Rewind to beginning of line and read query hhm file
        rewind(inf);
        q.Read(inf,path);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);

        // Add transition pseudocounts to query -> q.p[i][a]
        q.AddTransitionPseudocounts();

        if (!*par.clusterfile) { //compute context-specific pseudocounts?
	  // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
	  q.PreparePseudocounts();
	  // Add amino acid pseudocounts to query:  q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
	  q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);;
        } else {
	  // Add context specific pseudocount to query
	  q.AddContextSpecificPseudocounts(par.pcm);
        }
        
        q.CalculateAminoAcidBackground();
    }

    // ... or is it an a2m/a3m alignment file
    else if (line[0]=='#' || line[0]=='>')
    {
        Alignment* pali;
        if (qali==NULL) pali=new(Alignment); else pali=qali;
        if (par.calibrate) {
            printf("\nError in %s: only HHM files can be calibrated.\n",program_name);
            printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile);
            exit(1);
        }

        if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n";

        // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
        pali->Read(inf,infile,line);

        // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
        // and store marked sequences in name[k] and seq[k]
        pali->Compress(infile);

        // Sort out the nseqdis most dissimilar sequences for display in the output alignments
        pali->FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);

        // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
        pali->N_filtered = pali->Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);

 	if (par.Neff>=0.999) 
	  pali->FilterNeff();

	// Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
        pali->FrequenciesAndTransitions(q);
        if (v>=2 && q.Neff_HMM>11.0)
            fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM);

        // Add transition pseudocounts to query -> p[i][a]
        q.AddTransitionPseudocounts();

        if (!*par.clusterfile) { //compute context-specific pseudocounts?
	  // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
	  q.PreparePseudocounts();
	  // Add amino acid pseudocounts to query:  p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
	  q.AddAminoAcidPseudocounts(par.pcm, par.pca, par.pcb, par.pcc);
        } else {
	  // Add context specific pseudocount to query
	  q.AddContextSpecificPseudocounts(par.pcm);
        }

        q.CalculateAminoAcidBackground();

        if (qali==NULL) delete(pali);
    
    } else if (!strncmp(line,"HMMER",5)) {

        ///////////////////////////////////////////////////////////////////////////////////////
        // Don't allow HMMER format as input due to the severe loss of sensitivity!!!! (only allowed in HHmake)
        if (strncmp(program_name,"hhmake",6)) {
	  cerr<<endl<<"Error in "<<program_name<<": HMMER format not allowed as input due to the severe loss of sensitivity!\n";
	  exit(1);
        }
      
        // Is infile a HMMER3 file?
	if (!strncmp(line,"HMMER3",6))
	  {
	    if (v>=2) cout<<"Query file is in HMMER3 format\n";
	    
	    // Read 'query HMMER file
	    rewind(inf);
	    q.ReadHMMer3(inf,path);
	    
	    // Don't add transition pseudocounts to query!!
	    // DON'T ADD amino acid pseudocounts to query: pcm=0!  q.p[i][a] = f[i][a]
	    q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
	    q.CalculateAminoAcidBackground();
	  }
	
	// ... or is infile an old HMMER file?
	else if (!strncmp(line,"HMMER",5))
	  {
	    if (v>=2) cout<<"Query file is in HMMER format\n";
	    
	    // Read 'query HMMER file
	    rewind(inf);
	    q.ReadHMMer(inf,path);
	    
	    // DON'T ADD amino acid pseudocounts to query: pcm=0!  q.p[i][a] = f[i][a]
	    q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
	    q.CalculateAminoAcidBackground();
	  }
	
    } else {
      cerr<<endl<<"Error in "<<program_name<<": unrecognized input file format in \'"<<infile<<"\'\n";
      cerr<<"line = "<<line<<"\n";
      exit(1);
    }
    fclose(inf);

    if (par.addss==1)
      CalculateSS(q);

    if (par.columnscore == 5 && !q.divided_by_local_bg_freqs) q.DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs);

    if (par.forward>=1) q.Log2LinTransitionProbs(1.0);
    return;
}
Beispiel #3
0
/////////////////////////////////////////////////////////////////////////////////////
//// MAIN PROGRAM
/////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
    char* argv_conf[MAXOPT]; // Input arguments from .hhdefaults file (first=1: argv_conf[0] is not used)
    int argc_conf;               // Number of arguments in argv_conf

    strcpy(par.infile, "");
    strcpy(par.outfile, "");
    strcpy(par.alnfile, "");

    //Default parameter settings
    par.nseqdis = MAXSEQ - 1;        // maximum number of sequences to be written
    par.showcons = 0;
    par.cons = 1;
    par.Ndiff = 0;
    par.max_seqid = 100;
    par.coverage = 0;
    par.pc_hhm_context_engine.pca = 0.0;  // no amino acid pseudocounts
    par.pc_hhm_nocontext_a = 0.0;  // no amino acid pseudocounts
    par.gapb = 0.0; // no transition pseudocounts

    // Make command line input globally available
    par.argv = argv;
    par.argc = argc;
    RemovePathAndExtension(program_name, argv[0]);

    // Enable changing verbose mode before defaults file and command line are processed
    int v = 2;
    for (int i = 1; i < argc; i++) {
        if (!strcmp(argv[i], "-def"))
            par.readdefaultsfile = 1;
        else if (strcmp(argv[i], "-v") == 0) {
            v = atoi(argv[i + 1]);
        }
    }
    par.v = Log::from_int(v);
    Log::reporting_level() = par.v;

    par.SetDefaultPaths();

    // Read .hhdefaults file?
    if (par.readdefaultsfile) {
        // Process default otpions from .hhconfig file
        ReadDefaultsFile(argc_conf, argv_conf);
        ProcessArguments(argc_conf, argv_conf);
    }

    // Process command line options (they override defaults from .hhdefaults file)
    ProcessArguments(argc, argv);

    Alignment* qali = new Alignment(MAXSEQ, par.maxres);
    HMM* q = new HMM(MAXSEQDIS, par.maxres);        //Create a HMM with maximum of par.maxres match states

    // q is only available after maxres is known, so we had to move this here
    for (int i = 1; i <= argc - 1; i++) {
        if (!strcmp(argv[i], "-name") && (i < argc - 1)) {
            strmcpy(q->name, argv[++i], NAMELEN - 1); //copy longname to name...
            strmcpy(q->longname, argv[i], DESCLEN - 1);   //copy full name to longname
        }
    }

    // Check command line input and default values
    if (!*par.infile) {
        help();
        HH_LOG(ERROR) << "Input file is missing!" << std::endl;
        exit(4);
    }

    // Get basename
    RemoveExtension(q->file, par.infile); //Get basename of infile (w/o extension):

    // Outfile not given? Name it basename.hhm
    if (!*par.outfile && !*par.alnfile) {
        RemoveExtension(par.outfile, par.infile);
        strcat(par.outfile, ".seq");
    }

    // Prepare CS pseudocounts lib
    if (!par.nocontxt && *par.clusterfile) {
        InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine,
                                     pc_hhm_context_mode, pc_prefilter_context_engine,
                                     pc_prefilter_context_mode);
    }

    // Set substitution matrix; adjust to query aa distribution if par.pcm==3
    SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim);

    // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
    char input_format = 0;
    ReadQueryFile(par, par.infile, input_format, par.wg, q, qali, pb, S, Sim);

    // Same code as in PrepareQueryHMM(par.infile,input_format,q,qali), except that we add SS prediction
    // Add Pseudocounts, if no HMMER input
    if (input_format == 0) {
        // Transform transition freqs to lin space if not already done
        q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
                                     par.gaph, par.gapi, par.gapb, par.gapb);

        // Comput substitution matrix pseudocounts
        if (par.nocontxt) {
            // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
            q->PreparePseudocounts(R);
            // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
            q->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode,
                                        par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b,
                                        par.pc_hhm_nocontext_c);
        }
        else {
            // Add full context specific pseudocounts to query
            q->AddContextSpecificPseudocounts(pc_hhm_context_engine,
                                              pc_hhm_context_mode);
        }
    }
    else {
        q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a,
                                    par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
    }

    q->CalculateAminoAcidBackground(pb);

    if (par.columnscore == 5 && !q->divided_by_local_bg_freqs)
        q->DivideBySqrtOfLocalBackgroundFreqs(
            par.half_window_size_local_aa_bg_freqs, pb);

    // Write consensus sequence to sequence file
    // Consensus sequence is calculated in hhalignment.C, Alignment::FrequenciesAndTransitions()
    if (*par.outfile) {
        FILE* outf = NULL;
        if (strcmp(par.outfile, "stdout")) {
            outf = fopen(par.outfile, "a");
            if (!outf)
                OpenFileError(par.outfile, __FILE__, __LINE__, __func__);
        }
        else
            outf = stdout;
        // OLD
        //// ">name_consensus" -> ">name consensus"
        //strsubst(q->sname[q->nfirst],"_consensus"," consensus");
        //fprintf(outf,">%s\n%s\n",q->sname[q->nfirst],q->seq[q->nfirst]+1);
        // NEW (long header needed for NR30cons database)
        fprintf(outf, ">%s\n%s\n", q->longname, q->seq[q->nfirst] + 1);
        fclose(outf);
    }

    // Print A3M/A2M/FASTA output alignment
    if (*par.alnfile) {
        HalfAlignment qa;
        int n = imin(q->n_display,
                     par.nseqdis + (q->nss_dssp >= 0) + (q->nss_pred >= 0)
                     + (q->nss_conf >= 0) + (q->ncons >= 0));
        qa.Set(q->name, q->seq, q->sname, n, q->L, q->nss_dssp, q->nss_pred,
               q->nss_conf, q->nsa_dssp, q->ncons);

        if (par.outformat == 1)
            qa.BuildFASTA();
        else if (par.outformat == 2)
            qa.BuildA2M();
        else if (par.outformat == 3)
            qa.BuildA3M();
        if (qali->readCommentLine)
            qa.Print(par.alnfile, par.append, qali->longname); // print alignment to outfile
        else
            qa.Print(par.alnfile, par.append);   // print alignment to outfile
    }

    delete qali;
    delete q;

    DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine,
                             pc_hhm_context_mode, pc_prefilter_context_engine,
                             pc_prefilter_context_mode);
}